cpp11 icon indicating copy to clipboard operation
cpp11 copied to clipboard

Improving string vector performance for push_back and subscript assignment

Open traversc opened this issue 1 year ago • 0 comments
trafficstars

When performing subscript assignment of a std::string to writable::strings there's first a conversion to the r_string class which includes a protection. However, that protection shouldn't be necessary if the result is immediately assigned and there is no translation.

Here's a benchmark compared to Rcpp, where I am generating random strings and assigning them to a pre-allocated vector.

microbenchmark(
  rcpp = assign_rcpp(n=1e5, -1),   # Assign random strings to Rcpp::CharacterVector
  cpp11 = assign_cpp11(n=1e5, -1), # Assign random strings to cpp11::writable::strings
  times = 20, setup = gc(full=TRUE))

Unit: milliseconds
  expr       min        lq      mean   median        uq       max neval
  rcpp  35.09063  36.53801  37.74236  37.3688  39.03084  41.29021    20
 cpp11 145.24590 148.24958 154.64878 151.7362 158.87924 177.13784    20

(The performance difference depends a lot on platform, but there's a 4x difference on a mac laptop.)

One possibility is to add some sort of overload or specialization for std::string for subscript assignment and push_back to short circuit the conversion to r_string.

Here's an example for push_back (https://github.com/traversc/cpp11/blob/main/inst/include/cpp11/strings.hpp#L133)

microbenchmark::microbenchmark(
  grow_strings_cpp11 = grow_strings_cpp11(1e5, -1),           # push_back 1e5 random strings, existing implementation
  grow_strings_fast_cpp11 = grow_strings_fast_cpp11(1e5, -1), # push_back_fast implementation
  grow_strings_manual = grow_strings_manual(1e5, -1),         # manual re-implementation using only SEXP
  times = 20, setup = gc(full=TRUE))

Unit: milliseconds
                    expr       min        lq      mean    median        uq       max neval
      grow_strings_cpp11 148.06594 152.42953 158.78716 156.06482 163.46553 183.88168    20
 grow_strings_fast_cpp11  38.22086  39.39721  40.47245  40.07249  41.43170  44.69328    20
     grow_strings_manual  38.65436  40.02137  41.00465  41.10390  41.81826  46.18455    20

assign_rcpp / assign_cpp11 benchmark functions

#include <random>
std::random_device rd;
std::mt19937 gen(rd());

int random_int(int min, int max) {
  std::uniform_int_distribution<int> dist(min, max);
  return dist(gen);
}

std::string random_string() {
  std::string s(10, '\0');
  for (size_t i = 0; i < 10; i++) {
    s[i] = random_int(0, 25) + 'a';
  }
  return s;
}

#include <Rcpp.h>
using namespace Rcpp;

# Rcpp
// [[Rcpp::export(rng=false)]]
CharacterVector assign_rcpp(size_t n, int seed) {
  CharacterVector x(n);
  if(seed != -1) gen.seed(seed);
  for (size_t i = 0; i < n; ++i) {
    x[i] = random_string();
  }
  return x;
}

#include "cpp11.hpp"
using namespace cpp11;
namespace writable = cpp11::writable;

# cpp11
[[cpp11::register]]
writable::strings assign_cpp11(size_t n, int seed) {
  if(seed != -1) gen.seed(seed);
  writable::strings x(n);
  for (size_t i = 0; i < n; ++i) {
    x[i] = random_string();
  }
  return x;
}

push_back benchmark functions

#include "cpp11.hpp"
#include <vector>
#include <random>
using namespace cpp11;
namespace writable = cpp11::writable;

std::random_device rd;
std::mt19937 gen(rd());

double random_double() {
  std::uniform_real_distribution<double> dist(0.0, 1.0);
  return dist(gen);
}

int random_int(int min, int max) {
  std::uniform_int_distribution<int> dist(min, max);
  return dist(gen);
}

std::string random_string() {
  std::string s(10, '\0');
  for (size_t i = 0; i < 10; i++) {
    s[i] = random_int(0, 25) + 'a';
  }
  return s;
}

[[cpp11::register]]
writable::strings grow_strings_cpp11(size_t n, int seed) {
  if(seed != -1) gen.seed(seed);
  writable::strings x;
  for (size_t i = 0; i < n; ++i) {
    x.push_back(random_string());
  }
  return x;
}

[[cpp11::register]]
writable::strings grow_strings_fast_cpp11(size_t n, int seed) {
  if(seed != -1) gen.seed(seed);
  writable::strings x;
  for (size_t i = 0; i < n; ++i) {
    x.push_back_fast(random_string());
  }
  return x;
}

[[cpp11::register]]
SEXP grow_strings_manual(size_t n, int seed) {
  if(seed != -1) gen.seed(seed);
  SEXP data_ = PROTECT(Rf_allocVector(STRSXP, 0));
  size_t size_ = 0;
  size_t capacity_ = 0;
  for (size_t i = 0; i < n; ++i) {
    if(size_ == capacity_) {
      capacity_ = capacity_ == 0 ? 1 : capacity_ * 2;
      SEXP new_data_ = PROTECT(Rf_allocVector(STRSXP, capacity_));
      for(size_t j = 0; j < size_; ++j) {
        SET_STRING_ELT(new_data_, j, STRING_ELT(data_, j));
      }
      UNPROTECT(2);
      data_ = PROTECT(new_data_);
    }
    SET_STRING_ELT(data_, size_++, Rf_mkChar(random_string().c_str()));
  }
  // copy back down to size
  if(size_ < capacity_) {
    SEXP new_data_ = PROTECT(Rf_allocVector(STRSXP, size_));
    for(size_t j = 0; j < size_; ++j) {
      SET_STRING_ELT(new_data_, j, STRING_ELT(data_, j));
    }
    UNPROTECT(2);
    return new_data_;
  } else {
    UNPROTECT(1);
    return data_;
  }
}

traversc avatar Oct 30 '24 23:10 traversc