20 Rewriting R code in C++

20.1 Getting started with C++

Q1: With the basics of C++ in hand, it’s now a great time to practice by reading and writing some simple C++ functions. For each of the following functions, read the code and figure out what the corresponding base R function is. You might not understand every part of the code yet, but you should be able to figure out the basics of what the function does.

double f1(NumericVector x) {
  int n = x.size();
  double y = 0;
  
  for(int i = 0; i < n; ++i) {
    y += x[i] / n;
  }
  return y;
}

NumericVector f2(NumericVector x) {
  int n = x.size();
  NumericVector out(n);
  
  out[0] = x[0];
  for(int i = 1; i < n; ++i) {
    out[i] = out[i - 1] + x[i];
  }
  return out;
}

bool f3(LogicalVector x) {
  int n = x.size();
  
  for(int i = 0; i < n; ++i) {
    if (x[i]) return true;
  }
  return false;
}

int f4(Function pred, List x) {
  int n = x.size();
  
  for(int i = 0; i < n; ++i) {
    LogicalVector res = pred(x[i]);
    if (res[0]) return i + 1;
  }
  return 0;
}

NumericVector f5(NumericVector x, NumericVector y) {
  int n = std::max(x.size(), y.size());
  NumericVector x1 = rep_len(x, n);
  NumericVector y1 = rep_len(y, n);
  
  NumericVector out(n);
  
  for (int i = 0; i < n; ++i) {
    out[i] = std::min(x1[i], y1[i]);
  }
  
  return out;
}

A: The code above corresponds to the following base R functions:

f1: mean()
f2: cumsum()
f3: any()
f4: Position()
f5: pmin()

Q2: To practice your function writing skills, convert the following functions into C++. For now, assume the inputs have no missing values.

all().
cumprod(), cummin(), cummax().
diff(). Start by assuming lag 1, and then generalise for lag n.
range().
var(). Read about the approaches you can take on Wikipedia. Whenever implementing a numerical algorithm, it’s always good to check what is already known about the problem.

A: Let’s port these functions to C++.

all()

bool allC(LogicalVector x) {
  int n = x.size();

  for (int i = 0; i < n; ++i) {
    if (!x[i]) return false;
  }
  return true;
}

cumprod(), cummin(), cummax().

NumericVector cumprodC(NumericVector x) {
  int n = x.size();
  NumericVector out(n);

  out[0] = x[0];
  for (int i = 1; i < n; ++i) {
    out[i]  = out[i - 1] * x[i];
  }
  return out;
}

NumericVector cumminC(NumericVector x) {
  int n = x.size();
  NumericVector out(n);

  out[0] = x[0];
  for (int i = 1; i < n; ++i) {
    out[i]  = std::min(out[i - 1], x[i]);
  }
  return out;
}

NumericVector cummaxC(NumericVector x) {
  int n = x.size();
  NumericVector out(n);

  out[0] = x[0];
  for (int i = 1; i < n; ++i) {
    out[i]  = std::max(out[i - 1], x[i]);
  }
  return out;
}

diff() (Start by assuming lag 1, and then generalise for lag n.)

NumericVector diffC(NumericVector x) {
  int n = x.size();
  NumericVector out(n - 1);

  for (int i = 1; i < n; i++) {
    out[i - 1] = x[i] - x[i - 1];
  }
  return out ;
}

NumericVector difflagC(NumericVector x, int lag = 1) {
  int n = x.size();

  if (lag >= n) stop("`lag` must be less than `length(x)`.");

  NumericVector out(n - lag);

  for (int i = lag; i < n; i++) {
    out[i - lag] = x[i] - x[i - lag];
  }
  return out;
}

range()

NumericVector rangeC(NumericVector x) {
  double omin = x[0], omax = x[0];
  int n = x.size();

  if (n == 0) stop("`length(x)` must be greater than 0.");

  for (int i = 1; i < n; i++) {
    omin = std::min(x[i], omin);
    omax = std::max(x[i], omax);
  }

  NumericVector out(2);
  out[0] = omin;
  out[1] = omax;
  return out;
}

var()

double varC(NumericVector x) {
  int n = x.size();

  if (n < 2) {
    return NA_REAL;
  }

  double mx = 0;
  for (int i = 0; i < n; ++i) {
    mx += x[i] / n;
  }

  double out = 0;
  for (int i = 0; i < n; ++i) {
    out += pow(x[i] - mx, 2);
  }

  return out / (n - 1);
}

20.2 Missing values

Q1: Rewrite any of the functions from the first exercise to deal with missing values. If na.rm is true, ignore the missing values. If na.rm is false, return a missing value if the input contains any missing values. Some good functions to practice with are min(), max(), range(), mean(), and var().

A: For this exercise we start with minC() and extend it so it can deal with missing values. We introduce an na_rm argument to make minC() aware of NAs. In case x contains exclusively NA values minC() should return Inf for na_rm = TRUE. For the return values vector data types are used to avoid irregular type conversions.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector minC(NumericVector x, bool na_rm = false) {
  int n = x.size();
  NumericVector out = NumericVector::create(R_PosInf);
  
  if (na_rm) {
    for (int i = 0; i < n; ++i) {
      if (x[i] == NA_REAL) {
        continue;
      }
      if (x[i] < out[0]) {
        out[0] = x[i];
      }
    }
  } else {
    for (int i = 0; i < n; ++i) {
      if (NumericVector::is_na(x[i])) {
        out[0] = NA_REAL;
        return out;
      }
      if (x[i] < out[0]) {
        out[0] = x[i];
      }
    }
  }
  
  return out;
}

minC(c(2:4, NA))
#> [1] NA
minC(c(2:4, NA), na_rm = TRUE)
#> [1] 2
minC(c(NA, NA), na_rm = TRUE)
#> [1] Inf

We also extend anyC() so it can deal with missing values. Please note that this (again) introduces some code duplication. This could be avoided by moving the check for missing values to the inner loop at the expense of a slight decrease of performance. Here we use LogicalVector as return type. If we would use bool instead, the C++ NA_LOGICAL would be converted into R’s logical TRUE.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
LogicalVector anyC(LogicalVector x, bool na_rm = false) {
  int n = x.size();
  LogicalVector out = LogicalVector::create(false);

  if (na_rm == false) {
    for (int i = 0; i < n; ++i) {
      if (LogicalVector::is_na(x[i])) {
        out[0] = NA_LOGICAL;
        return out;
      } else {
        if (x[i]) {
          out[0] = true;
        }
      }
    }
  }

  if (na_rm) {
    for (int i = 0; i < n; ++i) {
      if (LogicalVector::is_na(x[i])) {
        continue;
      }
      if (x[i]) {
        out[0] = true;
        return out;
      }
    }
  }
  
  return out;
}

anyC(c(NA, TRUE))  # any(c(NA, TRUE)) would return TRUE in this case
#> [1] NA
anyC(c(NA, TRUE), na_rm = TRUE)
#> [1] TRUE

Q2: Rewrite cumsum() and diff() so they can handle missing values. Note that these functions have slightly more complicated behaviour.

A: Our NA-aware cumsumC() function will return a vector of the same length as x. By default (na_rm = FALSE) all values following the first NA input value will be set to NA, because they depend on the unknown missing value. In case of na_rm = FALSE the NA values are treated like zeros.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector cumsumC(NumericVector x, bool na_rm = false) {
  int n = x.size();
  NumericVector out(n);
  LogicalVector is_missing = is_na(x);
  
  if (!na_rm) {
    out[0] = x[0];
    for (int i = 1; i < n; ++i) {
      if (is_missing[i - 1]) {
        out[i] = NA_REAL;
      } else{
        out[i] = out[i - 1] + x[i];
      }
    }
  }
  
  if (na_rm) {
    if (is_missing[0]) {
      out[0] = 0;
    } else {
      out[0] = x[0];
    } 
    for (int i = 1; i < n; ++i) {
      if (is_missing[i]) {
        out[i] = out[i-1] + 0;
      } else {
        out[i] = out[i-1] + x[i];
      } 
    }
  }
  
  return out;
}

cumsumC(c(1, NA, 2, 4))
#> [1]  1 NA NA NA
cumsumC(c(1, NA, 2, 4), na_rm = TRUE)
#> [1] 1 1 3 7

The diffC() implementation will return an NA vector of length length(x) - lag, if the input vector contains a missing value. In case of na_rm = TRUE, the function will return an NA for every difference with at least one NA as input.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector diffC(NumericVector x, int lag = 1,
                    bool na_rm = false) {
  int n = x.size();
  
  if (lag >= n) stop("`lag` must be less than `length(x)`.");
  
  NumericVector out(n - lag);
  
  for (int i = lag; i < n; i++) {
    if (NumericVector::is_na(x[i]) ||
        NumericVector::is_na(x[i - lag])) {
      if (!na_rm) {
        return rep(NumericVector::create(NA_REAL), n - lag);
      }
      out[i - lag] = NA_REAL;
      continue;
    }
    out[i - lag] = x[i] - x[i - lag];
  }
  
  return out;
}

diffC(c(1, 3, NA, 10))
#> [1] NA NA NA
diffC(c(1, 3, NA, 10), na_rm = TRUE)
#> [1] 2 NA NA

20.3 Standard Template Library

To practice using the STL algorithms and data structures, implement the following using R functions in C++, using the hints provided:

Q1: median.default() using partial_sort.

A: The median is computed differently for even or odd vectors, which we allow for in the function below.

#include <algorithm>
#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
double medianC(NumericVector x) {
  int n = x.size();

  if (n % 2 == 0) {
    std::partial_sort (x.begin(), x.begin() + n / 2 + 1, x.end());
    return (x[n / 2 - 1] + x[n / 2]) / 2;
  } else {
    std::partial_sort (x.begin(), x.begin() + (n + 1) / 2, x.end());
    return x[(n + 1) / 2 - 1];
  }
}

Q2: %in% using unordered_set and the find() or count() methods.

A: We use the find() method and loop through the unordered_set until we find a match or have scanned the entire set.

#include <Rcpp.h>
#include <unordered_set>
using namespace Rcpp;

// [[Rcpp::export]]
LogicalVector inC(CharacterVector x, CharacterVector table) {
  std::unordered_set<String> seen;
  seen.insert(table.begin(), table.end());
  
  int n = x.size();
  LogicalVector out(n);
  for (int i = 0; i < n; ++i) {
    out[i] = seen.find(x[i]) != seen.end();
  }
  
  return out;
}

Q3: unique() using an unordered_set (challenge: do it in one line!).

A: The insert()-method will return if an equivalent element already exists. If a new element is inserted, we will add it to the (unique) return vector of our function.

#include <Rcpp.h>
#include <unordered_set>
using namespace Rcpp;

// [[Rcpp::export]]
NumericVector uniqueC(NumericVector x) {
  std::unordered_set<int> seen;
  int n = x.size();
  
  std::vector<double> out;
  for (int i = 0; i < n; ++i) {
    if (seen.insert(x[i]).second) out.push_back(x[i]);
  }
  
  return wrap(out);
}


// As a one-liner
// [[Rcpp::export]]
std::unordered_set<double> uniqueCC(NumericVector x) {
  return std::unordered_set<double>(x.begin(), x.end());
}

Q4: min() using std::min(), or max() using std::max().

A: We will implement min() by iterating over the vector and recursively comparing each element to the current minimum value.

#include <Rcpp.h>
using namespace Rcpp;

// [[Rcpp::export]]
double minC(NumericVector x) {
  int n = x.size();
  double out = x[0];
  
  for (int i = 0; i < n; i++) {
    out = std::min(out, x[i]);
  }
  
  return out;
}

Q5: which.min() using min_element, or which.max() using max_element.

A: To implement which.min(), we will first locate the min_element and then compute the distance() to it (starting from the beginning of the vector).

#include <Rcpp.h>
#include <algorithm>
#include <iterator>
using namespace Rcpp;

// [[Rcpp::export]]
double which_minC(NumericVector x) {
  int out = std::distance(
    x.begin(), std::min_element(x.begin(), x.end())
  );

  return out + 1;
}

Q6: setdiff(), union(), and intersect() for integers using sorted ranges and set_union, set_intersection and set_difference.

A: The structure of the three functions will be very similar.

We first sort both input vectors. Then we apply the respective set_union, set_intersection or set_difference function. After that, the result will be between the iterators tmp.begin() and out_end. To retrieve the result, we loop once through the range between tmp.begin() and out_end in the last part of each function.

The set operations in base R will discard duplicated values in the arguments. We achieve a similar behaviour by introducing a deduplication step, which omits values that match their predecessor. For the symmetric set functions unionC and intersectC this step is implemented for the output vector. For setdiffC the deduplication is applied to the first input vector.

#include <Rcpp.h>
#include <unordered_set>
#include <algorithm>
using namespace Rcpp;

// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
IntegerVector unionC(IntegerVector x, IntegerVector y) {
  int nx = x.size();
  int ny = y.size();
  
  IntegerVector tmp(nx + ny);
  
  std::sort(x.begin(), x.end()); // unique
  std::sort(y.begin(), y.end());
  
  IntegerVector::iterator out_end = std::set_union(
    x.begin(), x.end(), y.begin(), y.end(), tmp.begin()
  );
  
  int prev_value = 0;
  IntegerVector out;
  for (IntegerVector::iterator it = tmp.begin();
       it != out_end; ++it) {
    if ((it != tmp.begin())  && (prev_value == *it)) continue;
    
    out.push_back(*it);
    
    prev_value = *it;
  }
  
  return out;
}

// [[Rcpp::export]]
IntegerVector intersectC(IntegerVector x, IntegerVector y) {
  int nx = x.size();
  int ny = y.size();
  
  IntegerVector tmp(std::min(nx, ny));
  
  std::sort(x.begin(), x.end());
  std::sort(y.begin(), y.end());
  
  IntegerVector::iterator out_end = std::set_intersection(
    x.begin(), x.end(), y.begin(), y.end(), tmp.begin()
  );
  
  int prev_value = 0;  
  IntegerVector out;
  for (IntegerVector::iterator it = tmp.begin();
       it != out_end; ++it) {
    if ((it != tmp.begin()) && (prev_value == *it)) continue;
    
    out.push_back(*it);
    
    prev_value = *it;
  }
  
  return out;
}

// [[Rcpp::export]]
IntegerVector setdiffC(IntegerVector x, IntegerVector y) {
  int nx = x.size();
  int ny = y.size();
  
  IntegerVector tmp(nx);
  
  std::sort(x.begin(), x.end());
  
  int prev_value = 0;
  IntegerVector x_dedup;
  for (IntegerVector::iterator it = x.begin();
       it != x.end(); ++it) {
    if ((it != x.begin()) && (prev_value == *it)) continue;
    
    x_dedup.push_back(*it);
    
    prev_value = *it;
  }
  
  std::sort(y.begin(), y.end());
  
  IntegerVector::iterator out_end = std::set_difference(
    x_dedup.begin(), x_dedup.end(), y.begin(), y.end(), tmp.begin()
  );
  
  IntegerVector out;
  for (IntegerVector::iterator it = tmp.begin();
       it != out_end; ++it) {
    out.push_back(*it);
  }
  
  return out;
}

Let’s verify, that these functions work as intended.

# input vectors include duplicates
x <- c(1, 2, 3, 3, 3)
y <- c(3, 3, 2, 5)

union(x, y)
#> [1] 1 2 3 5
unionC(x, y)
#> [1] 1 2 3 5

intersect(x, y)
#> [1] 2 3
intersectC(x, y)
#> [1] 2 3

setdiff(x, y)
#> [1] 1
setdiffC(x, y)
#> [1] 1

19 Improving performance