20 Rewriting R code in C++
20.1 Getting started with C++
Q1: With the basics of C++ in hand, it’s now a great time to practice by reading and writing some simple C++ functions. For each of the following functions, read the code and figure out what the corresponding base R function is. You might not understand every part of the code yet, but you should be able to figure out the basics of what the function does.
double f1(NumericVector x) {
int n = x.size();
double y = 0;
for(int i = 0; i < n; ++i) {
+= x[i] / n;
y }
return y;
}
(NumericVector x) {
NumericVector f2int n = x.size();
(n);
NumericVector out
[0] = x[0];
outfor(int i = 1; i < n; ++i) {
[i] = out[i - 1] + x[i];
out}
return out;
}
bool f3(LogicalVector x) {
int n = x.size();
for(int i = 0; i < n; ++i) {
if (x[i]) return true;
}
return false;
}
int f4(Function pred, List x) {
int n = x.size();
for(int i = 0; i < n; ++i) {
= pred(x[i]);
LogicalVector res if (res[0]) return i + 1;
}
return 0;
}
(NumericVector x, NumericVector y) {
NumericVector f5int n = std::max(x.size(), y.size());
= rep_len(x, n);
NumericVector x1 = rep_len(y, n);
NumericVector y1
(n);
NumericVector out
for (int i = 0; i < n; ++i) {
[i] = std::min(x1[i], y1[i]);
out}
return out;
}
A: The code above corresponds to the following base R functions:
- f1:
mean()
- f2:
cumsum()
- f3:
any()
- f4:
Position()
- f5:
pmin()
Q2: To practice your function writing skills, convert the following functions into C++. For now, assume the inputs have no missing values.
diff()
. Start by assuming lag 1, and then generalise for lagn
.var()
. Read about the approaches you can take on Wikipedia. Whenever implementing a numerical algorithm, it’s always good to check what is already known about the problem.
A: Let’s port these functions to C++.
-
bool allC(LogicalVector x) { int n = x.size(); for (int i = 0; i < n; ++i) { if (!x[i]) return false; } return true; }
-
cumprod()
,cummin()
,cummax()
.(NumericVector x) { NumericVector cumprodCint n = x.size(); (n); NumericVector out [0] = x[0]; outfor (int i = 1; i < n; ++i) { [i] = out[i - 1] * x[i]; out} return out; } (NumericVector x) { NumericVector cumminCint n = x.size(); (n); NumericVector out [0] = x[0]; outfor (int i = 1; i < n; ++i) { [i] = std::min(out[i - 1], x[i]); out} return out; } (NumericVector x) { NumericVector cummaxCint n = x.size(); (n); NumericVector out [0] = x[0]; outfor (int i = 1; i < n; ++i) { [i] = std::max(out[i - 1], x[i]); out} return out; }
-
diff()
(Start by assuming lag 1, and then generalise for lagn
.)(NumericVector x) { NumericVector diffCint n = x.size(); (n - 1); NumericVector out for (int i = 1; i < n; i++) { [i - 1] = x[i] - x[i - 1]; out} return out ; } (NumericVector x, int lag = 1) { NumericVector difflagCint n = x.size(); if (lag >= n) stop("`lag` must be less than `length(x)`."); (n - lag); NumericVector out for (int i = lag; i < n; i++) { [i - lag] = x[i] - x[i - lag]; out} return out; }
-
(NumericVector x) { NumericVector rangeCdouble omin = x[0], omax = x[0]; int n = x.size(); if (n == 0) stop("`length(x)` must be greater than 0."); for (int i = 1; i < n; i++) { = std::min(x[i], omin); omin = std::max(x[i], omax); omax } (2); NumericVector out[0] = omin; out[1] = omax; outreturn out; }
-
double varC(NumericVector x) { int n = x.size(); if (n < 2) { return NA_REAL; } double mx = 0; for (int i = 0; i < n; ++i) { += x[i] / n; mx } double out = 0; for (int i = 0; i < n; ++i) { += pow(x[i] - mx, 2); out } return out / (n - 1); }
20.2 Missing values
Q1: Rewrite any of the functions from the first exercise to deal with missing values. If na.rm
is true, ignore the missing values. If na.rm
is false, return a missing value if the input contains any missing values. Some good functions to practice with are min()
, max()
, range()
, mean()
, and var()
.
A: For this exercise we start with minC()
and extend it so it can deal with missing values. We introduce an na_rm
argument to make minC()
aware of NA
s. In case x
contains exclusively NA
values minC()
should return Inf
for na_rm = TRUE
. For the return values vector data types are used to avoid irregular type conversions.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
(NumericVector x, bool na_rm = false) {
NumericVector minCint n = x.size();
= NumericVector::create(R_PosInf);
NumericVector out
if (na_rm) {
for (int i = 0; i < n; ++i) {
if (x[i] == NA_REAL) {
continue;
}
if (x[i] < out[0]) {
[0] = x[i];
out}
}
} else {
for (int i = 0; i < n; ++i) {
if (NumericVector::is_na(x[i])) {
[0] = NA_REAL;
outreturn out;
}
if (x[i] < out[0]) {
[0] = x[i];
out}
}
}
return out;
}
minC(c(2:4, NA))
#> [1] NA
minC(c(2:4, NA), na_rm = TRUE)
#> [1] 2
minC(c(NA, NA), na_rm = TRUE)
#> [1] Inf
We also extend anyC()
so it can deal with missing values. Please note that this (again) introduces some code duplication. This could be avoided by moving the check for missing values to the inner loop at the expense of a slight decrease of performance. Here we use LogicalVector
as return type. If we would use bool
instead, the C++ NA_LOGICAL
would be converted into R’s logical TRUE
.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
(LogicalVector x, bool na_rm = false) {
LogicalVector anyCint n = x.size();
= LogicalVector::create(false);
LogicalVector out
if (na_rm == false) {
for (int i = 0; i < n; ++i) {
if (LogicalVector::is_na(x[i])) {
[0] = NA_LOGICAL;
outreturn out;
} else {
if (x[i]) {
[0] = true;
out}
}
}
}
if (na_rm) {
for (int i = 0; i < n; ++i) {
if (LogicalVector::is_na(x[i])) {
continue;
}
if (x[i]) {
[0] = true;
outreturn out;
}
}
}
return out;
}
anyC(c(NA, TRUE)) # any(c(NA, TRUE)) would return TRUE in this case
#> [1] NA
anyC(c(NA, TRUE), na_rm = TRUE)
#> [1] TRUE
Q2: Rewrite cumsum()
and diff()
so they can handle missing values. Note that these functions have slightly more complicated behaviour.
A: Our NA
-aware cumsumC()
function will return a vector of the same length as x
. By default (na_rm = FALSE
) all values following the first NA
input value will be set to NA
, because they depend on the unknown missing value. In case of na_rm = FALSE
the NA
values are treated like zeros.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
(NumericVector x, bool na_rm = false) {
NumericVector cumsumCint n = x.size();
(n);
NumericVector out= is_na(x);
LogicalVector is_missing
if (!na_rm) {
[0] = x[0];
outfor (int i = 1; i < n; ++i) {
if (is_missing[i - 1]) {
[i] = NA_REAL;
out} else{
[i] = out[i - 1] + x[i];
out}
}
}
if (na_rm) {
if (is_missing[0]) {
[0] = 0;
out} else {
[0] = x[0];
out}
for (int i = 1; i < n; ++i) {
if (is_missing[i]) {
[i] = out[i-1] + 0;
out} else {
[i] = out[i-1] + x[i];
out}
}
}
return out;
}
The diffC()
implementation will return an NA
vector of length length(x) - lag
, if the input vector contains a missing value. In case of na_rm = TRUE
, the function will return an NA
for every difference with at least one NA
as input.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
(NumericVector x, int lag = 1,
NumericVector diffCbool na_rm = false) {
int n = x.size();
if (lag >= n) stop("`lag` must be less than `length(x)`.");
(n - lag);
NumericVector out
for (int i = lag; i < n; i++) {
if (NumericVector::is_na(x[i]) ||
::is_na(x[i - lag])) {
NumericVectorif (!na_rm) {
return rep(NumericVector::create(NA_REAL), n - lag);
}
[i - lag] = NA_REAL;
outcontinue;
}
[i - lag] = x[i] - x[i - lag];
out}
return out;
}
20.3 Standard Template Library
To practice using the STL algorithms and data structures, implement the following using R functions in C++, using the hints provided:
Q1: median.default()
using partial_sort
.
A: The median is computed differently for even or odd vectors, which we allow for in the function below.
#include <algorithm>
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double medianC(NumericVector x) {
int n = x.size();
if (n % 2 == 0) {
std::partial_sort (x.begin(), x.begin() + n / 2 + 1, x.end());
return (x[n / 2 - 1] + x[n / 2]) / 2;
} else {
std::partial_sort (x.begin(), x.begin() + (n + 1) / 2, x.end());
return x[(n + 1) / 2 - 1];
}
}
Q2: %in%
using unordered_set
and the find()
or count()
methods.
A: We use the find()
method and loop through the unordered_set
until we find a match or have scanned the entire set.
#include <Rcpp.h>
#include <unordered_set>
using namespace Rcpp;
// [[Rcpp::export]]
(CharacterVector x, CharacterVector table) {
LogicalVector inCstd::unordered_set<String> seen;
.insert(table.begin(), table.end());
seen
int n = x.size();
(n);
LogicalVector outfor (int i = 0; i < n; ++i) {
[i] = seen.find(x[i]) != seen.end();
out}
return out;
}
Q3: unique()
using an unordered_set
(challenge: do it in one line!).
A: The insert()
-method will return if an equivalent element already exists. If a new element is inserted, we will add it to the (unique) return vector of our function.
#include <Rcpp.h>
#include <unordered_set>
using namespace Rcpp;
// [[Rcpp::export]]
(NumericVector x) {
NumericVector uniqueCstd::unordered_set<int> seen;
int n = x.size();
std::vector<double> out;
for (int i = 0; i < n; ++i) {
if (seen.insert(x[i]).second) out.push_back(x[i]);
}
return wrap(out);
}
// As a one-liner
// [[Rcpp::export]]
std::unordered_set<double> uniqueCC(NumericVector x) {
return std::unordered_set<double>(x.begin(), x.end());
}
Q4: min()
using std::min()
, or max()
using std::max()
.
A: We will implement min()
by iterating over the vector and recursively comparing each element to the current minimum value.
#include <Rcpp.h>
using namespace Rcpp;
// [[Rcpp::export]]
double minC(NumericVector x) {
int n = x.size();
double out = x[0];
for (int i = 0; i < n; i++) {
= std::min(out, x[i]);
out }
return out;
}
Q5: which.min()
using min_element
, or which.max()
using max_element
.
A: To implement which.min()
, we will first locate the min_element
and then compute the distance()
to it (starting from the beginning of the vector).
#include <Rcpp.h>
#include <algorithm>
#include <iterator>
using namespace Rcpp;
// [[Rcpp::export]]
double which_minC(NumericVector x) {
int out = std::distance(
.begin(), std::min_element(x.begin(), x.end())
x);
return out + 1;
}
Q6: setdiff()
, union()
, and intersect()
for integers using sorted ranges and set_union
, set_intersection
and set_difference
.
A: The structure of the three functions will be very similar.
We first sort both input vectors. Then we apply the respective set_union
, set_intersection
or set_difference
function. After that, the result will be between the iterators tmp.begin()
and out_end
. To retrieve the result, we loop once through the range between tmp.begin()
and out_end
in the last part of each function.
The set operations in base R will discard duplicated values in the arguments. We achieve a similar behaviour by introducing a deduplication step, which omits values that match their predecessor. For the symmetric set functions unionC
and intersectC
this step is implemented for the output vector. For setdiffC
the deduplication is applied to the first input vector.
#include <Rcpp.h>
#include <unordered_set>
#include <algorithm>
using namespace Rcpp;
// [[Rcpp::plugins(cpp11)]]
// [[Rcpp::export]]
(IntegerVector x, IntegerVector y) {
IntegerVector unionCint nx = x.size();
int ny = y.size();
(nx + ny);
IntegerVector tmp
std::sort(x.begin(), x.end()); // unique
std::sort(y.begin(), y.end());
::iterator out_end = std::set_union(
IntegerVector.begin(), x.end(), y.begin(), y.end(), tmp.begin()
x);
int prev_value = 0;
;
IntegerVector outfor (IntegerVector::iterator it = tmp.begin();
!= out_end; ++it) {
it if ((it != tmp.begin()) && (prev_value == *it)) continue;
.push_back(*it);
out
= *it;
prev_value }
return out;
}
// [[Rcpp::export]]
(IntegerVector x, IntegerVector y) {
IntegerVector intersectCint nx = x.size();
int ny = y.size();
(std::min(nx, ny));
IntegerVector tmp
std::sort(x.begin(), x.end());
std::sort(y.begin(), y.end());
::iterator out_end = std::set_intersection(
IntegerVector.begin(), x.end(), y.begin(), y.end(), tmp.begin()
x);
int prev_value = 0;
;
IntegerVector outfor (IntegerVector::iterator it = tmp.begin();
!= out_end; ++it) {
it if ((it != tmp.begin()) && (prev_value == *it)) continue;
.push_back(*it);
out
= *it;
prev_value }
return out;
}
// [[Rcpp::export]]
(IntegerVector x, IntegerVector y) {
IntegerVector setdiffCint nx = x.size();
int ny = y.size();
(nx);
IntegerVector tmp
std::sort(x.begin(), x.end());
int prev_value = 0;
;
IntegerVector x_dedupfor (IntegerVector::iterator it = x.begin();
!= x.end(); ++it) {
it if ((it != x.begin()) && (prev_value == *it)) continue;
.push_back(*it);
x_dedup
= *it;
prev_value }
std::sort(y.begin(), y.end());
::iterator out_end = std::set_difference(
IntegerVector.begin(), x_dedup.end(), y.begin(), y.end(), tmp.begin()
x_dedup);
;
IntegerVector outfor (IntegerVector::iterator it = tmp.begin();
!= out_end; ++it) {
it .push_back(*it);
out}
return out;
}
Let’s verify, that these functions work as intended.