The essence of propensity scoring in pseudo-code is as follows:

  • a distance metric is defined between vectors of arbitrary dimensions
  • for each classified vector c:
    • for each unclassified vector u:
      • measure the distance between u and c
    • retain the smallest distance for u. In a kNN approach the actual specific c would matter but not in this case.

At the end of all this looping you get a list of customers with a score (corresponding to the shortest distance to a classified one).

The typical picture you get out of this looks like the following:

Propensity Spikes.

 

A sharp amount of customers being close to the existing customer and the rest. That is, there always appears a small set of almost-buying ones. Upon plotting the distance versus the amount of customers that you get for that distance you see the following sigmoid curve

Propensity Curve.

which again emphasizes the small set of almost-classifieds and a large distance to the rest.

Rcpp code

This code is really a straightforward implementation of the pseudo-code above. There are technical tricks related to Rcpp but that’s akin to the Rcpp language and not to the algorithm at hand.

include <Rcpp.h>
#include <iostream>
using namespace Rcpp;


// [[Rcpp::export]]
double distance(NumericVector a, NumericVector b){
  double r = 0;
  for (int k=0; k<a.size(); k++) {
    r +=  abs(a[k] - b[k]);
  }
  return r;
}
// [[Rcpp::export]]
SEXP gathered(DataFrame centroids, DataFrame other, double threshold){

  int centroidCount = centroids.size();
  int otherCount = other.size();


  NumericVector a;
  NumericVector b;
  double d;
  IntegerVector indices;
  NumericVector distances;
  for(int y=0; y<otherCount; y++){
    NumericVector currentDistances;
    for(int x=0; x<centroidCount; x++){
      a = centroids[x];
      b = other[y];
      d = distance(a,b);
      if(d<=threshold){
        currentDistances.push_back(d);
      }
    }
    if(currentDistances.size()>0){
      distances.push_back(mean(currentDistances));
      indices.push_back(y);
    }
  }
  List l = List::create(_["Index"]=indices, _["AvgDistance"]=distances);
  DataFrame z(l);
  return z;
}
// [[Rcpp::export]]
SEXP withinDistance(DataFrame centroids, DataFrame other, double threshold){

  int centroidCount = centroids.size();
  int otherCount = other.size();
  List l(centroidCount);

  NumericVector a;
  NumericVector b;
  double d;
  for(int x=0; x<centroidCount; x++){
    IntegerVector collector;
    NumericVector distances;
    for(int y=0; y<otherCount; y++){
      a = centroids[x];
      b = other[y];
      d = distance(a,b);
      if(d<=threshold){
        collector.push_back(y);
        distances.push_back(d);
      }
    }

    l[x] = List::create(_["Index"]=collector, _["Distance"]=distances);
  }
  return l;
}


/*
 * This will measure the distances between the given centroids and and another dataset.
 * The vector should be columns since rows are somewhat difficult to loop over in Rcpp.
 *
 */
// [[Rcpp::export]]
NumericMatrix measure(DataFrame centroids, DataFrame other){

  int centroidCount = centroids.size();
  int otherCount = other.size();
  NumericMatrix m(centroidCount, otherCount);

  NumericVector a;
  NumericVector b;
  for(int x=0; x<centroidCount; x++){
    for(int y=0; y<otherCount; y++){
      a = centroids[x];
      b = other[y];
      m(x, y) = distance(a,b);
    }
  }
  return m;
}

// [[Rcpp::export]]
CharacterVector test(DataFrame frame) {
  GenericVector v = frame[0];
  CharacterVector r(v.size());
  for (int k=0; k<v.size(); k++) {
    String a = v[k];
    r[k] =  a;
  }
  return r;
}

// [[Rcpp::export]]
SEXP  byName( DataFrame df, std::string s ){
  return df[s];
}

// [[Rcpp::export]]
void byrow( DataFrame df ){
  int nCols = df.size();
  int nRows = df.nrows();
  for(int j=0; j< nCols; j++){
    CharacterVector b = df[j];
    std::cout << "Column " << j << std::endl;
    for(int i=0; i< nRows; i++){
      std::string s = as<std::string>(b[i]);
      std::cout << s << " ";
    }
    std::cout << std::endl;
  }
}

// [[Rcpp::export]]
void bycolumn( DataFrame df ){
  int nCols = df.size();
  int nRows = df.nrows();
  List dataframeList(nCols);
  for (int i=0;i<nCols; i++) {
    dataframeList[i] = df[i];
  }

  for(int i=0; i<nRows; i++){
    std::cout << "Row " << i << std::endl;
    for(int j=0; j<nCols; j++){
      CharacterVector v = dataframeList[j];
      std::string s = as<std::string>(v[i]);
      std::cout << s << " ";
    }
    std::cout << std::endl;
  }
}


//[[Rcpp::export]]
DataFrame copier(DataFrame x) {
  int nCols = x.length(); // x.size() would work as well
  List dataframeList(nCols); // every dataframe column corresponds to an item in the list

  for (int i=0;i<nCols; i++) {
    dataframeList[i] = x[i]; // copy column by column
  }
  dataframeList.attr("names") = x.attr("names"); // copy the column names
  DataFrame z(dataframeList); // convert list to DataFrame
  z.attr("row.names") = x.attr("row.names"); // copy the rownames if any
  return z;
}


//[[Rcpp::export]]
NumericVector creater(){

  NumericVector v(10);
  for(int k=0; k<10;k++){
    v[k] = R::runif(10,20);
  }
  return wrap(v);
}

Using it in R

With this Rcpp algorithm one can move on to creating the scores for actual data. You will need a frame containing your existing customers and a frame with the visitor or to-be customers:

 

library(dplyr)
library(ggplot2)
existingCustomers = as.data.frame( frame[which(frame$isCustomer==1),])
potentialCustomers = as.data.frame( frame[which(frame$isCustomer!=1),])
 
# only the feature vectors
existingCustomers = dplyr::select(existingCustomers,  starts_with("F"))
potentialCustomers = dplyr::select(potentialCustomers,  starts_with("F"))
#binarize
existingCustomers = ifelse(existingCustomers > 0,1,0)
potentialCustomers = ifelse(potentialCustomers > 0,1,0)
 
existingCustomers = as.data.frame(t(existingCustomers))
potentialCustomers = as.data.frame(t(potentialCustomers))
 
threshold = 10
stamp = Sys.time()
result = Propensity::gathered(existingCustomers, potentialCustomers, threshold)
Sys.time() - stamp
 
hist(result$AvgDistance, main="Propensity distribution", xlab="distance")
 
cumul = c()
for(k in 1:100){
  x=length(which(result$AvgDistance<=k*threshold/100))
  cumul = c(cumul, round(100.0*x/ncol(potentialCustomers),2))
  #print(paste0("There are ", x, " clients (", round(100.0*x/ncol(potentialCustomers),2),"%)", " less than ", k," metafeatures away."))
}
 
plot(seq(0, length.out = 100, by = threshold/100), cumul, type="l", main="Proportion of potentialCustomers vs distance", ylab="percent", xlab="distance")
 
 
ggplot(result, aes(AvgDistance)) +  geom_freqpoly(bins = 500)+xlab("Average distance") + ylab("Amount")
ggplot(data=data.frame(x=1:100, y=cumul), aes(x=x,y=y)) + geom_line()+xlab("distance") + ylab("percent")+labs(title = "Proportion of potentialCustomers vs distance")

One important note here is that the gathered method uses the transpose of the frame. Simply because looping over columns is much easier in Rcpp than looping over rows. Hence the “t” of the transposition when passing the data.