Let’s say you have data with a mix of random noise and signals. This could be for example radio signals with white background noise or people visiting an e-commerce website with a portion being genuine customers and a portion just randomly browsing. Can one filter out the real data from the noise? The answer is yes and is based on the fact that if you have enough data where a portion of it is based on a statistical distribution you can ‘see’ it. Technically speaking, if data is normally distributed it shows after a while. The chi-square (chi2) test is a test which detects this in data. The reason you look at a normal distribution is because of the central limit theorem and the assumption you have a lot of data. Whether one can do something similar for a small data set or for, say, a Poisson distribution I don’t know.

Below you will find a synthetic example of the effectiveness of chi2. We create a mix of noise and signals. In the first example the signals are mixed in bands within the data; three bands are randomly added to the true noise. In the second example we add the signals randomly (not in bands) in the data. We show in both cases that the chi2 test can detect the bands and the randomly added signals, It’s both a ‘proof’ and a nice code-example which can be used in projects.

You can find the gist here.

 

Signals in bands

We create a synthetic frame of data where three bands of signals are mixed randomly:

probabilityInsideCluster = 0.5
probabilityOutsideCluster = 0.2
probabilityNoise = 0.3
clusterBoundaries = c(10, 25, 56, 81, 151, 293)
clientTypeCount = c(0, 0, 0, 0)
library(dplyr)
library(rpart)
# this create a frame of clients
makeClientsFrame = function(attribCount = 1000,
                            clientCount = 1000,
                            typeDistribution = c(0.0, 0.3, 0.3, 0.4)) {
  # a stochastic cluster variable
  # if k is null or 0 the noise is uniform
  stoch = function(k, i) {
    if (is.null(k) || k == 0)
    {
      return(ifelse(runif(1) < probabilityNoise, 1, 0))
    }
    if (clusterBoundaries[2 * k - 1] < i &&
        i <= clusterBoundaries[2 * k]) {
      return(ifelse(runif(1) < probabilityInsideCluster, 1, 0))
    }
    return(ifelse(runif(1) < probabilityOutsideCluster, 1, 0))
  }
  # make a single row of values
  makeClient = function(clus) {
    if (is.null(clus) ||
        clus == 0)
      clientTypeCount[1] <<- clientTypeCount[1] + 1
    else
      clientTypeCount[clus] <<- clientTypeCount[clus] + 1
    r = sapply(1:attribCount, function(x) {
      stoch(clus, x)
    })
    r = c(r, clus) # keep the type of client at the end
    return(r)
  }
  
  # create the initial frame
  df = data_frame(makeClient(0))
  addClient = function(clus) {
    df <<- data.frame(df, data_frame(makeClient(clus)))
  }
  
  invisible(replicate(clientCount, addClient(sample(0:3, 1, prob = typeDistribution))))
  df = data.frame(t(df))
  # adding a target column
  #df = mutate(df, Target = sample(c(1,0), clientCount + 1, replace = TRUE))
  colnames(df) = c(paste("A" , 1:attribCount, sep = ""), "Target")
  return(df)
}
baseFrame = makeClientsFrame()

and the actual chi2 test is as follows:

chi = function(frame, index) {
  colName = paste("A", index, sep = "")
  colIndex = as.numeric(which(colnames(frame) == colName))
  targetIndex = as.numeric(which(colnames(frame) == "Target"))
  q = frame[, c(colIndex, targetIndex)]
  col = as.symbol(colnames(q)[1])
  c1 = count_(filter(q, Target == 1), colName)$n
  c2 = count_(filter(q, Target == 2), colName)$n
  c3 = count_(filter(q, Target == 3), colName)$n
  df = data.frame(c1, c2, c3)
  chisq.test(df)$p.value
}

Note that the chi2 test has a different implementation in Excel, MatLab, Mathematica and R. 

If you apply the test to the synthetic data and plot the result

# look at the Pearson coefficient for each of the first 500 attribute
chis = lapply(1:1000, function(x) {
  chi(baseFrame, x)
})
plot(1:500, chis[1:500], xlab = "Feature #", ylab = "Pearson coefficient")
abline(v = clusterBoundaries, col = "red")

# extremely small Pearson means high correlation
goodchis = lapply(chis, function(x) {
  if (x < 0.003)
    x
  else
    NA
})
# they are at which position?
which(!is.na(goodchis))

plot(500:1000, chis[500:1000], xlab = "Feature #", ylab = "Pearson coefficient")

Chi2 Filtering

which is massively convincing but at the same time not a surprise.

Signals everywhere

We can perform the same experiment but with the data spread across the whole set rather than keeping things in intervals:

options = list(
  attribCount = 1000,
  # number of attributes
  clientCount = 15000,
  # number of clients
  foldingAttribCount = 50,
  # number of significant attribs
  targetCount = 3,
  # number of targets or classes
  percentageUnclassified = 50 # percent of clientCount not classified, i.e. value 0
)

makeRandomPartition = function(options) {
  splitIntervalTo = ceiling((1 - options$percentageUnclassified / 100) * options$clientCount)
  starts = as.vector(sort(sample(
    2:(splitIntervalTo - 1), options$targetCount - 1
  )))
  starts = c(1, starts, splitIntervalTo)
  r = c()
  for (k in 1:(options$targetCount)) {
    r = c(r, rep(k, starts[k + 1] - starts[k]))
  }
  r = c(r, rep(0, options$clientCount - splitIntervalTo + 1))
  return(r)
}
# creates one attrib vector for all clients
makeAttrib = function(target, partition, options) {
  if (is.null(target) || target == 0) {
    sample(0:1, options$clientCount, replace = TRUE)
  } else{
    series = lapply(partition, function(x) {
      if (x == target) {
        sample(0:1, 1, prob = c(0.3, 0.7))
      }
      else{
        sample(0:1, 1)
      }
    })
    return(as.numeric(series))
  }
}

makeRandomFrame = function(options){
  mem = list() # keeps track of attribs contributing to which target
  significantColumns = sample(1:options$attribCount, options$foldingAttribCount)
  partition = makeRandomPartition(options) # the target column
  df = data.frame(matrix(NA, nrow = options$clientCount, ncol = 0))
  for(k in 1:options$attribCount){
    if(is.element(k, significantColumns)) {
      whichTarget = sample(1:options$targetCount,1)
      memName = paste("T", whichTarget, sep="")
      mem[[memName]] = c(mem[[memName]], k) # col k leads to target whichTarget
      mem[["All"]] = c(mem[["All"]], k)
      attrib = makeAttrib(whichTarget, partition, options)
      df = bind_cols(df, as.data.frame(attrib))
    }
    else{
      attrib = makeAttrib(0, partition, options)
      df = bind_cols(df, as.data.frame(attrib))
    }
  }
  df = bind_cols(df, as.data.frame(partition))
  colnames(df) = c(paste("A" , 1:options$attribCount, sep=""), "Target")
  result = list(data = df, contrib = mem)
  return(result)
}

The chi2 test and the result are just as easily extracted:

chi = function(frame, index, options){
  colName = paste("A", index, sep = "")
  colIndex = as.numeric( which(colnames(frame) == colName))
  targetIndex = as.numeric( which(colnames(frame) == "Target"))
  q = frame[,c(colIndex, targetIndex)]
  df = data.frame(matrix(NA, nrow = 2, ncol = 0))
  #for(k in 1:options$targetCount){
  for(k in c(0,2)){
    count01s = count_(filter(q, Target == k), colName )$n
    tcolName = paste("T", index, sep = "")
    appen = as.data.frame(count01s)
    colnames(appen) = c(tcolName)
    df = bind_cols(df, appen)
  }
  chisq.test(df)$p.value
}


frame = makeRandomFrame(options)
frameData = frame$data
contrib = frame$contrib
chis = lapply(1:options$attribCount, function(x){chi(frameData, x, options)})
#hoera!
plot(1:options$attribCount, chis, ylim=c(0, 0.05))
abline(v= contrib[["All"]], col = "red")

Chi2 everywhere.

which is just as convincing as the previous synthetic sample.