Contents cited from Coursera class Data Analysis from Johns Hopkins University by Jeff Leek. The dataset that Jeff was working with comes from the package kernlab (kernal based machine learning lab).

library(kernlab); data(spam); set.seed(3435);

trainIndicator = rbinom(4601, size=1, prob=0.5)

table(trainIndicator)

0 1

2314 2287

# table command here is called: Cross Tabulation and Table Creation which comes very handy

1. Look at the training set with the commands: names(data), head(data), table(data$col)

2. Plot

plot(log10(trainSpam$capitalAve+1) ~ trainSpam$type)

# you can also use pairs command here

plot(log10(trainSpam[,1:4]+1))

plot(hclust(dist(t(log10(trainSpam[, 1:57]+1)))))

# the code below demonstrate a basic process for statistical prediction/modeling

trainSpam$numType <- as.numeric(trainSpam$type) – 1

costFunction <- function(x,y) {sum(x!=(y>0.5))}

cvError = rep(NA, 55)

library(boot)

for(i in 1:55){

lmFormula = as.formula(paste(“numType~”, names(trainSpam)[i], sep=””))

glmFit = glm(lmFormula, family=”binomial”, data=trainSpam)

cvError[i] <- cv.glm(trainSpam, glmFit, costFunction, 2)$delta[2]

}

# measure of uncertainty

predictionModel <- glm(numType ~ charDollar, family =”binomial”, data=trainSpam)

predictionTest <- predict(predictionModel, testSpam)

predictedSpam <- rep(“nonspam”, dim(testSpam)[1])

predictedSpam[predictionModel$fitted > 0.5] = “spam”

table(predictedSpam, testSpam$type)

– predictedSpam nonspam spam

– nonspam 1348 398

– spam 81 481

(We can see the spam classifier we built only using the dollar sign did a pretty good job for non spam emails but for spam emails, the output turned out to be half and half)

And the error rate is about 22%