[1] "Created: Wed Sep  3 16:26:14 2014"

We use only Random Forest. We eliminate terms in order to improve prediction accuracy.

As the name suggests, the Random Forest generates trees randomly. This means that if you repeat these calculations, you will probably get a slightly different result. We could avoid this reproducibility issue by setting the random number seed or (better) increasing the number of trees generated at the cost of a substantially longer computation. However, it may be better to accept that this procedure does not produce an exact definitive ordering of the features because close neighbours may perform about as well as each other and with a slightly different dataset, the ordering might come out slightly differently. However, we can be confident that the model-based features are generally superior to the sample-based ones. So when we consider developing new features for new problems, we will tend to do better if we take a modelling rather than sample-based approach.

See featlcdb.Rmd for how the data feat.rda was prepared for this analysis.

load("feat.rda")
source("funcs.R")
require(randomForest)

Set up the predictors that we will use throughout. Note that monotone transformations of the variables would make no difference to a partitioning method. I have not used data_num as this is the number of observations in the lightcurve file (might be cheating to use this?).

I have also excluded location (ra,dec,longitude,latitude) because it is difficult to use the non-variable objects with these variables. Also it is questionable whether it makes any sense to use them anyway.

preds <- c("shov","maxdiff","dscore","totvar","quadvar","famp","fslope","outl","gscore","lsd","gtvar","rm.amplitude","rm.beyond1std","rm.fpr20","rm.fpr35","rm.fpr50","rm.fpr80","rm.maxslope","rm.mad","rm.medbuf","rm.pairslope","rm.peramp","rm.pdfp","rm.skew","rm.kurtosis","rm.std","rm.rcorbor")
npreds <- length(preds)

All types

Apply random forest to all predictors and compute the training and testing classification rates:

tpredform <- paste(preds,collapse="+")
tallform <- as.formula(paste("type ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$type)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$type)
truerate  <- sum(diag(cm))/sum(cm)
rr <- c(nomrate,truerate)
names(rr) <- c("train","test")
rr
 train   test 
0.8121 0.8024 

Look at the importance of each predictor

importance(fmod)
              MeanDecreaseGini
shov                     94.29
maxdiff                  67.61
dscore                   98.67
totvar                  121.73
quadvar                 157.22
famp                    174.92
fslope                  133.99
outl                     59.37
gscore                  121.54
lsd                      66.81
gtvar                    99.16
rm.amplitude             63.30
rm.beyond1std            27.19
rm.fpr20                 29.83
rm.fpr35                 29.75
rm.fpr50                 29.87
rm.fpr80                 28.54
rm.maxslope              47.59
rm.mad                   33.49
rm.medbuf                29.88
rm.pairslope             21.59
rm.peramp                46.28
rm.pdfp                  39.42
rm.skew                  46.81
rm.kurtosis              37.74
rm.std                   80.26
rm.rcorbor               10.23

Drop predictors sequentially according to importance

crm <- matrix(NA,npreds,2)
vpreds <- preds
vnames <- rep("",npreds)
for(i in 1:(npreds-1)){
vimp <- importance(fmod)
vari <- which.min(vimp)
vpreds <- vpreds[-vari]
vnames[i] <- rownames(vimp)[vari]
tpredform <- paste(vpreds,collapse="+")
tallform <- as.formula(paste("type ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$type)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$type)
truerate  <- sum(diag(cm))/sum(cm)
crm[i,] <- c(nomrate,truerate)
}
nomrate <- mean(trains$type == 'nv')
truerate <- mean(tests$type == 'nv')
crm[npreds,] <- c(nomrate,truerate)
vnames[npreds] <- vpreds
dimnames(crm) <- list(vnames,c("train","test"))
crm
               train   test
rm.rcorbor    0.8105 0.8081
rm.pairslope  0.8133 0.8137
rm.beyond1std 0.8117 0.8056
rm.fpr20      0.8117 0.8089
rm.fpr80      0.8109 0.8065
rm.fpr35      0.8129 0.8073
rm.medbuf     0.8113 0.8105
rm.mad        0.8125 0.8073
rm.fpr50      0.8133 0.8089
rm.kurtosis   0.8198 0.8129
rm.pdfp       0.8121 0.8113
rm.peramp     0.8153 0.8121
rm.maxslope   0.8141 0.8032
rm.skew       0.8073 0.8032
outl          0.8065 0.8048
rm.amplitude  0.7992 0.8000
lsd           0.7972 0.7968
maxdiff       0.7931 0.7968
dscore        0.7903 0.7911
rm.std        0.7867 0.7823
gtvar         0.7794 0.7685
quadvar       0.7798 0.7629
gscore        0.7540 0.7484
shov          0.6815 0.6871
fslope        0.6306 0.6524
totvar        0.4940 0.4806
famp          0.4706 0.4500

Plot the rates: (solid=train, dashed=test)

crm <- data.frame(crm)
p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("")
p <- p + geom_line()
p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm))
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),axis.text = element_text(size = rel(1.2)), axis.title = element_text(size = rel(1.2)))
p <- p + geom_line(aes(y=test),linetype="dashed")
p

plot of chunk vs

transients vs. non-variables:

trains$vtype <- ifelse(trains$type == "nv","nv","tr")
trains$vtype <- factor(trains$vtype)
tests$vtype <- ifelse(tests$type == "nv","nv","tr")
tests$vtype <- factor(tests$vtype)
tpredform <- paste(preds,collapse="+")
tallform <- as.formula(paste("vtype ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$vtype)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$vtype)
truerate  <- sum(diag(cm))/sum(cm)
rr <- c(nomrate,truerate)
names(rr) <- c("train","test")
rr
 train   test 
0.9282 0.9194 

Look at the importance of each predictor

importance(fmod)
              MeanDecreaseGini
shov                    29.288
maxdiff                 16.330
dscore                  95.589
totvar                  65.896
quadvar                186.157
famp                   207.285
fslope                 149.558
outl                    22.457
gscore                 138.073
lsd                     42.311
gtvar                   42.732
rm.amplitude            27.800
rm.beyond1std           10.821
rm.fpr20                11.571
rm.fpr35                11.521
rm.fpr50                12.636
rm.fpr80                13.075
rm.maxslope             21.738
rm.mad                  13.150
rm.medbuf               10.378
rm.pairslope             7.825
rm.peramp               17.798
rm.pdfp                 18.634
rm.skew                 15.014
rm.kurtosis             16.761
rm.std                  28.604
rm.rcorbor               2.108

Drop predictors sequentially according to importance

crm <- matrix(NA,npreds,2)
vpreds <- preds
vnames <- rep("",npreds)
for(i in 1:(npreds-1)){
vimp <- importance(fmod)
vari <- which.min(vimp)
vpreds <- vpreds[-vari]
vnames[i] <- rownames(vimp)[vari]
tpredform <- paste(vpreds,collapse="+")
tallform <- as.formula(paste("vtype ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$vtype)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$vtype)
truerate  <- sum(diag(cm))/sum(cm)
crm[i,] <- c(nomrate,truerate)
}
nomrate <- mean(trains$vtype == 'nv')
truerate <- mean(tests$vtype == 'nv')
crm[npreds,] <- c(nomrate,truerate)
vnames[npreds] <- vpreds
dimnames(crm) <- list(vnames,c("train","test"))
crm
               train   test
rm.rcorbor    0.9266 0.9210
rm.medbuf     0.9274 0.9210
rm.pairslope  0.9254 0.9194
rm.beyond1std 0.9258 0.9242
rm.fpr20      0.9286 0.9226
rm.fpr35      0.9266 0.9250
rm.fpr80      0.9246 0.9202
rm.mad        0.9262 0.9210
rm.skew       0.9246 0.9194
rm.fpr50      0.9262 0.9218
maxdiff       0.9282 0.9194
rm.pdfp       0.9258 0.9202
rm.kurtosis   0.9254 0.9194
rm.peramp     0.9230 0.9218
rm.maxslope   0.9234 0.9218
rm.amplitude  0.9246 0.9194
rm.std        0.9246 0.9218
outl          0.9206 0.9129
gtvar         0.9177 0.9056
shov          0.9129 0.8984
totvar        0.9089 0.8960
lsd           0.9032 0.8879
dscore        0.8964 0.8774
gscore        0.8786 0.8710
fslope        0.8730 0.8589
quadvar       0.8149 0.8024
famp          0.4706 0.4500

Plot the rates: (solid=train, dashed=test)

crm <- data.frame(crm)
p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("")
p <- p + geom_line()
p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm))
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1))
p <- p + geom_line(aes(y=test),linetype="dashed")
p