--- title: Selection of Features author: Julian Faraway output: html_document: toc: true theme: cosmo --- ```{r global_options, include=FALSE} library(knitr) opts_chunk$set(comment=NA, fig.path='/tmp/Figs/', warning=FALSE, message=FALSE) ``` ```{r echo=FALSE} paste("Created:",date()) ``` We use only Random Forest. We eliminate terms in order to improve prediction accuracy. As the name suggests, the Random Forest generates trees randomly. This means that if you repeat these calculations, you will probably get a slightly different result. We could avoid this reproducibility issue by setting the random number seed or (better) increasing the number of trees generated at the cost of a substantially longer computation. However, it may be better to accept that this procedure does not produce an exact definitive ordering of the features because close neighbours may perform about as well as each other and with a slightly different dataset, the ordering might come out slightly differently. However, we can be confident that the model-based features are generally superior to the sample-based ones. So when we consider developing new features for new problems, we will tend to do better if we take a modelling rather than sample-based approach. See [featlcdb.Rmd](newfeat.Rmd) for how the data [feat.rda](feat.rda) was prepared for this analysis. ```{r} load("feat.rda") source("funcs.R") require(randomForest) ``` Set up the predictors that we will use throughout. Note that monotone transformations of the variables would make no difference to a partitioning method. I have not used data_num as this is the number of observations in the lightcurve file (might be cheating to use this?). I have also excluded location (ra,dec,longitude,latitude) because it is difficult to use the non-variable objects with these variables. Also it is questionable whether it makes any sense to use them anyway. ```{r} preds <- c("shov","maxdiff","dscore","totvar","quadvar","famp","fslope","outl","gscore","lsd","gtvar","rm.amplitude","rm.beyond1std","rm.fpr20","rm.fpr35","rm.fpr50","rm.fpr80","rm.maxslope","rm.mad","rm.medbuf","rm.pairslope","rm.peramp","rm.pdfp","rm.skew","rm.kurtosis","rm.std","rm.rcorbor") npreds <- length(preds) ``` ## All types Apply random forest to all predictors and compute the training and testing classification rates: ```{r} tpredform <- paste(preds,collapse="+") tallform <- as.formula(paste("type ~",tpredform)) fmod <- randomForest(tallform, data=trains) pv <- predict(fmod) cm <- xtabs( ~ pv + trains$type) nomrate <- sum(diag(cm))/sum(cm) pv <- predict(fmod, newdata=tests) cm <- xtabs( ~ pv + tests$type) truerate <- sum(diag(cm))/sum(cm) rr <- c(nomrate,truerate) names(rr) <- c("train","test") rr ``` Look at the importance of each predictor ```{r} importance(fmod) ``` Drop predictors sequentially according to importance ```{r} crm <- matrix(NA,npreds,2) vpreds <- preds vnames <- rep("",npreds) for(i in 1:(npreds-1)){ vimp <- importance(fmod) vari <- which.min(vimp) vpreds <- vpreds[-vari] vnames[i] <- rownames(vimp)[vari] tpredform <- paste(vpreds,collapse="+") tallform <- as.formula(paste("type ~",tpredform)) fmod <- randomForest(tallform, data=trains) pv <- predict(fmod) cm <- xtabs( ~ pv + trains$type) nomrate <- sum(diag(cm))/sum(cm) pv <- predict(fmod, newdata=tests) cm <- xtabs( ~ pv + tests$type) truerate <- sum(diag(cm))/sum(cm) crm[i,] <- c(nomrate,truerate) } nomrate <- mean(trains$type == 'nv') truerate <- mean(tests$type == 'nv') crm[npreds,] <- c(nomrate,truerate) vnames[npreds] <- vpreds dimnames(crm) <- list(vnames,c("train","test")) crm ``` Plot the rates: (solid=train, dashed=test) ```{r vs} crm <- data.frame(crm) p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("") p <- p + geom_line() p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm)) p <- p + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),axis.text = element_text(size = rel(1.2)), axis.title = element_text(size = rel(1.2))) p <- p + geom_line(aes(y=test),linetype="dashed") p ``` ## transients vs. non-variables: ```{r} trains$vtype <- ifelse(trains$type == "nv","nv","tr") trains$vtype <- factor(trains$vtype) tests$vtype <- ifelse(tests$type == "nv","nv","tr") tests$vtype <- factor(tests$vtype) ``` ```{r} tpredform <- paste(preds,collapse="+") tallform <- as.formula(paste("vtype ~",tpredform)) fmod <- randomForest(tallform, data=trains) pv <- predict(fmod) cm <- xtabs( ~ pv + trains$vtype) nomrate <- sum(diag(cm))/sum(cm) pv <- predict(fmod, newdata=tests) cm <- xtabs( ~ pv + tests$vtype) truerate <- sum(diag(cm))/sum(cm) rr <- c(nomrate,truerate) names(rr) <- c("train","test") rr ``` Look at the importance of each predictor ```{r} importance(fmod) ``` Drop predictors sequentially according to importance ```{r} crm <- matrix(NA,npreds,2) vpreds <- preds vnames <- rep("",npreds) for(i in 1:(npreds-1)){ vimp <- importance(fmod) vari <- which.min(vimp) vpreds <- vpreds[-vari] vnames[i] <- rownames(vimp)[vari] tpredform <- paste(vpreds,collapse="+") tallform <- as.formula(paste("vtype ~",tpredform)) fmod <- randomForest(tallform, data=trains) pv <- predict(fmod) cm <- xtabs( ~ pv + trains$vtype) nomrate <- sum(diag(cm))/sum(cm) pv <- predict(fmod, newdata=tests) cm <- xtabs( ~ pv + tests$vtype) truerate <- sum(diag(cm))/sum(cm) crm[i,] <- c(nomrate,truerate) } nomrate <- mean(trains$vtype == 'nv') truerate <- mean(tests$vtype == 'nv') crm[npreds,] <- c(nomrate,truerate) vnames[npreds] <- vpreds dimnames(crm) <- list(vnames,c("train","test")) crm ``` Plot the rates: (solid=train, dashed=test) ```{r} crm <- data.frame(crm) p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("") p <- p + geom_line() p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm)) p <- p + theme(axis.text.x=element_text(angle=90, hjust=1)) p <- p + geom_line(aes(y=test),linetype="dashed") p ``` ## Transients only, ```{r} trains$ttype <- trains$type trains$ttype[trains$ttype == "nv"] <- NA trains$ttype <- factor(trains$ttype) tests$ttype <- tests$type tests$ttype[tests$ttype == "nv"] <- NA tests$ttype <- factor(tests$ttype) ``` ```{r} tpredform <- paste(preds,collapse="+") tallform <- as.formula(paste("ttype ~",tpredform)) fmod <- randomForest(tallform, data=na.omit(trains)) pv <- predict(fmod) jta <- na.omit(trains$ttype) cm <- xtabs( ~ pv + jta) nomrate <- sum(diag(cm))/sum(cm) pv <- predict(fmod, newdata=na.omit(tests)) jtr <- na.omit(tests$ttype) cm <- xtabs( ~ pv + jtr) truerate <- sum(diag(cm))/sum(cm) rr <- c(nomrate,truerate) names(rr) <- c("train","test") rr ``` Look at the importance of each predictor ```{r} importance(fmod) ``` Drop predictors sequentially according to importance ```{r} crm <- matrix(NA,npreds,2) vpreds <- preds vnames <- rep("",npreds) for(i in 1:(npreds-1)){ vimp <- importance(fmod) vari <- which.min(vimp) vpreds <- vpreds[-vari] vnames[i] <- rownames(vimp)[vari] tpredform <- paste(vpreds,collapse="+") tallform <- as.formula(paste("ttype ~",tpredform)) fmod <- randomForest(tallform, data=na.omit(trains)) pv <- predict(fmod) cm <- xtabs( ~ pv + jta) nomrate <- sum(diag(cm))/sum(cm) pv <- predict(fmod, newdata=na.omit(tests)) cm <- xtabs( ~ pv + jtr) truerate <- sum(diag(cm))/sum(cm) crm[i,] <- c(nomrate,truerate) } nomrate <- mean(jta == 'sn') truerate <- mean(jtr == 'sn') crm[npreds,] <- c(nomrate,truerate) vnames[npreds] <- vpreds dimnames(crm) <- list(vnames,c("train","test")) crm ``` Plot the rates: (solid=train, dashed=test) ```{r} crm <- data.frame(crm) p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("") p <- p + geom_line() p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm)) p <- p + theme(axis.text.x=element_text(angle=90, hjust=1)) p <- p + geom_line(aes(y=test),linetype="dashed") p ```