---
title: Selection of Features
author: Julian Faraway
output:
 html_document:
  toc: true
  theme: cosmo
---

```{r global_options, include=FALSE}
library(knitr)
opts_chunk$set(comment=NA, fig.path='/tmp/Figs/', warning=FALSE, message=FALSE)
```

```{r echo=FALSE}
paste("Created:",date())
```

We use only Random Forest. We eliminate terms in order to improve prediction accuracy.

As the name suggests, the Random Forest generates trees randomly. This means that if you repeat
these calculations, you will probably get a slightly different result. We could avoid this reproducibility
issue by setting the random number seed or (better) increasing the number of trees generated at the
cost of a substantially longer computation. However, it may be better to accept that this procedure
does not produce an exact definitive ordering of the features because close neighbours may perform
about as well as each other and with a slightly different dataset, the ordering might come out slightly differently.
However, we can be confident that the model-based features are generally superior to the sample-based ones. So
when we consider developing new features for new problems, we will tend to do better if we take a modelling rather
than sample-based approach.

See [featlcdb.Rmd](newfeat.Rmd) for how the data [feat.rda](feat.rda) was prepared for this analysis.

```{r}
load("feat.rda")
source("funcs.R")
require(randomForest)
```

Set up the predictors that we will use throughout. Note that monotone
transformations of the variables would make no difference to a
partitioning method. I have not used data_num as this is the number of
observations in the lightcurve file (might be cheating to use this?).

I have also excluded location (ra,dec,longitude,latitude) because it
is difficult to use the non-variable objects with these
variables. Also it is questionable whether it makes any sense to use
them anyway.

```{r}
preds <- c("shov","maxdiff","dscore","totvar","quadvar","famp","fslope","outl","gscore","lsd","gtvar","rm.amplitude","rm.beyond1std","rm.fpr20","rm.fpr35","rm.fpr50","rm.fpr80","rm.maxslope","rm.mad","rm.medbuf","rm.pairslope","rm.peramp","rm.pdfp","rm.skew","rm.kurtosis","rm.std","rm.rcorbor")
npreds <- length(preds)
```

## All types

Apply random forest to all predictors and compute the training and testing classification rates:

```{r}
tpredform <- paste(preds,collapse="+")
tallform <- as.formula(paste("type ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$type)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$type)
truerate  <- sum(diag(cm))/sum(cm)
rr <- c(nomrate,truerate)
names(rr) <- c("train","test")
rr
```

Look at the importance of each predictor

```{r}
importance(fmod)
```

Drop predictors sequentially according to importance

```{r}
crm <- matrix(NA,npreds,2)
vpreds <- preds
vnames <- rep("",npreds)
for(i in 1:(npreds-1)){
vimp <- importance(fmod)
vari <- which.min(vimp)
vpreds <- vpreds[-vari]
vnames[i] <- rownames(vimp)[vari]
tpredform <- paste(vpreds,collapse="+")
tallform <- as.formula(paste("type ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$type)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$type)
truerate  <- sum(diag(cm))/sum(cm)
crm[i,] <- c(nomrate,truerate)
}
nomrate <- mean(trains$type == 'nv')
truerate <- mean(tests$type == 'nv')
crm[npreds,] <- c(nomrate,truerate)
vnames[npreds] <- vpreds
dimnames(crm) <- list(vnames,c("train","test"))
crm
```

Plot the rates: (solid=train, dashed=test)

```{r vs}
crm <- data.frame(crm)
p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("")
p <- p + geom_line()
p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm))
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1, vjust=0.5),axis.text = element_text(size = rel(1.2)), axis.title = element_text(size = rel(1.2)))
p <- p + geom_line(aes(y=test),linetype="dashed")
p
```

## transients vs. non-variables:

```{r}
trains$vtype <- ifelse(trains$type == "nv","nv","tr")
trains$vtype <- factor(trains$vtype)
tests$vtype <- ifelse(tests$type == "nv","nv","tr")
tests$vtype <- factor(tests$vtype)
```

```{r}
tpredform <- paste(preds,collapse="+")
tallform <- as.formula(paste("vtype ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$vtype)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$vtype)
truerate  <- sum(diag(cm))/sum(cm)
rr <- c(nomrate,truerate)
names(rr) <- c("train","test")
rr
```

Look at the importance of each predictor

```{r}
importance(fmod)
```

Drop predictors sequentially according to importance

```{r}
crm <- matrix(NA,npreds,2)
vpreds <- preds
vnames <- rep("",npreds)
for(i in 1:(npreds-1)){
vimp <- importance(fmod)
vari <- which.min(vimp)
vpreds <- vpreds[-vari]
vnames[i] <- rownames(vimp)[vari]
tpredform <- paste(vpreds,collapse="+")
tallform <- as.formula(paste("vtype ~",tpredform))
fmod <- randomForest(tallform, data=trains)
pv <- predict(fmod)
cm <- xtabs( ~ pv + trains$vtype)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=tests)
cm <- xtabs( ~ pv + tests$vtype)
truerate  <- sum(diag(cm))/sum(cm)
crm[i,] <- c(nomrate,truerate)
}
nomrate <- mean(trains$vtype == 'nv')
truerate <- mean(tests$vtype == 'nv')
crm[npreds,] <- c(nomrate,truerate)
vnames[npreds] <- vpreds
dimnames(crm) <- list(vnames,c("train","test"))
crm
```

Plot the rates: (solid=train, dashed=test)

```{r}
crm <- data.frame(crm)
p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("")
p <- p + geom_line()
p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm))
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1))
p <- p + geom_line(aes(y=test),linetype="dashed")
p
```


## Transients only,

```{r}
trains$ttype <- trains$type
trains$ttype[trains$ttype == "nv"] <- NA
trains$ttype <- factor(trains$ttype)
tests$ttype <- tests$type
tests$ttype[tests$ttype == "nv"] <- NA
tests$ttype <- factor(tests$ttype)
```

```{r}
tpredform <- paste(preds,collapse="+")
tallform <- as.formula(paste("ttype ~",tpredform))
fmod <- randomForest(tallform, data=na.omit(trains))
pv <- predict(fmod)
jta <- na.omit(trains$ttype)
cm <- xtabs( ~ pv + jta)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=na.omit(tests))
jtr <- na.omit(tests$ttype)
cm <- xtabs( ~ pv + jtr)
truerate  <- sum(diag(cm))/sum(cm)
rr <- c(nomrate,truerate)
names(rr) <- c("train","test")
rr
```

Look at the importance of each predictor

```{r}
importance(fmod)
```

Drop predictors sequentially according to importance

```{r}
crm <- matrix(NA,npreds,2)
vpreds <- preds
vnames <- rep("",npreds)
for(i in 1:(npreds-1)){
vimp <- importance(fmod)
vari <- which.min(vimp)
vpreds <- vpreds[-vari]
vnames[i] <- rownames(vimp)[vari]
tpredform <- paste(vpreds,collapse="+")
tallform <- as.formula(paste("ttype ~",tpredform))
fmod <- randomForest(tallform, data=na.omit(trains))
pv <- predict(fmod)
cm <- xtabs( ~ pv + jta)
nomrate  <- sum(diag(cm))/sum(cm)
pv <- predict(fmod, newdata=na.omit(tests))
cm <- xtabs( ~ pv + jtr)
truerate  <- sum(diag(cm))/sum(cm)
crm[i,] <- c(nomrate,truerate)
}
nomrate <- mean(jta == 'sn')
truerate <- mean(jtr == 'sn')
crm[npreds,] <- c(nomrate,truerate)
vnames[npreds] <- vpreds
dimnames(crm) <- list(vnames,c("train","test"))
crm
```

Plot the rates: (solid=train, dashed=test)

```{r}
crm <- data.frame(crm)
p <- ggplot(aes(x=1:nrow(crm),y=train),data=crm)+ylab("Classification rate")+xlab("")
p <- p + geom_line()
p <- p + scale_x_continuous(breaks=1:nrow(crm),labels=row.names(crm))
p <- p + theme(axis.text.x=element_text(angle=90, hjust=1))
p <- p + geom_line(aes(y=test),linetype="dashed")
p
```