[1] "Created: Thu Jun 19 13:57:22 2014"
load("lcdb.rda")
source("funcs.R")
GPR <- TRUE
Set the kernel type to squared exponential and set the inverse width:
kerntype <- "exponential"
wvec <- 2e-4
Measures can be classified into groups.
We have not used all these measures for classification purposes. Some we have not used, like nobs, because they would distort classification performance as explained in the article. Other measures were failed experiments in that they failed to incrementally improve classification performance. We have not reported these in the article but keep them here for future reference.
firstdate <- 53464
daterange <- 2764
detection.limit <- 20.5
uids <- lcdbi$id
nuids <- length(uids)
statmat <- NULL
types <- NULL
ids <- NULL
for(i in 1:nuids){
olc <- lcdb[[i]]
type <- as.character(lcdbi$type[i])
id <- as.character(lcdbi$id[i])
aa <- lcstats(olc,GPR,kerntype,wvec,daterange=daterange,firstdate=firstdate,detection.limit=detection.limit)
if(is.na(aa[1])) next
statmat <- rbind(statmat,aa)
types <- c(types,type)
ids <- c(ids,id)
}
colnames(statmat) <- c("meds", "iqr","shov", "maxdiff", "dscore","wander", "moveloc","nobs", "totvar", "quadvar", "famp", "fslope","trend",
"outl", "skewres", "std", "shapwilk", "lsd", "gscore", "mdev", "gtvar", "rm.amplitude", "rm.mean",
"rm.stddev", "rm.beyond1std", "rm.fpr20", "rm.fpr35", "rm.fpr50",
"rm.fpr65", "rm.fpr80", "rm.lintrend", "rm.maxslope", "rm.mad",
"rm.medbuf", "rm.pairslope", "rm.peramp", "rm.pdfp", "rm.skew",
"rm.kurtosis", "rm.std", "rm.rcorbor")
cmdb <- data.frame(statmat,type=types,id=ids,row.names=ids)
cmdb$lsd[is.na(cmdb$lsd) | is.infinite(cmdb$lsd)] <- min(cmdb$lsd[!is.infinite(cmdb$lsd)],na.rm=TRUE)-1
cmdb$fslope[is.infinite(cmdb$fslope)] <- 1
summary(cmdb)
meds iqr shov maxdiff
Min. :10.3 Min. :0.010 Min. :0.010 Min. : 0.040
1st Qu.:17.4 1st Qu.:0.220 1st Qu.:0.136 1st Qu.: 0.729
Median :18.8 Median :0.430 Median :0.297 Median : 1.190
Mean :18.3 Mean :0.552 Mean :0.327 Mean : 1.547
3rd Qu.:19.8 3rd Qu.:0.685 3rd Qu.:0.452 3rd Qu.: 1.930
Max. :22.3 Max. :5.188 Max. :3.685 Max. :11.780
dscore wander moveloc nobs
Min. :0.000 Min. :1.88e-05 Min. :0.000095 Min. : 5
1st Qu.:0.129 1st Qu.:1.14e-04 1st Qu.:0.000462 1st Qu.: 21
Median :0.215 Median :2.14e-04 Median :0.000725 Median : 57
Mean :0.208 Mean :3.13e-04 Mean :0.001196 Mean :103
3rd Qu.:0.284 3rd Qu.:3.85e-04 3rd Qu.:0.001644 3rd Qu.:167
Max. :0.387 Max. :2.95e-03 Max. :0.005611 Max. :641
totvar quadvar famp fslope
Min. :0.0003 Min. :0.00000 Min. : 0.032 Min. :0.0025
1st Qu.:0.0054 1st Qu.:0.00008 1st Qu.: 0.393 1st Qu.:0.0295
Median :0.0120 Median :0.00040 Median : 0.897 Median :0.0675
Mean :0.0234 Mean :0.00319 Mean : 1.448 Mean :0.1300
3rd Qu.:0.0287 3rd Qu.:0.00247 3rd Qu.: 2.042 3rd Qu.:0.1817
Max. :0.3257 Max. :0.17403 Max. :14.429 Max. :1.7435
trend outl skewres std
Min. :-1.17e-03 Min. : 1.04 Min. :-6.872 Min. :0.0085
1st Qu.:-5.30e-05 1st Qu.: 2.19 1st Qu.:-0.426 1st Qu.:0.1512
Median :-1.00e-05 Median : 2.88 Median :-0.010 Median :0.2704
Mean :-2.14e-05 Mean : 3.22 Mean : 0.045 Mean :0.3169
3rd Qu.: 1.67e-05 3rd Qu.: 3.81 3rd Qu.: 0.433 3rd Qu.:0.4108
Max. : 9.03e-04 Max. :15.63 Max. :12.692 Max. :2.4160
shapwilk lsd gscore mdev
Min. :0.245 Min. :-5.47 Min. :0.000 Min. :0.003
1st Qu.:0.892 1st Qu.:-2.31 1st Qu.:0.116 1st Qu.:0.278
Median :0.949 Median :-1.59 Median :0.266 Median :0.480
Mean :0.913 Mean :-1.74 Mean :0.227 Mean :0.582
3rd Qu.:0.976 3rd Qu.:-1.10 3rd Qu.:0.332 3rd Qu.:0.775
Max. :0.998 Max. : 1.13 Max. :0.399 Max. :4.782
gtvar rm.amplitude rm.mean rm.stddev
Min. : 0.00 Min. :0.035 Min. :0.00e+00 Min. :0.00e+00
1st Qu.: 1.58 1st Qu.:0.500 1st Qu.:1.00e-08 1st Qu.:0.00e+00
Median : 4.46 Median :0.809 Median :4.00e-08 Median :1.00e-08
Mean : 10.81 Mean :1.020 Mean :1.01e-06 Mean :4.00e-07
3rd Qu.: 15.44 3rd Qu.:1.355 3rd Qu.:1.30e-07 3rd Qu.:6.00e-08
Max. :154.34 Max. :6.225 Max. :8.57e-05 Max. :1.24e-04
rm.beyond1std rm.fpr20 rm.fpr35 rm.fpr50
Min. :0.000 Min. :0.0000 Min. :0.0001 Min. :0.0002
1st Qu.:0.200 1st Qu.:0.0978 1st Qu.:0.1968 1st Qu.:0.3157
Median :0.273 Median :0.1392 Median :0.2581 Median :0.3949
Mean :0.266 Mean :0.1543 Mean :0.2715 Mean :0.4011
3rd Qu.:0.333 3rd Qu.:0.1852 3rd Qu.:0.3261 3rd Qu.:0.4788
Max. :0.667 Max. :0.9532 Max. :0.9728 Max. :0.9827
rm.fpr65 rm.fpr80 rm.lintrend rm.maxslope
Min. :0.0003 Min. :0.0006 Min. :-0.1758 Min. : 0.0
1st Qu.:0.4669 1st Qu.:0.7067 1st Qu.:-0.0001 1st Qu.: 62.3
Median :0.5606 Median :0.7851 Median : 0.0000 Median : 125.0
Mean :0.5515 Mean :0.7606 Mean : 0.0022 Mean : 169.1
3rd Qu.:0.6511 3rd Qu.:0.8604 3rd Qu.: 0.0001 3rd Qu.: 225.0
Max. :0.9931 Max. :1.0000 Max. : 0.3413 Max. :1756.1
rm.mad rm.medbuf rm.pairslope rm.peramp
Min. :0.004 Min. :0.000 Min. :0.000 Min. : 0
1st Qu.:0.105 1st Qu.:1.000 1st Qu.:0.433 1st Qu.: 1
Median :0.200 Median :1.000 Median :0.500 Median : 1
Mean :0.249 Mean :0.977 Mean :0.496 Mean : 16
3rd Qu.:0.326 3rd Qu.:1.000 3rd Qu.:0.533 3rd Qu.: 2
Max. :2.506 Max. :1.000 Max. :1.000 Max. :34671
rm.pdfp rm.skew rm.kurtosis rm.std
Min. : 0.0 Min. :-13.955 Min. : 0.70 Min. :0.014
1st Qu.: 0.6 1st Qu.: -0.665 1st Qu.: 2.13 1st Qu.:0.222
Median : 1.0 Median : -0.002 Median : 3.11 Median :0.375
Mean : 3.7 Mean : -0.180 Mean : 6.04 Mean :0.472
3rd Qu.: 1.6 3rd Qu.: 0.516 3rd Qu.: 5.36 3rd Qu.:0.616
Max. :1735.4 Max. : 11.105 Max. :208.94 Max. :3.429
rm.rcorbor type id
Min. :0.0000 nv :1725 1001072061085: 1
1st Qu.:0.0000 sn : 536 1001072061310: 1
Median :0.0000 cv : 461 1001072061380: 1
Mean :0.0141 downes : 376 1001072061435: 1
3rd Qu.:0.0000 rrlyr : 292 1001072061602: 1
Max. :0.5000 agn : 140 1001072061641: 1
(Other): 190 (Other) :3714
mnames <- names(cmdb)
for(i in 1:(ncol(cmdb)-2)){
vname <- mnames[i]
tranf <- functran[[match(vname,names(functran))]]
y <- sapply(cmdb[,i], tranf)
ylab <- ifelse(tranf=="identity",mnames[i],paste0(tranf,"(",mnames[i],")"))
plot(cmdb[,i] ~ cmdb$type, ylab=ylab, xlab="Type")
}
Split the data into a training(2/3) and a test(1/3) sample in the same way as before. The split will not be identical to that used on the Richards measures because of the problem that not all the Richards stats are computed on the NV group resulting in the discard of some objects from the calculation.
set.seed(123)
n <- nrow(cmdb)
isel <- sample(1:n,round(n/3))
trains <- cmdb[-isel,]
tests <- cmdb[isel,]
There are 1240 observations in the test set and 2480 observations in the training set.
Save for future use.
save(cmdb,trains,tests,file="feat.rda")