The choice of the smoothing parameter in nonparametric regression is critical to the form of the estimated curve and any inference that follows. Many methods are available that will generate a single choice for this parameter. Here we argue that the considerable uncertainty in this choice should be explicitly represented.
The construction of standard simultaneous confidence bands in nonparametric regression often requires difficult mathematical arguments. We question their practical utility, presenting several deficiencies. We propose a new kind of confidence band that reflects the uncertainty regarding the smoothness of the estimate.
We use the following:
Download these if you want to reproduce the following:
Read in AGN data
agn <- read.table("agn.dat",header=TRUE)Compute default SCBs using lowess:
datarange=2764
agn$mjd <- agn$mjd - 53464
library(ggplot2,quietly=TRUE)
p <- ggplot(agn ,aes(x=mjd, y=mag))
p <- p+xlab("Day")+ ylab("Magnitude") + scale_y_reverse()+xlim(0,datarange)
p <- p+geom_point(size=0.75)
slow <- p + geom_smooth()
slowNow use smoothing splines via mgcv:
sspl <- p + geom_smooth(method="gam", formula = y ~ s(x, bs = "cs"))
ssplSet up implausible curves:
library(mgcv)
amod <- gam(mag ~ s(mjd, bs="cs"), data=agn)
ap <- plot(amod)[[1]]bdf <- data.frame(Day=ap$x, Magnitude=ap$fit+mean(agn$mag))
bdf$ucb <- bdf$Magnitude - 2 * ap$se
bdf$lcb <- bdf$Magnitude + 2 * ap$se
bdf$zig <- bdf$Magnitude + (((1:nrow(bdf)) %% 2)*2-1) * ap$se
bdf$jl <- bdf$ucb+0.1Set up the plots:
p <- ggplot(bdf,aes(x=Day, y=Magnitude))
p <- p+ scale_y_reverse()+xlim(0,datarange)
p <- p+geom_line(aes(y=ucb),color="grey")
p <- p+geom_line(aes(y=lcb),color="grey")
pzig <- p+geom_line(aes(y=zig))
pjl <-  p+geom_line(aes(y=jl))First plot of the second Figure in the paper:
pzigSecond plot of the second Figure in the paper:
pjlLoad in R functions. Set the seed for reproducibility (may turn out differently on other OS).
source("gpr.R")
set.seed(123)Set up the base plot:
p <- ggplot(agn, aes(mjd, mag)) + geom_point() + xlab("Day")+ylab("Magnitude")+scale_y_reverse()+xlim(0,datarange)
xj <- agn$mjd
yj <- agn$magInitialize the full Bayes fit process:
stanmod <- gpinit()Run the chains:
medianresponse <- median(yj)
priorfun <- function(x) rep(medianresponse,length=length(x))
bgp <- gpfit(xj, yj, stanmod, priory=priorfun)Plot the posterior mean at the median values of posteriors on the tuning parameters:
parext <- extract(bgp, pars=c("eta_sq", "rho_sq", "sigma_sq"))
xgrid <- creategrid(xj)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*median(parext$rho_sq), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf, aes(x,y))Plot the 95% smoothness bands:
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.025), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf1 <- data.frame(x=gpbm$grid, y=gpbm$predict)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.975), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf2 <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf1, aes(x,y),color='red')+geom_line(data=ldf2, aes(x,y),color='blue')Set up and plot the data:
fossil <- read.table("fossil.dat",header=TRUE)
fossil$sr <- (fossil$sr-0.7)*100
p <- ggplot(fossil, aes(age, sr)) + geom_point() + xlab("Age")+ylab("Strontium Ratio")
xj <- fossil$age
yj <- fossil$srRun the chains:
medianresponse <- median(yj)
priorfun <- function(x) rep(medianresponse,length=length(x))
bgp <- gpfit(xj, yj, stanmod, priory=priorfun)Plot the posterior mean at the median values of posteriors on the tuning parameters:
parext <- extract(bgp, pars=c("eta_sq", "rho_sq", "sigma_sq"))
xgrid <- creategrid(xj)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*median(parext$rho_sq), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf, aes(x,y))Plot the 95% smoothness bands:
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.025), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf1 <- data.frame(x=gpbm$grid, y=gpbm$predict)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.975), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf2 <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf1, aes(x,y),color='red')+geom_line(data=ldf2, aes(x,y),color='blue')