ABSTRACT

The choice of the smoothing parameter in nonparametric regression is critical to the form of the estimated curve and any inference that follows. Many methods are available that will generate a single choice for this parameter. Here we argue that the considerable uncertainty in this choice should be explicitly represented.

The construction of standard simultaneous confidence bands in nonparametric regression often requires difficult mathematical arguments. We question their practical utility, presenting several deficiencies. We propose a new kind of confidence band that reflects the uncertainty regarding the smoothness of the estimate.

Code and Data

We use the following:

Download these if you want to reproduce the following:

Active Galactic Nucleus Data

Read in AGN data

agn <- read.table("agn.dat",header=TRUE)

Compute default SCBs using lowess:

datarange=2764
agn$mjd <- agn$mjd - 53464
library(ggplot2,quietly=TRUE)
p <- ggplot(agn ,aes(x=mjd, y=mag))
p <- p+xlab("Day")+ ylab("Magnitude") + scale_y_reverse()+xlim(0,datarange)
p <- p+geom_point(size=0.75)
slow <- p + geom_smooth()
slow

Now use smoothing splines via mgcv:

sspl <- p + geom_smooth(method="gam", formula = y ~ s(x, bs = "cs"))
sspl

Implausible curves between the bands

Set up implausible curves:

library(mgcv)
amod <- gam(mag ~ s(mjd, bs="cs"), data=agn)
ap <- plot(amod)[[1]]

bdf <- data.frame(Day=ap$x, Magnitude=ap$fit+mean(agn$mag))
bdf$ucb <- bdf$Magnitude - 2 * ap$se
bdf$lcb <- bdf$Magnitude + 2 * ap$se
bdf$zig <- bdf$Magnitude + (((1:nrow(bdf)) %% 2)*2-1) * ap$se
bdf$jl <- bdf$ucb+0.1

Set up the plots:

p <- ggplot(bdf,aes(x=Day, y=Magnitude))
p <- p+ scale_y_reverse()+xlim(0,datarange)
p <- p+geom_line(aes(y=ucb),color="grey")
p <- p+geom_line(aes(y=lcb),color="grey")
pzig <- p+geom_line(aes(y=zig))
pjl <-  p+geom_line(aes(y=jl))

First plot of the second Figure in the paper:

pzig

Second plot of the second Figure in the paper:

pjl

Smoothness bands construction

Load in R functions. Set the seed for reproducibility (may turn out differently on other OS).

source("gpr.R")
set.seed(123)

AGN Data

Set up the base plot:

p <- ggplot(agn, aes(mjd, mag)) + geom_point() + xlab("Day")+ylab("Magnitude")+scale_y_reverse()+xlim(0,datarange)
xj <- agn$mjd
yj <- agn$mag

Initialize the full Bayes fit process:

stanmod <- gpinit()

Run the chains:

medianresponse <- median(yj)
priorfun <- function(x) rep(medianresponse,length=length(x))
bgp <- gpfit(xj, yj, stanmod, priory=priorfun)

Plot the posterior mean at the median values of posteriors on the tuning parameters:

parext <- extract(bgp, pars=c("eta_sq", "rho_sq", "sigma_sq"))
xgrid <- creategrid(xj)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*median(parext$rho_sq), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf, aes(x,y))

Plot the 95% smoothness bands:

gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.025), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf1 <- data.frame(x=gpbm$grid, y=gpbm$predict)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.975), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf2 <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf1, aes(x,y),color='red')+geom_line(data=ldf2, aes(x,y),color='blue')

Fossil data

Set up and plot the data:

fossil <- read.table("fossil.dat",header=TRUE)
fossil$sr <- (fossil$sr-0.7)*100
p <- ggplot(fossil, aes(age, sr)) + geom_point() + xlab("Age")+ylab("Strontium Ratio")
xj <- fossil$age
yj <- fossil$sr

Run the chains:

medianresponse <- median(yj)
priorfun <- function(x) rep(medianresponse,length=length(x))
bgp <- gpfit(xj, yj, stanmod, priory=priorfun)

Plot the posterior mean at the median values of posteriors on the tuning parameters:

parext <- extract(bgp, pars=c("eta_sq", "rho_sq", "sigma_sq"))
xgrid <- creategrid(xj)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*median(parext$rho_sq), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf, aes(x,y))

Plot the 95% smoothness bands:

gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.025), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf1 <- data.frame(x=gpbm$grid, y=gpbm$predict)
gpbm <- gpreg(xj, yj, grid=xgrid, invwid=2*quantile(parext$rho_sq,0.975), noisevar=median(parext$sigma_sq), sigvar=median(parext$eta_sq), priormean=priorfun(xgrid))
ldf2 <- data.frame(x=gpbm$grid, y=gpbm$predict)
p+geom_line(data=ldf1, aes(x,y),color='red')+geom_line(data=ldf2, aes(x,y),color='blue')