Cross-validation

Comparing cross-validation methods

library(data.table)
n.segments <- 10
seg.mean.vec <- 1:n.segments
data.per.segment <- 10
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 100
n.folds.vec <- c(10, 2)
prop.valid.vec <- 1/n.folds.vec
sim.result.list <- list()
for(data.seed in 1:100){
  set.seed(data.seed)
  data.vec <- rnorm(n.data, data.mean.vec, 0.1)
  is.valid.vec.list <- list()
  for(n.folds in n.folds.vec){
    uniq.folds <- 1:n.folds
    n.seeds <- n.validation.sets/n.folds
    split.type <- sprintf("%d-fold %d times", n.folds, n.seeds)
    for(seed in 1:n.seeds){
      set.seed(seed)
      fold.vec <- sample(rep(uniq.folds, l=n.data))
      for(valid.fold in uniq.folds){
        is.valid.vec.list[[split.type]][[paste(seed, valid.fold)]] <-
          fold.vec==valid.fold
      }
    }
  }
  for(prop.valid in prop.valid.vec){
    split.type <- sprintf("%d%% %d times", 100*prop.valid, n.validation.sets)
    prop.vec <- c(subtrain=1-prop.valid, validation=prop.valid)
    for(split.i in 1:n.validation.sets){
      set.seed(split.i)
      is.valid.vec.list[[split.type]][[split.i]] <- binsegRcpp::random_set_vec(
        n.data, prop.vec) == "validation"
    }
  }
  loss.dt <- CJ(split.i=1:n.validation.sets, type=names(is.valid.vec.list))[, {
    is.valid <- is.valid.vec.list[[type]][[split.i]]
    bs.model <- binsegRcpp::binseg_normal(data.vec, is.validation.vec=is.valid)
    bs.model$splits[, data.table(
      segments,
      validation.loss)]
  }, by=.(split.i, type)]
  loss.stats <- loss.dt[, .(
    mean.valid.loss=mean(validation.loss)
  ), by=.(type, segments)]
  select.each.split <- loss.dt[
  , .SD[which.min(validation.loss)],
    by=.(type, split.i)]
  selected.times <- select.each.split[, .(
    times=.N
  ), by=.(type, segments)]
  selected.segments <- rbind(
    select.each.split[, .(
      selected=min(segments)
    ), by=.(method=paste(type, "min err, min segs"))],
    selected.times[, .(
      selected=segments[which.max(times)]
    ), by=.(method=paste(type, "min err, max times"))],
    loss.stats[, .(
      selected=segments[which.min(mean.valid.loss)]
    ), by=.(method=paste(type, "mean err, min err"))]
  )
  sim.result.list[[data.seed]] <- data.table(
    data.seed, selected.segments, n.segments)
}
sim.result <- do.call(rbind, sim.result.list)
(sim.err <- sim.result[, .(
  zero.one.loss=sum(selected != n.segments),
  L1.loss=sum(abs(selected-n.segments)),
  L2.loss=sum((selected-n.segments)^2)
), by=method][order(zero.one.loss)])
#>                                  method zero.one.loss L1.loss L2.loss
#>                                  <char>         <int>   <num>   <num>
#>  1:    10% 100 times min err, max times             2       4      10
#>  2: 10-fold 10 times min err, max times             2       5      13
#>  3:  2-fold 50 times min err, max times             3       4       6
#>  4:    50% 100 times min err, max times             5       6       8
#>  5:   2-fold 50 times min err, min segs             7       7       7
#>  6:     50% 100 times min err, min segs            27      27      27
#>  7:     50% 100 times mean err, min err            42     128    1166
#>  8:     10% 100 times mean err, min err            45     232    3310
#>  9:  10-fold 10 times mean err, min err            46     198    2128
#> 10:   2-fold 50 times mean err, min err            47     108     898
#> 11:     10% 100 times min err, min segs           100     263     829
#> 12:  10-fold 10 times min err, min segs           100     321    1135
plot(data.vec)

plot of chunk unnamed-chunk-1

The code above compares several types of cross-validation for selecting the number of segments in simulated random normal data. The table above shows various error rates which compare the selected number of segments to the true number of segments in the simulation. The best methods appears to be the ones which use min err, max times.

How many times is sufficient?

n.segments <- 20
seg.mean.vec <- 1:n.segments
data.per.segment <- 5
data.mean.vec <- rep(seg.mean.vec, each=data.per.segment)
n.data <- length(data.mean.vec)
n.validation.sets <- 200
prop.valid <- c(0.01, 0.05, 0.1, 0.25, 0.5)
sim.result <- data.table(data.seed=1:100)[, {
  set.seed(data.seed)
  data.vec <- rnorm(n.data, data.mean.vec, 0.1)
  select.each.split <- CJ(split.i=1:n.validation.sets, prop.valid)[, {
    set.seed(split.i)
    prop.sets <- c(subtrain=1-prop.valid, validation=prop.valid)
    is.valid <- binsegRcpp::random_set_vec(
      n.data, prop.sets)=="validation"
    bs.model <- binsegRcpp::binseg_normal(
      data.vec, is.validation.vec=is.valid)
    bs.model$splits[, .(selected=segments[which.min(validation.loss)])]
  }, by=.(split.i, prop.valid)]
  data.table(n.splits=1:n.validation.sets)[, {
    select.each.split[split.i <= n.splits, .(
      times=.N
    ),
    by=.(prop.valid, selected)
    ][, .SD[which.max(times), .(selected)], by=prop.valid]
  }, by=n.splits]
}, by=data.seed]

if(require(ggplot2)){
  ggplot()+
    scale_color_gradient(low="red", high="black")+
    geom_line(aes(
      n.splits, selected,
      group=paste(data.seed, prop.valid),
      color=prop.valid),
      data=sim.result)+
    scale_y_continuous(breaks=seq(0, 100, by=10))
}
#> Loading required package: ggplot2

plot of chunk unnamed-chunk-2


accuracy.dt <- sim.result[, .(
  correct=sum(selected==n.segments)
), by=.(prop.valid, n.splits)]
if(require(ggplot2)){
  gg <- ggplot()+
    geom_line(aes(
      n.splits, correct,
      group=prop.valid,
      color=prop.valid),
      size=2,
      data=accuracy.dt)+
    scale_y_continuous("number of correctly chosen data sets")+
    scale_color_gradient(low="red", high="black")
  if(require(directlabels)){
    direct.label(gg, "right.polygons")
  }else{
    gg
  }
}
#> Loading required package: directlabels

plot of chunk unnamed-chunk-2

The plot above suggests that 100 validation sets is sufficient.