## see Advanced R 17.3 by Wickham for more on the microbenchmark package
library(microbenchmark)

## create data sets in sizes of power of 10
ns <- 2^(11:15)
x <- vector("list",length(ns))
for(ii in 1:length(x)){
    x[[ii]] <- rnorm(ns[ii])
    
}

## computes mean for data set x[[ii]]
MeanTest <- function(ii){
    mean(x[[ii]])
}

out <- microbenchmark(MeanTest(1),MeanTest(2),MeanTest(3),MeanTest(4),MeanTest(5))
summary(out)

summary(out)$median

options(repr.plot.height=4,repr.plot.width=5)
plot(ns,summary(out)$median,xlab="n (sample size)",ylab="Median Execution Time")

x <- rnorm(1e6)
y <- rnorm(1e7)
out <- microbenchmark(mean(x),mean(y),sd(x),sd(y))
summary(out)

x <- rnorm(1e6)
out <- microbenchmark(mean(x),median(x))
summary(out)

## generate data from a 2-dimensional Gaussian mixture model
library(MixSim)
set.seed(1)
Q <- MixSim(BarOmega = 0.01,K = 2, p = 2)
A <- simdataset(n = 500, Pi = Q$Pi, Mu = Q$Mu, S = Q$S)
plot(A$X)

## computes Euclidean distance between all pairs of observations
D <- dist(A$X)

## finds groups using distances
out <- hclust(D, method = "complete", members = NULL) 

## find two groups, get membership
group <- cutree(out,k=2)

plot(A$X,col=group)

## how does this algorithm scale?
## analyze just the computation of D

## create data sets in sizes of power of 10
ns <- 2^(6:10)
X <- vector("list",length(ns))
for(ii in 1:length(X)){
    X[[ii]] <- simdataset(n = ns[ii], Pi = Q$Pi, Mu = Q$Mu, S = Q$S)$X
    
}

## computes mean for data set x[[ii]]
DistanceTest <- function(ii){
    dist(X[[ii]])
}

out <- microbenchmark(DistanceTest(1),DistanceTest(2),DistanceTest(3),DistanceTest(4),DistanceTest(5),times=20)
summary(out)

plot(ns,summary(out)$median,xlab="n (sample size)",ylab="Median Execution Time")