Basic Statistics in Julia

In [ ]:
using Stats
In [ ]:
srand(1)
In [ ]:
x = rand(100)
In [ ]:
min(x)
In [ ]:
median(x)
In [ ]:
max(x)
In [ ]:
quantile(x, [0.0, 0.5, 1.0])
In [ ]:
describe(x)

Probability Distributions in Julia

In [ ]:
using Distributions
In [ ]:
x = rand(Gamma(1, 2), 100)

Standard R Functions with Simpler Names

In [ ]:
d = Normal(0, 1)
In [ ]:
pdf(d, 0.0)
In [ ]:
cdf(d, 0.0)
In [ ]:
quantile(d, 0.1)
In [ ]:
rand(d)
In [ ]:
rand(Categorical([0.1, 0.9]))
In [ ]:
rand(sampler(Categorical([0.5, 0.5])))
In [ ]:
Categorical([0.5, 0.5])
In [ ]:
sampler(Categorical([0.5, 0.5]))

Additional Abstractions around PDF's, CDF's, etc.

In [ ]:
quantile(d, [0.25, 0.75])
In [ ]:
-loglikelihood(d, rand(d, 100_000)) / 100_000

Theoretical Properties of Distributions

In [ ]:
entropy(d)
In [ ]:
mean(d)
In [ ]:
skewness(d)
In [ ]:
kurtosis(d)
In [ ]:
var(d)
In [ ]:
modes(d)

Fit Distributions to Data

In [ ]:
x = rand(d, 1_000)
In [ ]:
fit_mle(Normal, x)
In [ ]:
(mean(d), std(d)), (mean(x), std(x))
In [ ]:
methods(mean)

Bayesian Updating with Conjugate Priors

In [ ]:
x = rand(Bernoulli(0.9), 10_000)
In [ ]:
posterior(Beta(3, 3), Bernoulli, x)

Kernel Density Estimation

In [ ]:
using Gadfly
In [ ]:
x = rand(Gamma(3, 3), 100_000)
In [ ]:
k = kde(x)
In [ ]:
names(Distributions.UnivariateKDE)
In [ ]:
set_default_plot_size(25cm, 15cm)
In [ ]:
plot(x = k.x, y = k.density,
     Guide.XLabel("x"), Guide.YLabel("Estimated Density"),
     Geom.line)

Tabular Data and Missing Values in Julia

Representing Missing Values

In [ ]:
using DataFrames
In [ ]:
NA + 1
In [ ]:
x = DataArray([1, 2, 3])
In [ ]:
{1, 2, NA}
In [ ]:
x[1] = NA
In [ ]:
mean(x)
In [ ]:
x[!isna(x)]
In [ ]:
mean(x[!isna(x)])

Factor-Like Variables

In [ ]:
y = PooledDataArray([1, 1, 2, 3])
In [ ]:
levels(y)

Representing Tabular Data

In [ ]:
df = DataFrame(A = float(1:10), B = rand(10))
In [ ]:
head(df)
In [ ]:
tail(df)
In [ ]:
df["C"] = repeat(["G1", "G2"], inner = [5])
In [ ]:
pool!(df, ["C"])
In [ ]:
df["C"]
In [ ]:
levels(df["C"])
In [ ]:
repeat([1 2; 3 4], inner = [2, 1], outer = [1, 2],)
In [ ]:
z = DataArray([1 + 2im])
In [ ]:
z[1] = NA
In [ ]:
DataFrame(A = [DataFrame(B = 1:2), DataFrame(C = 3:4)])
In [ ]:
df[1:10, :]
In [ ]:
by(df, "C", df -> mean(df["B"]))
In [ ]:
select(:(C .== "G1"), df)
In [ ]:
df[:(C .== "G1"), :]
In [ ]:
df["C"] .== "G1"
In [ ]:
with(df, :(A + B))

Accessing Classical Datasets

In [ ]:
using RDatasets
In [ ]:
iris = data("datasets", "iris")
In [ ]:
head(iris)
In [ ]:
plot(iris,
     x = "Petal.Length", y = "Petal.Width", color = "Species",
     Geom.point)

Converting DataFrames to Design Matrices

In [ ]:
ModelMatrix(ModelFrame(:(A ~ B), df))

DataFrame I/O

In [ ]:
writetable("df.csv", df)
In [ ]:
df
In [ ]:
df2 = readtable("df.csv")

Merging Data Sets

In [ ]:
A = DataFrame(X = 1:3, Z = ["A", "B", "C"])
In [ ]:
B = DataFrame(Y = 4:6, Z = ["A", "B", "B"])
In [ ]:
join(A, B, on = "Z")
In [ ]:
join(A, B, on = "Z", kind = :inner)
In [ ]:
join(A, B, on = "Z", kind = :left)
In [ ]:
join(A, B, on = "Z", kind = :right)
In [ ]:
join(A, B, on = "Z", kind = :outer)

Split-Apply-Combine Operations

In [ ]:
by(iris, "Species", nrow)
In [ ]:
by(iris, "Species", df -> mean(df["Petal.Length"]))
In [ ]:
by(iris, "Species", :(N = size(_DF, 1)))

GLM's in Julia

In [ ]:
using GLM
In [ ]:
glm(:(B ~ A), df, Binomial())
In [ ]:
glm(:(A ~ B), df, Poisson())

Optimization in Julia

In [ ]:
using Optim
In [ ]:
f(x::Vector) = (10.73 - x[1])^2 + (1134.29 - x[2])^4
In [ ]:
f([0.0, 0.0])
In [ ]:
optimize(f, [0.0, 0.0])
In [ ]:
optimize(f, [0.0, 0.0], method = :l_bfgs)

Maximum Likelihood Estimation in Julia

In [ ]:
x = rand(Normal(11, 3), 1_000)
In [ ]:
function makenll(x)
    nll(params::Vector) = -loglikelihood(Normal(params[1], 3), x)
end
In [ ]:
nll = makenll(x)
In [ ]:
nll([0.0])
In [ ]:
nll([10.0])
In [ ]:
optimize(nll, [0.0])
In [ ]:
mean(x)

More resources:

  • NLopt
  • JuMP

ML Algorithms

In [ ]:
using RDatasets
In [ ]:
iris = data("datasets", "iris")
In [ ]:
using Clustering
In [ ]:
kmeans(matrix(iris[:, 2:5])', 3)
In [ ]:
by(iris, "Species", df -> DataFrame(A = mean(df[2]),
                                    B = mean(df[3]),
                                    C = mean(df[4]),
                                    D = mean(df[5])))