subtypes(Number) subtypes(Real) subtypes(Integer) # Floating Point @show 5/3 # Mathematical Constant @show pi # Rational @show 2//3 + 1 # BigInt @show big(2) ^ 1000 ; subtypes(String) s = "Hello World" @show typeof(s) @show s[7] ; # Unicode Names and Values 你好 = "(｡◕_◕｡)ﾉ " @show typeof(你好) @show 你好 ^ 3 ; type NewType i::Integer s::String end new_t = NewType(33, "this is a NewType") @show new_t.i @show new_t.s ; # Vectors v = [1, 1] # Vector Operations @show v + [2, 0] # vector addition @show v + 1 # same as v + [1,1] @show 5*v # scalar multiplication println( "Dot Product : ", dot(v, v) ) println( "Norm : ", norm(v) ) # Matrices M = [1 1 ; 0 1] # Matrix Addition M + 1 , M + [0 0 ; 5 5] # Matrix Multiplication 2M , M ^ 2 , M * v # Gaussian Elimination b = M * v M \ b # solve back for v # Named functions f(x) = 10x function g(x) return x * 10 end @show f(5) @show g(5) ; # Anonymous functions assigned to variables h = x -> x * 10 i = function(x) x * 10 end @show h(5) @show i(5) ; # Operators are functions +(4,5) p = + p(2,3) bar(x::String) = println("You entered the string: \$x") bar(x::Integer) = x * 10 bar(x::NewType) = println(x.s) methods(bar) bar("Hello") bar(new_t) bar(5) # Adding strings "Hello" + "World" # But the addition operator is a function, so we can apply multi-dispatch +(a::String, b::String) = a * b "Hello" + "World" +(a::Number, b::String) = string(a) + b +(a::String, b::Number) = a + string(b) 99 + "bottles" # Method Overloading type SimpleObject data::Union(Integer, String) set::Function function SimpleObject() this = new() this.data = "" function setter(x::Integer) println("Setting an integer") this.data = x end function setter(x::String) println("Setting a string") this.data = x end this.set = setter return this end end obj = SimpleObject() obj.set(99) obj.set("hello") # Sum of odd integers between 1 and 5 values = 1:5 myMapper = x -> x myFilter = x -> x % 2 == 1 myReducer = (x,y) -> x + y mapped = map( myMapper, values ) filtered = filter( myFilter, mapped ) reduced = reduce( myReducer, filtered ) # Code Generation # Functions for exponentiating to the powers of 1 to 5 for n in 1:5 s = "power\$n(x) = x ^ \$n" println(s) expression = parse(s) eval(expression) end power5( 2 ) # Macros: Crude Timer Example macro timeit(expression) quote t = time() result = \$expression # evaluation elapsed = time() - t println( "elapsed time: ", elapsed ) return result end end @timeit cos(2pi) @timeit cos(2pi) using StatsBase x = rand(100) # uniform distribution [0,1) println( "mean: ", mean(x) ) println( "variance: ", var(x) ) println( "skewness: ", skewness(x) ) println( "kurtosis: ", kurtosis(x) ) describe(x) using Distributions distr = Normal(0, 2) println( "pdf @ origin = ", pdf(distr, 0.0) ) println( "cdf @ origin = ", cdf(distr, 0.0) ) x = rand(distr, 1000) fit_mle(Normal, x) using DataFrames df = DataFrame( A = [6, 3, 4], B = ["a", "b", "c"], C = [1//2, 3//4, 5//6], D = [true, true, false] ) df[:C][2] = NA df # Joins names = DataFrame(ID = [5, 4], Name = ["Jack", "Jill"]) jobs = DataFrame(ID = [5, 4], Job = ["Lawyer", "Doctor"]) full = join(names, jobs, on = :ID) using RDatasets iris = dataset("datasets", "iris") head(iris) # Group by Species, then compute mean of PetalLength per group by( iris, :Species, df -> mean(df[:PetalLength]) ) using ASCIIPlots x = iris[:PetalLength] y = iris[:PetalWidth] scatterplot(x, y) using Winston scatter(x, y, ".") xlabel("PetalLength") ylabel("PetalWidth") using Gadfly set_default_plot_size(20cm, 12cm) plot(iris, x = "PetalLength", y = "PetalWidth", color = "Species", Geom.point) # K-means Clustering using Clustering features = array(iris[:, 1:4])' # use matrix() on Julia v0.2 result = kmeans( features, 3 ) # onto 3 clusters plot(iris, x = "PetalLength", y = "PetalWidth", color = result.assignments, Geom.point) # Principal Component Analysis using MultivariateStats pc = fit(PCA, features; maxoutdim = 2) reduced = transform(pc, features) @show size(reduced) plot(iris, x = reduced[1,:], y = reduced[2,:], color = "Species", Geom.point) using MultivariateStats # Generate a noisy linear system features = rand(1000, 3) # feature matrix coeffs = rand(3) # ground truth of weights targets = features * coeffs + 0.1 * randn(1000) # generate response # Linear Least Square Regression coeffs_llsq = llsq(features, targets; bias=false) # Ridge Regression coeffs_ridge = ridge(features, targets, 0.1; bias=false) # regularization coef = 0.1 @show coeffs @show coeffs_llsq @show coeffs_ridge ; # Cross Validation: K-Fold Example using MLBase, MultivariateStats n = length(targets) # Define training and error evaluation functions function training(inds) coeffs = ridge(features[inds, :], targets[inds], 0.1; bias=false) return coeffs end function error_evaluation(coeffs, inds) y = features[inds, :] * coeffs rms_error = sqrt(mean(abs2(targets[inds] .- y))) return rms_error end # Cross validate scores = cross_validate( inds -> training(inds), (coeffs, inds) -> error_evaluation(coeffs, inds), n, # total number of samples Kfold(n, 3)) # cross validation plan: 3-fold # Get the mean and std of scores @show scores @show mean_and_std(scores) ; # Model Tuning: Grid Search using MLBase, MultivariateStats # Hold out 20% of records for testing n_test = int(length(targets) * 0.2) train_rows = shuffle([1:length(targets)] .> n_test) features_train, features_test = features[train_rows, :], features[!train_rows, :] targets_train, targets_test = targets[train_rows], targets[!train_rows] # Define estimation function function estfun(regcoef, bias) coeffs = ridge(features_train, targets_train, regcoef; bias=bias) return bias ? (coeffs[1:end-1], coeffs[end]) : (coeffs, 0.0) end # Define error evaluation function as mean squared deviation evalfun(coeffs) = msd(features_test * coeffs[1] + coeffs[2], targets_test) result = gridtune(estfun, evalfun, ("regcoef", [0.01, 0.1, 1.0]), ("bias", [true, false]); ord=Reverse, # smaller msd value indicates better model verbose=true) # show progress information best_model, best_config, best_score = result # Print results coeffs, bias = best_model println("Best model:") println(" coeffs = \$(coeffs')"), println(" bias = \$bias") println("Best config: regcoef = \$(best_config[1]), bias = \$(best_config[2])") println("Best score: \$(best_score)") # Regression Tree using DecisionTree # Train model, make predictions on test records model = build_tree(targets_train, features_train) predictions = apply_tree(model, features_test) @show cor(targets_test, predictions) @show R2(targets_test, predictions) scatter(targets_test, predictions, ".") xlabel("actual"); ylabel("predicted") # Support Vector Machine using LIBSVM features = array(iris[:, 1:4]) labels = array(iris[:Species]) # Hold out 20% of records for testing n_test = int(length(labels) * 0.2) train_rows = shuffle([1:length(labels)] .> n_test) features_train, features_test = features[train_rows, :], features[!train_rows, :] labels_train, labels_test = labels[train_rows], labels[!train_rows] model = svmtrain(labels_train, features_train') (predictions, decision_values) = svmpredict(model, features_test') confusion_matrix(labels_test, predictions) # Random Forest using DecisionTree # Train forest using 2 random features per split and 10 trees model = build_forest(labels_train, features_train, 2, 10) predictions = apply_forest(model, features_test) # Pretty print of one tree in forest print_tree(model.trees[1]) confusion_matrix(labels_test, predictions)