using DataFrames, Random;

df = DataFrame(A = 1:4, B = ["M", "F", "F", "M"], C = randn(4))

mat = [1 "M"; 2 "M"]

df[:A] == df[:, :A] == df.A

df = DataFrame(Int = [1:1:10;], 
               Float = [1.:1.:10.;],
               Categorical = CategoricalArray(["A","B","C","A","B","C","A","B","C","D"]), 
               String = ["A","B","C","A","B","C","A","B","C","D"])
describe(df)

using RDatasets
iris = RDatasets.dataset("datasets", "iris")
describe(iris, :std, :q25, :q75, :sum => sum)

iris[:, r"Sepal"] |> first

A = DataFrame(ID = [1,2,3,4,5,8,9,10], Japanese = [10, missing, 30, 40, 50, 80, 90, 100], Math = [30, 40, 20, 50, 10, 60, 90, 60])
B = DataFrame(ID = [7,6,5,4,3,2,1], Books = [20, 1, 10, 5, 7, 3, missing], 
              Parents = CategoricalArray(["大卒", "大卒", "高卒", missing, "大卒", "高卒", "高卒"]))
join(A, B, on = :ID, kind = :inner)

C = DataFrame(Dokusho = [7,6,5,4,3,2,1], Books = [20, 1, 10, 5, 7, 3, missing])
join(A, C, on = :ID => :Dokusho)

# Aのデータを基準として結合
join(A, B, on = :ID, kind = :left)

# Bのデータを基準として結合
join(A, B, on = :ID, kind = :right) |> sort!

join(A, B, on = :ID, kind = :outer) |> sort!

join(A, B, on = :ID, kind = :semi)

join(A, B, on = :ID, kind = :anti)

D = join(A, B, on = :ID,  kind = :outer)
longD = DataFrames.stack(D, [:Japanese, :Math], variable_name = :Subject, value_name = :Score);
first(longD, 5)

using Gadfly
plot(longD, xgroup = "Subject", x = "Score", Geom.subplot_grid(Geom.histogram));

melt(D, [:ID, :Books, :Parents], variable_name = :Subject, value_name = :Score) |> x -> first(x, 5)

unstack(longD, :Subject, :Score)

using RCall
R"""
library(tidyverse)
iris %>% 
    filter(Sepal.Width >= 3.2 & Sepal.Width <= 3.8) %>%
    group_by(Species) %>%
    nest %>%
    mutate(ols = map(data, ~lm(data = .x, formula = Sepal.Width ~ Sepal.Length)), 
           plt = map(ols, ~plot(.x))) %>%
    pull(ols)
"""

using DataFramesMeta, Statistics, StatsBase

@linq iris |> with(iris[:SepalLength .> 6., :]) |> first(5)

@linq iris |>
   where(5.0 .≥ :SepalWidth .≥ 3.0) |>
   groupby(:Species) |>
   transform(StdSepalLn = zscore(:SepalLength)) |>
   by(:Species, mean = mean(:StdSepalLn), var = std(:StdSepalLn))

OrderedIris = @linq iris |>
    where(7.0 .≥ :SepalLength) |>
    groupby(:Species) |>
    orderby(:SepalLength)
map(df -> first(df, 3), OrderedIris)

@linq iris |> groupby(:Species) |> based_on(MEAN = mean(:SepalLength), VAR = var(:SepalLength))

@linq iris |> groupby(:Species) |> based_on(MEAN = mean(:SepalLength), DOUBLE = 2*:SepalLength) |> first(5)

GroupedIris = @linq iris |> groupby(:Species)
@linq GroupedIris[1] |> first(3)

using GLM
res = map(x -> coef(lm(@formula(SepalLength ~ SepalWidth), x)), GroupedIris)

plt = @linq map(df -> plot(df, x="SepalLength", y="SepalWidth", Geom.point), GroupedIris) |> DataFrame
gridstack([plt[1,:x1] plt[2,:x1] plt[3,:x1]])