https://archive.ics.uci.edu/ml/datasets/wine ←このデータから、赤ワインの評価と白ワインの評価に差があるかを調べる
require 'daru'
require 'rbplotly'
require 'daru/plotly'
require 'statsample'
true
wine = Daru::DataFrame.from_csv('./winequality-both.csv')
wine.head 10 # 最初の10行を表示
Daru::DataFrame(10x13) | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
0 | red | 7.4 | 0.7 | 0 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
1 | red | 7.8 | 0.88 | 0 | 2.6 | 0.098 | 25 | 67 | 0.9968 | 3.2 | 0.68 | 9.8 | 5 |
2 | red | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15 | 54 | 0.997 | 3.26 | 0.65 | 9.8 | 5 |
3 | red | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17 | 60 | 0.998 | 3.16 | 0.58 | 9.8 | 6 |
4 | red | 7.4 | 0.7 | 0 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
5 | red | 7.4 | 0.66 | 0 | 1.8 | 0.075 | 13 | 40 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
6 | red | 7.9 | 0.6 | 0.06 | 1.6 | 0.069 | 15 | 59 | 0.9964 | 3.3 | 0.46 | 9.4 | 5 |
7 | red | 7.3 | 0.65 | 0 | 1.2 | 0.065 | 15 | 21 | 0.9946 | 3.39 | 0.47 | 10 | 7 |
8 | red | 7.8 | 0.58 | 0.02 | 2 | 0.073 | 9 | 18 | 0.9968 | 3.36 | 0.57 | 9.5 | 7 |
9 | red | 7.5 | 0.5 | 0.36 | 6.1 | 0.071 | 17 | 102 | 0.9978 | 3.35 | 0.8 | 10.5 | 5 |
wine['type'].uniq.to_a
["red", "white"]
reds = wine.where(wine['type'].eq('red'))
reds.head 10
Daru::DataFrame(10x13) | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
0 | red | 7.4 | 0.7 | 0 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
1 | red | 7.8 | 0.88 | 0 | 2.6 | 0.098 | 25 | 67 | 0.9968 | 3.2 | 0.68 | 9.8 | 5 |
2 | red | 7.8 | 0.76 | 0.04 | 2.3 | 0.092 | 15 | 54 | 0.997 | 3.26 | 0.65 | 9.8 | 5 |
3 | red | 11.2 | 0.28 | 0.56 | 1.9 | 0.075 | 17 | 60 | 0.998 | 3.16 | 0.58 | 9.8 | 6 |
4 | red | 7.4 | 0.7 | 0 | 1.9 | 0.076 | 11 | 34 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
5 | red | 7.4 | 0.66 | 0 | 1.8 | 0.075 | 13 | 40 | 0.9978 | 3.51 | 0.56 | 9.4 | 5 |
6 | red | 7.9 | 0.6 | 0.06 | 1.6 | 0.069 | 15 | 59 | 0.9964 | 3.3 | 0.46 | 9.4 | 5 |
7 | red | 7.3 | 0.65 | 0 | 1.2 | 0.065 | 15 | 21 | 0.9946 | 3.39 | 0.47 | 10 | 7 |
8 | red | 7.8 | 0.58 | 0.02 | 2 | 0.073 | 9 | 18 | 0.9968 | 3.36 | 0.57 | 9.5 | 7 |
9 | red | 7.5 | 0.5 | 0.36 | 6.1 | 0.071 | 17 | 102 | 0.9978 | 3.35 | 0.8 | 10.5 | 5 |
whites = wine.where(wine['type'].eq 'white')
whites.head 10
Daru::DataFrame(10x13) | |||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|---|
type | fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
1599 | white | 7 | 0.27 | 0.36 | 20.7 | 0.045 | 45 | 170 | 1.001 | 3 | 0.45 | 8.8 | 6 |
1600 | white | 6.3 | 0.3 | 0.34 | 1.6 | 0.049 | 14 | 132 | 0.994 | 3.3 | 0.49 | 9.5 | 6 |
1601 | white | 8.1 | 0.28 | 0.4 | 6.9 | 0.05 | 30 | 97 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
1602 | white | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47 | 186 | 0.9956 | 3.19 | 0.4 | 9.9 | 6 |
1603 | white | 7.2 | 0.23 | 0.32 | 8.5 | 0.058 | 47 | 186 | 0.9956 | 3.19 | 0.4 | 9.9 | 6 |
1604 | white | 8.1 | 0.28 | 0.4 | 6.9 | 0.05 | 30 | 97 | 0.9951 | 3.26 | 0.44 | 10.1 | 6 |
1605 | white | 6.2 | 0.32 | 0.16 | 7 | 0.045 | 30 | 136 | 0.9949 | 3.18 | 0.47 | 9.6 | 6 |
1606 | white | 7 | 0.27 | 0.36 | 20.7 | 0.045 | 45 | 170 | 1.001 | 3 | 0.45 | 8.8 | 6 |
1607 | white | 6.3 | 0.3 | 0.34 | 1.6 | 0.049 | 14 | 132 | 0.994 | 3.3 | 0.49 | 9.5 | 6 |
1608 | white | 8.1 | 0.22 | 0.43 | 1.5 | 0.044 | 28 | 129 | 0.9938 | 3.22 | 0.45 | 11 | 6 |
include Daru::Plotly::Methods # plot, generate_data
Object
red_qualities = generate_data(reds['quality'], type: :histogram, opts: { name: 'red', marker: { color: '#80273F' } })
white_qualities = generate_data(whites['quality'], type: :histogram, opts: { name: 'white', marker: { color: 'white' }})
Plotly::Plot.new(
data: red_qualities + white_qualities,
layout: { paper_bgcolor: 'silver', plot_bgcolor: 'silver', font: { color: 'white' } }
).show
#<CZTop::Socket::PUB:0x5651b1889ed0 last_endpoint="tcp://127.0.0.1:60555">
↑ なんとなく違いがありそう
reds['quality'].mean # 平均値
5.6360225140712945
whites['quality'].mean
5.87790935075541
平均値を見ると、白ワインの方が評価が高いように見える。
たまたま高くなっただけじゃないのか? → t検定で検証する
Statsample::Analysis.store(Statsample::Test::T) do
t = Statsample::Test.t_two_samples_independent(reds['quality'], whites['quality'])
summary t
end
Statsample::Analysis.run_batch
NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:291. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:292. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/levene.rb:51. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/levene.rb:51. NOTE: Daru::Vector#only_valid is deprecated; use reject_values instead. It will be removed on or after 2016-10-01. Daru::Vector#only_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/levene.rb:60. NOTE: Daru::Vector#only_valid is deprecated; use reject_values instead. It will be removed on or after 2016-10-01. Daru::Vector#only_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/levene.rb:60. NOTE: Daru::Vector#only_valid is deprecated; use reject_values instead. It will be removed on or after 2016-10-01. Daru::Vector#only_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/levene.rb:71. NOTE: Daru::Vector#only_valid is deprecated; use reject_values instead. It will be removed on or after 2016-10-01. Daru::Vector#only_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/levene.rb:71. /usr/local/lib/ruby/gems/2.4.0/gems/distribution-0.7.3/lib/distribution/math_extension/incomplete_beta.rb:14: warning: constant ::Fixnum is deprecated NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:267. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:267. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:269. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:269. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:271. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:271. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:272. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:272. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:281. NOTE: Daru::Vector#n_valid is deprecated; use count_values instead. It will be removed on or after 2016-10-01. Daru::Vector#n_valid called from /usr/local/lib/ruby/gems/2.4.0/gems/statsample-2.0.2/lib/statsample/test/t.rb:282.
Analysis 2017-05-22 16:20:08 +0000 = Statsample::Test::T == Two Sample T Test Mean and standard deviation +----------+--------+--------+------+ | Variable | mean | sd | n | +----------+--------+--------+------+ | quality | 5.6360 | 0.8076 | 1599 | | quality | 5.8779 | 0.8856 | 4898 | +----------+--------+--------+------+ Levene test for equality of variances : F(1, 6495) = 0.6213 , p = 0.4306 T statistics +--------------------+----------+-----------+----------------+ | Type | t | df | p (both tails) | +--------------------+----------+-----------+----------------+ | Equal variance | -9.6856 | 6495 | 0.0000 | | Non equal variance | -10.1494 | 2950.7505 | 0.0000 | +--------------------+----------+-----------+----------------+ Effect size +-------+---------+ | x1-x2 | -0.2419 | | d | -0.2599 | +-------+---------+
p値が十分小さい(p = 0.0000)ので、統計的に有意な差がある → 白ワインの方が総じて評価が高い
※あくまでこのデータからは有意な差があるというだけ
だいたいのことができますが、まだ不十分なところもあります。
Daru::DataFrame#corr
相関係数を出すメソッド → 遅い
wine.corr
Daru::DataFrame(12x12) | ||||||||||||
---|---|---|---|---|---|---|---|---|---|---|---|---|
fixed acidity | volatile acidity | citric acid | residual sugar | chlorides | free sulfur dioxide | total sulfur dioxide | density | pH | sulphates | alcohol | quality | |
fixed acidity | 1.0 | 0.21900825635100266 | 0.32443572544730104 | -0.11198128107823678 | 0.29819477170273795 | -0.28273542836956794 | -0.32905390129522527 | 0.45890998228044255 | -0.2527004683162333 | 0.29956774438249995 | -0.09545152256332995 | -0.07674320790962286 |
volatile acidity | 0.21900825635100266 | 1.0000000000000002 | -0.37798131705526566 | -0.19601117434765503 | 0.3771242764338664 | -0.35255730641340716 | -0.4144761946507196 | 0.2712956478511821 | 0.261454402742256 | 0.2259836797410744 | -0.03764038583468137 | -0.26569947761148033 |
citric acid | 0.32443572544730104 | -0.37798131705526566 | 1.0 | 0.14245122598675725 | 0.038998014089851825 | 0.13312580951823125 | 0.19524197598145304 | 0.09615392906417021 | -0.32980819113172016 | 0.0561973001349725 | -0.01049349217337923 | 0.08553171718367827 |
residual sugar | -0.11198128107823678 | -0.19601117434765503 | 0.14245122598675725 | 0.9999999999999999 | -0.12894049990326722 | 0.4028706400566557 | 0.4954815870066449 | 0.552516950293483 | -0.2673198368768104 | -0.1859274052901841 | -0.3594147708159969 | -0.03698048458576987 |
chlorides | 0.29819477170273795 | 0.3771242764338664 | 0.038998014089851825 | -0.12894049990326722 | 1.0000000000000002 | -0.1950447852077002 | -0.27963044744333765 | 0.3626146565578091 | 0.04470797955050543 | 0.395593306547331 | -0.2569155799729125 | -0.20066550043510553 |
free sulfur dioxide | -0.28273542836956794 | -0.35255730641340716 | 0.13312580951823125 | 0.4028706400566557 | -0.1950447852077002 | 1.0000000000000002 | 0.7209340813785245 | 0.025716842144663805 | -0.14585389640016552 | -0.18845724880121598 | -0.1798384348893394 | 0.05546305861663346 |
total sulfur dioxide | -0.32905390129522527 | -0.4144761946507196 | 0.19524197598145304 | 0.4954815870066449 | -0.27963044744333765 | 0.7209340813785245 | 1.0 | 0.03239451234680212 | -0.23841310290340784 | -0.27572681991620573 | -0.26573963910715914 | -0.04138545385560974 |
density | 0.45890998228044255 | 0.2712956478511821 | 0.09615392906417021 | 0.552516950293483 | 0.3626146565578091 | 0.025716842144663805 | 0.03239451234680212 | 1.0 | 0.011686080687174244 | 0.25947849534575335 | -0.6867454216813362 | -0.3058579060694189 |
pH | -0.2527004683162333 | 0.261454402742256 | -0.32980819113172016 | -0.2673198368768104 | 0.04470797955050543 | -0.14585389640016552 | -0.23841310290340784 | 0.011686080687174244 | 1.0000000000000002 | 0.19212340657115304 | 0.12124846709464465 | 0.019505703714435736 |
sulphates | 0.29956774438249995 | 0.2259836797410744 | 0.0561973001349725 | -0.1859274052901841 | 0.395593306547331 | -0.18845724880121598 | -0.27572681991620573 | 0.25947849534575335 | 0.19212340657115304 | 0.9999999999999999 | -0.003029194944255261 | 0.038485445876515374 |
alcohol | -0.09545152256332995 | -0.03764038583468137 | -0.01049349217337923 | -0.3594147708159969 | -0.2569155799729125 | -0.1798384348893394 | -0.26573963910715914 | -0.6867454216813362 | 0.12124846709464465 | -0.003029194944255261 | 1.0 | 0.44431852000752226 |
quality | -0.07674320790962286 | -0.26569947761148033 | 0.08553171718367827 | -0.03698048458576987 | -0.20066550043510553 | 0.05546305861663346 | -0.04138545385560974 | -0.3058579060694189 | 0.019505703714435736 | 0.038485445876515374 | 0.44431852000752226 | 1.0 |
plot(wine.corr, type: :heatmap).show
#<CZTop::Socket::PUB:0x5651b1889ed0 last_endpoint="tcp://127.0.0.1:60555">
Daru:
> ruby corr.rb
user system total real
correlation 5.330000 0.100000 5.430000 ( 5.613104)
pandas:
> python3 corr.py
## benchmarker: release 4.0.1 (for python)
## python version: 3.5.2
## python compiler: GCC 4.2.1 Compatible Apple LLVM 7.3.0 (clang-703.0.31)
## python platform: Darwin-16.4.0-x86_64-i386-64bit
## python executable: /usr/local/opt/python3/bin/python3.5
## cpu model: Intel(R) Core(TM) i7-3520M CPU @ 2.90GHz
## parameters: loop=1, cycle=1, extra=0
## real (total = user + sys)
corr 0.0032 0.0000 0.0000 0.0000
## Ranking real
corr 0.0032 (100.0) ********************
## Matrix real [01]
[01] corr 0.0032 100.0
pandasでは0.0032秒で終わる処理が、daruでは5.33秒もかかる。 $ \because $ cython
DataFrame#where
¶pandas:
df[df['quality'] >= 6 && df['alcohol'] <= 10] # qualityが3以上でalcoholが10以下の行を抽出
df[df['quality'] >= 6 && df['alcohol'] <= 10]['pH'] = 0 # 上の条件の行のpHを0にする
df # pHが上書きされている
daru:
df[df['quality'] >= 6 && df['alcohol'] <= 10] # qualityが3以上でalcoholが10以下の行を抽出
df[df['quality'] >= 6 && df['alcohol'] <= 10]['pH'] = 0 # fails!
pandas
と比較すると
など
pandasと同じものを目指すなら同じ程度の労力が必要だが、pandasほどのユーザー数は見込めない?
pandasに対抗するためには、Rubyの魅力を活かす必要がある。
Rubyでシュッとグラフを書くみたいなことができるだけでも十分?
※個人の感想です