import sys
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
%matplotlib inline
CRU TS4.01: Climatic Research Unit (CRU) Time-Series (TS) version 4.01 of high-resolution gridded data of month-by-month variation in climate (Jan. 1901- Dec. 2016)
You can download CRU TS4.01 data set from CDEA Archive (needs registration)
http://data.ceda.ac.uk/badc/cru/data/cru_ts/cru_ts_4.01/data/tmp
Note that use of these data is covered by the following licence:
http://www.nationalarchives.gov.uk/doc/open-government-licence/version/3/
# read dat file
data = pd.read_csv("../data/cru_ts4.01.1901.2016.tmp.dat", delim_whitespace=True, header=None).values
# reshape
data = data.reshape((-1, 360*720))
# ignore invalid grids
vvv = (data == -999).sum(axis=0)
# same as the paper
(vvv == 0).sum()
67420
data_live = data[:, vvv == 0].transpose().astype("int32")
data_live.shape
(67420, 1392)
# seems 10 times degrees celsius
# -595 means -59.5
# 395 means 39.5
data_live.max(), data_live.min()
(395, -595)
We want similar situation as MODEL4 in the paper, that says 97% Accuracy.
Here, we randomly pick grid and randomly pick year, around 500K in total.
sample_size = 519718
np.random.seed(71)
sample_indices = np.random.choice(67420*77, sample_size, replace=False)
train_size = 519718 * 3 // 4
The definition of RISE/FALL in the paper is not clear for me, so simply compare 30 years mean / 10 years mean
(the paper says based on the mean temperature for 10 years after the training period.)
X = np.zeros((sample_size, 360))
y = np.zeros(sample_size).astype("int")
for i in range(sample_size):
sample_ind = sample_indices[i]
grid_ind = sample_ind // 77
year_ind = sample_ind % 77
X[i, :] = data_live[grid_ind, (year_ind*12):(year_ind*12+360)]
mean30 = X[i, :].mean()
mean10 = data_live[grid_ind, (year_ind*12+360):(year_ind*12+480)].mean()
if mean10 > mean30:
y[i] = 1
else:
y[i] = 0
# around same distribution
(1-y).sum(), y.sum()
(156818, 362900)
train_data = lgb.Dataset(X[:train_size, :], y[:train_size])
valid_data = lgb.Dataset(X[train_size:, :], y[train_size:])
params = {
"objective" : "binary",
"metric" : "binary_error"
}
lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)
Training until validation scores don't improve for 200 rounds. [100] valid_0's binary_error: 0.222997 [200] valid_0's binary_error: 0.194743 [300] valid_0's binary_error: 0.176495 [400] valid_0's binary_error: 0.160717 [500] valid_0's binary_error: 0.149704 [600] valid_0's binary_error: 0.140322 [700] valid_0's binary_error: 0.133957 [800] valid_0's binary_error: 0.126753 [900] valid_0's binary_error: 0.121827 [1000] valid_0's binary_error: 0.117663 [1100] valid_0's binary_error: 0.113592 [1200] valid_0's binary_error: 0.110167 [1300] valid_0's binary_error: 0.106904 [1400] valid_0's binary_error: 0.103817 [1500] valid_0's binary_error: 0.10127 [1600] valid_0's binary_error: 0.0989456 [1700] valid_0's binary_error: 0.0967136 [1800] valid_0's binary_error: 0.0944201 [1900] valid_0's binary_error: 0.0925806 [2000] valid_0's binary_error: 0.0911953 [2100] valid_0's binary_error: 0.0895713 [2200] valid_0's binary_error: 0.0882244 [2300] valid_0's binary_error: 0.0871854 [2400] valid_0's binary_error: 0.0861695 [2500] valid_0's binary_error: 0.0854306 [2600] valid_0's binary_error: 0.0842531 [2700] valid_0's binary_error: 0.0829831 [2800] valid_0's binary_error: 0.0818979 [2900] valid_0's binary_error: 0.0812745 [3000] valid_0's binary_error: 0.0801047 [3100] valid_0's binary_error: 0.0795505 [3200] valid_0's binary_error: 0.0788348 [3300] valid_0's binary_error: 0.0781806 [3400] valid_0's binary_error: 0.077434 [3500] valid_0's binary_error: 0.0769106 [3600] valid_0's binary_error: 0.0762026 [3700] valid_0's binary_error: 0.0754791 [3800] valid_0's binary_error: 0.0748634 [3900] valid_0's binary_error: 0.0743785 [4000] valid_0's binary_error: 0.0738013 [4100] valid_0's binary_error: 0.0732317 [4200] valid_0's binary_error: 0.072693 [4300] valid_0's binary_error: 0.0720465 [4400] valid_0's binary_error: 0.0715616 [4500] valid_0's binary_error: 0.0712999 [4600] valid_0's binary_error: 0.0709844 [4700] valid_0's binary_error: 0.0704764 [4800] valid_0's binary_error: 0.0701916 [4900] valid_0's binary_error: 0.0699992 [5000] valid_0's binary_error: 0.0696221 [5100] valid_0's binary_error: 0.0693758 [5200] valid_0's binary_error: 0.0689063 [5300] valid_0's binary_error: 0.0686062 [5400] valid_0's binary_error: 0.0682983 [5500] valid_0's binary_error: 0.0678981 [5600] valid_0's binary_error: 0.0676672 [5700] valid_0's binary_error: 0.0675672 [5800] valid_0's binary_error: 0.0672285 [5900] valid_0's binary_error: 0.0670823 [6000] valid_0's binary_error: 0.0668129 [6100] valid_0's binary_error: 0.0665666 [6200] valid_0's binary_error: 0.0663742 [6300] valid_0's binary_error: 0.0661587 [6400] valid_0's binary_error: 0.0659509 [6500] valid_0's binary_error: 0.06572 [6600] valid_0's binary_error: 0.0655353 [6700] valid_0's binary_error: 0.065289 [6800] valid_0's binary_error: 0.0652197 [6900] valid_0's binary_error: 0.0648272 [7000] valid_0's binary_error: 0.064681 [7100] valid_0's binary_error: 0.0644347 [7200] valid_0's binary_error: 0.0641884 [7300] valid_0's binary_error: 0.064073 [7400] valid_0's binary_error: 0.0639421 [7500] valid_0's binary_error: 0.0636266 [7600] valid_0's binary_error: 0.0636881 Early stopping, best iteration is: [7467] valid_0's binary_error: 0.0635573
<lightgbm.basic.Booster at 0x1162cb470>
1 - 0.0635573
0.9364427
X_st = X - X.min(axis=1, keepdims=True)
X_st = X_st / X_st.max(axis=1, keepdims=True)
train_data = lgb.Dataset(X_st[:train_size, :], y[:train_size])
valid_data = lgb.Dataset(X_st[train_size:, :], y[train_size:])
params = {
"objective" : "binary",
"metric" : "binary_error"
}
lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)
Training until validation scores don't improve for 200 rounds. [100] valid_0's binary_error: 0.167321 [200] valid_0's binary_error: 0.131663 [300] valid_0's binary_error: 0.112376 [400] valid_0's binary_error: 0.0990841 [500] valid_0's binary_error: 0.0889633 [600] valid_0's binary_error: 0.0815901 [700] valid_0's binary_error: 0.0758639 [800] valid_0's binary_error: 0.0709382 [900] valid_0's binary_error: 0.06719 [1000] valid_0's binary_error: 0.0637959 [1100] valid_0's binary_error: 0.060725 [1200] valid_0's binary_error: 0.0582544 [1300] valid_0's binary_error: 0.0558532 [1400] valid_0's binary_error: 0.0541138 [1500] valid_0's binary_error: 0.0519895 [1600] valid_0's binary_error: 0.0506657 [1700] valid_0's binary_error: 0.0493189 [1800] valid_0's binary_error: 0.0479797 [1900] valid_0's binary_error: 0.0466559 [2000] valid_0's binary_error: 0.0456861 [2100] valid_0's binary_error: 0.0447395 [2200] valid_0's binary_error: 0.0436851 [2300] valid_0's binary_error: 0.043054 [2400] valid_0's binary_error: 0.0425306 [2500] valid_0's binary_error: 0.0418841 [2600] valid_0's binary_error: 0.0412376 [2700] valid_0's binary_error: 0.0406373 [2800] valid_0's binary_error: 0.0401678 [2900] valid_0's binary_error: 0.0396829 [3000] valid_0's binary_error: 0.0390903 [3100] valid_0's binary_error: 0.0388132 [3200] valid_0's binary_error: 0.0385284 [3300] valid_0's binary_error: 0.0379897 [3400] valid_0's binary_error: 0.0377049 [3500] valid_0's binary_error: 0.037197 [3600] valid_0's binary_error: 0.0369738 [3700] valid_0's binary_error: 0.0366813 [3800] valid_0's binary_error: 0.0365351 [3900] valid_0's binary_error: 0.0363657 [4000] valid_0's binary_error: 0.0360194 [4100] valid_0's binary_error: 0.0356961 [4200] valid_0's binary_error: 0.0355499 [4300] valid_0's binary_error: 0.0354422 [4400] valid_0's binary_error: 0.0352036 [4500] valid_0's binary_error: 0.0351266 [4600] valid_0's binary_error: 0.0350112 [4700] valid_0's binary_error: 0.0348649 [4800] valid_0's binary_error: 0.0348187 [4900] valid_0's binary_error: 0.0345263 [5000] valid_0's binary_error: 0.0345032 [5100] valid_0's binary_error: 0.0343339 [5200] valid_0's binary_error: 0.0343647 [5300] valid_0's binary_error: 0.0342646 [5400] valid_0's binary_error: 0.0340106 [5500] valid_0's binary_error: 0.0338798 [5600] valid_0's binary_error: 0.033926 [5700] valid_0's binary_error: 0.0337412 [5800] valid_0's binary_error: 0.0336874 [5900] valid_0's binary_error: 0.033595 [6000] valid_0's binary_error: 0.0335027 [6100] valid_0's binary_error: 0.0333487 [6200] valid_0's binary_error: 0.0333333 [6300] valid_0's binary_error: 0.0332102 [6400] valid_0's binary_error: 0.0331178 [6500] valid_0's binary_error: 0.0330101 [6600] valid_0's binary_error: 0.032987 [6700] valid_0's binary_error: 0.0329485 [6800] valid_0's binary_error: 0.0329716 [6900] valid_0's binary_error: 0.0328638 [7000] valid_0's binary_error: 0.0326483 [7100] valid_0's binary_error: 0.0326099 [7200] valid_0's binary_error: 0.0325945 Early stopping, best iteration is: [7037] valid_0's binary_error: 0.0324559
<lightgbm.basic.Booster at 0x1162b5b00>
1 - 0.0324559
0.9675441
Now we sample train and valid data in time-series manner and check the Accuracy.
For train data, we randomly pick grid and randomly pick year 0-54(start from 1901-1955), 375K in total.
For valid data, we randomly pick grid and randomly pick year 64-76(start from 1965-1977), 375K in total.
train data (both X and y) do not include 1995- data.
valid_y consists of 1995- data.
train_size = 375000
np.random.seed(71)
train_sample_indices = np.random.choice(67420*55, train_size, replace=False)
valid_size = 125000
np.random.seed(71)
valid_sample_indices = np.random.choice(67420*13, valid_size, replace=False)
train_X = np.zeros((train_size, 360))
train_y = np.zeros(train_size).astype("int")
for i in range(train_size):
sample_ind = train_sample_indices[i]
grid_ind = sample_ind // 55
year_ind = sample_ind % 55
train_X[i, :] = data_live[grid_ind, (year_ind*12):(year_ind*12+360)]
mean30 = train_X[i, :].mean()
mean10 = data_live[grid_ind, (year_ind*12+360):(year_ind*12+480)].mean()
if mean10 > mean30:
train_y[i] = 1
else:
train_y[i] = 0
valid_X = np.zeros((valid_size, 360))
valid_y = np.zeros(valid_size).astype("int")
for i in range(valid_size):
sample_ind = valid_sample_indices[i]
grid_ind = sample_ind // 13
year_ind = sample_ind % 13 + 64
valid_X[i, :] = data_live[grid_ind, (year_ind*12):(year_ind*12+360)]
mean30 = valid_X[i, :].mean()
mean10 = data_live[grid_ind, (year_ind*12+360):(year_ind*12+480)].mean()
if mean10 > mean30:
valid_y[i] = 1
else:
valid_y[i] = 0
(1-train_y).sum(), train_y.sum(), (1-valid_y).sum(), valid_y.sum()
(162195, 212805, 2613, 122387)
train_y.mean(), valid_y.mean()
(0.56748, 0.979096)
we see Global warming here...
all 1 model has 98% accuracy.
train_data = lgb.Dataset(train_X, train_y)
valid_data = lgb.Dataset(valid_X, valid_y)
params = {
"objective" : "binary",
"metric" : "binary_error"
}
lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)
Training until validation scores don't improve for 200 rounds. [100] valid_0's binary_error: 0.145256 [200] valid_0's binary_error: 0.141512 [300] valid_0's binary_error: 0.140848 [400] valid_0's binary_error: 0.143792 Early stopping, best iteration is: [288] valid_0's binary_error: 0.139184
<lightgbm.basic.Booster at 0x1162cbc18>
train_X_st = train_X - train_X.min(axis=1, keepdims=True)
train_X_st = train_X_st / train_X_st.max(axis=1, keepdims=True)
valid_X_st = valid_X - valid_X.min(axis=1, keepdims=True)
valid_X_st = valid_X_st / valid_X_st.max(axis=1, keepdims=True)
train_data = lgb.Dataset(train_X_st, train_y)
valid_data = lgb.Dataset(valid_X_st, valid_y)
params = {
"objective" : "binary",
"metric" : "binary_error"
}
lgb.train(params, train_set=train_data, valid_sets=[valid_data], early_stopping_rounds=200, num_boost_round=10000, verbose_eval=100)
Training until validation scores don't improve for 200 rounds. [100] valid_0's binary_error: 0.112464 [200] valid_0's binary_error: 0.114136 [300] valid_0's binary_error: 0.117384 Early stopping, best iteration is: [119] valid_0's binary_error: 0.111232
<lightgbm.basic.Booster at 0x1162cb080>