#!/usr/bin/env python
# coding: utf-8

# GHCN V2 Temperatures ANOM (C)  CR 1200KM 1880-present
# 
# GLOBAL Temperature Anomalies in .01 C     base period: 1951-1980
# 
# http://climatecode.org/

# In[1]:


import os
import git


if not os.path.exists('ccc-gistemp'):
    git.Git().clone('https://github.com/ClimateCodeFoundation/ccc-gistemp.git')

if not os.path.exists('madqc'):
    git.Git().clone('https://github.com/ClimateCodeFoundation/madqc.git')


# It seems that
# 
# http://data.giss.nasa.gov/gistemp/sources_v3/GISTEMPv3_sources.tar.gz
# 
# and 
# 
# http://data.giss.nasa.gov/pub/gistemp/SBBX.ERSST.gz
# 
# are down, so let's use a local copy instead.

# In[2]:


get_ipython().system('mkdir -p ccc-gistemp/input')

get_ipython().system('cp GISTEMPv3_sources.tar.gz SBBX.ERSST.gz ccc-gistemp/input')


# In[3]:


get_ipython().run_line_magic('cd', 'ccc-gistemp/')


# We don't really need `pypy` for the fetch phase, but the code is Python 2 and the notebook is Python 3, so this is just a lazy way to call py2k code from a py3k notebook ;-p
# 
# PS: we are also using the International Surface Temperature Initiative data (ISTI).

# In[4]:


get_ipython().system('pypy tool/fetch.py isti')


# QC the ISTI data.

# In[5]:


get_ipython().system('../madqc/mad.py --progress input/isti.merged.dat')


# We need to copy the ISTI data into the `input` directory.

# In[6]:


get_ipython().system('cp isti.merged.qc.dat input/isti.merged.qc.dat')

get_ipython().system('cp input/isti.merged.inv input/isti.merged.qc.inv')


# Here is where `pypy` is really needed, this step takes ~35 minutes on valina `python` but only ~100 seconds on `pypy`.

# In[7]:


get_ipython().system("pypy tool/run.py -p 'data_sources=isti.merged.qc.dat;element=TAVG' -s 0-1,3-5")


# Python `gistemp` saves the results in the same format as the Fortran program but it ships with `gistemp2csv.py` to make it easier to read the data with `pandas`.

# In[8]:


get_ipython().system('pypy tool/gistemp2csv.py result/*.txt')


# In[9]:


import pandas as pd


df = pd.read_csv(
    'result/landGLB.Ts.GHCN.CL.PA.csv',
    skiprows=3,
    index_col=0,
    na_values=('*****', '****'),
)


# Let's use `sklearn` to compute the full trend...

# In[10]:


from sklearn import linear_model
from sklearn.metrics import mean_squared_error, r2_score


reg0 = linear_model.LinearRegression()

series0 = df['J-D'].dropna()
y = series0.values
X = series0.index.values[:, None]

reg0.fit(X, y)
y_pred0 = reg0.predict(X)
R2_0 = mean_squared_error(y, y_pred0)
var0 = r2_score(y, y_pred0)


#  and the past 30 years trend.

# In[11]:


reg1 = linear_model.LinearRegression()

series1 = df['J-D'].dropna().iloc[-30:]
y = series1.values
X = series1.index.values[:, None]
reg1.fit(X, y)
y_pred1 = reg1.predict(X)
R2_1 = mean_squared_error(y[-30:], y_pred1)
var1 = r2_score(y[-30:], y_pred1)


# In[12]:


get_ipython().run_line_magic('matplotlib', 'inline')

ax = df.plot.line(y='J-D', figsize=(9, 9), legend=None)
ax.plot(series0.index, y_pred0, 'r--')
ax.plot(series1.index, y_pred1, 'r')
ax.set_xlim([1879, 2018])

leg = f"""Trend in ℃/century (R²)
Full: {reg0.coef_[0]*100:0.2f} ({var0:0.2f})
30-year: {reg1.coef_[0]*100:0.2f} ({var1:0.2f})
"""

ax.text(0.10, 0.75, leg, transform=ax.transAxes);