import pandas as pd
from datetime import datetime
import scipy.stats as sts
def parse(yr, mon):
date = datetime(year=int(yr), day=2, month=int(mon))
return date
url='http://www.cpc.ncep.noaa.gov/data/indices/sstoi.indices'
data = pd.read_table(url, sep=' ', header=0, skiprows=0, parse_dates = [['YR', 'MON']], skipinitialspace=True, index_col=0, date_parser=parse)
print data.describe
print '\n'
print data.index
<bound method DataFrame.describe of <class 'pandas.core.frame.DataFrame'> DatetimeIndex: 377 entries, 1982-01-02 00:00:00 to 2013-05-02 00:00:00 Data columns (total 9 columns): NINO1+2 377 non-null values ANOM 377 non-null values NINO3 377 non-null values ANOM.1 377 non-null values NINO4 377 non-null values ANOM.2 377 non-null values NINO3.4 377 non-null values ANOM.3 377 non-null values Unnamed: 10 0 non-null values dtypes: float64(9)> <class 'pandas.tseries.index.DatetimeIndex'> [1982-01-02 00:00:00, ..., 2013-05-02 00:00:00] Length: 377, Freq: None, Timezone: None
grouped = data.groupby(lambda x: x.year)
grouped
<pandas.core.groupby.DataFrameGroupBy at 0x1095e4e10>
zscore = lambda x: (x - x.mean()) / x.std()
transformed = grouped.transform(zscore)
print transformed['ANOM.3']
YR_MON 1982-01-02 -0.986922 1982-02-02 -1.179216 1982-03-02 -1.179216 1982-04-02 -0.885119 1982-05-02 -0.376105 1982-06-02 0.087664 1982-07-02 -0.161188 1982-08-02 0.098975 1982-09-02 0.415695 1982-10-02 1.049134 1982-11-02 1.286674 1982-12-02 1.829622 1983-01-02 1.715072 1983-02-02 1.428598 1983-03-02 0.976272 ... 2012-03-02 -0.999284 2012-04-02 -0.663736 2012-05-02 -0.063283 2012-06-02 0.572491 2012-07-02 0.961020 2012-08-02 1.314227 2012-09-02 0.925699 2012-10-02 0.537170 2012-11-02 0.660793 2012-12-02 -0.169245 2013-01-02 -1.001483 2013-02-02 -0.924445 2013-03-02 0.462223 2013-04-02 1.386668 2013-05-02 0.077037 Name: ANOM.3, Length: 377, dtype: float64
npdata = np.genfromtxt(url, skip_header=1)
unique_enso_year = [int(value) for value in set(npdata[:, 0])]
nin34 = np.zeros(len(unique_enso_year))
for ind, year in enumerate(unique_enso_year):
indexes = np.flatnonzero(npdata[:, 0]==year)
if len(indexes) == 12:
nin34[ind] = np.mean(npdata[indexes, 9])
else:
nin34[ind] = np.nan
nin34x = (nin34 - sts.nanmean(nin34)) / sts.nanstd(nin34)
nin34y = np.vstack((unique_enso_year, nin34x))
nin34y
array([[ 1.98200000e+03, 1.98300000e+03, 1.98400000e+03, 1.98500000e+03, 1.98600000e+03, 1.98700000e+03, 1.98800000e+03, 1.98900000e+03, 1.99000000e+03, 1.99100000e+03, 1.99200000e+03, 1.99300000e+03, 1.99400000e+03, 1.99500000e+03, 1.99600000e+03, 1.99700000e+03, 1.99800000e+03, 1.99900000e+03, 2.00000000e+03, 2.00100000e+03, 2.00200000e+03, 2.00300000e+03, 2.00400000e+03, 2.00500000e+03, 2.00600000e+03, 2.00700000e+03, 2.00800000e+03, 2.00900000e+03, 2.01000000e+03, 2.01100000e+03, 2.01200000e+03, 2.01300000e+03], [ 1.52721525e+00, 7.79877334e-01, -9.70046905e-01, -1.01299736e+00, 1.93296839e-01, 1.97880860e+00, -1.60325932e+00, -1.17375478e+00, 2.44837384e-01, 9.67632179e-01, 9.77449426e-01, 5.68806528e-01, 6.17892762e-01, -2.70568072e-01, -6.84119592e-01, 1.85732017e+00, -1.90802942e-01, -1.71861197e+00, -1.28788027e+00, -3.49106046e-01, 1.10630079e+00, 4.66952593e-01, 5.85986710e-01, 1.96978306e-01, 2.73061969e-01, -7.51613164e-01, -1.06085644e+00, 5.73715152e-01, -6.42396294e-01, -1.20075220e+00, 6.33370760e-04, nan]])
attempt_cheating = (grouped.mean()['ANOM.3'][:-1] - sts.nanmean(grouped.mean()['ANOM.3'][:-1])) / sts.nanstd(grouped.mean()['ANOM.3'][:-1])
attempt_cheating - nin34x[:-1]
1982 0 1983 0 1984 0 1985 0 1986 0 1987 0 1988 0 1989 0 1990 0 1991 0 1992 0 1993 0 1994 0 1995 0 1996 0 1997 0 1998 0 1999 0 2000 0 2001 0 2002 0 2003 0 2004 0 2005 0 2006 0 2007 0 2008 0 2009 0 2010 0 2011 0 2012 0 Name: ANOM.3, dtype: float64
grouped2 = data.groupby(lambda x: x.year).apply(lambda sdf: sdf if len(sdf) > 11 else None).reset_index(drop=True)
print grouped2
print '\n'
print grouped2.index
<class 'pandas.core.frame.DataFrame'> Int64Index: 372 entries, 0 to 371 Data columns (total 9 columns): NINO1+2 372 non-null values ANOM 372 non-null values NINO3 372 non-null values ANOM.1 372 non-null values NINO4 372 non-null values ANOM.2 372 non-null values NINO3.4 372 non-null values ANOM.3 372 non-null values Unnamed: 10 0 non-null values dtypes: float64(9) Int64Index([0 , 1 , 2 , 3 , 4 , 5 , 6 , 7 , 8 , 9 , 10 , 11 , 12 , 13 , 14 , 15 , 16 , 17 , 18 , 19 , 20 , 21 , 22 , 23 , 24 , 25 , 26 , 27 , 28 , 29 , 30 , 31 , 32 , 33 , 34 , 35 , 36 , 37 , 38 , 39 , 40 , 41 , 42 , 43 , 44 , 45 , 46 , 47 , 48 , 49 , 50 , 51 , 52 , 53 , 54 , 55 , 56 , 57 , 58 , 59 , 60 , 61 , 62 , 63 , 64 , 65 , 66 , 67 , 68 , 69 , 70 , 71 , 72 , 73 , 74 , 75 , 76 , 77 , 78 , 79 , 80 , 81 , 82 , 83 , 84 , 85 , 86 , 87 , 88 , 89 , 90 , 91 , 92 , 93 , 94 , 95 , 96 , 97 , 98 , 99 , 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 224, 225, 226, 227, 228, 229, 230, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 278, 279, 280, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 317, 318, 319, 320, 321, 322, 323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371], dtype=int64)
(grouped.mean()['ANOM.3'][:-1] - sts.nanmean(grouped.mean()['ANOM.3'][:-1])) / sts.nanstd(grouped.mean()['ANOM.3'][:-1])
1982 1.527215 1983 0.779877 1984 -0.970047 1985 -1.012997 1986 0.193297 1987 1.978809 1988 -1.603259 1989 -1.173755 1990 0.244837 1991 0.967632 1992 0.977449 1993 0.568807 1994 0.617893 1995 -0.270568 1996 -0.684120 1997 1.857320 1998 -0.190803 1999 -1.718612 2000 -1.287880 2001 -0.349106 2002 1.106301 2003 0.466953 2004 0.585987 2005 0.196978 2006 0.273062 2007 -0.751613 2008 -1.060856 2009 0.573715 2010 -0.642396 2011 -1.200752 2012 0.000633 Name: ANOM.3, dtype: float64
gist = !jist -p test.ipynb
nbviewer = gist[0].replace('https://gist.github.com/','http://nbviewer.ipython.org/')
print nbviewer