In [1]:
import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
ro=['5765358043206','9812808043220','9576658043223','1699958043227','7225068043228','265208043229']
hu=['8073718043234','2087988043232','6247548043235']
In [26]:
p='C:/Users/csala/Onedrive - Lancaster University/Datarepo/szekelydata/klima/'
In [44]:
stations=[]
for i in ro:
    stations.append(pd.read_csv(p+'high_res/raw/ro/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))
    print(i)
5765358043206
9812808043220
9576658043223
1699958043227
7225068043228
265208043229
In [45]:
for i in hu:
    stations.append(pd.read_csv(p+'high_res/raw/hu/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))
    print(i)
8073718043234
2087988043232
6247548043235
In [47]:
station=pd.concat(stations)
station=station.drop_duplicates()
station[2]=station[2].str.strip()
station[3]=station[3].str.strip()
station=station[[0,2,3,4,5,6]]
station.columns=['ID','LOC','COUNTRY','LAT','LON','ELEVATION']
station.to_csv(p+'stations.csv')

!!! 16G memory required at least, 64G recommended

RO

In [8]:
dfs=[]
for i in ro:
    df=pd.read_csv(p+'high_res/raw/ro/'+i+'dat.txt',delimiter= '\s+')
    dfs.append(df)
    print(i)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
5765358043206
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
9812808043220
9576658043223
1699958043227
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,6,11,12,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
7225068043228
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
265208043229
In [10]:
dfz=pd.concat(dfs)
In [11]:
dfs=None #free memory
df=None #free memory
In [12]:
dfz.head()
Out[12]:
USAF WBAN YR--MODAHRMN DIR SPD GUS CLG SKC L M ... SLP ALT STP MAX MIN PCP01 PCP06 PCP24 PCPXX SD
0 119000 99999 201205010000 040 2 *** *** *** * * ... 1017.6 ***** 984.8 *** *** ***** 0.00 ***** ***** **
1 119000 99999 201205010100 040 2 *** *** *** * * ... 1017.8 ***** 984.9 *** *** ***** ***** ***** ***** **
2 119000 99999 201205010200 *** 0 *** *** *** * * ... 1018.1 ***** 985 *** *** ***** ***** ***** ***** **
3 119000 99999 201205010300 030 2 *** *** *** * * ... 1018.3 ***** 985.2 *** *** ***** ***** ***** ***** **
4 119000 99999 201205010400 *** 0 *** *** *** * * ... 1018.8 ***** 985.6 *** *** ***** ***** ***** ***** **

5 rows × 33 columns

In [13]:
#!!!! DO NOT DO THIS
#dfz=dfz.drop_duplicates()
In [14]:
dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfz['year']=dfz['time'].dt.year
dfz['month']=dfz['time'].dt.month
dfz['day']=dfz['time'].dt.day
dfz['hour']=dfz['time'].dt.hour
In [15]:
#keep only months with at least 6 days to avoid anomalies (20%)
filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()
filt2=filt[filt>5].dropna()
#keep only years with at least 3 months to avoid anomalies (20%)
filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()
filt4=filt3[filt3>3].dropna()
In [17]:
filt4.head()
Out[17]:
month
USAF year
119000 2012 8.0
2013 12.0
2014 12.0
2015 12.0
2016 4.0
In [ ]:
for stn in filt4.index.unique(0):
    years=filt4.loc[stn].index.unique()
    d=dfz[dfz['USAF']==stn]
    d=d[d['year'].isin(years)]
    d.to_csv(p+'high_res/export/'+str(stn)+'.csv')
    print(stn)
119000
150000
150001
150002
150010
150040
150070
150090
150100
150105
150140
150150
150200
150230
150235
150250
150320
150330
150400
150410
150420
150440
150470
150520
150550
150560
150630
150690
150730
150750
150800
150830
150850
150880
150890
150900
150940
150950
150990
151070
151080
151090
151110
151130
151170
151180
151190
151200
151205
151230
151240
151270
151320
151340
151360
151380
151400
151430
151450
151455
151480
151500
151540
151580
151590
151600
151620
151630
151650
151680
151700
151740
151790
151820
151840
151890
151940
151970
151990
152000
152005
152040
152060
152080
152090
152120
152150
152170
152190
152210
152300
152310
152350
152380
152410
152450
152470
152540
152590
152600
152610
152620
152640
152650
152670
152700
152730
152770
152790
152800
152820
152840
152850
152870
152890
152920
152960
152970
152980
152990
153000
153010
153020
153070
153100
153140
153150
153160
153170
153190
153200
153210
153240
153250
153280
153330
153350
153355
153360
153370
153380
153400
153410
153440
153450
153460
153470
153490
153500
153550
153560
153600
153630
153640
153660
153690
153730
153750
153770
153870
153880
153890
153950
154020
154050
154060
154080
154090
154100
154120
154160
154190

HU

In [51]:
dfs=[]
for i in hu:
    df=pd.read_csv(p+'high_res/raw/hu/'+i+'dat.txt',delimiter= '\s+')
    dfs.append(df)
    print(i)
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
8073718043234
2087988043232
C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,20,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)
6247548043235
In [52]:
dfz=pd.concat(dfs)
In [53]:
dfs=None #free memory
df=None #free memory
In [54]:
dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfz['year']=dfz['time'].dt.year
dfz['month']=dfz['time'].dt.month
dfz['day']=dfz['time'].dt.day
dfz['hour']=dfz['time'].dt.hour
In [55]:
#keep only months with at least 6 days to avoid anomalies (20%)
filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()
filt2=filt[filt>5].dropna()
#keep only years with at least 3 months to avoid anomalies (20%)
filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()
filt4=filt3[filt3>3].dropna()
In [ ]:
for stn in filt4.index.unique(0):
    years=filt4.loc[stn].index.unique()
    d=dfz[dfz['USAF']==stn]
    d=d[d['year'].isin(years)]
    d.to_csv(p+'high_res/export/'+str(stn)+'.csv')
    print(stn)
127560
127660
127720
127860
128050
128120