In [1]:

import pandas as pd, numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:

ro=['5765358043206','9812808043220','9576658043223','1699958043227','7225068043228','265208043229']
hu=['8073718043234','2087988043232','6247548043235']

In [26]:

p='C:/Users/csala/Onedrive - Lancaster University/Datarepo/szekelydata/klima/'

In [44]:

stations=[]
for i in ro:
    stations.append(pd.read_csv(p+'high_res/raw/ro/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))
    print(i)

5765358043206
9812808043220
9576658043223
1699958043227
7225068043228
265208043229

In [45]:

for i in hu:
    stations.append(pd.read_csv(p+'high_res/raw/hu/'+i+'stn+.txt',delimiter= '+',skiprows=2,header=None))
    print(i)

8073718043234
2087988043232
6247548043235

In [47]:

station=pd.concat(stations)
station=station.drop_duplicates()
station[2]=station[2].str.strip()
station[3]=station[3].str.strip()
station=station[[0,2,3,4,5,6]]
station.columns=['ID','LOC','COUNTRY','LAT','LON','ELEVATION']
station.to_csv(p+'stations.csv')

!!! 16G memory required at least, 64G recommended

RO

In [8]:

dfs=[]
for i in ro:
    df=pd.read_csv(p+'high_res/raw/ro/'+i+'dat.txt',delimiter= '\s+')
    dfs.append(df)
    print(i)

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

5765358043206

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (21) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

9812808043220
9576658043223
1699958043227

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,6,11,12,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

7225068043228

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

265208043229

In [10]:

dfz=pd.concat(dfs)

In [11]:

dfs=None #free memory
df=None #free memory

In [12]:

dfz.head()

Out[12]:

	USAF	WBAN	YR--MODAHRMN	DIR	SPD	GUS	CLG	SKC	L	M	...	SLP	ALT	STP	MAX	MIN	PCP01	PCP06	PCP24	PCPXX	SD
0	119000	99999	201205010000	040	2	***	***	***	*	*	...	1017.6	*****	984.8	***	***	*****	0.00	*****	*****	**
1	119000	99999	201205010100	040	2	***	***	***	*	*	...	1017.8	*****	984.9	***	***	*****	*****	*****	*****	**
2	119000	99999	201205010200	***	0	***	***	***	*	*	...	1018.1	*****	985	***	***	*****	*****	*****	*****	**
3	119000	99999	201205010300	030	2	***	***	***	*	*	...	1018.3	*****	985.2	***	***	*****	*****	*****	*****	**
4	119000	99999	201205010400	***	0	***	***	***	*	*	...	1018.8	*****	985.6	***	***	*****	*****	*****	*****	**

5 rows × 33 columns

In [13]:

#!!!! DO NOT DO THIS
#dfz=dfz.drop_duplicates()

In [14]:

dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfz['year']=dfz['time'].dt.year
dfz['month']=dfz['time'].dt.month
dfz['day']=dfz['time'].dt.day
dfz['hour']=dfz['time'].dt.hour

In [15]:

#keep only months with at least 6 days to avoid anomalies (20%)
filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()
filt2=filt[filt>5].dropna()
#keep only years with at least 3 months to avoid anomalies (20%)
filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()
filt4=filt3[filt3>3].dropna()

In [17]:

filt4.head()

Out[17]:

		month
USAF	year
119000	2012	8.0
	2013	12.0
	2014	12.0
	2015	12.0
	2016	4.0

In [ ]:

for stn in filt4.index.unique(0):
    years=filt4.loc[stn].index.unique()
    d=dfz[dfz['USAF']==stn]
    d=d[d['year'].isin(years)]
    d.to_csv(p+'high_res/export/'+str(stn)+'.csv')
    print(stn)

HU

In [51]:

dfs=[]
for i in hu:
    df=pd.read_csv(p+'high_res/raw/hu/'+i+'dat.txt',delimiter= '\s+')
    dfs.append(df)
    print(i)

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,21,22,23,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

8073718043234
2087988043232

C:\ProgramData\Anaconda3\lib\site-packages\IPython\core\interactiveshell.py:3058: DtypeWarning: Columns (4,11,12,20,21,22,23,24,25) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

6247548043235

In [52]:

dfz=pd.concat(dfs)

In [53]:

dfs=None #free memory
df=None #free memory

In [54]:

dfz['time']=pd.to_datetime(dfz['YR--MODAHRMN'],format='%Y%m%d%H%M')
dfz['year']=dfz['time'].dt.year
dfz['month']=dfz['time'].dt.month
dfz['day']=dfz['time'].dt.day
dfz['hour']=dfz['time'].dt.hour

In [55]:

#keep only months with at least 6 days to avoid anomalies (20%)
filt=dfz.groupby(['USAF','year','month'])[['day']].nunique()
filt2=filt[filt>5].dropna()
#keep only years with at least 3 months to avoid anomalies (20%)
filt3=filt2.reset_index().groupby(['USAF','year'])[['month']].nunique()
filt4=filt3[filt3>3].dropna()

In [ ]:

for stn in filt4.index.unique(0):
    years=filt4.loc[stn].index.unique()
    d=dfz[dfz['USAF']==stn]
    d=d[d['year'].isin(years)]
    d.to_csv(p+'high_res/export/'+str(stn)+'.csv')
    print(stn)