#!/usr/bin/env python # coding: utf-8 # In[7]: import pandas as pd, numpy as np from scipy import stats import matplotlib.pyplot as plt get_ipython().run_line_magic('matplotlib', 'inline') # In[88]: stations=pd.read_csv('data/stations.csv').set_index('ID') # Setup plot params # In[89]: def get_country(c,h='hs',plot=False): if c=='huro': hu=pd.read_csv('data/'+'hu'+'_'+h+'.csv') #daily data ro=pd.read_csv('data/'+'ro'+'_'+h+'.csv') #daily data df=pd.concat([hu,ro]) else: df=pd.read_csv('data/'+c+'_'+h+'.csv') #daily data # df=pd.read_csv('data/'+c+'_hs.csv') #high_res data df['time']=pd.to_datetime(df['time']) df['year']=df['time'].dt.year df['month']=df['time'].dt.month df['day']=df['time'].dt.day df['hour']=df['time'].dt.hour df=df.set_index('time') df=df.sort_index() if plot: df.groupby('year').nunique()['ID'].plot() return df # Re-run this section for `ro`, `hu` and `huro` # In[380]: c='huro' if c!='huro': df=get_country(c,plot=True) df_ro=df.copy() # df_hu=df.copy() else: df=pd.concat([df_ro,df_hu]) # In[382]: import math def haversine(coord1, coord2): R = 6372800 # Earth radius in meters lat1, lon1 = coord1 lat2, lon2 = coord2 phi1, phi2 = math.radians(lat1), math.radians(lat2) dphi = math.radians(lat2 - lat1) dlambda = math.radians(lon2 - lon1) a = math.sin(dphi/2)**2 + \ math.cos(phi1)*math.cos(phi2)*math.sin(dlambda/2)**2 return 2*R*math.atan2(math.sqrt(a), math.sqrt(1 - a))/1000 # In[383]: coerce={} for i1 in range(len(stations.index)): station1=stations.index[i1] for i2 in range(i1,len(stations.index)): station2=stations.index[i2] if station1!=station2: if haversine(stations.loc[station1][['LAT','LON']].values, stations.loc[station2][['LAT','LON']].values)<10: print(station1,station2,stations.loc[station1]['LOC'],stations.loc[station2]['LOC']) if station2 not in coerce: if station1 in coerce: coerce[station2]=coerce[station1] else: coerce[station2]=station1 # In[384]: df['ID']=df['ID'].replace(coerce) # In[385]: station_list=df['ID'].unique() # In[386]: coerce={} for i1 in range(len(station_list)): station1=station_list[i1] for i2 in range(i1,len(station_list)): station2=station_list[i2] if station1!=station2: if haversine(stations.loc[station1][['LAT','LON']].values, stations.loc[station2][['LAT','LON']].values)<10: print(station1,station2,stations.loc[station1]['LOC'],stations.loc[station2]['LOC']) if station2 not in coerce: if station1 in coerce: coerce[station2]=coerce[station1] else: coerce[station2]=station1 # If nothing found continue, otherwise loop. # In[398]: keys=['XTEMP','XSPD','XVSB'] dz=df.groupby(['ID','month','hour','year']).median()[keys] #universal mean dzm=dz.groupby(['ID','month','hour']).median()[keys] #or last 10 mean dzm10=df[df['year'].isin(range(2009,2020))].groupby(['ID','month','hour']).median()[keys] dzn=dz-dzm dzn10=dz-dzm10 dzp=dzn.groupby(['ID','year']).median() dzp10=dzn10.groupby(['ID','year']).median() #get outliers dws=[] for station in dzp.index.get_level_values(0).unique(): dw=dzp.loc[station][['XTEMP']] dw['z']=np.abs(stats.zscore(dw).flatten()) dw=dw[dw['z']<3] dw['ID']=station dws.append(dw) dws=pd.concat(dws) #slice under new index dzp=dzp.loc[dws.reset_index().set_index(['ID','year']).index] dzp10=dzp10.loc[dws.reset_index().set_index(['ID','year']).index] #export dzr=(((dzp-dzp.groupby('ID').min())/(dzp.groupby('ID').max()-dzp.groupby('ID').min()))*2)-1 dzr.columns=[i.replace('X','N') for i in dzr.columns] dzp10.columns=[i+'10' for i in dzp10.columns] dzq=dzp.join(dzp10).join(dzr).join(stations[['LOC','LAT','LON','ELEVATION']]) # In[399]: for i in [10,20,30,40,50,60]: dk=dzq.reset_index().set_index('ID').loc[dzp.groupby(['ID']).count()['XTEMP']>i] dk.to_csv('stripes/'+c+'_'+str(i)+'.csv') if i==40: dk.to_csv('stripes/'+c+'.csv') translated_stations=dk.index.unique() # Stations - only for `huro` # In[370]: import json # In[371]: namer=json.loads(open('data/namer.json','r').read()) # In[393]: for i in stations.loc[translated_stations]['LOC'].values: if i not in namer: print('"'+i+'":"'+i.capitalize()+'",') # In[394]: namer2={ "BAISOARA":"Băișoara", "AVRAMENI":"Avrameni", "BAIA MARE/MAGHERUSI":"Nagybánya - Miszmogyorós", "BISTRITA":"Beszterce", "BARNOVA":"Iași / Bârnova", "BATOS":"Bátos", "BAISOARA":"Băișoara", "BARLAD":"Bârlad", "ARAD":"Arad", "ALBA IULIA":"Gyulafehérvár", "BLAJ":"Balázsfalva", "BARAOLT":"Barót", "ADJUD":"Adjud", "BALINTESTI":"Bălintețti", "BALEA LAC":"Bâlea-tó", "BISOCA":"Bisoca", "BANLOC":"Bánlak", "APA NEAGRA":"Apa Neagră", "BAILE HERCULANE":"Herkulesfürdő", "BERZASCA":"Berzasca", "BACLES":"Bâcleș", "BAILESTI":"Băilești", "ADAMCLISI":"Adamclisi", "ALEXANDRIA":"Alexandria", "BECHET":"Bechet", "DARABANI":"Darabani", "BORCEA FETESTI AIR BASE":"Fetești légibázis", "SIGHETUL MARMATIEI":"Máramarossziget", "RADAUTI":"Rădăuți", "DOROHOI":"Dorohoi", "SUCEAVA/SALCEA":"Suceava", "BALATON":"Balaton", "PECS":"Pécs", "CEAHLAU":"Csalhó", "ARAD":"Arad", "POSTAVARU":"Postăvaru-csúcs", "CRAIOVA":"Craiova", "TURNU-MAGURELE":"Turnu Măgurele"} # In[395]: namer.update(namer2) # In[397]: open('data/namer.json','w').write(json.dumps(namer)) # Carpet plots # In[404]: dzn10.loc[station].groupby(['year','month']).median()[['XTEMP']].reset_index() # In[410]: import seaborn as sns # In[499]: for station in df['ID'].unique(): dy=dzn.loc[station].groupby(['year','month']).median()[['XTEMP']].unstack().T for i in range(min(dy.columns)+1,max(dy.columns)): if i not in dy.columns: dy[i]=np.nan dy=dy[range(min(dy.columns),max(dy.columns)+1)] dy=dy[dy>dy.min().min()*0.7] dy=dy[dy