#url='http://weather-warehouse.com/WeatherHistory/PastWeatherData_IthacaCornellUniv_Ithaca_NY_'+m+'.html'#get data files and store them #from urllib2 import urlopen months =( 'January','February','March','April','May','June', 'July','August','September','October','November','December') #rawpage={} #for m in months: #rawpage[m]=urlopen(url).read() #with open('wdata/'+m+'.html','w') as f: f.write(rawpage[m]) #'): s[-1]=s[-1][:-10] #skip entire year if missing data if '' in s: skipped.append(s[0]) continue data += [ [int(s[0])] + map(float,s[1:]) ] print file,'skipping:',' '.join(skipped) # print ', '.join([str(i)+':'+ids[i] for i in range(len(ids))]) return data # data returned as a list of columns [[1900 .. 2012],[11.0,6.0, ...], ... ] ##http://weather-warehouse.com/WeatherHistory/PastWeatherData_IthacaCornellUniv_Ithaca_NY_March.html data=getdata('wdata/March.html') from scipy import stats #r,p=stats.pearsonr(xdata,ydata) #slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) #files in subdirectory, e.g., wdata/March.html yd={} for m in months: data=getdata('wdata/'+m+'.html') yd[m]= [[yr[j] for yr in data] for j in range(12)] #find all the pearson correlations and slopes for temps cor={} for m in months: cor[m]=[] for j in range(1,8): slope, intercept, r_value, p_value, std_err = stats.linregress(yd[m][0],yd[m][j]) cor[m].append(r_value) #find the biggest correlations [(m,i+1,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] > .15] #0:year, 1:lotemp, 2:hitemp, 3:himin, 4:lomax, 5:avgmin, 6:avgmax, 7:mean xdata=yd['February'][0] ydata=yd['February'][4] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'go',xdata,line,'r-') xlim(1900,2013) grid('on') figure(figsize=(6,9)) #pick Aug as half year away, to compare xdata=yd['August'][0] ydata=yd['August'][4] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'o',xdata,line,'m-') xdata=yd['February'][0] ydata=yd['February'][4] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'o',xdata,line,'r-') yticks(linspace(-10,80,19,endpoint=True)) xlim(1900,2013) legend(['Aug low max','r=.04, p=.74', 'Feb low max','r=.23, p=.017'], 'center left') grid('on') #find all the pearson correlations and slopes for rain, snow cor={} for m in months: cor[m]=[] for j in range(8,12): slope, intercept, r_value, p_value, std_err = stats.linregress(yd[m][0],yd[m][j]) cor[m].append(r_value) #find large correl [(m,i+8,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] > .2] #8:precip, 9:snow, 10:maxprecip, 11:maxsnow, xdata=yd['September'][0] ydata=yd['September'][8] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'o',xdata,line,'m-') ydata=yd['September'][10] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'o',xdata,line,'r-') grid('on') xlim(1900,2013) legend(['Sep precip','correl=.29, p=.007','Sep maxprecip','correl=.31, p=.004'],'upper left') #find large anti-correl [(m,i+8,cor[m][i]) for m in cor for i in range(len(cor[m])) if cor[m][i] < -.1] xdata=yd['April'][0] ydata=yd['April'][9] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'o',xdata,line,'m-') ydata=yd['April'][11] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'o',xdata,line,'r-') grid('on') xlim(1900,2013) legend(['April snow','correl=-.13, p=.2','April maxsnow','correl=-.11, p=.26']) yrdata = {yr:[] for yr in range(1900,2013) if yr not in range(1919,1926)} for data in [yd[m] for m in months]: yrs = data[0] avg = data[7] for i in range(len(yrs)): yrdata[yrs[i]].append(avg[i]) xdata=[yr for yr in yrdata if len(yrdata[yr]) ==12] ydata=[mean(yrdata[yr]) for yr in yrdata if len(yrdata[yr]) ==12] slope, intercept, r_value, p_value, std_err = stats.linregress(xdata,ydata) print r_value, p_value, std_err line = slope*array(xdata)+intercept plot(xdata,ydata,'o',xdata,line,'r-') xlim(1926,2012) grid('on') yrdata[2011] yrdata[1990]
') #split the table entries if len(s) == 1: continue if s[0].find('The official station') > -1: continue if s[0].startswith('') ids = re.findall("id='(.*?)'",header)[1:] titles = re.findall("title='
(.*?)\.?'",header) if s[0].startswith('
'): s[0]=s[0][8:] if s[-1].endswith('