import urllib2
from scipy import stats
from pandas import Series, DataFrame
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
path = 'https://archive.ics.uci.edu/ml/machine-learning-databases/forest-fires/forestfires.csv'
raw_csv = urllib2.urlopen(path)
feature_names = ('X', 'Y', 'month', 'day', "FFMC", "DMC", 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain')
target_name = 'area'
all_names = feature_names + (target_name,)
df = pd.read_csv(raw_csv)
print df.head(5)
print
print df[140:145]
X Y month day FFMC DMC DC ISI temp RH wind rain area 0 7 5 mar fri 86.2 26.2 94.3 5.1 8.2 51 6.7 0.0 0.0 1 7 4 oct tue 90.6 35.4 669.1 6.7 18.0 33 0.9 0.0 0.0 2 7 4 oct sat 90.6 43.7 686.9 6.7 14.6 33 1.3 0.0 0.0 3 8 6 mar fri 91.7 33.3 77.5 9.0 8.3 97 4.0 0.2 0.0 4 8 6 mar sun 89.3 51.3 102.2 9.6 11.4 99 1.8 0.0 0.0 X Y month day FFMC DMC DC ISI temp RH wind rain area 140 2 5 sep mon 90.9 126.5 686.5 7.0 21.9 39 1.8 0.0 0.47 141 1 2 aug wed 95.5 99.9 513.3 13.2 23.3 31 4.5 0.0 0.55 142 8 6 aug fri 90.1 108.0 529.8 12.5 21.2 51 8.9 0.0 0.61 143 1 2 jul sat 90.0 51.3 296.3 8.7 16.6 53 5.4 0.0 0.71 144 2 5 aug wed 95.5 99.9 513.3 13.2 23.8 32 5.4 0.0 0.77
sdf = df[['X', 'Y', 'area']]
print sdf.head(5)
print
print sdf[140:145]
X Y area 0 7 5 0.0 1 7 4 0.0 2 7 4 0.0 3 8 6 0.0 4 8 6 0.0 X Y area 140 2 5 0.47 141 1 2 0.55 142 8 6 0.61 143 1 2 0.71 144 2 5 0.77
sdf = df[['month', 'area']]
print sdf.head()
print
print sdf[140:145]
month area 0 mar 0.0 1 oct 0.0 2 oct 0.0 3 mar 0.0 4 mar 0.0 month area 140 sep 0.47 141 aug 0.55 142 aug 0.61 143 jul 0.71 144 aug 0.77
sdf = df[['temp', 'RH', 'wind', 'rain']]
print sdf.head()
temp RH wind rain 0 8.2 51 6.7 0.0 1 18.0 33 0.9 0.0 2 14.6 33 1.3 0.0 3 8.3 97 4.0 0.2 4 11.4 99 1.8 0.0
sdf = df[['temp', 'RH', 'wind', 'rain', 'area']]
print sdf.head()
print
print sdf[140:145]
temp RH wind rain area 0 8.2 51 6.7 0.0 0.0 1 18.0 33 0.9 0.0 0.0 2 14.6 33 1.3 0.0 0.0 3 8.3 97 4.0 0.2 0.0 4 11.4 99 1.8 0.0 0.0 temp RH wind rain area 140 21.9 39 1.8 0.0 0.47 141 23.3 31 4.5 0.0 0.55 142 21.2 51 8.9 0.0 0.61 143 16.6 53 5.4 0.0 0.71 144 23.8 32 5.4 0.0 0.77
log1p_val = np.log1p(sdf['area'])
sdf.loc[:, 'area'] = log1p_val
print sdf.head()
print
print sdf[140:145]
FFMC DMC DC ISI temp RH wind rain area 0 86.2 26.2 94.3 5.1 8.2 51 6.7 0.0 0.0 1 90.6 35.4 669.1 6.7 18.0 33 0.9 0.0 0.0 2 90.6 43.7 686.9 6.7 14.6 33 1.3 0.0 0.0 3 91.7 33.3 77.5 9.0 8.3 97 4.0 0.2 0.0 4 89.3 51.3 102.2 9.6 11.4 99 1.8 0.0 0.0 FFMC DMC DC ISI temp RH wind rain area 140 90.9 126.5 686.5 7.0 21.9 39 1.8 0.0 0.385262 141 95.5 99.9 513.3 13.2 23.3 31 4.5 0.0 0.438255 142 90.1 108.0 529.8 12.5 21.2 51 8.9 0.0 0.476234 143 90.0 51.3 296.3 8.7 16.6 53 5.4 0.0 0.536493 144 95.5 99.9 513.3 13.2 23.8 32 5.4 0.0 0.570980
sdf = df[['FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain', 'area']]
log1p_val = np.log1p(sdf['area'])
sdf.loc[:, 'area'] = log1p_val
print sdf.head()
print
print sdf[140:145]
FFMC DMC DC ISI temp RH wind rain area 0 86.2 26.2 94.3 5.1 8.2 51 6.7 0.0 0.0 1 90.6 35.4 669.1 6.7 18.0 33 0.9 0.0 0.0 2 90.6 43.7 686.9 6.7 14.6 33 1.3 0.0 0.0 3 91.7 33.3 77.5 9.0 8.3 97 4.0 0.2 0.0 4 89.3 51.3 102.2 9.6 11.4 99 1.8 0.0 0.0 FFMC DMC DC ISI temp RH wind rain area 140 90.9 126.5 686.5 7.0 21.9 39 1.8 0.0 0.385262 141 95.5 99.9 513.3 13.2 23.3 31 4.5 0.0 0.438255 142 90.1 108.0 529.8 12.5 21.2 51 8.9 0.0 0.476234 143 90.0 51.3 296.3 8.7 16.6 53 5.4 0.0 0.536493 144 95.5 99.9 513.3 13.2 23.8 32 5.4 0.0 0.570980
a = [1, 2, 3, 4, 5]
b = [2, 5, 9, 1, 2]
print np.corrcoef(a, b)
print
corr = np.corrcoef(a, b)[0][1]
print corr
[[ 1. -0.1933473] [-0.1933473 1. ]] -0.193347297809