cd /home/anshul/
/home/anshul
import pandas as pd
import numpy as np
from sklearn import tree
from sklearn.externals.six import StringIO
import pydot
weather=pd.read_csv('weather.csv')
weather.head(10)
Unnamed: 0 | Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | ... | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RISK_MM | RainTomorrow | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 2007-11-01 | Canberra | 8.0 | 24.3 | 0.0 | 3.4 | 6.3 | NW | 30 | ... | 29 | 1019.7 | 1015.0 | 7 | 7 | 14.4 | 23.6 | No | 3.6 | Yes |
1 | 2 | 2007-11-02 | Canberra | 14.0 | 26.9 | 3.6 | 4.4 | 9.7 | ENE | 39 | ... | 36 | 1012.4 | 1008.4 | 5 | 3 | 17.5 | 25.7 | Yes | 3.6 | Yes |
2 | 3 | 2007-11-03 | Canberra | 13.7 | 23.4 | 3.6 | 5.8 | 3.3 | NW | 85 | ... | 69 | 1009.5 | 1007.2 | 8 | 7 | 15.4 | 20.2 | Yes | 39.8 | Yes |
3 | 4 | 2007-11-04 | Canberra | 13.3 | 15.5 | 39.8 | 7.2 | 9.1 | NW | 54 | ... | 56 | 1005.5 | 1007.0 | 2 | 7 | 13.5 | 14.1 | Yes | 2.8 | Yes |
4 | 5 | 2007-11-05 | Canberra | 7.6 | 16.1 | 2.8 | 5.6 | 10.6 | SSE | 50 | ... | 49 | 1018.3 | 1018.5 | 7 | 7 | 11.1 | 15.4 | Yes | 0.0 | No |
5 | 6 | 2007-11-06 | Canberra | 6.2 | 16.9 | 0.0 | 5.8 | 8.2 | SE | 44 | ... | 57 | 1023.8 | 1021.7 | 7 | 5 | 10.9 | 14.8 | No | 0.2 | No |
6 | 7 | 2007-11-07 | Canberra | 6.1 | 18.2 | 0.2 | 4.2 | 8.4 | SE | 43 | ... | 47 | 1024.6 | 1022.2 | 4 | 6 | 12.4 | 17.3 | No | 0.0 | No |
7 | 8 | 2007-11-08 | Canberra | 8.3 | 17.0 | 0.0 | 5.6 | 4.6 | E | 41 | ... | 57 | 1026.2 | 1024.2 | 6 | 7 | 12.1 | 15.5 | No | 0.0 | No |
8 | 9 | 2007-11-09 | Canberra | 8.8 | 19.5 | 0.0 | 4.0 | 4.1 | S | 48 | ... | 48 | 1026.1 | 1022.7 | 7 | 7 | 14.1 | 18.9 | No | 16.2 | Yes |
9 | 10 | 2007-11-10 | Canberra | 8.4 | 22.8 | 16.2 | 5.4 | 7.7 | E | 31 | ... | 32 | 1024.1 | 1020.7 | 7 | 1 | 13.3 | 21.7 | Yes | 0.0 | No |
10 rows × 25 columns
weather.dtypes
Unnamed: 0 int64 Date object Location object MinTemp float64 MaxTemp float64 Rainfall float64 Evaporation float64 Sunshine float64 WindGustDir object WindGustSpeed float64 WindDir9am object WindDir3pm object WindSpeed9am float64 WindSpeed3pm int64 Humidity9am int64 Humidity3pm int64 Pressure9am float64 Pressure3pm float64 Cloud9am int64 Cloud3pm int64 Temp9am float64 Temp3pm float64 RainToday object RISK_MM float64 RainTomorrow object dtype: object
** Now we need to convert the categorical features to int values. To do this job we'll define this encoding function.**
def encode_target(df, target_columns):
"""Add column to df with integers for the target.
Args
----
df -- pandas DataFrame.
target_column -- column to map to int, producing
new Target column.
Returns
-------
df_mod -- modified DataFrame.
targets -- list of target names.
"""
df_mod = df.copy()
for target_column in target_columns:
targets = df_mod[target_column].unique()
map_to_int = {name: n for n, name in enumerate(targets)}
df_mod[target_column] = df_mod[target_column].replace(map_to_int)
return df_mod
weather_new=encode_target(weather,["RainToday","Location","WindGustDir","WindDir9am","WindDir3pm","RainTomorrow"])
features= list(weather_new.columns[3:])
features.remove("RISK_MM")
target=features.pop()
y = weather_new[target]
X = weather_new[features]
good_columns = X._get_numeric_data().dropna(axis=1)
features= list(good_columns.columns)
print features
['MinTemp', 'MaxTemp', 'Rainfall', 'Evaporation', 'WindGustDir', 'WindDir9am', 'WindDir3pm', 'WindSpeed3pm', 'Humidity9am', 'Humidity3pm', 'Pressure9am', 'Pressure3pm', 'Cloud9am', 'Cloud3pm', 'Temp9am', 'Temp3pm', 'RainToday']
dt = tree.DecisionTreeClassifier(min_samples_split=20, random_state=99)
dt=dt.fit(good_columns, y)
dot_data = StringIO()
tree.export_graphviz(dt,feature_names=features,out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_pdf("weather.pdf")
from IPython.display import Image
Image(filename='/home/anshul/Pictures/Screenshot from 2015-12-18 18:02:55.png')