In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

BIKES¶

In [22]:

day = pd.read_csv("data/day.csv")

In [2]:

data = day.drop(["dteday", "instant", "casual", 'registered', 'cnt', 'yr'], axis=1)

In [3]:

data.columns

Out[3]:

Index(['season', 'mnth', 'holiday', 'weekday', 'workingday', 'weathersit',
       'temp', 'atemp', 'hum', 'windspeed'],
      dtype='object')

In [4]:

data_raw = data.copy()

In [5]:

data.season = data.season.map({1: "spring", 2: "summer", 3: "fall", 4: 'winter'})
data.weathersit = data.weathersit.map({1: "clear, partly cloudy", 2: 'mist, cloudy', 3: 'light snow, light rain', 4:'heavy rain, snow and fog'})
data.mnth = pd.to_datetime(data.mnth, format="%m").dt.strftime("%b")
data.weekday = pd.to_datetime(data.weekday, format="%w").dt.strftime("%a")

In [6]:

data_dummies = pd.get_dummies(data, columns=['season', 'mnth', 'weekday', 'weathersit'])

In [20]:

data_dummies.head()

Out[20]:

	workingday	temp	atemp	hum	windspeed	season_spring	...	weekday_Mon	weathersit_clear, partly cloudy	weathersit_mist, cloudy	cnt
0	0	0.344167	0.363625	0.805833	0.160446	1.0	...	1.0	0.0	1.0	985
1	0	0.363478	0.353739	0.696087	0.248539	1.0	...	1.0	0.0	1.0	801
2	1	0.196364	0.189405	0.437273	0.248309	1.0	...	1.0	1.0	0.0	1349
3	1	0.200000	0.212122	0.590435	0.160296	1.0	...	1.0	1.0	0.0	1562
4	1	0.226957	0.229270	0.436957	0.186900	1.0	...	1.0	1.0	0.0	1600

5 rows × 27 columns

In [8]:

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_raw.values, day.cnt.values, random_state=0)

/home/andy/checkout/scikit-learn/sklearn/cross_validation.py:43: DeprecationWarning: This module has been deprecated in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [9]:

from sklearn.linear_model import RidgeCV
ridge = RidgeCV().fit(X_train, y_train)

In [10]:

from sklearn.metrics import r2_score

In [11]:

ridge.score(X_train, y_train)

Out[11]:

0.52947089808569869

In [12]:

ridge.score(X_test, y_test)

Out[12]:

0.51677323867022462

In [13]:

from sklearn.tree import DecisionTreeRegressor

In [14]:

tree = DecisionTreeRegressor(max_depth=5).fit(X_train, y_train)
print(tree.score(X_train, y_train))
print(tree.score(X_test, y_test))

0.668775389005
0.516835225432

In [15]:

from sklearn.ensemble import RandomForestRegressor
forest = RandomForestRegressor(n_estimators=500).fit(X_train, y_train)
print(forest.score(X_train, y_train))
print(forest.score(X_test, y_test))

0.947307995615
0.595926183484

In [19]:

data_raw.cnt = day.cnt
data_dummies.cnt = day.cnt

data_raw.to_csv("data/bike_day_raw.csv", index=None)
data_dummies.to_csv("data/bike_day_dummies.csv", index=None)

LOANS¶

In [6]:

data = pd.read_csv("data/loan.csv")[::23]

/home/andy/anaconda3/lib/python3.5/site-packages/IPython/core/interactiveshell.py:2723: DtypeWarning: Columns (19,55) have mixed types. Specify dtype option on import or set low_memory=False.
  interactivity=interactivity, compiler=compiler, result=result)

In [7]:

data.shape

Out[7]:

(38582, 74)

In [8]:

data.head()

Out[8]:

	id	member_id	loan_amnt	funded_amnt	funded_amnt_inv	term	int_rate	installment	grade	sub_grade	...	total_bal_il	il_util	open_rv_12m	open_rv_24m	max_bal_bc	all_util	total_rev_hi_lim	inq_fi	total_cu_tl	inq_last_12m
0	1077501	1296599	5000.0	5000.0	4975.0	36 months	10.65	162.87	B	B2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
23	1069700	1304810	10000.0	10000.0	10000.0	36 months	11.71	330.76	B	B3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
46	1069465	1304521	5000.0	5000.0	5000.0	36 months	8.90	158.77	A	A5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
69	1069073	1303718	15000.0	15000.0	15000.0	36 months	14.65	517.42	C	C3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
92	1065674	1299785	8200.0	8200.0	8200.0	60 months	19.42	214.62	E	E3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

5 rows × 74 columns

In [9]:

counts = data.notnull().sum(axis=0).sort_values(ascending=False)

In [10]:

columns = counts[:52].index

In [11]:

data = data[columns]

In [12]:

data = data.dropna()

In [13]:

data.head()

Out[13]:

	out_prncp	total_rec_late_fee	total_rec_prncp	total_pymnt_inv	total_pymnt	out_prncp_inv	initial_list_status	revol_bal	dti	addr_state	...	pub_rec	open_acc	title	revol_util	last_pymnt_d	emp_title	total_rev_hi_lim	tot_cur_bal	tot_coll_amt
42550	4145.19	15.0	3854.81	4990.03	4990.030000	4145.19	f	7203.0	15.75	CO	...	1.0	9.0	My Life Saving Loan :)	34.6	Oct-2015	PARTS MANAGER	20800.0	15949.0	0.0
42573	0.00	0.0	16000.00	16900.62	16900.617682	0.00	w	6868.0	7.39	MA	...	0.0	12.0	Debt Consolidation	10.3	Nov-2014	Senior Software Engineer	66800.0	369614.0	0.0
42596	13859.72	0.0	6140.28	11734.08	11734.080000	13859.72	w	13772.0	17.03	TX	...	0.0	14.0	Credit card refinancing	41.6	Jan-2016	Project Manager	33100.0	75993.0	3712.0
42619	0.00	0.0	4631.49	12732.64	12732.640000	0.00	w	27597.0	8.58	NY	...	3.0	19.0	Consolidation for Self-Publication	69.9	May-2015	Administrator	39500.0	181757.0	0.0
42642	0.00	0.0	30000.00	34218.71	34218.711966	0.00	f	17756.0	9.41	MN	...	0.0	12.0	Credit card refinancing	61.0	Mar-2015	Engineer/Conductor	29100.0	161166.0	0.0

5 rows × 52 columns

In [14]:

bad_statuses = ["Charged Off ", "Default", "Does not meet the credit policy. Status:Charged Off", "In Grace Period",
                "Default Receiver", "Late (16-30 days)", "Late (31-120 days)"]

In [15]:

data['bad_status'] = data.loan_status.isin(bad_statuses)

In [16]:

data = data.drop(["url", "title", "id", "emp_title", "loan_status"], axis=1)

In [17]:

data.columns

Out[17]:

Index(['out_prncp', 'total_rec_late_fee', 'total_rec_prncp', 'total_pymnt_inv',
       'total_pymnt', 'out_prncp_inv', 'initial_list_status', 'revol_bal',
       'dti', 'addr_state', 'zip_code', 'purpose', 'pymnt_plan', 'issue_d',
       'verification_status', 'annual_inc', 'home_ownership', 'emp_length',
       'sub_grade', 'grade', 'installment', 'int_rate', 'term',
       'funded_amnt_inv', 'funded_amnt', 'loan_amnt', 'member_id',
       'total_rec_int', 'recoveries', 'last_pymnt_amnt',
       'collection_recovery_fee', 'application_type', 'policy_code',
       'inq_last_6mths', 'acc_now_delinq', 'last_credit_pull_d', 'total_acc',
       'delinq_2yrs', 'earliest_cr_line', 'pub_rec', 'open_acc',
       'collections_12_mths_ex_med', 'revol_util', 'last_pymnt_d',
       'total_rev_hi_lim', 'tot_cur_bal', 'tot_coll_amt', 'bad_status'],
      dtype='object')

In [18]:

data.dtypes

Out[18]:

out_prncp                     float64
total_rec_late_fee            float64
total_rec_prncp               float64
total_pymnt_inv               float64
total_pymnt                   float64
out_prncp_inv                 float64
initial_list_status            object
revol_bal                     float64
dti                           float64
addr_state                     object
zip_code                       object
purpose                        object
pymnt_plan                     object
issue_d                        object
verification_status            object
annual_inc                    float64
home_ownership                 object
emp_length                     object
sub_grade                      object
grade                          object
installment                   float64
int_rate                      float64
term                           object
funded_amnt_inv               float64
funded_amnt                   float64
loan_amnt                     float64
member_id                       int64
total_rec_int                 float64
recoveries                    float64
last_pymnt_amnt               float64
collection_recovery_fee       float64
application_type               object
policy_code                   float64
inq_last_6mths                float64
acc_now_delinq                float64
last_credit_pull_d             object
total_acc                     float64
delinq_2yrs                   float64
earliest_cr_line               object
pub_rec                       float64
open_acc                      float64
collections_12_mths_ex_med    float64
revol_util                    float64
last_pymnt_d                   object
total_rev_hi_lim              float64
tot_cur_bal                   float64
tot_coll_amt                  float64
bad_status                       bool
dtype: object

In [70]:

data.purpose.value_counts()

Out[70]:

debt_consolidation    489197
credit_card           195527
home_improvement       46956
other                  36707
major_purchase         14143
medical                 7457
small_business          7451
car                     6666
moving                  4549
vacation                4140
house                   3001
wedding                  876
renewable_energy         432
educational                1
Name: purpose, dtype: int64

In [37]:

float_columns = data.dtypes[data.dtypes == "float64"].index

In [38]:

data_float = data[float_columns]

In [39]:

data_float.shape

Out[39]:

(32719, 30)

In [40]:

X = data_float.values
y = data.bad_status.values

In [68]:

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.972329760789
0.975794621027

In [69]:

lr.coef_.shape

Out[69]:

(1, 28)

In [71]:

plt.figure(figsize=(8, 8))
plt.barh(range(X.shape[1]), lr.coef_.ravel())
plt.yticks(np.arange(X.shape[1]) + .5, data_float_hard.columns.tolist(), va="center");

In [72]:

data_float_hard = data_float.drop(['total_rec_late_fee', "revol_util"], axis=1)

In [67]:

X = data_float_hard.values

SHELTER ANIMALS¶

In [114]:

train = pd.read_csv("data/shelter_train.csv")
test = pd.read_csv("data/shelter_test.csv")

In [115]:

train.head()

Out[115]:

	AnimalID	Name	DateTime	OutcomeType	OutcomeSubtype	AnimalType	SexuponOutcome	AgeuponOutcome	Breed	Color
0	A671945	Hambone	2014-02-12 18:22:00	Return_to_owner	NaN	Dog	Neutered Male	1 year	Shetland Sheepdog Mix	Brown/White
1	A656520	Emily	2013-10-13 12:44:00	Euthanasia	Suffering	Cat	Spayed Female	1 year	Domestic Shorthair Mix	Cream Tabby
2	A686464	Pearce	2015-01-31 12:28:00	Adoption	Foster	Dog	Neutered Male	2 years	Pit Bull Mix	Blue/White
3	A683430	NaN	2014-07-11 19:09:00	Transfer	Partner	Cat	Intact Male	3 weeks	Domestic Shorthair Mix	Blue Cream
4	A667013	NaN	2013-11-15 12:52:00	Transfer	Partner	Dog	Neutered Male	2 years	Lhasa Apso/Miniature Poodle	Tan

Bank marketing¶

In [116]:

data = pd.read_csv("data/bank-additional/bank-additional-full.csv", sep=";")

In [117]:

data.head()

Out[117]:

	age	job	marital	education	default	housing	loan	contact	month	day_of_week	...	campaign	pdays	poutcome	emp.var.rate	cons.price.idx	cons.conf.idx	euribor3m	nr.employed	y
0	56	housemaid	married	basic.4y	no	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
1	57	services	married	high.school	unknown	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
2	37	services	married	high.school	no	yes	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
3	40	admin.	married	basic.6y	no	no	no	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no
4	56	services	married	high.school	no	no	yes	telephone	may	mon	...	1	999	nonexistent	1.1	93.994	-36.4	4.857	5191.0	no

5 rows × 21 columns

In [118]:

data.job.value_counts()

Out[118]:

admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64

In [119]:

data.columns

Out[119]:

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [120]:

data.dtypes

Out[120]:

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
duration            int64
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [121]:

target = data.y
data = data.drop("y", axis=1)
bla = pd.get_dummies(data)

In [122]:

bla.columns

Out[122]:

Index(['age', 'duration', 'campaign', 'pdays', 'previous', 'emp.var.rate',
       'cons.price.idx', 'cons.conf.idx', 'euribor3m', 'nr.employed',
       'job_admin.', 'job_blue-collar', 'job_entrepreneur', 'job_housemaid',
       'job_management', 'job_retired', 'job_self-employed', 'job_services',
       'job_student', 'job_technician', 'job_unemployed', 'job_unknown',
       'marital_divorced', 'marital_married', 'marital_single',
       'marital_unknown', 'education_basic.4y', 'education_basic.6y',
       'education_basic.9y', 'education_high.school', 'education_illiterate',
       'education_professional.course', 'education_university.degree',
       'education_unknown', 'default_no', 'default_unknown', 'default_yes',
       'housing_no', 'housing_unknown', 'housing_yes', 'loan_no',
       'loan_unknown', 'loan_yes', 'contact_cellular', 'contact_telephone',
       'month_apr', 'month_aug', 'month_dec', 'month_jul', 'month_jun',
       'month_mar', 'month_may', 'month_nov', 'month_oct', 'month_sep',
       'day_of_week_fri', 'day_of_week_mon', 'day_of_week_thu',
       'day_of_week_tue', 'day_of_week_wed', 'poutcome_failure',
       'poutcome_nonexistent', 'poutcome_success'],
      dtype='object')

In [123]:

X = bla.values
y = target.values

In [124]:

from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
lr = LogisticRegression()
lr.fit(X_train, y_train)
print(lr.score(X_train, y_train))
print(lr.score(X_test, y_test))

0.908905506458
0.91201320773

In [130]:

plt.figure(figsize=(10, 12))
plt.barh(range(X.shape[1]), lr.coef_.ravel())
plt.yticks(np.arange(X.shape[1]) + .5, bla.columns.tolist(), va="center");

In [131]:

from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=100).fit(X_train, y_train)
rf.score(X_train, y_train)

Out[131]:

0.99993525622349555

In [132]:

rf.score(X_test, y_test)

Out[132]:

0.91317859570748761

In [135]:

bla['target'] = target
bla.to_csv("data/bank-campaign.csv", index=None)

In [ ]: