#!/usr/bin/env python # coding: utf-8 # Below, I implement a quick and dirty first ML pipeline: I import a subset of the training data, split it into training/test, choose some features and run a random forest on it. # In[1]: get_ipython().system(' ls') # Import subset of training data for 1st model pipeline: # In[72]: #Import libraries: import numpy as np import pandas as pd #import xgboost as xgb import time train = pd.read_csv("train.csv", nrows = 10000) test= pd.read_csv("test.csv", nrows = 1000) # In[73]: train.head() # In[74]: test.head() # ___ # Plot histograms of numerical features: # In[75]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt plt.style.use('ggplot') pd.DataFrame.hist(train, figsize = [15,15]); # ___ # Drop non-numerical feautures in order to run a random forest: # In[76]: drop_list = [] for (train_name, train_series) in train.iteritems(): print(train_name, train_series.dtype) if train_series.dtype == 'object': drop_list.append(train_name) drop_list.append('hotel_cluster') print(drop_list) # ___ # Split data into predictor variables (X) and target variable (Y): # In[77]: X = train.drop(drop_list, axis = 1).fillna(-1) y = train['hotel_cluster'] # Split into test & train sets, run random forest on train set: # In[78]: from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #See here: http://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier # Build a forest and compute the feature importances forest = RandomForestClassifier(n_estimators=10) forest.fit(X_train, y_train) score = forest.score(X_test, y_test) # Print precision: # In[79]: from sklearn.metrics import precision_score precision_score(y_test, forest.predict(X_test), average="micro") # Note: this is NOT the quite the metric of model performance we need to use for the competition but it's close. I tihnk that they're equal when we make only one prediction for each row. **Discuss**. This score would get us in the top ~50% of current leaderboard.