#!/usr/bin/env python # coding: utf-8 # ## A different approach # The steps in doing data cleanup with some examples have been done in great detail in the Linear Regression Data Cleanup. # # In this dataset most of the cleanup is in the strings in the columns names. # Other than that there is nothing of major significance. # Changing column names is something we attempt here as well so the lessons will be useful. # # In this exercise we are going to do something quite different, something more daring and more educational than the traditional data cleanup. We are going to avoid the process of cleaning up the column names entirely. We are just going to rename them x0,x1,... # all the way to x516, keeping x562 and 563 as 'subject' and 'activity'. # # Then we are going to use a black box approach to Random Forests, i.e. one where we dont really understand the variables and the model but we can do useful things anyway. Then we will see what we get as the top 10 variables and compare with the previous approach. # # This will allow use to compare and contrast the two methods - one where we used a lot of domain knowledge, the other this one, where we use a 'black box' approach. # # In practice we prefer the former approach but it is useful to see the latter in action as well. # In[1]: get_ipython().run_line_magic('pylab', 'inline') import pandas as pd from numpy import nan as NA samsungdata = pd.read_csv('../datasets/samsung/samsungdata.csv') # In[6]: len(samsungdata.columns) # In[9]: samsungdata.columns[-2:] # In[10]: sambak = samsungdata # make a copy and experiment on it not the original # In[12]: sambak.columns[0] # In[34]: # In[16]: cols=list(sambak.columns) # get the columns names and coerce to a mutable list. A pandas index is immutable. # In[17]: cols[0] = 'Renamed' # In[18]: cols[0:5] # In[35]: newcols = [ "x%d"%(k) for k in range(0,len(cols)) ] # make a list with a new set of column names # In[20]: newcols[0:5] # In[25]: newcols[-2:] # In[28]: newcols[-1] # In[36]: newcols[-2:] = cols[-2:] # replace the last two items with the human readable column names from the original dataset # In[30]: newcols[-2:] # In[37]: sambak.columns = newcols # replace the orig columns with newcols # In[38]: sambak.columns[0] # check the name # In[39]: sambak['x0'] # check the value - this is the row index # In[49]: sambak2=sambak[sambak.columns[1:-2]] # drop the first column and the last two columns to get the independent vars # In[54]: sambak2['x1'] # check to see it's there # Try this: # # sambak2['x0'] # we want this to fail and give an exception like # # --------------------------------------------------------------------------- # KeyError Traceback (most recent call last) # in () # ----> 1 sambak2['x0'] # check # # /Applications/anaconda15/lib/python2.7/site-packages/pandas/core/frame.pyc in __getitem__(self, key) # 1926 else: # 1927 # get column # -> 1928 return self._get_item_cache(key) # 1929 # 1930 def _getitem_slice(self, key): # # /Applications/anaconda15/lib/python2.7/site-packages/pandas/core/generic.pyc in _get_item_cache(self, item) # 568 return cache[item] # 569 except Exception: # --> 570 values = self._data.get(item) # 571 res = self._box_item_values(item, values) # 572 cache[item] = res # # /Applications/anaconda15/lib/python2.7/site-packages/pandas/core/internals.pyc in get(self, item) # 1381 # 1382 def get(self, item): # -> 1383 _, block = self._find_block(item) # 1384 return block.get(item) # 1385 # # /Applications/anaconda15/lib/python2.7/site-packages/pandas/core/internals.pyc in _find_block(self, item) # 1523 # 1524 def _find_block(self, item): # -> 1525 self._check_have(item) # 1526 for i, block in enumerate(self.blocks): # 1527 if item in block: # # /Applications/anaconda15/lib/python2.7/site-packages/pandas/core/internals.pyc in _check_have(self, item) # 1530 def _check_have(self, item): # 1531 if item not in self.items: # -> 1532 raise KeyError('no item named %s' % com.pprint_thing(item)) # 1533 # 1534 def reindex_axis(self, new_axis, method=None, axis=0, copy=True): # # KeyError: u'no item named x0' # In[55]: sambak2[sambak2.columns[-1]] # want to check what the last columns is # In[56]: samsungdata['subject'].value_counts() # In[57]: samsungdata['subject'].unique() # In[59]: samtest = samsungdata[samsungdata['subject'] >= 27] # In[60]: samtrain = samsungdata[samsungdata['subject'] <= 6] # In[66]: samval2 = samsungdata[samsungdata['subject'] < 27] # In[68]: samval = samval2[samsungdata['subject'] >= 21 ] # In[69]: samval['subject'].unique() # In[70]: samtrain['subject'].unique() # In[71]: samtest['subject'].unique() # Now we are ready to create the model, validate and test it. # In[72]: import randomforests as rf samtrain = rf.remap_col(samtrain,'activity') samval = rf.remap_col(samval,'activity') samtest = rf.remap_col(samtest,'activity') # In[73]: samtrain['activity'].unique() # In[103]: import sklearn.ensemble as sk rfc = sk.RandomForestClassifier(n_estimators=50, compute_importances=True, oob_score=True) train_data = samtrain[samtrain.columns[1:-2]] train_truth = samtrain['activity'] model = rfc.fit(train_data, train_truth) # In[104]: rfc.oob_score_ # In[120]: fi = enumerate(rfc.feature_importances_) cols = samtrain.columns [(value,cols[i]) for (i,value) in fi if value > 0.017] # In[124]: fi = enumerate(rfc.feature_importances_) cols = samtrain.columns top10 = [(cols[i]) for (i,value) in fi if value > 0.017] # In[125]: top10 # So here we see a set of top 10 features but we have absolutely no idea what they are or mean. # We came at the 0.17 figure by trial and error, adjusting until we got 10. The number here is 0.17. But the number in the earlier meathod was 0.04. What does this mean? # Since we are using many more features (all of them actually) the importance meausre is spread more widely, so individual importance scores are smaller. But we can still rank them and pick the top 10. # In[113]: # pandas data frame adds a spurious unknown column in 0 position hence starting at col 1 # not using subject column, activity ie target is in last columns hence -2 i.e dropping last 2 cols val_data = samval[samval.columns[1:-2]] val_truth = samval['activity'] val_pred = rfc.predict(val_data) test_data = samtest[samtest.columns[1:-2]] test_truth = samtest['activity'] test_pred = rfc.predict(test_data) # In[114]: print("mean accuracy score for validation set = %f" %(rfc.score(val_data, val_truth))) print("mean accuracy score for test set = %f" %(rfc.score(test_data, test_truth))) # In[115]: import sklearn.metrics as skm test_cm = skm.confusion_matrix(test_truth,test_pred) # In[116]: import pylab as pl pl.matshow(test_cm) pl.title('Confusion matrix for test data') pl.colorbar() pl.show() # As before, we now compute some commonly used measures of prediction "goodness". # In[117]: # Accuracy print("Accuracy = %f" %(skm.accuracy_score(test_truth,test_pred))) # Precision print("Precision = %f" %(skm.precision_score(test_truth,test_pred))) # Recall print("Recall = %f" %(skm.recall_score(test_truth,test_pred))) # F1 Score print("F1 score = %f" %(skm.f1_score(test_truth,test_pred))) # As we somewhat expected this model is more accurate. # BUT ... # we have no idea what the variables are and we gain no knowledeg or intuition from this model. # # HOWEVER ... # We did save a lot of tedious work wrangling text columns. # Perhaps we could go bacj and try to understand these features by mapping back to the original dataset and extracting these columns. # # # In[126]: origindx = [int(x[1:]) for x in top10] # In[127]: origindx # In[128]: samsungdata.columns[origindx] # In[129]: samorig = pd.read_csv('../datasets/samsung/samsungdata.csv') # In[130]: samorig.columns[origindx] # So these are the columns names in the raw data set. But still we have no immediate intuition about the project. # In[ ]: