#!/usr/bin/env python # coding: utf-8 # # AUTONOMIO INTRODUCTION - Capability Overview # # 30th of August, 2017 # DOWNLOAD THIS NOTEBOOK # ## About this Demo # Using a regular server / laptop, this demo can be excecuted in a matter of roughly 10 minutes including the hyperparameter scan. Nothing is hidden, everything actually happens with a single line command. # ## Installing Autonomio # pip install git+https://github.com/autonomio/core-module.git # ## Downloading this Notebook # 1) If you're working from a desktop machine click here. # 2) If you're working from a server: # wget link # ## Running an LSTM Model to Predict Time Series # In this example we are going to ingest the Titanic dataset from the Kaggle challenge. Looking at the the kernel examples for this challenge, the ingestion involves between 20 and 100 lines of code in order to get to an acceptable result. # In[2]: # first we import the wrangler() tool from autonomio.commands import data, wrangler # next we load the data as it's provided in Kaggle titanic = data('train.csv','file') # then we prepare the data in a single command titanic = wrangler(titanic, y='Survived', starts_with_col='Cabin', first_fill_cols='Cabin', treshold=.8) # In the later examples, this exact dataframe will yield us >85% result on the provided test dataset. # ## Training a Simple Neural Network # We will continue with the Titanic dataset, as we had prepared it in the previous section. Now we just have to train a model, so we can move in to making the predictions on the test dataset. # In[211]: from autonomio.commands import train get_ipython().run_line_magic('matplotlib', 'inline') train([2,9], 'Survived', titanic, dropout=0, epoch=150, flatten='none', loss='logcosh') # At this point we're not too interested in the accuracy of the model, but given how little work we've done, 80% is not too bad for a start. More or less, we're working "out of the box". Before going too deep, let's go over a few simple examples. # ## Running an LSTM Model to Predict Time Series # In[213]: from autonomio.commands import data, train # loads the bitcoin price dataset from autonomio datasets btc = data('bitcoin_price') # make plots visible in the notebook get_ipython().run_line_magic('matplotlib', 'inline') # run a train command using LSTM model train(btc['price'], model='lstm') # Again, we are getting a good starting point result, this time with zero parameter configuration. We just input the data (bitcoin daily historical price) and choose the LSTM as the model we want to use, and that's it. # ## Working with Unstructured Data # One of the bigger headaches associated with data science is dealing with various kinds of text data. In this example we'll ingest tweets for sentiment prediction. # In[214]: from autonomio.commands import data, train # loads the tweet sentiment dataset from autonomio datasets tweets = data('tweet_sentiment') # remove tweets that are not negative and sample down to ~2,000 tweets tweets = tweets[tweets.neg > 0].sample(frac=.1) tweets.head() # We now have a dataset with just negative tweets, where one column is text, and two columns are values related with sentiment. The sentiment classification is provided by NLTK, a solution widely used in academic literature and industry alike. # In[215]: train('text','neg', tweets, loss='logcosh', dropout=0.2, epoch=150, flatten=.9, w_reg_values=[0.02,0.02]) # Generally 85% accuracy is considered the upper boundary of the sentiment classification problem, as even human coders rarely agree with each other above 85%. In fact, studies have shown that even a single human coder working on the sentiment classification may have up to 20% disagreement across a larger sample without frequent calibration inverval. # ## Performing a Grid Hyperparameter Search # In[216]: from autonomio.commands import data, hyperscan # loads the pima indians diabetes dataset from a file diabetes = data('diabetes.csv','file',header=None) # run a grid search covering some obvious options result = hyperscan([0,8],'i', diabetes, epochs=150, scan_mode='selective', batch_sizes=[12,14], layers=[2,5], losses='logcosh', optimizers=['adam','nadam'], activations=['linear','softsign','elu'], shapes=['brick','diamond']) # In[217]: result.head() # In[17]: print result.columns.values # As you can see, in return we get a dataframe with the results of the hyperparameter scan. Let's take a loot at the findings from this small scan. # ## Analyzing Hyperparameter Grid Search Results # One part of hyperparameter scanning (grid search) is the convinience at which the initial configuration can be made, and another is how results can be analyzed. Below are some examples of the in-built capabilities that streamline the analysis part. # In[218]: from autonomio.hyperstats import hyper_descriptive # provides a groupby table based on selections hyper_descriptive(result,['activation','batch_size'],'test_acc_max','max') # In[219]: # comparing any number of parameters in a grid from autonomio.plots.paramgrid import paramgrid paramgrid(result,'layers') # In[220]: # comparing two values of a parameter side-by-side from autonomio.plots.duaparam import duaparam duaparam(result,'activation','elu','softsign') # In[21]: # comparing four dimensions of data in one plot from autonomio.plots.quadparam import quadparam get_ipython().run_line_magic('matplotlib', 'inline') quadparam(result,'test_acc_max','train_acc_max','test_loss_max','train_loss_max') # One way to understand parameters is to also see what is not working, as opposed to just what is working. Also it's generally more valuable to understand how combinations work rathern than single parameters. # In[222]: # provides a groupby table based on selections hyper_descriptive(result,['activation','layers','batch_size'],'test_acc_max','max') # ## Training and Saving a Model for Predictions # ### Saving a Trained Model # Probably the most common usecase for deep learning is training a model, saving it, and then using it as part of some solution outside of the training environment / data. Let's go back to the Titanic example to do that. # In[3]: from autonomio.commands import train train([2,9], 'Survived', titanic, dropout=0.1, epoch=500, layers=6, batch_size=12, flatten='none', loss='logcosh', activation='elu', shape='brick', validation_split=.1, save_model='titanic_raw') # As you can see, this time we have envoked a few additional parameters in hopes to improve the model slightly, and also are saving the model to a file. Even though the score is ~80%, getting a similar score in Kaggle typically requires 50 to 100 lines of code. # ### Using a Saved Model for Predictions # In[6]: from autonomio.commands import predictor, data, wrangler # loading the Titanic Kaggle test data from disc titanic_test = data('test.csv','file') # transforming the data in the same way as the data for the trained model titanic_test = wrangler(titanic_test, starts_with_col='Cabin', first_fill_cols='Cabin', treshold=.7) # making predictions with the saved model predictions = predictor(titanic_test,'titanic_raw', labels='PassengerId') # ### Interactive Plotting for Predictions # Sometimes it's useful for prediction evaluation to be able to see how specific items are ranked, for this Autonomio provides an interactive plot. # In[10]: from autonomio.plots.scatterz import scatterz # making predictions with interactive plot data output predictions = predictor(titanic_test, 'titanic_raw', labels='PassengerId', interactive=True, interactive_x='Fare') # Then using the output from the predictor, we can separately invoke the interactive plot. # In[11]: # invoking the interactive plot scatterz('Fare','Prediction', predictions, labels='PassengerId') # More or less everything that has to do with the deep learning workflow is as easy as that with Autonomio. Go ahead and try now with your own dataset next! :) # ## Install Autonomio # pip install git+https://github.com/autonomio/core-module.git # # Autonomio Website # # On Github #