In [ ]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 16, 9

Data analytics and machine learning with Python

I - Acquiring data

A simple HTTP request

In [ ]:
import requests

print(requests.get("http://example.com").text)

Communicating with APIs

In [ ]:
response = requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"})
raw_data = response.json()
titles = [item['volumeInfo']['title'] for item in raw_data['items']]
titles

Parsing websites

In [ ]:
import lxml.html

page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
# ^ This is probably illegal. Blocket, please don't sue me!
items_data = []
for el in page.getroot().find_class("item_row"):
    links = el.find_class("item_link")
    images = el.find_class("item_image")
    prices = el.find_class("list_price")
    if links and images and prices and prices[0].text:
        items_data.append({"name": links[0].text,
                           "image": images[0].attrib['src'],
                           "price": int(prices[0].text.split(":")[0].replace(" ", ""))})
items_data

Reading local files (CSV/JSON)

In [ ]:
import pandas

df = pandas.read_csv('sample.csv')
In [ ]:
# Display the DataFrame
df
In [ ]:
# DataFrame's columns
df.columns
In [ ]:
# Values of a given column
df.Model

Analyzing the dataframe

In [ ]:
# Any missing values?
df['Price']
In [ ]:
df['Description']
In [ ]:
# Fill missing prices by a linear interpolation
df['Description'] = df['Description'].fillna("No description is available.")
df['Price'] = df['Price'].interpolate()

df

II - Exploring data

In [ ]:
import matplotlib.pyplot as plt

df = pandas.read_csv('sample2.csv')

df
In [ ]:
# This table has 3 columns: Office, Year, Sales
df.columns
In [ ]:
# It's really easy to query data with Pandas:
df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]
In [ ]:
# It's also easy to do aggregations...
aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum()
aggregated_stockholm_sales
In [ ]:
aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum()
# ... and generate plots
aggregated_stockholm_sales.plot(kind='bar')
aggregated_ny_sales.plot(kind='bar', color='g')

Machine learning

Feature extraction

In [ ]:
from sklearn import feature_extraction

Extracting features from text

In [ ]:
corpus = ['Cats? I love cats!',
          'I love dogs.',
          'I hate cats :(',
          'I love trains',
          ]

tfidf = feature_extraction.text.TfidfVectorizer()

print(tfidf.fit_transform(corpus).toarray())
print(tfidf.get_feature_names())

Dict vectorizer

In [ ]:
import json


data = [json.loads("""{"weight": 194.0, "sex": "female", "student": true}"""),
        {"weight": 60., "sex": 'female', "student": True},
        {"weight": 80.1, "sex": 'male', "student": False},
        {"weight": 65.3, "sex": 'male', "student": True},
        {"weight": 58.5, "sex": 'female', "student": False}]

vectorizer = feature_extraction.DictVectorizer(sparse=False)

vectors = vectorizer.fit_transform(data)
print(vectors)
print(vectorizer.get_feature_names())

Pre-processing

Scaling
In [ ]:
from sklearn import preprocessing

data = [[10., 2345., 0., 2.],
        [3., -3490., 0.1, 1.99],
        [13., 3903., -0.2, 2.11]]

preprocessing.normalize(data)
Dimensionality reduction
In [ ]:
from sklearn import decomposition

data = [[0.3, 0.2, 0.4,  0.32],
        [0.3, 0.5, 1.0, 0.19],
        [0.3, -0.4, -0.8, 0.22]]

pca = decomposition.PCA()
print(pca.fit_transform(data))
print(pca.explained_variance_ratio_)

Machine learning models

Classification (SVM)

In [ ]:
from sklearn import datasets
from sklearn import svm
In [ ]:
iris = datasets.load_iris()

X = iris.data[:, :2]
y = iris.target

plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y])

to_predict = np.array([[4.35, 3.1], [5.61, 2.42]])
plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple')
In [ ]:
# Training the model
clf = svm.SVC(kernel='rbf')
clf.fit(X, y)

# Doing predictions
print(clf.predict(to_predict))

Regression (linear regression)

In [ ]:
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

def f(x):
    return x + np.random.random() * 3.

X = np.arange(0, 5, 0.5)
X = X.reshape((len(X), 1))
y = list(map(f, X))

clf = linear_model.LinearRegression()
clf.fit(X, y)
In [ ]:
new_X = np.arange(0.2, 5.2, 0.3)
new_X = new_X.reshape((len(new_X), 1))
new_y = clf.predict(new_X)

plt.scatter(X, y, color='g', label='Training data')

plt.plot(new_X, new_y, '.-', label='Predicted')
plt.legend()

Clustering (DBScan)

In [ ]:
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3,
                            random_state=0)
plt.scatter(X[:, 0], X[:, 1])
In [ ]:
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
db.labels_
In [ ]:
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_])

Cross-validation

In [ ]:
from sklearn import svm, cross_validation, datasets

iris = datasets.load_iris()
X, y = iris.data, iris.target

model = svm.SVC()
print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted'))
print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error'))

A more complex Machine Learning pipeline: "what's cooking?"

This is a basic solution I wrote for the Kaggle competition "What's cooking?" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients.

You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there.

In [ ]:
from collections import Counter
import json

import pandas as pd
import scipy.sparse
import sklearn.pipeline
import sklearn.cross_validation
import sklearn.feature_extraction
import sklearn.naive_bayes

def open_dataset(path):
    with open(path) as file:
        data = json.load(file)
    df = pd.DataFrame(data).set_index('id')
    return df

df = open_dataset('train.json')

pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))
pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))

def map_term_count(ingredients):
    return Counter(sum((i.split(' ') for i in ingredients), []))
X = pipeline.fit_transform(df.ingredients.apply(Counter))
X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))])
y = df.cuisine.values

model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)

# Cross-validation
score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2)
print(score)

# Running on the test dataset
t_df = open_dataset('test.json')
X_test = pipeline.transform(t_df.ingredients.apply(Counter))
X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))])

model.fit(X, y)

predictions = model.predict(X_test)
result_df = pd.DataFrame(index=t_df.index)
result_df['cuisine'] = pd.Series(predictions, index=result_df.index)

result_df['ingredients'] = t_df['ingredients']
result_df

Thanks for following! I hope you learned a thing or two :-)

Feel free to ask any question, or contact me on kachkach.com / @halflings