#!/usr/bin/env python
# coding: utf-8

# In[ ]:


get_ipython().run_line_magic('matplotlib', 'inline')
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 16, 9


# # Data analytics and machine learning with Python

# # I - Acquiring data

# ### A simple HTTP request

# In[ ]:


import requests

print(requests.get("http://example.com").text)


# ### Communicating with APIs

# In[ ]:


response = requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"})
raw_data = response.json()
titles = [item['volumeInfo']['title'] for item in raw_data['items']]
titles


# ### Parsing websites

# In[ ]:


import lxml.html

page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
# ^ This is probably illegal. Blocket, please don't sue me!
items_data = []
for el in page.getroot().find_class("item_row"):
    links = el.find_class("item_link")
    images = el.find_class("item_image")
    prices = el.find_class("list_price")
    if links and images and prices and prices[0].text:
        items_data.append({"name": links[0].text,
                           "image": images[0].attrib['src'],
                           "price": int(prices[0].text.split(":")[0].replace(" ", ""))})
items_data


# ### Reading local files (CSV/JSON)

# In[ ]:


import pandas

df = pandas.read_csv('sample.csv')


# In[ ]:


# Display the DataFrame
df


# In[ ]:


# DataFrame's columns
df.columns


# In[ ]:


# Values of a given column
df.Model


# # Analyzing the dataframe

# In[ ]:


# Any missing values?
df['Price']


# In[ ]:


df['Description']


# In[ ]:


# Fill missing prices by a linear interpolation
df['Description'] = df['Description'].fillna("No description is available.")
df['Price'] = df['Price'].interpolate()

df


# # II - Exploring data

# In[ ]:


import matplotlib.pyplot as plt

df = pandas.read_csv('sample2.csv')

df


# In[ ]:


# This table has 3 columns: Office, Year, Sales
df.columns


# In[ ]:


# It's really easy to query data with Pandas:
df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]


# In[ ]:


# It's also easy to do aggregations...
aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum()
aggregated_stockholm_sales


# In[ ]:


aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum()
# ... and generate plots
aggregated_stockholm_sales.plot(kind='bar')
aggregated_ny_sales.plot(kind='bar', color='g')


# # Machine learning

# ## Feature extraction

# In[ ]:


from sklearn import feature_extraction


# ### Extracting features from text

# In[ ]:


corpus = ['Cats? I love cats!',
          'I love dogs.',
          'I hate cats :(',
          'I love trains',
          ]

tfidf = feature_extraction.text.TfidfVectorizer()

print(tfidf.fit_transform(corpus).toarray())
print(tfidf.get_feature_names())


# ### Dict vectorizer

# In[ ]:


import json


data = [json.loads("""{"weight": 194.0, "sex": "female", "student": true}"""),
        {"weight": 60., "sex": 'female', "student": True},
        {"weight": 80.1, "sex": 'male', "student": False},
        {"weight": 65.3, "sex": 'male', "student": True},
        {"weight": 58.5, "sex": 'female', "student": False}]

vectorizer = feature_extraction.DictVectorizer(sparse=False)

vectors = vectorizer.fit_transform(data)
print(vectors)
print(vectorizer.get_feature_names())


# ### Pre-processing

# ##### Scaling

# In[ ]:


from sklearn import preprocessing

data = [[10., 2345., 0., 2.],
        [3., -3490., 0.1, 1.99],
        [13., 3903., -0.2, 2.11]]

preprocessing.normalize(data)


# ##### Dimensionality reduction

# In[ ]:


from sklearn import decomposition

data = [[0.3, 0.2, 0.4,  0.32],
        [0.3, 0.5, 1.0, 0.19],
        [0.3, -0.4, -0.8, 0.22]]

pca = decomposition.PCA()
print(pca.fit_transform(data))
print(pca.explained_variance_ratio_)


# # Machine learning models

# ## Classification (SVM)

# In[ ]:


from sklearn import datasets
from sklearn import svm


# In[ ]:


iris = datasets.load_iris()

X = iris.data[:, :2]
y = iris.target

plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y])

to_predict = np.array([[4.35, 3.1], [5.61, 2.42]])
plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple')


# In[ ]:


# Training the model
clf = svm.SVC(kernel='rbf')
clf.fit(X, y)

# Doing predictions
print(clf.predict(to_predict))


# ## Regression (linear regression)

# In[ ]:


import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt

def f(x):
    return x + np.random.random() * 3.

X = np.arange(0, 5, 0.5)
X = X.reshape((len(X), 1))
y = list(map(f, X))

clf = linear_model.LinearRegression()
clf.fit(X, y)


# In[ ]:


new_X = np.arange(0.2, 5.2, 0.3)
new_X = new_X.reshape((len(new_X), 1))
new_y = clf.predict(new_X)

plt.scatter(X, y, color='g', label='Training data')

plt.plot(new_X, new_y, '.-', label='Predicted')
plt.legend()


# ## Clustering (DBScan)

# In[ ]:


from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler

# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3,
                            random_state=0)
plt.scatter(X[:, 0], X[:, 1])


# In[ ]:


# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
db.labels_


# In[ ]:


import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_])


# ## Cross-validation

# In[ ]:


from sklearn import svm, cross_validation, datasets

iris = datasets.load_iris()
X, y = iris.data, iris.target

model = svm.SVC()
print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted'))
print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error'))


# # A more complex Machine Learning pipeline: "what's cooking?"
# This is a basic solution I wrote for the Kaggle competition "What's cooking?" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients.
# 
# You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there.

# In[ ]:


from collections import Counter
import json

import pandas as pd
import scipy.sparse
import sklearn.pipeline
import sklearn.cross_validation
import sklearn.feature_extraction
import sklearn.naive_bayes

def open_dataset(path):
    with open(path) as file:
        data = json.load(file)
    df = pd.DataFrame(data).set_index('id')
    return df

df = open_dataset('train.json')

pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))
pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))

def map_term_count(ingredients):
    return Counter(sum((i.split(' ') for i in ingredients), []))
X = pipeline.fit_transform(df.ingredients.apply(Counter))
X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))])
y = df.cuisine.values

model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)

# Cross-validation
score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2)
print(score)

# Running on the test dataset
t_df = open_dataset('test.json')
X_test = pipeline.transform(t_df.ingredients.apply(Counter))
X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))])

model.fit(X, y)

predictions = model.predict(X_test)
result_df = pd.DataFrame(index=t_df.index)
result_df['cuisine'] = pd.Series(predictions, index=result_df.index)

result_df['ingredients'] = t_df['ingredients']
result_df


# ## Thanks for following! I hope you learned a thing or two :-)
# 
# Feel free to ask any question, or contact me on [kachkach.com](www.kachkach.com) / [@halflings](http://github.com/halflings)