#!/usr/bin/env python # coding: utf-8 # In[ ]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np plt.style.use('ggplot') plt.rcParams['figure.figsize'] = 16, 9 # # Data analytics and machine learning with Python # # I - Acquiring data # ### A simple HTTP request # In[ ]: import requests print(requests.get("http://example.com").text) # ### Communicating with APIs # In[ ]: response = requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"}) raw_data = response.json() titles = [item['volumeInfo']['title'] for item in raw_data['items']] titles # ### Parsing websites # In[ ]: import lxml.html page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple") # ^ This is probably illegal. Blocket, please don't sue me! items_data = [] for el in page.getroot().find_class("item_row"): links = el.find_class("item_link") images = el.find_class("item_image") prices = el.find_class("list_price") if links and images and prices and prices[0].text: items_data.append({"name": links[0].text, "image": images[0].attrib['src'], "price": int(prices[0].text.split(":")[0].replace(" ", ""))}) items_data # ### Reading local files (CSV/JSON) # In[ ]: import pandas df = pandas.read_csv('sample.csv') # In[ ]: # Display the DataFrame df # In[ ]: # DataFrame's columns df.columns # In[ ]: # Values of a given column df.Model # # Analyzing the dataframe # In[ ]: # Any missing values? df['Price'] # In[ ]: df['Description'] # In[ ]: # Fill missing prices by a linear interpolation df['Description'] = df['Description'].fillna("No description is available.") df['Price'] = df['Price'].interpolate() df # # II - Exploring data # In[ ]: import matplotlib.pyplot as plt df = pandas.read_csv('sample2.csv') df # In[ ]: # This table has 3 columns: Office, Year, Sales df.columns # In[ ]: # It's really easy to query data with Pandas: df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)] # In[ ]: # It's also easy to do aggregations... aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum() aggregated_stockholm_sales # In[ ]: aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum() # ... and generate plots aggregated_stockholm_sales.plot(kind='bar') aggregated_ny_sales.plot(kind='bar', color='g') # # Machine learning # ## Feature extraction # In[ ]: from sklearn import feature_extraction # ### Extracting features from text # In[ ]: corpus = ['Cats? I love cats!', 'I love dogs.', 'I hate cats :(', 'I love trains', ] tfidf = feature_extraction.text.TfidfVectorizer() print(tfidf.fit_transform(corpus).toarray()) print(tfidf.get_feature_names()) # ### Dict vectorizer # In[ ]: import json data = [json.loads("""{"weight": 194.0, "sex": "female", "student": true}"""), {"weight": 60., "sex": 'female', "student": True}, {"weight": 80.1, "sex": 'male', "student": False}, {"weight": 65.3, "sex": 'male', "student": True}, {"weight": 58.5, "sex": 'female', "student": False}] vectorizer = feature_extraction.DictVectorizer(sparse=False) vectors = vectorizer.fit_transform(data) print(vectors) print(vectorizer.get_feature_names()) # ### Pre-processing # ##### Scaling # In[ ]: from sklearn import preprocessing data = [[10., 2345., 0., 2.], [3., -3490., 0.1, 1.99], [13., 3903., -0.2, 2.11]] preprocessing.normalize(data) # ##### Dimensionality reduction # In[ ]: from sklearn import decomposition data = [[0.3, 0.2, 0.4, 0.32], [0.3, 0.5, 1.0, 0.19], [0.3, -0.4, -0.8, 0.22]] pca = decomposition.PCA() print(pca.fit_transform(data)) print(pca.explained_variance_ratio_) # # Machine learning models # ## Classification (SVM) # In[ ]: from sklearn import datasets from sklearn import svm # In[ ]: iris = datasets.load_iris() X = iris.data[:, :2] y = iris.target plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y]) to_predict = np.array([[4.35, 3.1], [5.61, 2.42]]) plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple') # In[ ]: # Training the model clf = svm.SVC(kernel='rbf') clf.fit(X, y) # Doing predictions print(clf.predict(to_predict)) # ## Regression (linear regression) # In[ ]: import numpy as np from sklearn import linear_model import matplotlib.pyplot as plt def f(x): return x + np.random.random() * 3. X = np.arange(0, 5, 0.5) X = X.reshape((len(X), 1)) y = list(map(f, X)) clf = linear_model.LinearRegression() clf.fit(X, y) # In[ ]: new_X = np.arange(0.2, 5.2, 0.3) new_X = new_X.reshape((len(new_X), 1)) new_y = clf.predict(new_X) plt.scatter(X, y, color='g', label='Training data') plt.plot(new_X, new_y, '.-', label='Predicted') plt.legend() # ## Clustering (DBScan) # In[ ]: from sklearn.cluster import DBSCAN from sklearn.datasets.samples_generator import make_blobs from sklearn.preprocessing import StandardScaler # Generate sample data centers = [[1, 1], [-1, -1], [1, -1]] X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3, random_state=0) plt.scatter(X[:, 0], X[:, 1]) # In[ ]: # Compute DBSCAN db = DBSCAN(eps=0.3, min_samples=10).fit(X) db.labels_ # In[ ]: import matplotlib.pyplot as plt plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_]) # ## Cross-validation # In[ ]: from sklearn import svm, cross_validation, datasets iris = datasets.load_iris() X, y = iris.data, iris.target model = svm.SVC() print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted')) print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error')) # # A more complex Machine Learning pipeline: "what's cooking?" # This is a basic solution I wrote for the Kaggle competition "What's cooking?" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients. # # You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there. # In[ ]: from collections import Counter import json import pandas as pd import scipy.sparse import sklearn.pipeline import sklearn.cross_validation import sklearn.feature_extraction import sklearn.naive_bayes def open_dataset(path): with open(path) as file: data = json.load(file) df = pd.DataFrame(data).set_index('id') return df df = open_dataset('train.json') pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True)) pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True)) def map_term_count(ingredients): return Counter(sum((i.split(' ') for i in ingredients), [])) X = pipeline.fit_transform(df.ingredients.apply(Counter)) X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))]) y = df.cuisine.values model = sklearn.naive_bayes.MultinomialNB(alpha=0.1) # Cross-validation score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2) print(score) # Running on the test dataset t_df = open_dataset('test.json') X_test = pipeline.transform(t_df.ingredients.apply(Counter)) X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))]) model.fit(X, y) predictions = model.predict(X_test) result_df = pd.DataFrame(index=t_df.index) result_df['cuisine'] = pd.Series(predictions, index=result_df.index) result_df['ingredients'] = t_df['ingredients'] result_df # ## Thanks for following! I hope you learned a thing or two :-) # # Feel free to ask any question, or contact me on [kachkach.com](www.kachkach.com) / [@halflings](http://github.com/halflings)