%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.style.use('ggplot')
plt.rcParams['figure.figsize'] = 16, 9
import requests
print(requests.get("http://example.com").text)
response = requests.get("https://www.googleapis.com/books/v1/volumes", params={"q":"machine learning"})
raw_data = response.json()
titles = [item['volumeInfo']['title'] for item in raw_data['items']]
titles
import lxml.html
page = lxml.html.parse("http://www.blocket.se/stockholm?q=apple")
# ^ This is probably illegal. Blocket, please don't sue me!
items_data = []
for el in page.getroot().find_class("item_row"):
links = el.find_class("item_link")
images = el.find_class("item_image")
prices = el.find_class("list_price")
if links and images and prices and prices[0].text:
items_data.append({"name": links[0].text,
"image": images[0].attrib['src'],
"price": int(prices[0].text.split(":")[0].replace(" ", ""))})
items_data
import pandas
df = pandas.read_csv('sample.csv')
# Display the DataFrame
df
# DataFrame's columns
df.columns
# Values of a given column
df.Model
# Any missing values?
df['Price']
df['Description']
# Fill missing prices by a linear interpolation
df['Description'] = df['Description'].fillna("No description is available.")
df['Price'] = df['Price'].interpolate()
df
import matplotlib.pyplot as plt
df = pandas.read_csv('sample2.csv')
df
# This table has 3 columns: Office, Year, Sales
df.columns
# It's really easy to query data with Pandas:
df[(df['Office'] == 'Stockholm') & (df['Sales'] > 260)]
# It's also easy to do aggregations...
aggregated_stockholm_sales = df[df.Office == 'Stockholm'].groupby('Year').sum()
aggregated_stockholm_sales
aggregated_ny_sales = df[df.Office == 'New York'].groupby('Year').sum()
# ... and generate plots
aggregated_stockholm_sales.plot(kind='bar')
aggregated_ny_sales.plot(kind='bar', color='g')
from sklearn import feature_extraction
corpus = ['Cats? I love cats!',
'I love dogs.',
'I hate cats :(',
'I love trains',
]
tfidf = feature_extraction.text.TfidfVectorizer()
print(tfidf.fit_transform(corpus).toarray())
print(tfidf.get_feature_names())
import json
data = [json.loads("""{"weight": 194.0, "sex": "female", "student": true}"""),
{"weight": 60., "sex": 'female', "student": True},
{"weight": 80.1, "sex": 'male', "student": False},
{"weight": 65.3, "sex": 'male', "student": True},
{"weight": 58.5, "sex": 'female', "student": False}]
vectorizer = feature_extraction.DictVectorizer(sparse=False)
vectors = vectorizer.fit_transform(data)
print(vectors)
print(vectorizer.get_feature_names())
from sklearn import preprocessing
data = [[10., 2345., 0., 2.],
[3., -3490., 0.1, 1.99],
[13., 3903., -0.2, 2.11]]
preprocessing.normalize(data)
from sklearn import decomposition
data = [[0.3, 0.2, 0.4, 0.32],
[0.3, 0.5, 1.0, 0.19],
[0.3, -0.4, -0.8, 0.22]]
pca = decomposition.PCA()
print(pca.fit_transform(data))
print(pca.explained_variance_ratio_)
from sklearn import datasets
from sklearn import svm
iris = datasets.load_iris()
X = iris.data[:, :2]
y = iris.target
plt.scatter(X[:, 0], X[:, 1], color=['rgb'[v] for v in y])
to_predict = np.array([[4.35, 3.1], [5.61, 2.42]])
plt.scatter(to_predict[:, 0], to_predict[:, 1], color='purple')
# Training the model
clf = svm.SVC(kernel='rbf')
clf.fit(X, y)
# Doing predictions
print(clf.predict(to_predict))
import numpy as np
from sklearn import linear_model
import matplotlib.pyplot as plt
def f(x):
return x + np.random.random() * 3.
X = np.arange(0, 5, 0.5)
X = X.reshape((len(X), 1))
y = list(map(f, X))
clf = linear_model.LinearRegression()
clf.fit(X, y)
new_X = np.arange(0.2, 5.2, 0.3)
new_X = new_X.reshape((len(new_X), 1))
new_y = clf.predict(new_X)
plt.scatter(X, y, color='g', label='Training data')
plt.plot(new_X, new_y, '.-', label='Predicted')
plt.legend()
from sklearn.cluster import DBSCAN
from sklearn.datasets.samples_generator import make_blobs
from sklearn.preprocessing import StandardScaler
# Generate sample data
centers = [[1, 1], [-1, -1], [1, -1]]
X, labels_true = make_blobs(n_samples=200, centers=centers, cluster_std=0.3,
random_state=0)
plt.scatter(X[:, 0], X[:, 1])
# Compute DBSCAN
db = DBSCAN(eps=0.3, min_samples=10).fit(X)
db.labels_
import matplotlib.pyplot as plt
plt.scatter(X[:, 0], X[:, 1], c=['rgbw'[v] for v in db.labels_])
from sklearn import svm, cross_validation, datasets
iris = datasets.load_iris()
X, y = iris.data, iris.target
model = svm.SVC()
print(cross_validation.cross_val_score(model, X, y, scoring='precision_weighted'))
print(cross_validation.cross_val_score(model, X, y, scoring='mean_squared_error'))
This is a basic solution I wrote for the Kaggle competition "What's cooking?" where the goal is to predict to which type of cuisine a meal belongs to based on a list of ingredients.
You'll need more advanced features and methods to win a Kaggle competition, but this already gets you 90% there.
from collections import Counter
import json
import pandas as pd
import scipy.sparse
import sklearn.pipeline
import sklearn.cross_validation
import sklearn.feature_extraction
import sklearn.naive_bayes
def open_dataset(path):
with open(path) as file:
data = json.load(file)
df = pd.DataFrame(data).set_index('id')
return df
df = open_dataset('train.json')
pipeline = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))
pipeline_bis = sklearn.pipeline.make_pipeline(sklearn.feature_extraction.DictVectorizer(), sklearn.feature_extraction.text.TfidfTransformer(sublinear_tf=True))
def map_term_count(ingredients):
return Counter(sum((i.split(' ') for i in ingredients), []))
X = pipeline.fit_transform(df.ingredients.apply(Counter))
X = scipy.sparse.hstack([X, pipeline_bis.fit_transform(df.ingredients.apply(map_term_count))])
y = df.cuisine.values
model = sklearn.naive_bayes.MultinomialNB(alpha=0.1)
# Cross-validation
score = sklearn.cross_validation.cross_val_score(model, X, y, cv=2)
print(score)
# Running on the test dataset
t_df = open_dataset('test.json')
X_test = pipeline.transform(t_df.ingredients.apply(Counter))
X_test = scipy.sparse.hstack([X_test, pipeline_bis.transform(t_df.ingredients.apply(map_term_count))])
model.fit(X, y)
predictions = model.predict(X_test)
result_df = pd.DataFrame(index=t_df.index)
result_df['cuisine'] = pd.Series(predictions, index=result_df.index)
result_df['ingredients'] = t_df['ingredients']
result_df
Feel free to ask any question, or contact me on kachkach.com / @halflings