from sklearn import datasets from numpy import logical_or from sklearn.lda import LDA from sklearn.metrics import confusion_matrix iris = datasets.load_iris() subset = logical_or(iris.target == 0, iris.target == 1) X = iris.data[subset] y = iris.target[subset] print X[0:5,:] print y[0:5] # Linear Discriminant Analysis lda = LDA(2) lda.fit(X, y) confusion_matrix(y, lda.predict(X)) from sklearn.linear_model import LogisticRegression # Create Model model = LogisticRegression() # Fit Model model.fit(X, y) from sklearn.cluster import KMeans # Create Model kmeans = KMeans(n_clusters = 2) # Fit Model kmeans.fit(X) from sklearn.decomposition import PCA # Create Model pca = PCA(n_components=2) # Fit Model pca.fit(X) from sklearn.decomposition import PCA pca = PCA(n_components=2) pca.fit(X, y) from sklearn.feature_selection import SelectKBest from sklearn.metrics import matthews_corrcoef # Create Model kbest = SelectKBest(k = 3) # Fit Model kbest.fit(X, y) model = LogisticRegression() model.fit(X, y) kbest = SelectKBest(k = 1) kbest.fit(X, y) kmeans = KMeans(n_clusters = 2) kmeans.fit(X, y) pca = PCA(n_components=2) pca.fit(X, y) model = LogisticRegression() model.fit(X, y) print model.coef_ kmeans = KMeans(n_clusters = 2) kmeans.fit(X) print kmeans.cluster_centers_ pca = PCA(n_components=2) pca.fit(X, y) print pca.explained_variance_ kbest = SelectKBest(k = 1) kbest.fit(X, y) print kbest.get_support() model = LogisticRegression() model.fit(X, y) X_test = [[ 5.006, 3.418, 1.464, 0.244], [ 5.936, 2.77 , 4.26 , 1.326]] model.predict(X_test) print model.predict_proba(X_test) pca = PCA(n_components=2) pca.fit(X) print pca.transform(X)[0:5,:] pca = PCA(n_components=2) print pca.fit_transform(X)[0:5,:] kbest = SelectKBest(k = 1) kbest.fit(X, y) print kbest.transform(X)[0:5,:] from sklearn.cross_validation import KFold from numpy import arange from random import shuffle from sklearn.dummy import DummyClassifier model = DummyClassifier() model.fit(X, y) model.score(X, y) from sklearn.pipeline import Pipeline pipe = Pipeline([ ("select", SelectKBest(k = 3)), ("pca", PCA(n_components = 1)), ("classify", LogisticRegression()) ]) pipe.fit(X, y) pipe.predict(X) from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.linear_model import SGDClassifier news = fetch_20newsgroups() data = news.data category = news.target len(data) print " ".join(news.target_names) print data[8] pipe = Pipeline([ ('vect', CountVectorizer(max_features = 100)), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier()), ]) pipe.fit(data, category) import pandas as pd import numpy as np import sklearn.preprocessing, sklearn.decomposition, sklearn.linear_model, sklearn.pipeline, sklearn.metrics from sklearn_pandas import DataFrameMapper, cross_val_score data = pd.DataFrame({ 'pet': ['cat', 'dog', 'dog', 'fish', 'cat', 'dog', 'cat', 'fish'], 'children': [4., 6, 3, 3, 2, 3, 5, 4], 'salary': [90, 24, 44, 27, 32, 59, 36, 27] }) mapper = DataFrameMapper([ ('pet', sklearn.preprocessing.LabelBinarizer()), ('children', sklearn.preprocessing.StandardScaler()), ('salary', None) ]) mapper.fit_transform(data) mapper = DataFrameMapper([ ('pet', sklearn.preprocessing.LabelBinarizer()), ('children', sklearn.preprocessing.StandardScaler()), ('salary', None) ]) pipe = Pipeline([ ("mapper", mapper), ("pca", PCA(n_components=2)) ]) pipe.fit_transform(data) # 'data' is a data frame, not a numpy array! from sklearn.grid_search import GridSearchCV, RandomizedSearchCV from sklearn import datasets from sklearn.ensemble import RandomForestClassifier # Create sample dataset X, y = datasets.make_classification(n_samples = 1000, n_features = 40, n_informative = 6, n_classes = 2) # Pipeline for Feature Selection to Random Forest pipe = Pipeline([ ("select", SelectKBest()), ("classify", RandomForestClassifier()) ]) # Define parameter grid param_grid = { "select__k" : [1, 6, 20, 40], "classify__n_estimators" : [1, 10, 100], } gs = GridSearchCV(pipe, param_grid) # Search over grid gs.fit(X, y) gs.best_params_ print gs.best_estimator_.predict(X.mean(axis = 0)) gs.grid_scores_ param_grid = { "select__k" : [1, 5, 10, 15, 20, 25, 30, 35, 40], "classify__n_estimators" : [1, 5, 10, 25, 50, 75, 100], } gs = GridSearchCV(pipe, param_grid, n_jobs = 1) %timeit gs.fit(X, y) print gs = GridSearchCV(pipe, param_grid, n_jobs = 7) %timeit gs.fit(X, y) print param_grid = { "select__k" : range(1, 40), "classify__n_estimators" : range(1, 100), } gs = GridSearchCV(pipe, param_grid, n_jobs = 7) gs.fit(X, y) print "Best CV score", gs.best_score_ print gs.best_params_ gs = RandomizedSearchCV(pipe, param_grid, n_jobs = 7, n_iter = 10) gs.fit(X, y) print "Best CV score", gs.best_score_ print gs.best_params_