import pandas as pd # https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip # if you need unzip.... !sudo apt-get install -y unzip !curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip !unzip AV1611text.zip !ls -l AV1611text/ import glob import pandas as pd paths = glob.glob("AV1611text/*") books = [] for path in paths: contents = open(path).read() # For fear of unicode contents = contents.decode("ascii","ignore") contents = contents.lower() book = {} book['contents'] = contents filename = path[11:] book['name'] = filename[:-4] books.append(book) books_df = pd.DataFrame(books) books_df.head() from sklearn.feature_extraction.text import TfidfVectorizer # You'll need to change new_stopwords to 'english' if you haven't # run the code down below about nltk and creating a new stopwords list vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords) X = vectorizer.fit_transform(books_df['contents']) from sklearn.cluster import KMeans number_of_clusters = 5 km = KMeans(n_clusters=number_of_clusters) km.fit(X) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i), for ind in order_centroids[i, :10]: print(' %s' % terms[ind]), print '' additional_stopwords = ['shall', 'ye', 'thee', 'thou', 'thy', 'unto'] import nltk english_stopwords = nltk.corpus.stopwords.words('english') new_stopwords = additional_stopwords + english_stopwords # You should already have the data, so you can skip pulling/unzipping it # Data is from http://www.cs.cornell.edu/home/llee/data/convote.html #!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz #!tar -zxvf convote_v1.1.tar.gz import re import glob paths = glob.glob("convote_v1.1/data_stage_one/development_set/*") speeches = [] for path in paths: speech = {} filename = path[-26:] speech['filename'] = filename speech['bill_no'] = filename[:3] speech['speaker_no'] = filename[4:10] speech['bill_vote'] = filename[-5] speech['party'] = filename[-7] # Open the file speech_file = open(path, 'r') # Read the stuff out of it speech['contents'] = speech_file.read() cleaned_contents = re.sub(r"[^ \w]",'', speech['contents']) cleaned_contents = re.sub(r" +",' ', cleaned_contents) cleaned_contents = cleaned_contents.strip() words = cleaned_contents.split(' ') speech['word_count'] = len(words) speeches.append(speech) speeches[:5] speeches_df = pd.DataFrame(speeches) speeches_df.head() vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') longer_speeches = speeches_df[speeches_df["word_count"] > 92] X = vectorizer.fit_transform(longer_speeches['contents']) number_of_clusters = 7 km = KMeans(n_clusters=number_of_clusters) km.fit(X) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i), for ind in order_centroids[i, :8]: print(' %s' % terms[ind]), print '' longer_speeches["k-means label"] = km.labels_ longer_speeches.head() epa_speeches = longer_speeches[longer_speeches["k-means label"] == 2] vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') X = vectorizer.fit_transform(epa_speeches['contents']) number_of_clusters = 5 km = KMeans(n_clusters=number_of_clusters) km.fit(X) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i), for ind in order_centroids[i, :5]: print(' %s' % terms[ind]), print '' speeches_df["word_count"].describe() speeches_df[speeches_df["word_count"] < 17]["contents"][:10] !curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/hp.zip !unzip -y hp.zip paths = glob.glob("hp/*") fanfics = [] for path in paths: contents = open(path).read() # For fear of unicode contents = contents.decode("ascii","ignore") contents = contents.lower() fanfic = {} fanfic['contents'] = contents filename = path[3:] fanfic['name'] = filename[:-4] fanfics.append(fanfic) fanfics_df = pd.DataFrame(fanfics) fanfics_df.head() vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') X = vectorizer.fit_transform(fanfics_df['contents']) number_of_clusters = 2 km = KMeans(n_clusters=number_of_clusters) km.fit(X) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i), for ind in order_centroids[i, :8]: print(' %s' % terms[ind]), print '' fanfics_df["k-means labels"] = km.labels_ harrys_friends_df = fanfics_df[fanfics_df["k-means labels"] == 0] vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') X = vectorizer.fit_transform(harrys_friends_df['contents']) number_of_clusters = 5 km = KMeans(n_clusters=number_of_clusters) km.fit(X) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i), for ind in order_centroids[i, :8]: print(' %s' % terms[ind]), print '' harrys_friends_df["friends label"] = km.labels_ draco_herm_df = harrys_friends_df[harrys_friends_df["friends label"] == 0] vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') X = vectorizer.fit_transform(draco_herm_df['contents']) number_of_clusters = 3 km = KMeans(n_clusters=number_of_clusters) km.fit(X) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i), for ind in order_centroids[i, :8]: print(' %s' % terms[ind]), print '' !curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/twilight.zip !unzip -o twilight.zip !ls twilight !cat twilight/10016071.txt paths = glob.glob("twilight/*") fanfics = [] for path in paths: contents = open(path).read() # For fear of unicode contents = contents.decode("ascii","ignore") contents = contents.lower() fanfic = {} fanfic['contents'] = contents filename = path[3:] fanfic['name'] = filename[:-4] fanfics.append(fanfic) fanfics_df = pd.DataFrame(fanfics) fanfics_df.head() vectorizer = TfidfVectorizer(max_features=10000, stop_words='english') X = vectorizer.fit_transform(fanfics_df['contents']) number_of_clusters = 5 km = KMeans(n_clusters=number_of_clusters) km.fit(X) print("Top terms per cluster:") order_centroids = km.cluster_centers_.argsort()[:, ::-1] terms = vectorizer.get_feature_names() for i in range(number_of_clusters): print("Cluster %d:" % i), for ind in order_centroids[i, :8]: print(' %s' % terms[ind]), print '' # https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets.csv.zip # https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets-ukraine.csv.zip