import pandas as pd

# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip
# if you need unzip.... !sudo apt-get install -y unzip
!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip
!unzip AV1611text.zip

!ls -l AV1611text/

import glob
import pandas as pd

paths = glob.glob("AV1611text/*")

books = []
for path in paths:
    contents = open(path).read()
    # For fear of unicode
    contents = contents.decode("ascii","ignore")
    contents = contents.lower()
    
    book = {}
    book['contents'] = contents
    filename = path[11:]
    book['name'] = filename[:-4]
    books.append(book)

books_df = pd.DataFrame(books)
books_df.head()

from sklearn.feature_extraction.text import TfidfVectorizer

# You'll need to change new_stopwords to 'english' if you haven't
# run the code down below about nltk and creating a new stopwords list
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)

X = vectorizer.fit_transform(books_df['contents'])

from sklearn.cluster import KMeans

number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind]),
    print ''

additional_stopwords = ['shall', 'ye', 'thee', 'thou', 'thy', 'unto']

import nltk

english_stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = additional_stopwords + english_stopwords

# You should already have the data, so you can skip pulling/unzipping it
# Data is from http://www.cs.cornell.edu/home/llee/data/convote.html
#!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz
#!tar -zxvf convote_v1.1.tar.gz

import re
import glob

paths = glob.glob("convote_v1.1/data_stage_one/development_set/*")
speeches = []
for path in paths:
    speech = {}
    filename = path[-26:]
    speech['filename'] = filename
    speech['bill_no'] = filename[:3]
    speech['speaker_no'] = filename[4:10]
    speech['bill_vote'] = filename[-5]
    speech['party'] = filename[-7]
    
    # Open the file
    speech_file = open(path, 'r')
    # Read the stuff out of it
    speech['contents'] = speech_file.read()

    cleaned_contents = re.sub(r"[^ \w]",'', speech['contents'])
    cleaned_contents = re.sub(r" +",' ', cleaned_contents)
    cleaned_contents = cleaned_contents.strip()
    words = cleaned_contents.split(' ')
    speech['word_count'] = len(words)
    
    speeches.append(speech)
speeches[:5]

speeches_df = pd.DataFrame(speeches)
speeches_df.head()

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
longer_speeches = speeches_df[speeches_df["word_count"] > 92]
X = vectorizer.fit_transform(longer_speeches['contents'])

number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :8]:
        print(' %s' % terms[ind]),
    print ''

longer_speeches["k-means label"] = km.labels_
longer_speeches.head()
epa_speeches = longer_speeches[longer_speeches["k-means label"] == 2]

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(epa_speeches['contents'])

number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :5]:
        print(' %s' % terms[ind]),
    print ''

speeches_df["word_count"].describe()

speeches_df[speeches_df["word_count"] < 17]["contents"][:10]


!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/hp.zip

!unzip -y hp.zip

paths = glob.glob("hp/*")

fanfics = []
for path in paths:
    contents = open(path).read()
    # For fear of unicode
    contents = contents.decode("ascii","ignore")
    contents = contents.lower()
    
    fanfic = {}
    fanfic['contents'] = contents
    filename = path[3:]
    fanfic['name'] = filename[:-4]
    fanfics.append(fanfic)

fanfics_df = pd.DataFrame(fanfics)
fanfics_df.head()

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(fanfics_df['contents'])

number_of_clusters = 2
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :8]:
        print(' %s' % terms[ind]),
    print ''

fanfics_df["k-means labels"] = km.labels_
harrys_friends_df = fanfics_df[fanfics_df["k-means labels"] == 0]

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(harrys_friends_df['contents'])

number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :8]:
        print(' %s' % terms[ind]),
    print ''

harrys_friends_df["friends label"] = km.labels_
draco_herm_df = harrys_friends_df[harrys_friends_df["friends label"] == 0]

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(draco_herm_df['contents'])

number_of_clusters = 3
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :8]:
        print(' %s' % terms[ind]),
    print ''

!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/twilight.zip
!unzip -o twilight.zip

!ls twilight

!cat twilight/10016071.txt 

paths = glob.glob("twilight/*")

fanfics = []
for path in paths:
    contents = open(path).read()
    # For fear of unicode
    contents = contents.decode("ascii","ignore")
    contents = contents.lower()
    
    fanfic = {}
    fanfic['contents'] = contents
    filename = path[3:]
    fanfic['name'] = filename[:-4]
    fanfics.append(fanfic)

fanfics_df = pd.DataFrame(fanfics)
fanfics_df.head()

vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(fanfics_df['contents'])

number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)

print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :8]:
        print(' %s' % terms[ind]),
    print ''

# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets.csv.zip
# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets-ukraine.csv.zip