import pandas as pd
# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip
# if you need unzip.... !sudo apt-get install -y unzip
!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/AV1611text.zip
!unzip AV1611text.zip
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 1430k 100 1430k 0 0 1588k 0 --:--:-- --:--:-- --:--:-- 1587k Archive: AV1611text.zip creating: AV1611text/ inflating: AV1611text/.DS_Store creating: __MACOSX/ creating: __MACOSX/AV1611text/ inflating: __MACOSX/AV1611text/._.DS_Store inflating: AV1611text/1Chronicles.txt inflating: __MACOSX/AV1611text/._1Chronicles.txt inflating: AV1611text/1Corinthians.txt inflating: __MACOSX/AV1611text/._1Corinthians.txt inflating: AV1611text/1John.txt inflating: __MACOSX/AV1611text/._1John.txt inflating: AV1611text/1Kings.txt inflating: __MACOSX/AV1611text/._1Kings.txt inflating: AV1611text/1Peter.txt inflating: __MACOSX/AV1611text/._1Peter.txt inflating: AV1611text/1Samuel.txt inflating: __MACOSX/AV1611text/._1Samuel.txt inflating: AV1611text/1Thessalonians.txt inflating: __MACOSX/AV1611text/._1Thessalonians.txt inflating: AV1611text/1Timothy.txt inflating: __MACOSX/AV1611text/._1Timothy.txt inflating: AV1611text/2Chronicles.txt inflating: __MACOSX/AV1611text/._2Chronicles.txt inflating: AV1611text/2Corinthians.txt inflating: __MACOSX/AV1611text/._2Corinthians.txt inflating: AV1611text/2John.txt inflating: __MACOSX/AV1611text/._2John.txt inflating: AV1611text/2Kings.txt inflating: __MACOSX/AV1611text/._2Kings.txt inflating: AV1611text/2Peter.txt inflating: __MACOSX/AV1611text/._2Peter.txt inflating: AV1611text/2Samuel.txt inflating: __MACOSX/AV1611text/._2Samuel.txt inflating: AV1611text/2Thessalonians.txt inflating: __MACOSX/AV1611text/._2Thessalonians.txt inflating: AV1611text/2Timothy.txt inflating: __MACOSX/AV1611text/._2Timothy.txt inflating: AV1611text/3John.txt inflating: __MACOSX/AV1611text/._3John.txt inflating: AV1611text/Acts.txt inflating: __MACOSX/AV1611text/._Acts.txt inflating: AV1611text/Amos.txt inflating: __MACOSX/AV1611text/._Amos.txt inflating: AV1611text/Colossians.txt inflating: __MACOSX/AV1611text/._Colossians.txt inflating: AV1611text/Daniel.txt inflating: __MACOSX/AV1611text/._Daniel.txt inflating: AV1611text/Dedicatory.txt inflating: __MACOSX/AV1611text/._Dedicatory.txt inflating: AV1611text/Deuteronomy.txt inflating: __MACOSX/AV1611text/._Deuteronomy.txt inflating: AV1611text/Ecclesiastes.txt inflating: __MACOSX/AV1611text/._Ecclesiastes.txt inflating: AV1611text/Ephesians.txt inflating: __MACOSX/AV1611text/._Ephesians.txt inflating: AV1611text/Esther.txt inflating: __MACOSX/AV1611text/._Esther.txt inflating: AV1611text/Exodus.txt inflating: __MACOSX/AV1611text/._Exodus.txt inflating: AV1611text/Ezekiel.txt inflating: __MACOSX/AV1611text/._Ezekiel.txt inflating: AV1611text/Ezra.txt inflating: __MACOSX/AV1611text/._Ezra.txt inflating: AV1611text/Galatians.txt inflating: __MACOSX/AV1611text/._Galatians.txt inflating: AV1611text/Genesis.txt inflating: __MACOSX/AV1611text/._Genesis.txt inflating: AV1611text/Habakkuk.txt inflating: __MACOSX/AV1611text/._Habakkuk.txt inflating: AV1611text/Haggai.txt inflating: __MACOSX/AV1611text/._Haggai.txt inflating: AV1611text/Hebrews.txt inflating: __MACOSX/AV1611text/._Hebrews.txt inflating: AV1611text/Hosea.txt inflating: __MACOSX/AV1611text/._Hosea.txt inflating: AV1611text/Isaiah.txt inflating: __MACOSX/AV1611text/._Isaiah.txt inflating: AV1611text/James.txt inflating: __MACOSX/AV1611text/._James.txt inflating: AV1611text/Jeremiah.txt inflating: __MACOSX/AV1611text/._Jeremiah.txt inflating: AV1611text/Job.txt inflating: __MACOSX/AV1611text/._Job.txt inflating: AV1611text/Joel.txt inflating: __MACOSX/AV1611text/._Joel.txt inflating: AV1611text/John.txt inflating: __MACOSX/AV1611text/._John.txt inflating: AV1611text/Jonah.txt inflating: __MACOSX/AV1611text/._Jonah.txt inflating: AV1611text/Joshua.txt inflating: __MACOSX/AV1611text/._Joshua.txt inflating: AV1611text/Jude.txt inflating: __MACOSX/AV1611text/._Jude.txt inflating: AV1611text/Judges.txt inflating: __MACOSX/AV1611text/._Judges.txt inflating: AV1611text/Lamentations.txt inflating: __MACOSX/AV1611text/._Lamentations.txt inflating: AV1611text/Leviticus.txt inflating: __MACOSX/AV1611text/._Leviticus.txt inflating: AV1611text/Luke.txt inflating: __MACOSX/AV1611text/._Luke.txt inflating: AV1611text/Malachi.txt inflating: __MACOSX/AV1611text/._Malachi.txt inflating: AV1611text/Mark.txt inflating: __MACOSX/AV1611text/._Mark.txt inflating: AV1611text/Matthew.txt inflating: __MACOSX/AV1611text/._Matthew.txt inflating: AV1611text/Micah.txt inflating: __MACOSX/AV1611text/._Micah.txt inflating: AV1611text/Nahum.txt inflating: __MACOSX/AV1611text/._Nahum.txt inflating: AV1611text/Nehemiah.txt inflating: __MACOSX/AV1611text/._Nehemiah.txt inflating: AV1611text/Numbers.txt inflating: __MACOSX/AV1611text/._Numbers.txt inflating: AV1611text/Obadiah.txt inflating: __MACOSX/AV1611text/._Obadiah.txt inflating: AV1611text/Philemon.txt inflating: __MACOSX/AV1611text/._Philemon.txt inflating: AV1611text/Philippians.txt inflating: __MACOSX/AV1611text/._Philippians.txt inflating: AV1611text/Preface.txt inflating: __MACOSX/AV1611text/._Preface.txt inflating: AV1611text/Preface_w_footnotes.txt inflating: __MACOSX/AV1611text/._Preface_w_footnotes.txt inflating: AV1611text/Proverbs.txt inflating: __MACOSX/AV1611text/._Proverbs.txt inflating: AV1611text/Psalms.txt inflating: __MACOSX/AV1611text/._Psalms.txt inflating: AV1611text/Revelation.txt inflating: __MACOSX/AV1611text/._Revelation.txt inflating: AV1611text/Romans.txt inflating: __MACOSX/AV1611text/._Romans.txt inflating: AV1611text/Ruth.txt inflating: __MACOSX/AV1611text/._Ruth.txt inflating: AV1611text/SongofSolomon.txt inflating: __MACOSX/AV1611text/._SongofSolomon.txt inflating: AV1611text/Titus.txt inflating: __MACOSX/AV1611text/._Titus.txt inflating: AV1611text/Zechariah.txt inflating: __MACOSX/AV1611text/._Zechariah.txt inflating: AV1611text/Zephaniah.txt inflating: __MACOSX/AV1611text/._Zephaniah.txt inflating: __MACOSX/._AV1611text
!ls -l AV1611text/
total 9016 -rwxr-xr-x 1 soma staff 116614 Nov 9 2003 1Chronicles.txt -rwxr-xr-x 1 soma staff 51441 Nov 13 2003 1Corinthians.txt -rwxr-xr-x 1 soma staff 13289 Nov 14 2003 1John.txt -rwxr-xr-x 1 soma staff 132016 Nov 9 2003 1Kings.txt -rwxr-xr-x 1 soma staff 14081 Nov 14 2003 1Peter.txt -rwxr-xr-x 1 soma staff 134405 Nov 9 2003 1Samuel.txt -rwxr-xr-x 1 soma staff 10247 Nov 13 2003 1Thessalonians.txt -rwxr-xr-x 1 soma staff 13419 Nov 14 2003 1Timothy.txt -rwxr-xr-x 1 soma staff 144096 Nov 9 2003 2Chronicles.txt -rwxr-xr-x 1 soma staff 33679 Nov 13 2003 2Corinthians.txt -rwxr-xr-x 1 soma staff 1641 Nov 14 2003 2John.txt -rwxr-xr-x 1 soma staff 125212 Nov 9 2003 2Kings.txt -rwxr-xr-x 1 soma staff 9063 Nov 14 2003 2Peter.txt -rwxr-xr-x 1 soma staff 110670 Nov 9 2003 2Samuel.txt -rwxr-xr-x 1 soma staff 5812 Nov 13 2003 2Thessalonians.txt -rwxr-xr-x 1 soma staff 9842 Nov 14 2003 2Timothy.txt -rwxr-xr-x 1 soma staff 1688 Nov 14 2003 3John.txt -rwxr-xr-x 1 soma staff 134948 Nov 13 2003 Acts.txt -rwxr-xr-x 1 soma staff 22719 Nov 13 2003 Amos.txt -rwxr-xr-x 1 soma staff 11333 Nov 13 2003 Colossians.txt -rwxr-xr-x 1 soma staff 64065 Nov 13 2003 Daniel.txt -rwxr-xr-x 1 soma staff 5751 Feb 24 2004 Dedicatory.txt -rwxr-xr-x 1 soma staff 152225 Nov 5 2003 Deuteronomy.txt -rwxr-xr-x 1 soma staff 29934 Nov 13 2003 Ecclesiastes.txt -rwxr-xr-x 1 soma staff 17136 Nov 13 2003 Ephesians.txt -rwxr-xr-x 1 soma staff 31308 Nov 9 2003 Esther.txt -rwxr-xr-x 1 soma staff 176725 Nov 5 2003 Exodus.txt -rwxr-xr-x 1 soma staff 213314 Nov 13 2003 Ezekiel.txt -rwxr-xr-x 1 soma staff 42038 Nov 9 2003 Ezra.txt -rwxr-xr-x 1 soma staff 17067 Nov 13 2003 Galatians.txt -rwxr-xr-x 1 soma staff 205748 Nov 5 2003 Genesis.txt -rwxr-xr-x 1 soma staff 8304 Nov 13 2003 Habakkuk.txt -rwxr-xr-x 1 soma staff 5987 Nov 13 2003 Haggai.txt -rwxr-xr-x 1 soma staff 38903 Nov 14 2003 Hebrews.txt -rwxr-xr-x 1 soma staff 28373 Nov 13 2003 Hosea.txt -rwxr-xr-x 1 soma staff 202179 Nov 13 2003 Isaiah.txt -rwxr-xr-x 1 soma staff 12711 Nov 14 2003 James.txt -rwxr-xr-x 1 soma staff 231838 Nov 13 2003 Jeremiah.txt -rwxr-xr-x 1 soma staff 100264 Nov 10 2003 Job.txt -rwxr-xr-x 1 soma staff 11124 Nov 13 2003 Joel.txt -rwxr-xr-x 1 soma staff 102311 Nov 13 2003 John.txt -rwxr-xr-x 1 soma staff 6914 Nov 13 2003 Jonah.txt -rwxr-xr-x 1 soma staff 104045 Nov 5 2003 Joshua.txt -rwxr-xr-x 1 soma staff 3672 Nov 14 2003 Jude.txt -rwxr-xr-x 1 soma staff 102418 Nov 5 2003 Judges.txt -rwxr-xr-x 1 soma staff 19010 Nov 13 2003 Lamentations.txt -rwxr-xr-x 1 soma staff 132373 Nov 5 2003 Leviticus.txt -rwxr-xr-x 1 soma staff 140547 Nov 13 2003 Luke.txt -rwxr-xr-x 1 soma staff 9571 Nov 13 2003 Malachi.txt -rwxr-xr-x 1 soma staff 82513 Nov 13 2003 Mark.txt -rwxr-xr-x 1 soma staff 129900 Nov 13 2003 Matthew.txt -rwxr-xr-x 1 soma staff 16999 Nov 13 2003 Micah.txt -rwxr-xr-x 1 soma staff 7217 Nov 13 2003 Nahum.txt -rwxr-xr-x 1 soma staff 59451 Nov 9 2003 Nehemiah.txt -rwxr-xr-x 1 soma staff 183087 Nov 5 2003 Numbers.txt -rwxr-xr-x 1 soma staff 3727 Nov 13 2003 Obadiah.txt -rwxr-xr-x 1 soma staff 2533 Nov 14 2003 Philemon.txt -rwxr-xr-x 1 soma staff 12217 Nov 13 2003 Philippians.txt -rwxr-xr-x 1 soma staff 63160 Dec 1 2003 Preface.txt -rwxr-xr-x 1 soma staff 67128 Nov 23 2003 Preface_w_footnotes.txt -rwxr-xr-x 1 soma staff 85466 Nov 10 2003 Proverbs.txt -rwxr-xr-x 1 soma staff 243509 Nov 13 2003 Psalms.txt -rwxr-xr-x 1 soma staff 64205 Nov 14 2003 Revelation.txt -rwxr-xr-x 1 soma staff 52602 Nov 13 2003 Romans.txt -rwxr-xr-x 1 soma staff 13561 Nov 6 2003 Ruth.txt -rwxr-xr-x 1 soma staff 14584 Nov 13 2003 SongofSolomon.txt -rwxr-xr-x 1 soma staff 5512 Nov 14 2003 Titus.txt -rwxr-xr-x 1 soma staff 34273 Nov 13 2003 Zechariah.txt -rwxr-xr-x 1 soma staff 8841 Nov 13 2003 Zephaniah.txt
import glob
import pandas as pd
paths = glob.glob("AV1611text/*")
books = []
for path in paths:
contents = open(path).read()
# For fear of unicode
contents = contents.decode("ascii","ignore")
contents = contents.lower()
book = {}
book['contents'] = contents
filename = path[11:]
book['name'] = filename[:-4]
books.append(book)
books_df = pd.DataFrame(books)
books_df.head()
contents | name | |
---|---|---|
0 | the first book of the chronicles\r\n\r\nchapte... | 1Chronicles |
1 | the first epistle of paul the apostle to the c... | 1Corinthians |
2 | the first general epistle of john\r\n\r\nchapt... | 1John |
3 | the first book of the kings\r\n\r\ncommonly ca... | 1Kings |
4 | the first epistle general of peter\r\n\r\nchap... | 1Peter |
from sklearn.feature_extraction.text import TfidfVectorizer
# You'll need to change new_stopwords to 'english' if you haven't
# run the code down below about nltk and creating a new stopwords list
vectorizer = TfidfVectorizer(max_features=10000, stop_words=new_stopwords)
X = vectorizer.fit_transform(books_df['contents'])
from sklearn.cluster import KMeans
number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=5, n_init=10, n_jobs=1, precompute_distances=True, random_state=None, tol=0.0001, verbose=0)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :10]:
print(' %s' % terms[ind]),
print ''
Top terms per cluster: Cluster 0: lord hath upon god like man day hast thine wicked Cluster 1: christ god jesus lord things also us faith man hath Cluster 2: lord king said israel god upon people house son land Cluster 3: said jesus disciples man came saying god went peter lord Cluster 4: god man faith things us st good hath christ jesus
additional_stopwords = ['shall', 'ye', 'thee', 'thou', 'thy', 'unto']
import nltk
english_stopwords = nltk.corpus.stopwords.words('english')
new_stopwords = additional_stopwords + english_stopwords
# You should already have the data, so you can skip pulling/unzipping it
# Data is from http://www.cs.cornell.edu/home/llee/data/convote.html
#!curl -O http://www.cs.cornell.edu/home/llee/data/convote/convote_v1.1.tar.gz
#!tar -zxvf convote_v1.1.tar.gz
import re
import glob
paths = glob.glob("convote_v1.1/data_stage_one/development_set/*")
speeches = []
for path in paths:
speech = {}
filename = path[-26:]
speech['filename'] = filename
speech['bill_no'] = filename[:3]
speech['speaker_no'] = filename[4:10]
speech['bill_vote'] = filename[-5]
speech['party'] = filename[-7]
# Open the file
speech_file = open(path, 'r')
# Read the stuff out of it
speech['contents'] = speech_file.read()
cleaned_contents = re.sub(r"[^ \w]",'', speech['contents'])
cleaned_contents = re.sub(r" +",' ', cleaned_contents)
cleaned_contents = cleaned_contents.strip()
words = cleaned_contents.split(' ')
speech['word_count'] = len(words)
speeches.append(speech)
speeches[:5]
[{'bill_no': '052', 'bill_vote': 'N', 'contents': "mr. chairman , i thank the gentlewoman for yielding me this time . \nmy good colleague from california raised the exact and critical point . \nthe question is , what happens during those 45 days ? \nwe will need to support elections . \nthere is not a single member of this house who has not supported some form of general election , a special election , to replace the members at some point . \nbut during that 45 days , what happens ? \nthe chair of the constitution subcommittee says this is what happens : martial law . \nwe do not know who would fill the vacancy of the presidency , but we do know that the succession act most likely suggests it would be an unelected person . \nthe sponsors of the bill before us today insist , and i think rightfully so , on the importance of elections . \nbut to then say that during a 45-day period we would have none of the checks and balances so fundamental to our constitution , none of the separation of powers , and that the presidency would be filled by an unelected member of the cabinet who not a single member of this country , not a single citizen , voted to fill that position , and that that person would have no checks and balances from congress for a period of 45 days i find extraordinary . \ni find it inconsistent . \ni find it illogical , and , frankly , i find it dangerous . \nthe gentleman from wisconsin refused earlier to yield time , but i was going to ask him , if virginia has those elections in a shorter time period , they should be commended for that . \nso now we have a situation in the congress where the virginia delegation has sent their members here , but many other states do not have members here . \ndo they at that point elect a speaker of the house in the absence of other members ? \nand then three more states elect their representatives , temporary replacements , or full replacements at that point . \nthey come in . \ndo they elect a new speaker ? \nand if that happens , who becomes the president under the succession act ? \nthis bill does not address that question . \nthis bill responds to real threats with fantasies . \nit responds with the fantasy , first of all , that a lot of people will still survive ; but we have no guarantee of that . \nit responds with the fantasy that those who do survive will do the right thing . \nwe are here having this debate , we have debates every day , because people differ on what the right thing is to do . \ni have been in very traumatic situations with people in severe car wrecks and mountain climbing accidents . \nmy experience has not been that crisis imbues universal sagacity and fairness . \nit has not been that . \npeople respond in extraordinary ways , and we must preserve an institution that has the deliberative body and the checks and balances to meet those challenges . \nmany of our states are going increasingly to mail-in ballots . \nwe in this body were effectively disabled by an anthrax attack not long after september 11 . \ni would ask my dear friends , will you conduct this election in 45 days if there is anthrax in the mail and still preserve the franchise of the american people ? \nhow will you do that ? \nyou have no answer to that question . \ni find it extraordinary , frankly , that while saying you do not want to amend the constitution , we began this very congress by amending the constitution through the rule , by undermining the principle that a quorum is 50 percent of the body and instead saying it is however many people survive . \nand if that rule applies , who will designate it , who will implement it ? \nthe speaker , or the speaker 's designee ? \nagain , not an elected person , as you say is so critical and i believe is critical , but a temporary appointee , frankly , who not a single other member of this body knows who they are . \nso we not only have an unelected person , we have an unknown person who will convene this body , and who , by the way , could conceivably convene it for their own election to then become the president of the united states under the succession act . \nyou have refused steadfastly to debate this real issue broadly . \nyou had a mock debate in the committee on the judiciary in which the distinguished chairman presented my bill without allowing me the courtesy or dignity to defend it myself . \nand on that , you proudly say you defend democracy . \nsir , i think you dissemble in that regard . \nhere is the fundamental question for us , my friends , and it is this : the american people are watching television and an announcement comes on and says the congress has been destroyed in a nuclear attack , the president and vice president are killed and the supreme court is dead and thousands of our citizens in this town are . \nwhat happens next ? \nunder your bill , 45 days of chaos . \napparently , according to the committee on the judiciary subcommittee on the constitution chairman , 45 days of marshal law , rule of this country by an unelected president with no checks and balances . \nor an alternative , an alternative which says quite simply that the people have entrusted the representatives they send here to make profound decisions , war , taxation , a host of other things , and those representatives would have the power under the bill of the gentleman from california ( mr. rohrabacher ) xz4003430 bill or mine to designate temporary successors , temporary , only until we can have a real election . \nthe american people , in one scenario , are told we do not know who is going to run the country , we have no representatives ; where in another you will have temporary representatives carrying your interests to this great body while we deliberate and have real elections . \nthat is the choice . \nyou are making the wrong choice today if you think you have solved this problem . \n", 'filename': '052_400011_0327014_DON.txt', 'party': 'D', 'speaker_no': '400011', 'word_count': 974}, {'bill_no': '052', 'bill_vote': 'N', 'contents': "mr. chairman , i want to thank my good friend from california ( mr. rohrabacher ) xz4003430 . \ni will always remember that day , as we all will . \nhis point is well taken . \ni understand there is good intent behind the bill before us today and the amendment , but it is not enough . \nit simply is not . \nit leaves our country vulnerable for 45 days and that is too long . \nthe distinguished chairman of the committee on the judiciary made some comments recently that suggested that somehow terrorists would oppose this bill and by some implication would favor the bill the gentleman from california ( mr. rohrabacher ) xz4003430 and i have put forward because it seems to support their autocratic views of government . \nnothing could be further from the truth . \nin fact , what our bill would do is tell the terrorists , you could come on a single day and set off a nuclear weapon in this town and kill every single member of us ; and though we would be missed , the very next day the congress would be up and functioning with every single state , every single district having full representation by statesmen and stateswomen at a time of national crisis . \nthat is what the gentleman from california ( mr. rohrabacher ) xz4003430 and i are trying to do . \nwe are trying to tell the terrorists , you can kill all of us as individuals , but you will not defeat this institution . \nyou will not defeat the principle of representation . \nyou will not defeat the principles of checks and balances . \nyou will not impose martial law . \nhere is the irony . \nif terrorists hit us today when we finally vote on this , let us suppose a few democrats do not make it over here . \nyou are leaving this country vulnerable to change in power . \nif the terrorists were to strike your conference retreat where the president speaks to the republican house and senate members and kill hundreds of house and senate members on the republican side , the democrats at that point claim the majority . \nthe democrats at that point elect a speaker of the house . \ni am a democrat , for goodness sakes ; but that is not the way to leave our country vulnerable . \nyou are leaving your own party , you are leaving the will of the people through their elections vulnerable . \nif we have temporary replacements , you immediately reconstitute the house ; you immediately ensure representation ; you assure that you maintain the balance of political power ; and you do it in an orderly , structured way with no chaos , in a way that is constitutionally valid by definition . \nwhat you have proposed is not necessarily constitutionally valid . \nit leaves the terrorists able to change our system of government . \nit depends on a fantasy immediate or quick election . \nit does not allow really qualified people necessarily to get here and act in time . \nthere are so many things you have left undone . \nyou are going to try to say that at the start of this year we have solved this problem ; let us go home . \nyou have not solved the problem , and it is a doggone disgrace , and it is a danger to this country . \nthe other day a gentleman testified before the committee on the budget and said this : `` the lack of preparation for continuity , for true continuity invites attack. '' you are inviting attack . \nnot preventing attack . \n", 'filename': '052_400011_0327025_DON.txt', 'party': 'D', 'speaker_no': '400011', 'word_count': 556}, {'bill_no': '052', 'bill_vote': 'N', 'contents': 'mr. chairman , i rise to make two fundamental points before we proceed to vote on this . \nthe two points are these : this resolution does not solve the real problem and it may create more problems than it purports to solve , and we have to understand that . \nit does not solve the problem for this reason : by leaving us without a congress for 45 days , we essentially impose the opportunity for the executive branch to exert marshal law , and that is not what the framers of this country had in mind . \nthis bill , if we do not provide some mechanism for prompt replacement other than this bill , will leave this country governed by an unelected executive , a cabinet member most likely who not a single american elected to that office . \nfurthermore , it has a host of problems . \nit does not address the possibility that one delegation will elect its representatives more promptly than another . \nthey will come to this body , choose one of its members as speaker . \nthat person could move on to become the president . \nthen another delegation comes in , et cetera . \nyou are essentially leaving this country without a house of representatives , without checks and balances , without separation of powers , for at least 45 days , assuming an election can be held in 45 days and assuming that the terrorists through an anthrax attack , like they subjected this very capitol to , will not somehow undermine that ability . \nthis is reality . \nwe have seen the reality here . \nwe saw those airplanes hit the buildings , we saw the anthrax , and yet we are not truly acting to solve this . \nmr. chairman , i yield to my distinguished friend , the gentleman from california ( mr. rohrabacher ) xz4003430 . \n', 'filename': '052_400011_0327044_DON.txt', 'party': 'D', 'speaker_no': '400011', 'word_count': 282}, {'bill_no': '052', 'bill_vote': 'N', 'contents': 'mr. chairman , reclaiming my time , let me make two final points : one , the majority party must understand this : if you are at a republican conference retreat and terrorists should strike you and kill the president and vice president and significant numbers of your side of the aisle , the democrats under your proposed law will obtain the majority , will elect a speaker of the house , and that person will then become the president of the united states of america . \nyou are leaving this country vulnerable to that . \nyou must not do it . \nyou must not . \nthis matter must be taken seriously . \nit deserves full debate . \nwhether it is the proposal of the gentleman from california ( mr. rohrabacher ) xz4003430 and mine or others , we should commit to having this full house seriously consider this . \nif we do not and we are not fortunate , history will not look kindly upon the jeopardy in which we have left this great nation . \nvote no on this bill and insist on true debate on true continuity of congress in a responsible way that protects the balance of power , assures real succession to the presidency , and , most importantly , assures that your constituents will have representation at a time when our nation may well go to nuclear war , institute a draft , appropriate trillions of dollars , suspend habeas corpus and impose marshal law . \nyou do not want that . \nbut if you stop at this bill , you leave this nation vulnerable . \nmr. chairman , if there is no one to speak in opposition , i ask unanimous consent to withdraw my preferential motion . \n', 'filename': '052_400011_0327046_DON.txt', 'party': 'D', 'speaker_no': '400011', 'word_count': 261}, {'bill_no': '052', 'bill_vote': 'N', 'contents': "mr. chairman , i thank my distinguished colleague , and i appreciate his leadership on this issue . \nthe gentleman from california ( mr. rohrabacher ) xz4003430 spoke eloquently about the need for the rohrabacher/baird amendment ; and i would like to address it briefly , if i may . \nmadison is quoted on this topic , but let me quote madison from federalist 47 . \nhe said : `` the accumulation of all powers , legislative , executive , and judiciary in the same hands , whether of one , a few , or many , and whether hereditary , self-appointed , or elected , may justly be pronounced the very definition of tyranny. '' now , i would like , if i may , to ask my colleagues , before we pass this appropriations bill with legislative language in it alleging to maintain continuity , to maybe address a couple of questions , before my colleagues vote on this , and i will yield time . \nnot for a filibuster , but just to address some questions . \nhow will we , given madison 's concern , maintain checks and balances during the 49-day period until we have the special elections ? \ni would be happy to yield 30 seconds to anyone who plans to vote for this bill to address that question . \n", 'filename': '052_400011_1479036_DON.txt', 'party': 'D', 'speaker_no': '400011', 'word_count': 189}]
speeches_df = pd.DataFrame(speeches)
speeches_df.head()
bill_no | bill_vote | contents | filename | party | speaker_no | word_count | |
---|---|---|---|---|---|---|---|
0 | 052 | N | mr. chairman , i thank the gentlewoman for yie... | 052_400011_0327014_DON.txt | D | 400011 | 974 |
1 | 052 | N | mr. chairman , i want to thank my good friend ... | 052_400011_0327025_DON.txt | D | 400011 | 556 |
2 | 052 | N | mr. chairman , i rise to make two fundamental ... | 052_400011_0327044_DON.txt | D | 400011 | 282 |
3 | 052 | N | mr. chairman , reclaiming my time , let me mak... | 052_400011_0327046_DON.txt | D | 400011 | 261 |
4 | 052 | N | mr. chairman , i thank my distinguished collea... | 052_400011_1479036_DON.txt | D | 400011 | 189 |
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
longer_speeches = speeches_df[speeches_df["word_count"] > 92]
X = vectorizer.fit_transform(longer_speeches['contents'])
number_of_clusters = 7
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :8]:
print(' %s' % terms[ind]),
print ''
Top terms per cluster: Cluster 0: start head children program school programs amendment parents Cluster 1: china trade speaker cafta chinese currency jobs madam Cluster 2: treatment blending water sewage epa policy wastewater amendment Cluster 3: religious faith organizations based start head civil rights Cluster 4: veterans care va billion health budget money war Cluster 5: mr amendment chairman time gentleman house horses speaker Cluster 6: frivolous lawsuits court rule courts sanctions 11 lawsuit
longer_speeches["k-means label"] = km.labels_
longer_speeches.head()
epa_speeches = longer_speeches[longer_speeches["k-means label"] == 2]
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(epa_speeches['contents'])
number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :5]:
print(' %s' % terms[ind]),
print ''
Top terms per cluster: Cluster 0: subcommittee chairman amendment water policy Cluster 1: sewage epa amendment treatment policy Cluster 2: going rule epa saying anytime Cluster 3: issue fact trying matter obviously Cluster 4: blending treatment use communities weather
speeches_df["word_count"].describe()
count 702.000000 mean 273.216524 std 698.807057 min 3.000000 25% 17.000000 50% 92.500000 75% 368.750000 max 15402.000000 dtype: float64
speeches_df[speeches_df["word_count"] < 17]["contents"][:10]
5 i yield to the gentleman from illinois . \n 7 i yield to the gentleman from illinois . \n 14 mr. chairman , i demand a recorded vote . \n 24 i am , mr. speaker , in its present form . \n 27 mr. speaker , i demand a recorded vote . \n 28 mr. chairman , i offer an amendment . \n 30 mr. chairman , how much time do i have remaini... 32 mr. chairman , i demand a recorded vote . \n 37 mr. chairman , i yield back the balance of my ... 39 mr. chairman , i yield 2 minutes to the gentle... Name: contents, dtype: object
!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/hp.zip
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 9226k 100 9226k 0 0 4007k 0 0:00:02 0:00:02 --:--:-- 4006k
!unzip -y hp.zip
UnZip 5.52 of 28 February 2005, by Info-ZIP. Maintained by C. Spieler. Send bug reports using http://www.info-zip.org/zip-bug.html; see README for details. Usage: unzip [-Z] [-opts[modifiers]] file[.zip] [list] [-x xlist] [-d exdir] Default action is to extract files in list, except those in xlist, to exdir; file[.zip] may be a wildcard. -Z => ZipInfo mode ("unzip -Z" for usage). -p extract files to pipe, no messages -l list files (short format) -f freshen existing files, create none -t test compressed archive data -u update files, create if necessary -z display archive comment -x exclude files that follow (in xlist) -d extract files into exdir modifiers: -q quiet mode (-qq => quieter) -n never overwrite existing files -a auto-convert any text files -o overwrite files WITHOUT prompting -aa treat ALL files as text -j junk paths (do not make directories) -v be verbose/print version info -C match filenames case-insensitively -L make (some) names lowercase -X restore UID/GID info -V retain VMS version numbers -K keep setuid/setgid/tacky permissions -M pipe through "more" pager Examples (see unzip.txt for more info): unzip data1 -x joe => extract all files except joe from zipfile data1.zip unzip -p foo | more => send contents of foo.zip via pipe into program more unzip -fo foo ReadMe => quietly replace existing ReadMe if archive file newer
paths = glob.glob("hp/*")
fanfics = []
for path in paths:
contents = open(path).read()
# For fear of unicode
contents = contents.decode("ascii","ignore")
contents = contents.lower()
fanfic = {}
fanfic['contents'] = contents
filename = path[3:]
fanfic['name'] = filename[:-4]
fanfics.append(fanfic)
fanfics_df = pd.DataFrame(fanfics)
fanfics_df.head()
contents | name | |
---|---|---|
0 | prologue: the missiondisclaimer: all character... | 10001898 |
1 | blackdisclaimer: i do not own harry potterauth... | 10004131 |
2 | chapter 1"i'm pregnant.""""mum please say some... | 10004927 |
3 | author's note: hey, just so you know, this is ... | 10007980 |
4 | disclaimer: i do not own harry potter and frie... | 10010343 |
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(fanfics_df['contents'])
number_of_clusters = 2
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :8]:
print(' %s' % terms[ind]),
print ''
Top terms per cluster: Cluster 0: harry hermione draco said just ron like ginny Cluster 1: lily james sirius remus said harry just eyes
fanfics_df["k-means labels"] = km.labels_
harrys_friends_df = fanfics_df[fanfics_df["k-means labels"] == 0]
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(harrys_friends_df['contents'])
number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :8]:
print(' %s' % terms[ind]),
print ''
Top terms per cluster: Cluster 0: draco hermione harry malfoy just said like granger Cluster 1: harry said hermione ron ginny dumbledore potter just Cluster 2: hermione ron harry ginny said just fred like Cluster 3: rose scorpius albus said al just malfoy hugo Cluster 4: said like just eyes severus know harry time
harrys_friends_df["friends label"] = km.labels_
draco_herm_df = harrys_friends_df[harrys_friends_df["friends label"] == 0]
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(draco_herm_df['contents'])
number_of_clusters = 3
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :8]:
print(' %s' % terms[ind]),
print ''
Top terms per cluster: Cluster 0: draco blaise fred pansy george like theo know Cluster 1: draco hermione said just like granger didn malfoy Cluster 2: harry draco potter just malfoy like man eyes
!curl -O https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/twilight.zip
!unzip -o twilight.zip
% Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 2831k 100 2831k 0 0 2784k 0 0:00:01 0:00:01 --:--:-- 2787k Archive: twilight.zip inflating: twilight/10016071.txt inflating: twilight/10016524.txt inflating: twilight/10019441.txt inflating: twilight/10021891.txt inflating: twilight/10029139.txt inflating: twilight/10029797.txt inflating: twilight/10030147.txt inflating: twilight/10042940.txt inflating: twilight/10046233.txt inflating: twilight/10047055.txt inflating: twilight/10050374.txt inflating: twilight/10050718.txt inflating: twilight/10058807.txt inflating: twilight/10060217.txt inflating: twilight/10060881.txt inflating: twilight/10061496.txt inflating: twilight/10064909.txt inflating: twilight/10078293.txt inflating: twilight/10079909.txt inflating: twilight/10082628.txt inflating: twilight/10086525.txt inflating: twilight/10086845.txt inflating: twilight/10087055.txt inflating: twilight/10088968.txt inflating: twilight/10090541.txt inflating: twilight/10103988.txt inflating: twilight/10109144.txt inflating: twilight/10109176.txt inflating: twilight/10109917.txt inflating: twilight/10120506.txt inflating: twilight/10130957.txt inflating: twilight/10134818.txt inflating: twilight/10135294.txt inflating: twilight/10140051.txt inflating: twilight/10141874.txt inflating: twilight/10145528.txt inflating: twilight/10150704.txt inflating: twilight/10152474.txt inflating: twilight/10156755.txt inflating: twilight/10161589.txt inflating: twilight/10170411.txt inflating: twilight/10170441.txt inflating: twilight/10171111.txt inflating: twilight/10177219.txt inflating: twilight/10180541.txt inflating: twilight/10187563.txt inflating: twilight/10189248.txt inflating: twilight/10189525.txt inflating: twilight/10191037.txt inflating: twilight/10191255.txt inflating: twilight/10191633.txt inflating: twilight/10192934.txt inflating: twilight/10194603.txt inflating: twilight/10195667.txt inflating: twilight/10196304.txt inflating: twilight/10199678.txt inflating: twilight/10200439.txt inflating: twilight/10204063.txt inflating: twilight/10204235.txt inflating: twilight/10204244.txt inflating: twilight/10207563.txt inflating: twilight/10208579.txt inflating: twilight/10212570.txt inflating: twilight/10216557.txt inflating: twilight/10216605.txt inflating: twilight/10222122.txt inflating: twilight/10232528.txt inflating: twilight/10235795.txt inflating: twilight/10241550.txt inflating: twilight/10246172.txt inflating: twilight/10246462.txt inflating: twilight/10248062.txt inflating: twilight/10250458.txt inflating: twilight/10251763.txt inflating: twilight/10251981.txt inflating: twilight/10258946.txt inflating: twilight/10262365.txt inflating: twilight/10262385.txt inflating: twilight/10262983.txt inflating: twilight/10268896.txt inflating: twilight/10269161.txt inflating: twilight/10273916.txt inflating: twilight/10280960.txt inflating: twilight/10281896.txt inflating: twilight/10284689.txt inflating: twilight/10284801.txt inflating: twilight/10286275.txt inflating: twilight/10294367.txt inflating: twilight/10295939.txt inflating: twilight/10297442.txt inflating: twilight/10298933.txt inflating: twilight/10301389.txt inflating: twilight/10302586.txt inflating: twilight/10307650.txt inflating: twilight/10312333.txt inflating: twilight/10314566.txt inflating: twilight/10323040.txt inflating: twilight/10323045.txt inflating: twilight/10325720.txt inflating: twilight/10327214.txt inflating: twilight/10328544.txt inflating: twilight/10333204.txt inflating: twilight/10334017.txt inflating: twilight/10335946.txt inflating: twilight/10336553.txt inflating: twilight/10336818.txt inflating: twilight/10337605.txt inflating: twilight/10339410.txt inflating: twilight/10343797.txt inflating: twilight/10344814.txt inflating: twilight/10345258.txt inflating: twilight/10348194.txt inflating: twilight/10352165.txt inflating: twilight/10357506.txt inflating: twilight/10358658.txt inflating: twilight/10362789.txt inflating: twilight/10363568.txt inflating: twilight/10365151.txt inflating: twilight/10370676.txt inflating: twilight/10370942.txt inflating: twilight/10371789.txt inflating: twilight/10372056.txt inflating: twilight/10374514.txt inflating: twilight/10376925.txt inflating: twilight/10379848.txt inflating: twilight/10381590.txt inflating: twilight/10387254.txt inflating: twilight/10391280.txt inflating: twilight/10395072.txt inflating: twilight/10395577.txt inflating: twilight/10396002.txt inflating: twilight/10397311.txt inflating: twilight/10397579.txt inflating: twilight/10398693.txt inflating: twilight/10403468.txt inflating: twilight/10404430.txt inflating: twilight/10407405.txt inflating: twilight/10412446.txt inflating: twilight/10414259.txt inflating: twilight/10415883.txt inflating: twilight/10417349.txt inflating: twilight/10418008.txt inflating: twilight/10418103.txt inflating: twilight/10425518.txt inflating: twilight/10426753.txt inflating: twilight/10427603.txt inflating: twilight/10428183.txt inflating: twilight/10430881.txt inflating: twilight/10432467.txt inflating: twilight/10433266.txt inflating: twilight/10433536.txt inflating: twilight/10434458.txt inflating: twilight/10434895.txt inflating: twilight/10438356.txt inflating: twilight/10438858.txt inflating: twilight/10446328.txt inflating: twilight/10451419.txt inflating: twilight/10453301.txt inflating: twilight/10453491.txt inflating: twilight/10453825.txt inflating: twilight/10457477.txt inflating: twilight/10457506.txt inflating: twilight/10462777.txt inflating: twilight/10472264.txt inflating: twilight/10473271.txt inflating: twilight/10473280.txt inflating: twilight/10474456.txt inflating: twilight/10474580.txt inflating: twilight/10474855.txt inflating: twilight/10475090.txt inflating: twilight/10477604.txt inflating: twilight/10477950.txt inflating: twilight/10478310.txt inflating: twilight/10481398.txt inflating: twilight/10481550.txt inflating: twilight/10483597.txt inflating: twilight/10485026.txt inflating: twilight/10487872.txt inflating: twilight/10488717.txt inflating: twilight/10490963.txt inflating: twilight/10491290.txt inflating: twilight/10491955.txt inflating: twilight/10491999.txt inflating: twilight/10492279.txt inflating: twilight/10493288.txt inflating: twilight/10493501.txt inflating: twilight/10494243.txt inflating: twilight/10496508.txt inflating: twilight/10500426.txt inflating: twilight/10502406.txt inflating: twilight/10504088.txt inflating: twilight/10505005.txt inflating: twilight/10509496.txt inflating: twilight/10509795.txt inflating: twilight/10510031.txt inflating: twilight/10510951.txt inflating: twilight/10511805.txt inflating: twilight/10513794.txt inflating: twilight/10514795.txt inflating: twilight/10517876.txt inflating: twilight/10520140.txt inflating: twilight/10521227.txt inflating: twilight/10523218.txt inflating: twilight/10524772.txt inflating: twilight/10526181.txt inflating: twilight/10527292.txt inflating: twilight/10527617.txt inflating: twilight/10528547.txt inflating: twilight/10528594.txt inflating: twilight/10529345.txt inflating: twilight/10531169.txt inflating: twilight/10533355.txt inflating: twilight/10534458.txt inflating: twilight/10534532.txt inflating: twilight/10535653.txt inflating: twilight/10536476.txt inflating: twilight/10537199.txt inflating: twilight/10537772.txt inflating: twilight/10541448.txt inflating: twilight/10541579.txt inflating: twilight/10543643.txt inflating: twilight/10546687.txt inflating: twilight/10547113.txt inflating: twilight/10547343.txt inflating: twilight/10547370.txt inflating: twilight/10547780.txt inflating: twilight/10548403.txt inflating: twilight/10548632.txt inflating: twilight/10550216.txt inflating: twilight/10551522.txt inflating: twilight/10552181.txt inflating: twilight/10553063.txt inflating: twilight/10554639.txt inflating: twilight/10555069.txt inflating: twilight/10555577.txt inflating: twilight/10555968.txt inflating: twilight/10556127.txt inflating: twilight/10556168.txt inflating: twilight/10558082.txt inflating: twilight/10558233.txt inflating: twilight/10558446.txt inflating: twilight/10558662.txt inflating: twilight/10559054.txt inflating: twilight/10559764.txt inflating: twilight/10560953.txt inflating: twilight/10562121.txt inflating: twilight/10562706.txt inflating: twilight/10562816.txt inflating: twilight/10563237.txt inflating: twilight/10563689.txt inflating: twilight/10564873.txt inflating: twilight/10565056.txt inflating: twilight/10567010.txt inflating: twilight/10567223.txt inflating: twilight/10567394.txt inflating: twilight/10568543.txt inflating: twilight/10569116.txt inflating: twilight/10569143.txt inflating: twilight/10569246.txt inflating: twilight/10571217.txt inflating: twilight/10572009.txt inflating: twilight/10573849.txt inflating: twilight/10574055.txt inflating: twilight/10575483.txt inflating: twilight/10575699.txt inflating: twilight/10575743.txt inflating: twilight/10576242.txt inflating: twilight/10577724.txt inflating: twilight/10578186.txt inflating: twilight/10579028.txt inflating: twilight/10579440.txt inflating: twilight/10579738.txt inflating: twilight/10581551.txt inflating: twilight/10582449.txt inflating: twilight/10583173.txt inflating: twilight/10583990.txt inflating: twilight/10584271.txt inflating: twilight/10584313.txt inflating: twilight/10585251.txt inflating: twilight/10585363.txt inflating: twilight/10585807.txt inflating: twilight/10585949.txt inflating: twilight/10586787.txt inflating: twilight/10586903.txt inflating: twilight/10587095.txt inflating: twilight/10587412.txt inflating: twilight/10587486.txt inflating: twilight/10588117.txt inflating: twilight/10590030.txt inflating: twilight/10590413.txt inflating: twilight/10590578.txt inflating: twilight/10591164.txt inflating: twilight/10591786.txt inflating: twilight/10592328.txt inflating: twilight/10594034.txt inflating: twilight/10594298.txt inflating: twilight/10594448.txt inflating: twilight/10594891.txt inflating: twilight/10595084.txt inflating: twilight/10595448.txt inflating: twilight/10595722.txt inflating: twilight/10596133.txt inflating: twilight/10596177.txt inflating: twilight/10596192.txt inflating: twilight/10596374.txt inflating: twilight/10597666.txt inflating: twilight/10598064.txt inflating: twilight/10598593.txt inflating: twilight/10598938.txt inflating: twilight/10598968.txt inflating: twilight/10599652.txt inflating: twilight/10599692.txt inflating: twilight/10599794.txt inflating: twilight/10600073.txt inflating: twilight/10600300.txt inflating: twilight/10600566.txt inflating: twilight/10600676.txt inflating: twilight/10600792.txt inflating: twilight/10600868.txt inflating: twilight/10601348.txt inflating: twilight/10601367.txt inflating: twilight/10601482.txt inflating: twilight/10601786.txt inflating: twilight/10602048.txt inflating: twilight/10602382.txt inflating: twilight/10602676.txt inflating: twilight/10602697.txt inflating: twilight/10602788.txt inflating: twilight/10602822.txt inflating: twilight/10603050.txt inflating: twilight/10603099.txt inflating: twilight/10603119.txt inflating: twilight/10603423.txt inflating: twilight/10603438.txt inflating: twilight/10603800.txt inflating: twilight/10603842.txt inflating: twilight/10603961.txt inflating: twilight/10604332.txt inflating: twilight/10604342.txt inflating: twilight/10604632.txt inflating: twilight/10604633.txt inflating: twilight/10604662.txt inflating: twilight/10604771.txt inflating: twilight/10604944.txt inflating: twilight/10605257.txt inflating: twilight/10605627.txt inflating: twilight/10605754.txt inflating: twilight/10606009.txt inflating: twilight/10606063.txt inflating: twilight/10606134.txt inflating: twilight/10606239.txt inflating: twilight/10606349.txt inflating: twilight/10606357.txt inflating: twilight/10606404.txt inflating: twilight/10606406.txt inflating: twilight/10606596.txt inflating: twilight/10606710.txt inflating: twilight/10606764.txt inflating: twilight/10606787.txt inflating: twilight/10606793.txt inflating: twilight/10606808.txt inflating: twilight/10607110.txt inflating: twilight/10607388.txt inflating: twilight/10607430.txt inflating: twilight/10607571.txt inflating: twilight/10607776.txt inflating: twilight/10607787.txt inflating: twilight/10607912.txt inflating: twilight/10607918.txt inflating: twilight/10607932.txt inflating: twilight/10607972.txt inflating: twilight/10608013.txt inflating: twilight/10608017.txt inflating: twilight/10608103.txt inflating: twilight/10608139.txt inflating: twilight/10608163.txt inflating: twilight/10608220.txt inflating: twilight/10608285.txt inflating: twilight/10608534.txt inflating: twilight/10608813.txt inflating: twilight/10608939.txt inflating: twilight/10609029.txt inflating: twilight/10609365.txt inflating: twilight/10609722.txt inflating: twilight/10609791.txt inflating: twilight/10609918.txt inflating: twilight/10610013.txt inflating: twilight/10610032.txt inflating: twilight/10610179.txt inflating: twilight/10610348.txt inflating: twilight/10610356.txt inflating: twilight/10610486.txt inflating: twilight/10610852.txt inflating: twilight/10610855.txt inflating: twilight/10610917.txt inflating: twilight/10611126.txt inflating: twilight/10611205.txt inflating: twilight/10611269.txt inflating: twilight/10611278.txt inflating: twilight/10611406.txt inflating: twilight/10611504.txt inflating: twilight/10611704.txt inflating: twilight/10611856.txt inflating: twilight/10611900.txt inflating: twilight/10612025.txt inflating: twilight/10612026.txt inflating: twilight/10612073.txt inflating: twilight/10612120.txt inflating: twilight/10612223.txt inflating: twilight/10612262.txt inflating: twilight/10612396.txt inflating: twilight/10612469.txt inflating: twilight/10612522.txt inflating: twilight/10612543.txt inflating: twilight/10612604.txt inflating: twilight/10612720.txt inflating: twilight/10612742.txt inflating: twilight/10612987.txt inflating: twilight/10612989.txt inflating: twilight/10613463.txt inflating: twilight/10613499.txt inflating: twilight/10613520.txt inflating: twilight/10613525.txt inflating: twilight/10613572.txt inflating: twilight/10613578.txt inflating: twilight/10613584.txt inflating: twilight/10613687.txt inflating: twilight/10613710.txt inflating: twilight/10613723.txt inflating: twilight/10613937.txt inflating: twilight/10614061.txt inflating: twilight/10614126.txt inflating: twilight/10614171.txt inflating: twilight/10614312.txt inflating: twilight/10614343.txt inflating: twilight/10614353.txt inflating: twilight/10614445.txt inflating: twilight/4869815.txt inflating: twilight/5018846.txt inflating: twilight/5065602.txt inflating: twilight/5107651.txt inflating: twilight/5222864.txt inflating: twilight/5512555.txt inflating: twilight/5539671.txt inflating: twilight/5590332.txt inflating: twilight/5723175.txt inflating: twilight/5741779.txt inflating: twilight/5763619.txt inflating: twilight/5832550.txt inflating: twilight/5897650.txt inflating: twilight/5982525.txt inflating: twilight/6027176.txt inflating: twilight/6643551.txt inflating: twilight/6663669.txt inflating: twilight/6669949.txt inflating: twilight/6686525.txt inflating: twilight/6710619.txt inflating: twilight/6834202.txt inflating: twilight/6909951.txt inflating: twilight/6953804.txt inflating: twilight/7007740.txt inflating: twilight/7049842.txt inflating: twilight/7072736.txt inflating: twilight/7111096.txt inflating: twilight/7147261.txt inflating: twilight/7180599.txt inflating: twilight/7215043.txt inflating: twilight/7241648.txt inflating: twilight/7312315.txt inflating: twilight/7451302.txt inflating: twilight/7461067.txt inflating: twilight/7470526.txt inflating: twilight/7482748.txt inflating: twilight/7514383.txt inflating: twilight/7544799.txt inflating: twilight/7585406.txt inflating: twilight/7657479.txt inflating: twilight/7710461.txt inflating: twilight/7807895.txt inflating: twilight/7819564.txt inflating: twilight/7850576.txt inflating: twilight/7868518.txt inflating: twilight/7907513.txt inflating: twilight/7931275.txt inflating: twilight/8005615.txt inflating: twilight/8023905.txt inflating: twilight/8051177.txt inflating: twilight/8062991.txt inflating: twilight/8177305.txt inflating: twilight/8187260.txt inflating: twilight/8234864.txt inflating: twilight/8259942.txt inflating: twilight/8282455.txt inflating: twilight/8312984.txt inflating: twilight/8317307.txt inflating: twilight/8348608.txt inflating: twilight/8353504.txt inflating: twilight/8360306.txt inflating: twilight/8361607.txt inflating: twilight/8384503.txt inflating: twilight/8419554.txt inflating: twilight/8444455.txt inflating: twilight/8509717.txt inflating: twilight/8616601.txt inflating: twilight/8666106.txt inflating: twilight/8676067.txt inflating: twilight/8693686.txt inflating: twilight/8706065.txt inflating: twilight/8727994.txt inflating: twilight/8728238.txt inflating: twilight/8768557.txt inflating: twilight/8837147.txt inflating: twilight/8842965.txt inflating: twilight/8853059.txt inflating: twilight/8856038.txt inflating: twilight/8856278.txt inflating: twilight/8865881.txt inflating: twilight/8895989.txt inflating: twilight/8897114.txt inflating: twilight/8925733.txt inflating: twilight/8933287.txt inflating: twilight/8962912.txt inflating: twilight/8981010.txt inflating: twilight/8981302.txt inflating: twilight/8996197.txt inflating: twilight/8997361.txt inflating: twilight/9038159.txt inflating: twilight/9048088.txt inflating: twilight/9053802.txt inflating: twilight/9097248.txt inflating: twilight/9119196.txt inflating: twilight/9141251.txt inflating: twilight/9145261.txt inflating: twilight/9148990.txt inflating: twilight/9174762.txt inflating: twilight/9176270.txt inflating: twilight/9187817.txt inflating: twilight/9235498.txt inflating: twilight/9245557.txt inflating: twilight/9258328.txt inflating: twilight/9260002.txt inflating: twilight/9291120.txt inflating: twilight/9307320.txt inflating: twilight/9307324.txt inflating: twilight/9320161.txt inflating: twilight/9324745.txt inflating: twilight/9350379.txt inflating: twilight/9363528.txt inflating: twilight/9364380.txt inflating: twilight/9366781.txt inflating: twilight/9372612.txt inflating: twilight/9376361.txt inflating: twilight/9382021.txt inflating: twilight/9384370.txt inflating: twilight/9388253.txt inflating: twilight/9449313.txt inflating: twilight/9451138.txt inflating: twilight/9457516.txt inflating: twilight/9460911.txt inflating: twilight/9463656.txt inflating: twilight/9477911.txt inflating: twilight/9497589.txt inflating: twilight/9507423.txt inflating: twilight/9519349.txt inflating: twilight/9542579.txt inflating: twilight/9544658.txt inflating: twilight/9547136.txt inflating: twilight/9564350.txt inflating: twilight/9570770.txt inflating: twilight/9573360.txt inflating: twilight/9584935.txt inflating: twilight/9599527.txt inflating: twilight/9602556.txt inflating: twilight/9613741.txt inflating: twilight/9621627.txt inflating: twilight/9632720.txt inflating: twilight/9643272.txt inflating: twilight/9643999.txt inflating: twilight/9669120.txt inflating: twilight/9670479.txt inflating: twilight/9684794.txt inflating: twilight/9691507.txt inflating: twilight/9691564.txt inflating: twilight/9698577.txt inflating: twilight/9712607.txt inflating: twilight/9713348.txt inflating: twilight/9717709.txt inflating: twilight/9724618.txt inflating: twilight/9730929.txt inflating: twilight/9731152.txt inflating: twilight/9735154.txt inflating: twilight/9741363.txt inflating: twilight/9745552.txt inflating: twilight/9752376.txt inflating: twilight/9759104.txt inflating: twilight/9761533.txt inflating: twilight/9764168.txt inflating: twilight/9775624.txt inflating: twilight/9775934.txt inflating: twilight/9779069.txt inflating: twilight/9785972.txt inflating: twilight/9788953.txt inflating: twilight/9797824.txt inflating: twilight/9821325.txt inflating: twilight/9821671.txt inflating: twilight/9844398.txt inflating: twilight/9858083.txt inflating: twilight/9862476.txt inflating: twilight/9879884.txt inflating: twilight/9881221.txt inflating: twilight/9899121.txt inflating: twilight/9919438.txt inflating: twilight/9944397.txt inflating: twilight/9946445.txt inflating: twilight/9952305.txt inflating: twilight/9956100.txt inflating: twilight/9979704.txt inflating: twilight/9984045.txt inflating: twilight/9989843.txt inflating: twilight/9989965.txt inflating: twilight/9990417.txt inflating: twilight/9990870.txt inflating: twilight/9993318.txt
!ls twilight
!cat twilight/10016071.txt
paths = glob.glob("twilight/*")
fanfics = []
for path in paths:
contents = open(path).read()
# For fear of unicode
contents = contents.decode("ascii","ignore")
contents = contents.lower()
fanfic = {}
fanfic['contents'] = contents
filename = path[3:]
fanfic['name'] = filename[:-4]
fanfics.append(fanfic)
fanfics_df = pd.DataFrame(fanfics)
fanfics_df.head()
contents | name | |
---|---|---|
0 | what follows is the original one-shot-no edits... | light/10016071 |
1 | a/n: hey guys this is my new story hope you e... | light/10016524 |
2 | i saw him. him and bella. but who cares about ... | light/10019441 |
3 | disclaimer: all publicly recognizable characte... | light/10021891 |
4 | thou art a heartless monsterdisclaimer: all re... | light/10029139 |
vectorizer = TfidfVectorizer(max_features=10000, stop_words='english')
X = vectorizer.fit_transform(fanfics_df['contents'])
number_of_clusters = 5
km = KMeans(n_clusters=number_of_clusters)
km.fit(X)
print("Top terms per cluster:")
order_centroids = km.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()
for i in range(number_of_clusters):
print("Cluster %d:" % i),
for ind in order_centroids[i, :8]:
print(' %s' % terms[ind]),
print ''
Top terms per cluster: Cluster 0: just like said eyes didn know looked bella Cluster 1: story like just know love bella edward don Cluster 2: edward alice bella jasper said carlisle just emmett Cluster 3: jacob bella edward jake seth billy like charlie Cluster 4: sam leah emily paul says just like embry
# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets.csv.zip
# https://raw.githubusercontent.com/ledeprogram/courses/master/algorithms/data/tweets-ukraine.csv.zip