%matplotlib inline
import pandas as pd
import matplotlib.pyplot as plt
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)
import os
import sys
import traceback
Reading at scale:
Martha Ballard's Diary http://dohistory.org/diary/index.html
http://www.cameronblevins.org/posts/topic-modeling-martha-ballards-diary/
Richmond Dispatch
from IPython.display import Image
Image("http://journalofdigitalhumanities.org/wp-content/uploads/2013/02/blei_lda_illustration.png")
Assuming that you are storing your data in a directory in the same place as your iPython notebook.
First, we'll need to read in plain text files.
def readtextfiles(our_directory):
"""reads in plain text files and puts them in order in a list"""
current_dir=os.getcwd() #should build up file names using os.path()
os.chdir(our_directory)
files=[file for file in os.listdir(".") if not file.startswith('.')] #defeat hidden files
files=[file for file in files if not os.path.isdir(file)==True] #defeat directories
articles=[]
for file in files:
with open(file) as plaintext: #not doing anything about encoding, could be problem
lines=plaintext.readlines()
article=" ".join(lines) #alter lines if want to skip lines
articles.append(article) #you might want to extract the file name to use; how do it?
os.chdir(current_dir)
return articles, files
our_texts, names=readtextfiles("text_examples/british-fiction-corpus")
names
['Dickens_David.txt', 'Austen_Sense.txt', 'Dickens_Bleak.txt', 'CBronte_Jane.txt', 'CBronte_Villette.txt', 'ABronte_Agnes.txt', 'Austen_Emma.txt', 'Austen_Pride.txt', 'ABronte_Tenant.txt', 'CBronte_Professor.txt', 'Dickens_Hard.txt']
def data_cleanse(docs_to_clean):
import re
D=len(docs_to_clean)
for d in range(0, D):
docs_to_clean[d] = docs_to_clean[d].lower()
docs_to_clean[d] = re.sub(r'-', ' ', docs_to_clean[d])
docs_to_clean[d] = re.sub(r'[^a-zA-Z0-9 ]', '', docs_to_clean[d])
docs_to_clean[d] = re.sub(r' +', ' ', docs_to_clean[d])
docs_to_clean[d] = re.sub(r'\s\w\s', ' ', docs_to_clean[d]) #eliminate single letters
return docs_to_clean
#You'll need to modify this as necessary!
our_texts=data_cleanse(our_texts)
#more necessary when have messy text
#eliminate escaped characters
vectorizer
from scikit learn
¶from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer=TfidfVectorizer(min_df=0.5, stop_words='english', use_idf=True)
document_term_matrix=vectorizer.fit_transform(our_texts)
# now let's get our vocabulary--the names corresponding to the rows
vocab=vectorizer.get_feature_names()
len(vocab)
7102
document_term_matrix.shape
(11, 7102)
document_term_matrix_dense=document_term_matrix.toarray()
dtmdf=pd.DataFrame(document_term_matrix_dense, columns=vocab)
dtmdf
18 | abandoned | abashed | abhorred | abhorrence | abide | abilities | ability | able | abode | ... | youd | youll | young | younger | youngest | youre | youth | youthful | youve | zeal | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000267 | 0.002905 | 0.001869 | 0.000000 | 0.000488 | 0.000223 | 0.002235 | 0.001219 | 0.007115 | 0.001320 | ... | 0.004916 | 0.015798 | 0.049804 | 0.003124 | 0.001788 | 0.012290 | 0.005380 | 0.004309 | 0.004146 | 0.000244 |
1 | 0.000989 | 0.000827 | 0.000000 | 0.001805 | 0.003611 | 0.000000 | 0.007444 | 0.002708 | 0.029546 | 0.003491 | ... | 0.000000 | 0.000000 | 0.066158 | 0.003854 | 0.003308 | 0.000000 | 0.004496 | 0.001519 | 0.000000 | 0.001805 |
2 | 0.000000 | 0.002917 | 0.001341 | 0.000000 | 0.000245 | 0.000000 | 0.000898 | 0.000980 | 0.012721 | 0.000758 | ... | 0.003366 | 0.013598 | 0.074407 | 0.003137 | 0.001571 | 0.021317 | 0.005750 | 0.003296 | 0.004163 | 0.000000 |
3 | 0.000000 | 0.004706 | 0.000000 | 0.003210 | 0.000000 | 0.001176 | 0.000588 | 0.000000 | 0.010506 | 0.003972 | ... | 0.002353 | 0.004321 | 0.037001 | 0.006395 | 0.000588 | 0.002941 | 0.007766 | 0.001620 | 0.005778 | 0.002568 |
4 | 0.000724 | 0.001212 | 0.001448 | 0.000661 | 0.001323 | 0.000606 | 0.000000 | 0.001323 | 0.007529 | 0.002558 | ... | 0.000606 | 0.001669 | 0.053176 | 0.000941 | 0.003636 | 0.000606 | 0.012706 | 0.002782 | 0.000000 | 0.001323 |
5 | 0.000000 | 0.000000 | 0.001862 | 0.000000 | 0.001700 | 0.000000 | 0.003115 | 0.003400 | 0.025401 | 0.011833 | ... | 0.004673 | 0.010011 | 0.081041 | 0.015724 | 0.001558 | 0.017133 | 0.003629 | 0.000000 | 0.011900 | 0.000000 |
6 | 0.000000 | 0.000000 | 0.000000 | 0.000570 | 0.000000 | 0.000522 | 0.001567 | 0.000000 | 0.029214 | 0.000441 | ... | 0.000000 | 0.000000 | 0.077904 | 0.000811 | 0.002090 | 0.000000 | 0.004463 | 0.000959 | 0.000000 | 0.002281 |
7 | 0.000910 | 0.000000 | 0.000000 | 0.000000 | 0.004986 | 0.000761 | 0.004568 | 0.000000 | 0.031930 | 0.005142 | ... | 0.000000 | 0.000699 | 0.076278 | 0.017739 | 0.009898 | 0.000000 | 0.005322 | 0.000000 | 0.000000 | 0.000000 |
8 | 0.000789 | 0.005939 | 0.000000 | 0.002161 | 0.004322 | 0.001980 | 0.000000 | 0.000000 | 0.017937 | 0.004457 | ... | 0.010559 | 0.021208 | 0.047150 | 0.004612 | 0.000000 | 0.027717 | 0.006662 | 0.001818 | 0.015846 | 0.000720 |
9 | 0.003283 | 0.001373 | 0.001641 | 0.005996 | 0.000000 | 0.001373 | 0.000000 | 0.001499 | 0.008532 | 0.006956 | ... | 0.004120 | 0.016393 | 0.061858 | 0.002133 | 0.000000 | 0.016480 | 0.012798 | 0.006305 | 0.011992 | 0.000000 |
10 | 0.000000 | 0.002337 | 0.001862 | 0.000850 | 0.000000 | 0.000779 | 0.000779 | 0.000850 | 0.006048 | 0.000000 | ... | 0.008567 | 0.025029 | 0.078025 | 0.003024 | 0.000000 | 0.018692 | 0.004839 | 0.002145 | 0.002550 | 0.000850 |
11 rows × 7102 columns
#easy to program, but let's use a robust version from sklearn!
from sklearn.metrics.pairwise import cosine_similarity
similarity=cosine_similarity(document_term_matrix)
#Note here that the `cosine_similiary` can take
#an entire matrix as its argument
similarity_df=pd.DataFrame(similarity, index=names, columns=names)
similarity_df.ix[1].order(ascending=False)
Austen_Sense.txt 1.000000 Austen_Pride.txt 0.828285 Austen_Emma.txt 0.801833 ABronte_Tenant.txt 0.777003 ABronte_Agnes.txt 0.750176 CBronte_Jane.txt 0.739302 CBronte_Villette.txt 0.713728 Dickens_David.txt 0.704389 Dickens_Hard.txt 0.698309 CBronte_Professor.txt 0.671603 Dickens_Bleak.txt 0.666095 Name: Austen_Sense.txt, dtype: float64
you've already seen hierarchical clustering
#here's the blackbox
from sklearn.manifold import MDS
mds = MDS(n_components=2, dissimilarity="precomputed", random_state=1)
positions= mds.fit_transform(1-similarity)
positions.shape
(11, 2)
It's an 11 by 2 matrix
OR
simply an (x,y) coordinate pair for each of our texts
def plot_mds(positions, names):
xs, ys=positions[:, 0], positions[:, 1]
for x, y, name in zip(xs, ys, names):
plt.scatter(x,y)
plt.text(x,y,name)
plt.show()
#let's plot it: I've set up a black box
plot_mds(positions,names)
/home/mljones/anaconda/lib/python2.7/site-packages/matplotlib/font_manager.py:1282: UserWarning: findfont: Font family [u'monospace'] not found. Falling back to Bitstream Vera Sans (prop.get_family(), self.defaultFamily[fontext]))
names=[name.replace(".txt", "") for name in names]
plot_mds(positions,names)
What has this got us?
It suggests that even this crude measure of similarity is able to capture something significant.
Note: the axes don't really mean anything
python
package gensim
¶"corpora" = a collection of documents or texts
gensim
likes its documents to be a list of lists of words, not a list of strings
Get the stoplist in the data directory in my github.
our_texts, names=readtextfiles("text_examples/PCCIPtext")
our_texts=data_cleanse(our_texts)
#improved stoplist--may be too complete
stop=[]
with open('stoplist-multilingual') as f:
stop=f.readlines()
stop=[word.strip('\n') for word in stop]
texts = [[word for word in document.lower().split() if word not in stop] for document in our_texts] #gensim requires list of list of words in documents
from gensim import corpora, models, similarities, matutils
"""gensim includes its own vectorizing tools"""
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
number_topics=40
model = models.LdaMulticore(corpus, id2word=dictionary, num_topics=number_topics, passes=10)
#multicore faster but seems to crash more in my experience
# alternatively, for single core process
#model=models.LdaModel(corpus, id2word=dictionary, num_topics=number_topics, passes=10)
model.show_topics()
[u'0.012*military + 0.012*personnel + 0.008*training + 0.007*fbi + 0.007*support + 0.007*department + 0.007*counterterrorism + 0.006*navy + 0.006*costs + 0.006*activities', u'0.000*vought + 0.000*renical + 0.000*regi + 0.000*regul + 0.000*rei + 0.000*rej + 0.000*rel + 0.000*remi + 0.000*reoubl + 0.000*responsi', u'0.022*financial + 0.017*services + 0.011*systems + 0.010*service + 0.009*industry + 0.008*institutions + 0.008*market + 0.007*technology + 0.006*credit + 0.006*funds', u'0.004*reproduction + 0.004*prohibited + 0.004*reproduced + 0.004*owner + 0.004*copyright + 0.001*pg + 0.001*51 + 0.001*directive + 0.001*presidential + 0.001*identification', u'0.025*security + 0.012*systems + 0.009*national + 0.008*data + 0.007*federal + 0.007*government + 0.006*communications + 0.006*access + 0.005*policy + 0.005*key', u'0.020*security + 0.009*key + 0.008*privacy + 0.007*nist + 0.007*standards + 0.007*encryption + 0.006*federal + 0.006*national + 0.006*government + 0.005*1994', u'0.010*task + 0.007*force + 0.006*standardization + 0.005*computers + 0.004*members + 0.004*firms + 0.004*instruction + 0.003*interests + 0.003*embedded + 0.003*financial', u'0.011*organizations + 0.009*attacks + 0.008*financial + 0.007*market + 0.007*markets + 0.007*operations + 0.006*securities + 0.006*security + 0.006*telecommunications + 0.005*government', u'0.016*market + 0.015*trading + 0.014*stock + 0.013*markets + 0.013*futures + 0.012*securities + 0.009*exchange + 0.008*clearing + 0.007*options + 0.006*price', u'0.024*security + 0.022*supply + 0.020*chain + 0.010*container + 0.009*port + 0.007*rail + 0.007*layer + 0.006*global + 0.006*cargo + 0.006*shipping']
topics_indexed=[[b for (a,b) in topics] for topics in model.show_topics(number_topics,10,formatted=False)]
topics_indexed=pd.DataFrame(topics_indexed)
topics_indexed
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
0 | dod | security | homeland | defense | guard | national | study | force | support | missions |
1 | infrastructure | national | critical | infrastructures | industry | gas | oil | natural | risk | government |
2 | security | systems | national | data | federal | government | communications | access | policy | key |
3 | federal | pki | agencies | gao | agency | government | key | plans | security | emergency |
4 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
5 | security | aviation | healthcare | public | health | hospitals | industry | aircraft | bioterrorism | attack |
6 | security | supply | chain | container | port | rail | layer | global | cargo | shipping |
7 | network | networks | technology | internet | applications | data | services | high | packet | testbeds |
8 | software | miesau | embedded | weapons | fa | handlers | 18 | officials | problems | costs |
9 | reproduction | prohibited | reproduced | owner | copyright | pg | 51 | directive | presidential | identification |
10 | defense | bmd | soviet | nuclear | space | systems | weapons | strategic | software | missile |
11 | market | trading | stock | markets | futures | securities | exchange | clearing | options | price |
12 | military | personnel | training | fbi | support | department | counterterrorism | navy | costs | activities |
13 | terrorism | biological | food | 1999 | national | federal | bioterrorism | chemical | weapons | agriculture |
14 | task | force | standardization | computers | members | firms | instruction | interests | embedded | financial |
15 | technology | defense | industry | government | software | program | military | acquisition | programs | development |
16 | federal | data | agencies | government | systems | technology | privacy | agency | security | public |
17 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
18 | security | key | privacy | nist | standards | encryption | federal | national | government | 1994 |
19 | stockpile | dod | materials | national | material | assumptions | sensitivity | percent | data | year |
20 | organizations | attacks | financial | market | markets | operations | securities | security | telecommunications | government |
21 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
22 | financial | services | systems | service | industry | institutions | market | technology | credit | funds |
23 | secretary | policy | department | national | affairs | security | issues | staff | office | international |
24 | security | infrastructure | critical | sector | national | federal | government | protection | private | homeland |
25 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
26 | insurance | catastrophe | insurers | risk | terrorism | coverage | losses | reinsurance | companies | industry |
27 | cyber | security | government | national | 6633 | sjud4 | systems | 09 | po | psn |
28 | navy | plan | consolidation | technology | consolidated | report | cost | personnel | itf | impact |
29 | air | force | systems | equipment | warfare | electronic | test | gao | tactical | report |
30 | nder | fema | members | program | agencies | training | data | office | units | enclosurei |
31 | electronic | surveillance | law | service | communication | systems | enforcement | data | communications | technology |
32 | intelligence | national | dod | defense | foreign | production | program | support | security | military |
33 | industrial | planning | defense | program | production | preparedness | base | dod | mobilization | requirements |
34 | rwr | navy | alr | commonality | rwrs | testing | programs | 167 | common | production |
35 | defense | production | base | future | military | industrial | dtib | technology | government | manufacturing |
36 | internet | law | enforcement | government | privacy | crime | industry | security | 2000 | 11 |
37 | materials | hazardous | transportation | training | regulations | dot | federal | response | emergency | local |
38 | sharing | private | sector | government | telecommunications | barriers | military | ncc | foia | material |
39 | classified | agreement | security | agreements | nondisclosure | united | form | access | agency | employees |
So which topics most significant for each document? Pass a bag of words version of each document to the model.
model[dictionary.doc2bow(texts[1])]
[(0, 0.050292287075779665), (10, 0.013742397808748223), (15, 0.86210039114844894), (20, 0.02102820792464356), (32, 0.016294480609844526), (35, 0.019644289396883754)]
Let's find them for every document--with a list comprehension, of course
primarytopics=[model[dictionary.doc2bow(text)] for text in texts]
make it pretty with a list comprehension
import numpy as np
primarytopics_matrix=pd.DataFrame(np.matrix([matutils.sparse2full(primarytopic, number_topics) for primarytopic in primarytopics]))
primarytopics_matrix.ix[18].plot(kind="bar")
<matplotlib.axes._subplots.AxesSubplot at 0x7ff30b6d90d0>
primarytopics_matrix.ix[18].plot(kind="bar", title=names[18])
<matplotlib.axes._subplots.AxesSubplot at 0x7ff2ff9ac890>
topics_indexed.ix[[6, 21, 27, 38]]
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | |
---|---|---|---|---|---|---|---|---|---|---|
6 | security | supply | chain | container | port | rail | layer | global | cargo | shipping |
21 | vought | renical | regi | regul | rei | rej | rel | remi | reoubl | responsi |
27 | cyber | security | government | national | 6633 | sjud4 | systems | 09 | po | psn |
38 | sharing | private | sector | government | telecommunications | barriers | military | ncc | foia | material |
topics_indexed.ix[27]
0 cyber 1 security 2 government 3 national 4 6633 5 sjud4 6 systems 7 09 8 po 9 psn Name: 27, dtype: object
primarytopics_matrix[27].plot(kind="bar")
<matplotlib.axes._subplots.AxesSubplot at 0x7ff2ff8b75d0>
primarytopics_matrix[17].plot(kind="bar", title=str(topics_indexed.ix[17]))
<matplotlib.axes._subplots.AxesSubplot at 0x7ff2ff7dcdd0>
#that's ugly!