cd /home/anshul/data/text
/home/anshul/data/text
import glob
for files in glob.glob("*.txt"):
print files
austen-sense.txt austen-persuasion.txt bible-kjv.txt austen-emma.txt
def textReader(fname):
doc=open(fname,'rb')
document = ' '.join(line.decode('utf-8').strip() for line in doc.readlines())
return document
documents=[]
for files in glob.glob("*.txt"):
documents.append(textReader(files))
from nltk.tokenize import RegexpTokenizer
nltk_tokenizer=RegexpTokenizer(r'\w+')
for index,document in enumerate(documents):
documents[index]= nltk_tokenizer.tokenize(document)
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
for index,document in enumerate(documents):
documents[index]=[w for w in document if w not in stop_words ]
from nltk.stem import PorterStemmer
ps=PorterStemmer()
all_words=[]
for index,document in enumerate(documents):
documents[index]=map(lambda x: ps.stem(x).lower(),document)
all_words= all_words + documents[index]
for document in documents:
print document
[u'sen', u'sensibl', u'jane', u'austen', u'1811', u'chapter', u'1', u'famili', u'dashwood', u'long', u'settl', u'sussex', u'estat', u'larg', u'resid', u'norland', u'park', u'centr', u'properti', u'mani', u'gener', u'live', u'respect', u'manner', u'engag', u'gener', u'good', u'opinion', u'surround', u'acquaint', u'late', u'owner', u'estat', u'singl', u'man', u'live', u'advanc', u'age', u'mani', u'year', u'life', u'constant', u'companion', u'housekeep', u'sister', u'death', u'happen', u'ten', u'year', u'produc', u'great', u'alter', u'home', u'suppli', u'loss', u'invit', u'receiv', u'hou', u'famili', u'nephew', u'mr', u'henri', u'dashwood', u'legal', u'inheritor', u'norland', u'estat', u'person', u'intend', u'bequeath', u'societi', u'nephew', u'niec', u'children', u'old', u'gentleman', u'day', u'comfort', u'spent', u'hi', u'attach', u'increa', u'constant', u'attent', u'mr', u'mr', u'henri', u'dashwood', u'wish', u'proceed', u'mere', u'interest', u'good', u'heart', u'gave', u'everi', u'degr', u'solid', u'comfort', u'age', u'could', u'receiv', u'cheer', u'children', u'ad', u'relish', u'exist', u'former', u'marriag', u'mr', u'henri', u'dashwood', u'one', u'son', u'present', u'ladi', u'three', u'daughter', u'son', u'steadi', u'respect', u'young', u'man', u'ampli', u'provid', u'fortun', u'mother', u'larg', u'half', u'devolv', u'come', u'age', u'marriag', u'likewi', u'happen', u'soon', u'afterward', u'ad', u'wealth', u'therefor', u'success', u'norland', u'estat', u'realli', u'import', u'sister', u'fortun', u'independ', u'might', u'ari', u'father', u'inherit', u'properti', u'could', u'small', u'mother', u'noth', u'father', u'seven', u'thousand', u'pound', u'dispo', u'remain', u'moieti', u'first', u'wife', u'fortun', u'also', u'secur', u'child', u'life', u'interest'] [u'persua', u'jane', u'austen', u'1818', u'chapter', u'1', u'sir', u'walter', u'elliot', u'kellynch', u'hall', u'somersetshir', u'man', u'amu', u'never', u'took', u'book', u'baronetag', u'found', u'occup', u'idl', u'hour', u'consol', u'distress', u'one', u'faculti', u'rou', u'admir', u'respect', u'contempl', u'limit', u'remnant', u'earliest', u'patent', u'unwelcom', u'sensat', u'ari', u'domest', u'affair', u'chang', u'natur', u'piti', u'contempt', u'turn', u'almost', u'endless', u'creation', u'last', u'centuri', u'everi', u'leaf', u'powerless', u'could', u'read', u'histori', u'interest', u'never', u'fail', u'thi', u'page', u'favourit', u'volum', u'alway', u'open', u'elliot', u'kellynch', u'hall', u'walter', u'elliot', u'born', u'march', u'1', u'1760', u'marri', u'juli', u'15', u'1784', u'elizabeth', u'daughter', u'jame', u'stevenson', u'esq', u'south', u'park', u'counti', u'gloucest', u'ladi', u'die', u'1800', u'issu', u'elizabeth', u'born', u'june', u'1', u'1785', u'ann', u'born', u'august', u'9', u'1787', u'still', u'born', u'son', u'novemb', u'5', u'1789', u'mari', u'born', u'novemb', u'20', u'1791', u'preci', u'paragraph', u'origin', u'stood', u'printer', u'hand', u'sir', u'walter', u'improv', u'ad', u'inform', u'famili', u'word', u'date', u'mari', u'birth', u'marri', u'decemb', u'16', u'1810', u'charl', u'son', u'heir', u'charl', u'musgrov', u'esq', u'uppercross', u'counti', u'somerset', u'insert', u'accur', u'day', u'month', u'lost', u'wife', u'follow', u'histori', u'rise', u'ancient', u'respect', u'famili', u'usual', u'term', u'first', u'settl', u'cheshir', u'mention', u'dugdal', u'serv', u'offic', u'high', u'sheriff', u'repr', u'borough', u'three', u'success', u'parliament', u'exert', u'loyalti', u'digniti', u'baronet', u'first', u'year', u'charl', u'ii', u'mari', u'elizabeth', u'marri', u'form', u'altogeth', u'two', u'handsom', u'duodecimo', u'page', u'conclud', u'arm', u'motto', u'princip', u'seat', u'kellynch', u'hall', u'counti', u'somerset', u'sir', u'walter', u'handwrit', u'final', u'heir', u'presumpt', u'william', u'walter', u'elliot', u'esq', u'great', u'grandson', u'second', u'sir', u'walter'] [u'king', u'jame', u'bibl', u'old', u'testament', u'king', u'jame', u'bibl', u'first', u'book', u'mose', u'call', u'genesi', u'1', u'1', u'begin', u'god', u'creat', u'heaven', u'earth', u'1', u'2', u'earth', u'without', u'form', u'void', u'dark', u'upon', u'face', u'deep', u'spirit', u'god', u'move', u'upon', u'face', u'water', u'1', u'3', u'god', u'said', u'let', u'light', u'light', u'1', u'4', u'god', u'saw', u'light', u'good', u'god', u'divid', u'light', u'dark', u'1', u'5', u'god', u'call', u'light', u'day', u'dark', u'call', u'night', u'even', u'morn', u'first', u'day', u'1', u'6', u'god', u'said', u'let', u'firmament', u'midst', u'water', u'let', u'divid', u'water', u'water', u'1', u'7', u'god', u'made', u'firmament', u'divid', u'water', u'firmament', u'water', u'firmament', u'1', u'8', u'god', u'call', u'firmament', u'heaven', u'even', u'morn', u'second', u'day', u'1', u'9', u'god', u'said', u'let', u'water', u'heaven', u'gather', u'togeth', u'unto', u'one', u'place', u'let', u'dri', u'land', u'appear', u'1', u'10', u'god', u'call', u'dri', u'land', u'earth', u'gather', u'togeth', u'water', u'call', u'sea', u'god', u'saw', u'good', u'1', u'11', u'god', u'said', u'let', u'earth', u'bring', u'forth', u'grass', u'herb', u'yield', u'seed', u'fruit', u'tree', u'yield', u'fruit', u'kind', u'whose', u'seed', u'upon', u'earth', u'1', u'12', u'earth', u'brought', u'forth', u'grass', u'herb', u'yield', u'seed', u'kind', u'tree', u'yield', u'fruit', u'whose', u'seed', u'kind', u'god', u'saw', u'good', u'1', u'13', u'even', u'morn', u'third', u'day', u'1', u'14', u'god', u'said', u'let', u'light', u'firmament', u'heaven', u'divid', u'day', u'night', u'let', u'sign', u'season', u'day', u'year', u'1', u'15', u'let', u'light', u'firmament', u'heaven', u'give', u'light', u'upon', u'earth', u'1', u'16', u'god', u'made', u'two', u'great', u'light', u'greater', u'light', u'rule', u'day', u'lesser', u'light', u'rule', u'night', u'made', u'star', u'also', u'1', u'17', u'god', u'set', u'firmament', u'heaven', u'give', u'light', u'upon', u'earth', u'1', u'18', u'rule', u'day', u'night', u'divid', u'light', u'dark', u'god', u'saw', u'good', u'1', u'19', u'even', u'morn', u'fourth', u'day'] [u'emma', u'jane', u'austen', u'1816', u'volum', u'chapter', u'emma', u'woodhou', u'handsom', u'clever', u'rich', u'comfort', u'home', u'happi', u'disposit', u'seem', u'unit', u'best', u'bless', u'exist', u'live', u'nearli', u'twenti', u'one', u'year', u'world', u'littl', u'distress', u'vex', u'youngest', u'two', u'daughter', u'affect', u'indulg', u'father', u'consequ', u'sister', u'marriag', u'mistress', u'hou', u'earli', u'period', u'mother', u'die', u'long', u'ago', u'indistinct', u'remembr', u'caress', u'place', u'suppli', u'excel', u'woman', u'gover', u'fallen', u'littl', u'short', u'mother', u'affect', u'sixteen', u'year', u'miss', u'taylor', u'mr', u'woodhou', u'famili', u'less', u'gover', u'friend', u'fond', u'daughter', u'particularli', u'emma', u'_them_', u'intimaci', u'sister', u'even', u'miss', u'taylor', u'cea', u'hold', u'nomin', u'offic', u'gover', u'mild', u'temper', u'hardli', u'allow', u'impo', u'restraint', u'shadow', u'author', u'long', u'pass', u'away', u'live', u'togeth', u'friend', u'friend', u'mutual', u'attach', u'emma', u'like', u'highli', u'esteem', u'miss', u'taylor', u'judgment', u'direct', u'chiefli', u'real', u'evil', u'ind', u'emma', u'situat', u'power', u'rather', u'much', u'way', u'disposit', u'think', u'littl', u'well', u'disadvantag', u'threaten', u'alloy', u'mani', u'enjoy', u'danger', u'howev', u'present', u'unperceiv', u'mean', u'rank', u'misfortun', u'sorrow', u'came', u'gentl', u'sorrow', u'shape', u'disagr', u'consciou', u'miss', u'taylor', u'marri', u'miss', u'taylor', u'loss', u'first', u'brought', u'grief', u'wed', u'day', u'belov', u'friend', u'emma', u'first', u'sat', u'mourn', u'thought', u'continu', u'wed', u'bride', u'peopl', u'gone', u'father', u'left', u'dine', u'togeth', u'prospect', u'third', u'cheer', u'long', u'even', u'father', u'compo', u'sleep', u'dinner', u'usual', u'sit', u'think', u'lost']
import nltk
all_words=nltk.FreqDist(all_words)
print all_words.most_common(10)
[(u'1', 24), (u'god', 18), (u'light', 13), (u'day', 12), (u'let', 9), (u'water', 8), (u'earth', 8), (u'firmament', 8), (u'first', 7), (u'heaven', 6)]
all_words.plot(20)
from IPython.display import Image
Image(filename='/home/anshul/plot.png')
col=100
word_features= [w for (w,c) in all_words.most_common(col)]
import numpy as np
import pandas as pd
def find_Features(document):
words=document
features=[]
for w in word_features:
features.append(words.count(w))
return features
def termDocumentMatrix(documents):
feature_sets=np.zeros((len(documents),col))
i=0
for document in documents:
feature_sets[i,]= find_Features([word for word in document])
i=i+1
return pd.DataFrame(feature_sets,columns=word_features)
tdm= termDocumentMatrix(documents)
print tdm
1 god light day let water earth firmament first heaven ... \ 0 1 0 0 1 0 0 0 0 1 0 ... 1 3 0 0 1 0 0 0 0 2 0 ... 2 20 18 13 9 9 8 8 8 2 6 ... 3 0 0 0 1 0 0 0 0 2 0 ... woodhou histori give hou third offic happen forth place think 0 0 0 0 1 0 0 2 0 0 0 1 0 2 0 0 0 1 0 0 0 0 2 0 0 2 0 1 0 0 2 1 0 3 2 0 0 1 1 1 0 0 1 2 [4 rows x 100 columns]
similarity_matrix=pd.DataFrame(np.corrcoef(tdm.T),index=word_features,columns=word_features)
print similarity_matrix
1 god light day let water \ 1 1.000000 0.991189 0.991189 0.991189 0.991189 0.991189 god 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 light 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 day 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 let 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 water 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 earth 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 firmament 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 first 0.353996 0.333333 0.333333 0.333333 0.333333 0.333333 heaven 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 even 0.813421 0.870388 0.870388 0.870388 0.870388 0.870388 call 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 good 0.850395 0.870388 0.870388 0.870388 0.870388 0.870388 year -0.674453 -0.577350 -0.577350 -0.577350 -0.577350 -0.577350 walter -0.212398 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 emma -0.424795 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 father -0.660793 -0.555556 -0.555556 -0.555556 -0.555556 -0.555556 famili -0.813421 -0.870388 -0.870388 -0.870388 -0.870388 -0.870388 mr -0.486215 -0.440225 -0.440225 -0.440225 -0.440225 -0.440225 divid 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 miss -0.424795 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 said 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 upon 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 born -0.212398 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 taylor -0.424795 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 marri -0.375470 -0.471405 -0.471405 -0.471405 -0.471405 -0.471405 daughter -0.867110 -0.816497 -0.816497 -0.816497 -0.816497 -0.816497 live -0.674453 -0.577350 -0.577350 -0.577350 -0.577350 -0.577350 sir -0.212398 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 dashwood -0.353996 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 ... ... ... ... ... ... ... austen -0.991189 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 jane -0.991189 -1.000000 -1.000000 -1.000000 -1.000000 -1.000000 made 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 ad -0.480658 -0.522233 -0.522233 -0.522233 -0.522233 -0.522233 age -0.353996 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 rule 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 children -0.353996 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 whose 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 everi -0.490511 -0.577350 -0.577350 -0.577350 -0.577350 -0.577350 volum -0.551825 -0.577350 -0.577350 -0.577350 -0.577350 -0.577350 affect -0.424795 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 die -0.551825 -0.577350 -0.577350 -0.577350 -0.577350 -0.577350 second 0.674453 0.577350 0.577350 0.577350 0.577350 0.577350 nephew -0.353996 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 herb 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 gener -0.353996 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 never -0.212398 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 dri 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 brought 0.490511 0.577350 0.577350 0.577350 0.577350 0.577350 heir -0.212398 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 woodhou -0.424795 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 histori -0.212398 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 give 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 hou -0.674453 -0.577350 -0.577350 -0.577350 -0.577350 -0.577350 third 0.490511 0.577350 0.577350 0.577350 0.577350 0.577350 offic -0.551825 -0.577350 -0.577350 -0.577350 -0.577350 -0.577350 happen -0.353996 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 forth 0.991189 1.000000 1.000000 1.000000 1.000000 1.000000 place 0.490511 0.577350 0.577350 0.577350 0.577350 0.577350 think -0.424795 -0.333333 -0.333333 -0.333333 -0.333333 -0.333333 earth firmament first heaven ... woodhou \ 1 0.991189 0.991189 0.353996 0.991189 ... -0.424795 god 1.000000 1.000000 0.333333 1.000000 ... -0.333333 light 1.000000 1.000000 0.333333 1.000000 ... -0.333333 day 1.000000 1.000000 0.333333 1.000000 ... -0.333333 let 1.000000 1.000000 0.333333 1.000000 ... -0.333333 water 1.000000 1.000000 0.333333 1.000000 ... -0.333333 earth 1.000000 1.000000 0.333333 1.000000 ... -0.333333 firmament 1.000000 1.000000 0.333333 1.000000 ... -0.333333 first 0.333333 0.333333 1.000000 0.333333 ... 0.333333 heaven 1.000000 1.000000 0.333333 1.000000 ... -0.333333 even 0.870388 0.870388 0.522233 0.870388 ... 0.174078 call 1.000000 1.000000 0.333333 1.000000 ... -0.333333 good 0.870388 0.870388 -0.174078 0.870388 ... -0.522233 year -0.577350 -0.577350 -0.577350 -0.577350 ... 0.577350 walter -0.333333 -0.333333 0.333333 -0.333333 ... -0.333333 emma -0.333333 -0.333333 0.333333 -0.333333 ... 1.000000 father -0.555556 -0.555556 -0.333333 -0.555556 ... 0.777778 famili -0.870388 -0.870388 -0.522233 -0.870388 ... -0.174078 mr -0.440225 -0.440225 -0.968496 -0.440225 ... -0.088045 divid 1.000000 1.000000 0.333333 1.000000 ... -0.333333 miss -0.333333 -0.333333 0.333333 -0.333333 ... 1.000000 said 1.000000 1.000000 0.333333 1.000000 ... -0.333333 upon 1.000000 1.000000 0.333333 1.000000 ... -0.333333 born -0.333333 -0.333333 0.333333 -0.333333 ... -0.333333 taylor -0.333333 -0.333333 0.333333 -0.333333 ... 1.000000 marri -0.471405 -0.471405 0.471405 -0.471405 ... 0.000000 daughter -0.816497 -0.816497 0.000000 -0.816497 ... 0.816497 live -0.577350 -0.577350 -0.577350 -0.577350 ... 0.577350 sir -0.333333 -0.333333 0.333333 -0.333333 ... -0.333333 dashwood -0.333333 -0.333333 -1.000000 -0.333333 ... -0.333333 ... ... ... ... ... ... ... austen -1.000000 -1.000000 -0.333333 -1.000000 ... 0.333333 jane -1.000000 -1.000000 -0.333333 -1.000000 ... 0.333333 made 1.000000 1.000000 0.333333 1.000000 ... -0.333333 ad -0.522233 -0.522233 -0.870388 -0.522233 ... -0.522233 age -0.333333 -0.333333 -1.000000 -0.333333 ... -0.333333 rule 1.000000 1.000000 0.333333 1.000000 ... -0.333333 children -0.333333 -0.333333 -1.000000 -0.333333 ... -0.333333 whose 1.000000 1.000000 0.333333 1.000000 ... -0.333333 everi -0.577350 -0.577350 -0.577350 -0.577350 ... -0.577350 volum -0.577350 -0.577350 0.577350 -0.577350 ... 0.577350 affect -0.333333 -0.333333 0.333333 -0.333333 ... 1.000000 die -0.577350 -0.577350 0.577350 -0.577350 ... 0.577350 second 0.577350 0.577350 0.577350 0.577350 ... -0.577350 nephew -0.333333 -0.333333 -1.000000 -0.333333 ... -0.333333 herb 1.000000 1.000000 0.333333 1.000000 ... -0.333333 gener -0.333333 -0.333333 -1.000000 -0.333333 ... -0.333333 never -0.333333 -0.333333 0.333333 -0.333333 ... -0.333333 dri 1.000000 1.000000 0.333333 1.000000 ... -0.333333 brought 0.577350 0.577350 0.577350 0.577350 ... 0.577350 heir -0.333333 -0.333333 0.333333 -0.333333 ... -0.333333 woodhou -0.333333 -0.333333 0.333333 -0.333333 ... 1.000000 histori -0.333333 -0.333333 0.333333 -0.333333 ... -0.333333 give 1.000000 1.000000 0.333333 1.000000 ... -0.333333 hou -0.577350 -0.577350 -0.577350 -0.577350 ... 0.577350 third 0.577350 0.577350 0.577350 0.577350 ... 0.577350 offic -0.577350 -0.577350 0.577350 -0.577350 ... 0.577350 happen -0.333333 -0.333333 -1.000000 -0.333333 ... -0.333333 forth 1.000000 1.000000 0.333333 1.000000 ... -0.333333 place 0.577350 0.577350 0.577350 0.577350 ... 0.577350 think -0.333333 -0.333333 0.333333 -0.333333 ... 1.000000 histori give hou third offic happen \ 1 -0.212398 0.991189 -0.674453 0.490511 -0.551825 -0.353996 god -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 light -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 day -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 let -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 water -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 earth -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 firmament -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 first 0.333333 0.333333 -0.577350 0.577350 0.577350 -1.000000 heaven -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 even -0.522233 0.870388 -0.301511 0.904534 -0.301511 -0.522233 call -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 good -0.522233 0.870388 -0.301511 0.301511 -0.904534 0.174078 year -0.577350 -0.577350 1.000000 0.000000 0.000000 0.577350 walter 1.000000 -0.333333 -0.577350 -0.577350 0.577350 -0.333333 emma -0.333333 -0.333333 0.577350 0.577350 0.577350 -0.333333 father -0.555556 -0.555556 0.962250 0.192450 0.192450 0.333333 famili 0.522233 -0.870388 0.301511 -0.904534 0.301511 0.522233 mr -0.440225 -0.440225 0.762493 -0.457496 -0.457496 0.968496 divid -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 miss -0.333333 -0.333333 0.577350 0.577350 0.577350 -0.333333 said -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 upon -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 born 1.000000 -0.333333 -0.577350 -0.577350 0.577350 -0.333333 taylor -0.333333 -0.333333 0.577350 0.577350 0.577350 -0.333333 marri 0.942809 -0.471405 -0.408248 -0.408248 0.816497 -0.471405 daughter 0.000000 -0.816497 0.707107 0.000000 0.707107 0.000000 live -0.577350 -0.577350 1.000000 0.000000 0.000000 0.577350 sir 1.000000 -0.333333 -0.577350 -0.577350 0.577350 -0.333333 dashwood -0.333333 -0.333333 0.577350 -0.577350 -0.577350 1.000000 ... ... ... ... ... ... ... austen 0.333333 -1.000000 0.577350 -0.577350 0.577350 0.333333 jane 0.333333 -1.000000 0.577350 -0.577350 0.577350 0.333333 made -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 ad 0.174078 -0.522233 0.301511 -0.904534 -0.301511 0.870388 age -0.333333 -0.333333 0.577350 -0.577350 -0.577350 1.000000 rule -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 children -0.333333 -0.333333 0.577350 -0.577350 -0.577350 1.000000 whose -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 everi 0.577350 -0.577350 0.000000 -1.000000 0.000000 0.577350 volum 0.577350 -0.577350 0.000000 0.000000 1.000000 -0.577350 affect -0.333333 -0.333333 0.577350 0.577350 0.577350 -0.333333 die 0.577350 -0.577350 0.000000 0.000000 1.000000 -0.577350 second 0.577350 0.577350 -1.000000 0.000000 0.000000 -0.577350 nephew -0.333333 -0.333333 0.577350 -0.577350 -0.577350 1.000000 herb -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 gener -0.333333 -0.333333 0.577350 -0.577350 -0.577350 1.000000 never 1.000000 -0.333333 -0.577350 -0.577350 0.577350 -0.333333 dri -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 brought -0.577350 0.577350 0.000000 1.000000 0.000000 -0.577350 heir 1.000000 -0.333333 -0.577350 -0.577350 0.577350 -0.333333 woodhou -0.333333 -0.333333 0.577350 0.577350 0.577350 -0.333333 histori 1.000000 -0.333333 -0.577350 -0.577350 0.577350 -0.333333 give -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 hou -0.577350 -0.577350 1.000000 0.000000 0.000000 0.577350 third -0.577350 0.577350 0.000000 1.000000 0.000000 -0.577350 offic 0.577350 -0.577350 0.000000 0.000000 1.000000 -0.577350 happen -0.333333 -0.333333 0.577350 -0.577350 -0.577350 1.000000 forth -0.333333 1.000000 -0.577350 0.577350 -0.577350 -0.333333 place -0.577350 0.577350 0.000000 1.000000 0.000000 -0.577350 think -0.333333 -0.333333 0.577350 0.577350 0.577350 -0.333333 forth place think 1 0.991189 0.490511 -0.424795 god 1.000000 0.577350 -0.333333 light 1.000000 0.577350 -0.333333 day 1.000000 0.577350 -0.333333 let 1.000000 0.577350 -0.333333 water 1.000000 0.577350 -0.333333 earth 1.000000 0.577350 -0.333333 firmament 1.000000 0.577350 -0.333333 first 0.333333 0.577350 0.333333 heaven 1.000000 0.577350 -0.333333 even 0.870388 0.904534 0.174078 call 1.000000 0.577350 -0.333333 good 0.870388 0.301511 -0.522233 year -0.577350 0.000000 0.577350 walter -0.333333 -0.577350 -0.333333 emma -0.333333 0.577350 1.000000 father -0.555556 0.192450 0.777778 famili -0.870388 -0.904534 -0.174078 mr -0.440225 -0.457496 -0.088045 divid 1.000000 0.577350 -0.333333 miss -0.333333 0.577350 1.000000 said 1.000000 0.577350 -0.333333 upon 1.000000 0.577350 -0.333333 born -0.333333 -0.577350 -0.333333 taylor -0.333333 0.577350 1.000000 marri -0.471405 -0.408248 0.000000 daughter -0.816497 0.000000 0.816497 live -0.577350 0.000000 0.577350 sir -0.333333 -0.577350 -0.333333 dashwood -0.333333 -0.577350 -0.333333 ... ... ... ... austen -1.000000 -0.577350 0.333333 jane -1.000000 -0.577350 0.333333 made 1.000000 0.577350 -0.333333 ad -0.522233 -0.904534 -0.522233 age -0.333333 -0.577350 -0.333333 rule 1.000000 0.577350 -0.333333 children -0.333333 -0.577350 -0.333333 whose 1.000000 0.577350 -0.333333 everi -0.577350 -1.000000 -0.577350 volum -0.577350 0.000000 0.577350 affect -0.333333 0.577350 1.000000 die -0.577350 0.000000 0.577350 second 0.577350 0.000000 -0.577350 nephew -0.333333 -0.577350 -0.333333 herb 1.000000 0.577350 -0.333333 gener -0.333333 -0.577350 -0.333333 never -0.333333 -0.577350 -0.333333 dri 1.000000 0.577350 -0.333333 brought 0.577350 1.000000 0.577350 heir -0.333333 -0.577350 -0.333333 woodhou -0.333333 0.577350 1.000000 histori -0.333333 -0.577350 -0.333333 give 1.000000 0.577350 -0.333333 hou -0.577350 0.000000 0.577350 third 0.577350 1.000000 0.577350 offic -0.577350 0.000000 0.577350 happen -0.333333 -0.577350 -0.333333 forth 1.000000 0.577350 -0.333333 place 0.577350 1.000000 0.577350 think -0.333333 0.577350 1.000000 [100 rows x 100 columns]
def association(word_list,corr_coef):
for value in word_list:
print value
print similarity_matrix[similarity_matrix[value]>corr_coef][value].sort_values(ascending=False),"\n"
association(["god","light"],0.8)
god forth 1.000000 give 1.000000 god 1.000000 light 1.000000 day 1.000000 let 1.000000 water 1.000000 earth 1.000000 firmament 1.000000 heaven 1.000000 call 1.000000 divid 1.000000 said 1.000000 upon 1.000000 seed 1.000000 saw 1.000000 yield 1.000000 dark 1.000000 night 1.000000 morn 1.000000 kind 1.000000 fruit 1.000000 made 1.000000 rule 1.000000 whose 1.000000 herb 1.000000 dri 1.000000 1 0.991189 good 0.870388 even 0.870388 jame 0.870388 Name: god, dtype: float64 light forth 1.000000 give 1.000000 god 1.000000 light 1.000000 day 1.000000 let 1.000000 water 1.000000 earth 1.000000 firmament 1.000000 heaven 1.000000 call 1.000000 divid 1.000000 said 1.000000 upon 1.000000 seed 1.000000 saw 1.000000 yield 1.000000 dark 1.000000 night 1.000000 morn 1.000000 kind 1.000000 fruit 1.000000 made 1.000000 rule 1.000000 whose 1.000000 herb 1.000000 dri 1.000000 1 0.991189 good 0.870388 even 0.870388 jame 0.870388 Name: light, dtype: float64
text=textReader("austen-sense.txt")
import sys
sys.path.append("/usr/local/lib/python2.7/dist-packages")
from wordcloud import WordCloud
wordcloud = WordCloud(max_words=100).generate(text)
import matplotlib.pyplot as plt
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
Image(filename='/home/anshul/word_cloud.png')