In [1]:
import nltk
from nltk.corpus import gutenberg
In [2]:
for file in gutenberg.fileids():
    print file,len(gutenberg.words(file))
austen-emma.txt 192427
austen-persuasion.txt 98171
austen-sense.txt 141576
bible-kjv.txt 1010654
blake-poems.txt 8354
bryant-stories.txt 55563
burgess-busterbrown.txt 18963
carroll-alice.txt 34110
chesterton-ball.txt 96996
chesterton-brown.txt 86063
chesterton-thursday.txt 69213
edgeworth-parents.txt 210663
melville-moby_dick.txt 260819
milton-paradise.txt 96825
shakespeare-caesar.txt 25833
shakespeare-hamlet.txt 37360
shakespeare-macbeth.txt 23140
whitman-leaves.txt 154883
In [3]:
#contains punctuation
print gutenberg.words('milton-paradise.txt')[:20]
['[', 'Paradise', 'Lost', 'by', 'John', 'Milton', '1667', ']', 'Book', 'I', 'Of', 'Man', "'", 's', 'first', 'disobedience', ',', 'and', 'the', 'fruit']
In [4]:
shf = [file for file in gutenberg.fileids() if file.startswith('shakespeare')]
shf
Out[4]:
['shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt']
In [5]:
                                                           #no punctuation
shw = [w.lower() for file in shf for w in gutenberg.words(file) if w[0].isalpha()]
mpw = [w.lower() for w in gutenberg.words('milton-paradise.txt') if w[0].isalpha()]
len(shw),len(mpw) #numbers of words
Out[5]:
(69340, 80493)
In [6]:
print 'shakespeare:',nltk.FreqDist(shw).items()[:10]
print 'milton:',nltk.FreqDist(mpw).items()[:10]
shakespeare: [('the', 2222), ('and', 2036), ('to', 1515), ('i', 1455), ('of', 1302), ('you', 1124), ('a', 1019), ('my', 914), ('that', 904), ('in', 826)]
milton: [('and', 3395), ('the', 2968), ('to', 2228), ('of', 2050), ('in', 1366), ('his', 1170), ('with', 1160), ('or', 715), ('that', 704), ('all', 700)]
In [7]:
fdist=nltk.FreqDist(shw + mpw) #for both shakespeare and milton
top50=fdist.keys()[:50] #just use top 50 from combined both
top50[:10]
Out[7]:
['and', 'the', 'to', 'of', 'in', 'i', 'his', 'with', 'that', 'a']
In [8]:
#make a data matrix, one line for each 5000 word block, with the counts of topwords
M=[]
for corp in [shw,mpw]:
  for i in range(len(corp)/5000):  # 13 blocks of shakespeare, 16 of milton
    words = [w for w in corp[5000*i:5000*(i+1)] if w in top50]
    fdist= nltk.FreqDist(words)
    M.append([fdist[w] for w in top50])
In [9]:
for line in M[:13]: plot(line,'b') #shakespeare blocks in blue
for line in M[13:]: plot(line,'g') #milton blocks in green
In [10]:
figure(figsize=(12,4))
plot(mean(M[:13],axis=0),'b')
plot(mean(M[13:],axis=0),'g')
for i in range(50): text(i,mean(M[13:],axis=0)[i],top50[i])
legend(['Shakespeare','Milton'])
title('means')
Out[10]:
<matplotlib.text.Text at 0x2b393c40a3d0>
In [11]:
M = array(M)
M = M - mean(M,axis=0)[None,:]  # substract off means from columns
                           #(only interested in deviations from means)
M = M/std(M,axis=0)   #divide columns by standard deviation
                         #(make all 50 words equally important)
In [12]:
for line in M[:13]: plot(line,'b')
for line in M[13:]: plot(line,'g')
In [13]:
M = M - mean(M,axis=1)[:,None]  #subtract off means from rows
                            # (interested in covariance of rows)
U,S,Vt= linalg.svd(M)        # M = U S V^T
US=U.dot(diag(S))            # rescale columns of U by singular values
VS=Vt.T[:,:len(S)].dot(diag(S))  #rescale columns of V by singular values
In [14]:
L = S*S
bar(array(range(len(L)))+.5,L)
xlim(.5,29.5)
print 'percentage of variance in top two:',
print '({:.2f} + {:.2f}) / {:.2f} = {:.2f}'.format(L[0],L[1],sum(L),(L[0]+L[1])/sum(L))
percentage of variance in top two: (616.90 + 136.90) / 1425.10 = 0.53
In [15]:
plot(US[:13,0],US[:13,1],'ob')
plot(US[13:,0],US[13:,1],'og')
xlim(-8,8)
ylim(-6,6)
#first principal component gives good separation
title('Milton                            Shakespeare')
Out[15]:
<matplotlib.text.Text at 0x2b393c465310>
In [16]:
#plot(VS[:,0],VS[:,1],'o')
from matplotlib.colors import LinearSegmentedColormap
figure(figsize=(8,6))
scatter(VS[:,0],VS[:,1], c=VS[:,0], s=50, alpha=0.9,
        cmap=LinearSegmentedColormap.from_list('BlGr',[(0,.6,0),(0,1,1),(0,0,1)]))
for k in range(len(top50)): #+[25,42]:
    text(VS[k][0]+.05,VS[k][1]+.05,top50[k])
title('Milton words                       Shakespeare words')
#shakespeare i, your, is, have, you, it...
#milton: from, with, and, in, their, or
Out[16]:
<matplotlib.text.Text at 0x2b393c42cb50>
In [17]:
from nltk.cluster import KMeansClusterer, euclidean_distance
# see http://nltk.org/_modules/nltk/cluster/kmeans.html
In [18]:
vectors = US[1:-1,:2]  #omit first and last to test classifier
In [19]:
#kmeans using the euclidean distance metric, 2 means 
# and repeat clustering 10 times with random seeds
clusterer=KMeansClusterer(2, euclidean_distance, repeats=10)
clusters = clusterer.cluster(vectors,True)
means=array(clusterer.means())
print 'clustered as:', clusters
print 'means:', means
#note that shakespeare came out as cluster 1, milton as 0
None [[ 6.10957627 -0.48350392]
 [ 5.7416273  -2.20727748]
 [ 6.00670222  2.95922885]
 [ 3.8810451   1.1076569 ]
 [ 4.97310725  1.39614103]
 [ 5.06314372 -1.3631577 ]
 [ 5.73100054  0.80835337]
 [ 4.80400237 -1.22943347]
 [ 5.64165313 -0.02519376]
 [ 3.07144712  0.26834736]
 [ 3.81492092 -1.38432696]
 [ 4.49589965 -0.32360417]
 [-4.90086593 -4.06099333]
 [-2.68583027 -4.00370228]
 [-4.00090023  0.35327201]
 [-4.14990068 -0.64793723]
 [-2.91868508  0.89905002]
 [-4.3347406   1.4765603 ]
 [-3.73774188  0.16413739]
 [-5.42616263  0.08016156]
 [-5.90247106 -2.15759176]
 [-4.06971036  4.55170334]
 [-3.55207792  3.04860845]
 [-2.038942    4.09185734]
 [-4.82803085  1.72197546]
 [-3.48441023  1.34994605]
 [-4.5194215  -0.81134903]]
clustered as: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
means: [[-4.03665942  0.40371322]
 [ 4.94451047 -0.03973083]]
In [20]:
figure(figsize=(8,6))
plot(vectors[:12,0],vectors[:12,1],'ob') #shakespeare blue
plot(vectors[12:,0],vectors[12:,1],'og') #milton green
plot(means[:,0],means[:,1],'or') # the means in red
for j in [0,-1]: plot(US[j,0],US[j,1],'oc') #plot first and last point cyan
xlim(-8,8)
ylim(-6,6)
legend(['Shakespeare','Milton','means','test'],'lower right')
Out[20]:
<matplotlib.legend.Legend at 0x2b393c4fba90>
In [21]:
print clusterer.classify(US[0,0:2]),clusterer.classify(US[-1,0:2])
#gets both in correct cluster...
1 0
In [23]:
#now try three authors
shw = [w.lower() for file in shf for w in gutenberg.words(file) if w[0].isalpha()]
mpw = [w.lower() for w in gutenberg.words('milton-paradise.txt') if w[0].isalpha()]
auw= [w.lower() for w in gutenberg.words('austen-persuasion.txt') if w[0].isalpha()]
len(shw),len(mpw),len(auw) #numbers of words
Out[23]:
(69340, 80493, 84121)
In [24]:
fdist=nltk.FreqDist(shw + mpw + auw) #for both shakespeare and milton
top50=fdist.keys()[:50] #just use top 50 from combined both
top50[:10]
Out[24]:
['the', 'and', 'to', 'of', 'in', 'i', 'a', 'that', 'his', 'with']
In [73]:
#make a data matrix, one line for each 5000 word block, with the counts of topwords
M=[]
for corp in [shw,mpw,auw]:
  for i in range(len(corp)/5000):  # 13 blocks of shakespeare, 16 of milton and of austen
    words = [w for w in corp[5000*i:5000*(i+1)] if w in top50]
    fdist= nltk.FreqDist(words)
    M.append([fdist[w] for w in top50])
In [27]:
for line in M[:13]: plot(line,'b') #shakespeare blocks in blue
for line in M[13:29]: plot(line,'g') #milton blocks in green
for line in M[29:]: plot(line,'r') #austen blocks in red
In [28]:
M = array(M)
M = M - mean(M,axis=0)[None,:]  # substract off means from columns
                           #(only interested in deviations from means)
M = M/std(M,axis=0)   #divide columns by standard deviation
                         #(make all 50 words equally important)

for line in M[:13]: plot(line,'b') #shakespeare blocks in blue
for line in M[13:29]: plot(line,'g') #milton blocks in green
for line in M[29:]: plot(line,'r') #austen blocks in red
In [33]:
figure(figsize=(12,4))
plot(mean(M[:13],axis=0),'b')
plot(mean(M[13:29],axis=0),'g')
plot(mean(M[29:],axis=0),'r')
for i in range(50): text(i,.2*(i%2),top50[i])
legend(['Shakespeare','Milton','Austen'])
title('means')
Out[33]:
<matplotlib.text.Text at 0x2b393d33ced0>
In [34]:
M = M - mean(M,axis=1)[:,None]  #subtract off means from rows
                            # (interested in covariance of rows)
U,S,Vt= linalg.svd(M)        # M = U S V^T
US=U.dot(diag(S))            # rescale columns of U by singular values
VS=Vt.T[:,:len(S)].dot(diag(S))  #rescale columns of V by singular values
In [35]:
L = S*S
bar(array(range(len(L)))+.5,L)
xlim(.5,29.5)
print 'percentage of variance in top two:',
print '({:.2f} + {:.2f}) / {:.2f} = {:.2f}'.format(L[0],L[1],sum(L),(L[0]+L[1])/sum(L))
percentage of variance in top two: (689.43 + 613.17) / 2211.97 = 0.59
In [216]:
plot(US[:13,0],US[:13,1],'ob')
plot(US[13:29,0],US[13:29,1],'og')
plot(US[29:,0],US[29:,1],'or')
xlim(-8,8)
ylim(-6,6)
#first two principal components give good separation
legend(['Shakespeare','Milton','Austen'],numpoints=1)
Out[216]:
<matplotlib.legend.Legend at 0x2b3967cdf1d0>
In [72]:
#plot(VS[:,0],VS[:,1],'o')
def ac(v):  #return arccos from [-pi,pi]
    if v[1]>=0: return arccos(v[0]/norm(v))
    else: return -arccos(v[0]/norm(v))

from matplotlib.colors import LinearSegmentedColormap
figure(figsize=(8,6))
scatter(VS[:,0],VS[:,1], c=map(ac,VS[:,0:2]), s=50, alpha=0.9,
        cmap=LinearSegmentedColormap.from_list('wrgb',
       [(1,1,1),(1,0,0),(1,0,1),(0,0,1),(0,1,1),(0,.6,0),(1,1,1)]))
for k in range(len(top50)): #+[25,42]:
    text(VS[k][0]+.05,VS[k][1]+.05,top50[k])
#title('Milton words                       Shakespeare words')
#shakespeare i, your, is, have, you, it...
#milton: from, with, and, in, their, or
#austen: was,her,had,she
In [74]:
#now try four authors
shw = [w.lower() for file in shf for w in gutenberg.words(file) if w[0].isalpha()]
mpw = [w.lower() for w in gutenberg.words('milton-paradise.txt') if w[0].isalpha()]
auw= [w.lower() for w in gutenberg.words('austen-persuasion.txt') if w[0].isalpha()]
mdw=[w.lower() for w in gutenberg.words('melville-moby_dick.txt') if w[0].isalpha()]
len(shw),len(mpw),len(auw),len(mdw) #numbers of words
Out[74]:
(69340, 80493, 84121, 218365)
In [75]:
mdw=mdw[:80000] #truncate
In [77]:
fdist=nltk.FreqDist(shw + mpw + auw + mdw) #for all four
top50=fdist.keys()[:50] #just use top 50 from combined all
top50[:10]
Out[77]:
['the', 'and', 'of', 'to', 'in', 'a', 'i', 'that', 'his', 'with']
In [84]:
#make a data matrix, one line for each 5000 word block, with the counts of topwords
M=[]
for corp in [shw,mpw,auw,mdw]:
  for i in range(len(corp)/5000):  # 13 blocks of shakespeare, 16 of milton and of austen and of moby
    words = [w for w in corp[5000*i:5000*(i+1)] if w in top50]
    fdist= nltk.FreqDist(words)
    M.append([fdist[w] for w in top50])
In [85]:
for line in M[:13]: plot(line,'b') #shakespeare blocks in blue
for line in M[13:29]: plot(line,'g') #milton blocks in green
for line in M[29:45]: plot(line,'r') #austen blocks in red
for line in M[45:]: plot(line,'c') #moby dick in cyan
In [86]:
M = array(M)
M = M - mean(M,axis=0)[None,:]  # substract off means from columns
                           #(only interested in deviations from means)
M = M/std(M,axis=0)   #divide columns by standard deviation
                         #(make all 50 words equally important)
M = M - mean(M,axis=1)[:,None]  #subtract off means from rows
                            # (interested in covariance of rows)
U,S,Vt= linalg.svd(M)        # M = U S V^T
US=U[:,:len(S)].dot(diag(S))            # rescale columns of U by singular values
VS=Vt.T[:,:len(S)].dot(diag(S))  #rescale columns of V by singular values
In [237]:
L = S*S
bar(array(range(len(L)))+.5,L)
xlim(.5,45.5)
ylabel("variance of i'th component")
t2= '({:.1f} + {:.1f}) / {:.1f} = {:.2f}'.format(L[0],L[1],sum(L),(L[0]+L[1])/sum(L))
t3= '( ... + {:.1f}) / {:.1f} = {:.2f}'.format(L[2],sum(L),sum(L[:3])/sum(L))
text(6,700,'%variance top two: '+t2)
text(6,600,'%variance top three: '+t3)
print 'percentage of variance in top two:',t2
print 'percentage of variance in top three:',t3
title("Four Authors (Shks, Milt, Aust, Melv)")
#savefig('var4.png',dpi=72)
percentage of variance in top two: (708.8 + 657.5) / 2973.8 = 0.46
percentage of variance in top three: ( ... + 321.4) / 2973.8 = 0.57
In [208]:
#kmeans using the euclidean distance metric, 4 means 
# and repeat clustering 10 times with random seeds
clusterer=KMeansClusterer(4, euclidean_distance, repeats=10)
clusters = clusterer.cluster(US[:,:3])
means=array(clusterer.means())
print 'clustered as:', clusters
print 'means:', means
#note that shakespeare came out as cluster 1, milton as 0
clustered as: None
means: [[-4.73144117 -2.52568168 -0.74112017]
 [ 4.32038396 -2.07162856 -1.90737629]
 [-0.85628193  5.3517964  -1.04793381]
 [ 1.10678628  0.24897567  3.49994268]]
In [247]:
fig=figure(figsize=(5,5))
plot(US[:13,0],US[:13,1],'ob')
plot(US[13:29,0],US[13:29,1],'og')
plot(US[29:45,0],US[29:45,1],'or')
plot(US[45:,0],US[45:,1],'o',color='yellow')
scatter(*means.T[0:2,:],color='k',marker='*',s=100,zorder=10)
xlim(-8,8)
#ylim(-6,6)
#first two principal components give good separation
legend(['Shakespeare','Milton','Austen','Melville'],numpoints=1)
#savefig("smam2d.png",dpi=72)
In [315]:
#http://matplotlib.org/mpl_toolkits/mplot3d/tutorial.html
#http://stackoverflow.com/questions/5490288/plotting-3d-scatter-in-matplotlib

from mpl_toolkits.mplot3d import Axes3D
fig=figure(figsize=(6,6))
#ax = fig.add_subplot(111, projection='3d')
ax=fig.gca(projection='3d')

ax.scatter(*US.T[0:3,:13],c='b',s=40)
ax.scatter(*US.T[0:3,13:29],c='g',s=40)
ax.scatter(*US.T[0:3,29:45],c='r',s=40)
ax.scatter(*US.T[0:3,45:],c='yellow',s=40)
ax.scatter(*means.T,color='k',marker='*',s=100)

#hack to greate legend
for c in ['b','g','r','yellow']: ax.plot([],[],'o',c=c)
ax.legend(['Shakespeare','Milton','Austen','Melville'],numpoints=1)
#savefig("smam3d.png",dpi=72)
Out[315]:
<matplotlib.legend.Legend at 0x2b396d6ddd90>
In [323]:
#now try four more authors
bibw = [w.lower() for w in gutenberg.words('bible-kjv.txt') if w[0].isalpha()]
chw= [w.lower() for w in gutenberg.words('chesterton-ball.txt') if w[0].isalpha()]
edw= [w.lower() for w in gutenberg.words('edgeworth-parents.txt') if w[0].isalpha()]
whw= [w.lower() for w in gutenberg.words('whitman-leaves.txt') if w[0].isalpha()]
In [331]:
print len(bibw),len(chw),len(edw),len(whw)
791842 82725 170737 126276
In [332]:
bibw=bibw[:80000]
chw=chw[:80000]
edw=edw[:80000]
whw=whw[:80000]
In [334]:
fdist=nltk.FreqDist(shw + mpw + auw + mdw + bibw + chw + edw + whw) #for all eight
top50=fdist.keys()[:50] #just use top 50 from combined all
top50[:10]
Out[334]:
['the', 'and', 'of', 'to', 'a', 'in', 'i', 'that', 'he', 'his']
In [336]:
#add to data matrix, one line for each 5000 word block, with the counts of topwords
M=[]
for corp in [shw,mpw,auw,mdw,bibw,chw,edw,whw]:
  for i in range(len(corp)/5000):  # 13 blocks for shakespeare, 16 for rest
    words = [w for w in corp[5000*i:5000*(i+1)] if w in top50]
    fdist= nltk.FreqDist(words)
    M.append([fdist[w] for w in top50])
In [338]:
M = array(M)
M = M - mean(M,axis=0)[None,:]  # substract off means from columns
                           #(only interested in deviations from means)
M = M/std(M,axis=0)   #divide columns by standard deviation
                         #(make all 50 words equally important)
M = M - mean(M,axis=1)[:,None]  #subtract off means from rows
                            # (interested in covariance of rows)
U,S,Vt= linalg.svd(M)        # M = U S V^T
US=U[:,:len(S)].dot(diag(S))            # rescale columns of U by singular values
VS=Vt.T[:,:len(S)].dot(diag(S))  #rescale columns of V by singular values
In [344]:
L = S*S
bar(array(range(len(L)))+.5,L)
xlim(.5,45.5)
ylabel("variance of i'th component")
t2= '({:.1f} + {:.1f}) / {:.1f} = {:.2f}'.format(L[0],L[1],sum(L),(L[0]+L[1])/sum(L))
t3= '( ... + {:.1f}) / {:.1f} = {:.2f}'.format(L[2],sum(L),sum(L[:3])/sum(L))
t4= '( ... + {:.1f}) / {:.1f} = {:.2f}'.format(L[3],sum(L),sum(L[:4])/sum(L))
t5= '( ... + {:.1f}) / {:.1f} = {:.2f}'.format(L[4],sum(L),sum(L[:5])/sum(L))
t6= '( ... + {:.1f}) / {:.1f} = {:.2f}'.format(L[5],sum(L),sum(L[:6])/sum(L))
t7= '( ... + {:.1f}) / {:.1f} = {:.2f}'.format(L[6],sum(L),sum(L[:7])/sum(L))
text(6,1000,'%variance top two: '+t2)
text(6,900,'%variance top three: '+t3)
text(6,800,'%variance top four: '+t4)
text(6,700,'%variance top five: '+t5)
text(6,600,'%variance top six: '+t6)
text(6,500,'%variance top seven: '+t7)
title("Eight Authors (Shks, Milt, Aust, Melv, Bib, Ches, Edge, Whit)")
#savefig('var4.png',dpi=72)
Out[344]:
<matplotlib.text.Text at 0x2b3976671b50>
In [352]:
#kmeans using the euclidean distance metric, 4 means 
# and repeat clustering 10 times with random seeds
clusterer=KMeansClusterer(8, euclidean_distance, repeats=10)
clusters = clusterer.cluster(US[:,:5])  #use top five dimensions
means=array(clusterer.means())
print 'clustered as:', clusters
print 'means:', means
clustered as: None
means: [[ 1.17681128 -1.7227154  -5.24331226 -1.84328931  0.07486046]
 [-4.77431674  0.76032457 -1.10799887  0.02803771 -2.09976064]
 [-0.30971294 -5.36045998  3.60474594 -1.45417768 -0.44894318]
 [-2.9348402   2.62015477  0.30647196  0.15736091 -2.08899215]
 [-1.33858777  0.33202064  0.14031761  1.31835119  2.50596166]
 [ 0.09280307  3.53896233  2.08264806 -0.67663425 -0.24531875]
 [ 4.58316814 -0.6571071  -0.23894832  2.89166826 -1.52555994]
 [ 3.76878895  3.2349442   1.41085038 -2.41116751  0.54046512]]
In [354]:
from mpl_toolkits.mplot3d import Axes3D
fig=figure(figsize=(6,6))
#ax = fig.add_subplot(111, projection='3d')
ax=fig.gca(projection='3d')

ax.scatter(*US.T[0:3,:13],c='b',s=40)
ax.scatter(*US.T[0:3,13:29],c='g',s=40)
ax.scatter(*US.T[0:3,29:45],c='r',s=40)
ax.scatter(*US.T[0:3,45:61],c='yellow',s=40)
ax.scatter(*US.T[0:3,61:77],c='grey',s=40)
ax.scatter(*US.T[0:3,77:93],c='violet',s=40)
ax.scatter(*US.T[0:3,93:109],c='brown',s=40)
ax.scatter(*US.T[0:3,109:],c='orange',s=40)
ax.scatter(*means[:,:3].T,color='k',marker='*',s=100)

#hack to greate legend
#for c in ['b','g','r','yellow','grey','violet','brown','orange']: ax.plot([],[],'o',c=c)
#ax.legend(['Shakespeare','Milton','Austen','Melville','Bible','Ches','Edge','Whit'],numpoints=1)
#savefig("8auth3d.png",dpi=72)
Out[354]:
<mpl_toolkits.mplot3d.art3d.Patch3DCollection at 0x2b396bb29cd0>
In [ ]: