In [1]:
count={}
In [2]:
for n in range(40):
for w in l.rstrip().lower().split():
if w not in count: count[w]=0
count[w] += 1
In [3]:
#sort according to highest rank
ydata=sorted(count.values(),reverse=True)
In [4]:
#top values of word counts
ydata[:10]
Out[4]:
[1648, 794, 651, 623, 576, 518, 357, 273, 265, 176]
In [5]:
#find which they are
topwords=sorted(count,reverse=True,key=count.get)
topwords[:10]
Out[5]:
['the', 'of', 'to', 'a', 'and', 'in', 'that', 'is', 'for', 'are']
In [6]:
#reference zipf distribution
zipf=[1./i for i in range(1,len(ydata)+1)]
In [7]:
figure(figsize=(6,6))
plot(range(1,len(ydata)+1),ydata,'o')
yscale('log')
xscale('log')
grid('on')
ylim(1,10000)
xlabel('word rank')
ylabel('word count')
#y=1000  # guess why intercept
#for i in range(5):
#    plot(range(1,len(ydata)+1),y*array(zipf))
#    y*=2
y= 2500 #y-intercept
#fit with zipf
plot(range(1,len(ydata)+1),y*array(zipf))
#annotate top 10 words
for i in range(10):
text(i+1,ydata[i]+100,topwords[i])
#and sample from rest of range
for k in map(lambda x:2**(x+4),range(8)):
text(k+1,ydata[k]+5,topwords[k])
text(len(ydata),ydata[-1]+2,topwords[-1])
Out[7]:
<matplotlib.text.Text at 0x10a1bd510>
In [8]:
#size of vocab
len(count)
Out[8]:
5077
In [9]:
#total size (40 files, roughly 600 words / file, i.e. *small*)
sum(count.values())
Out[9]:
25155
In [10]:
import gzip
import re  #for regular expressions
In [11]:
#this time read in gzipped file
with gzip.open("oz.txt.gz") as textfile:
#strip out all punctuation and split
words = re.sub(r'[^\w\s]','',words).split()
In [12]:
#same as before, but now define function for everything
def zipfplot(words,source):
count={}
for w in words:
if w not in count: count[w]=0
count[w] += 1
ydata=sorted(count.values(),reverse=True)
zipf=[1./i for i in range(1,len(ydata)+1)]
topwords=sorted(count,reverse=True,key=count.get)

figure(figsize=(6,6))
plot(range(1,len(ydata)+1),ydata,'o')
ylim(1,10000)
yscale('log')
xscale('log')
grid('on')
xlabel('word rank')
ylabel('word count')
title(source)
y= 300*ydata[300] #fit to zipf at rank 300
plot(range(1,len(ydata)+1),y*array(zipf))
#annotate top 10 words
for i in range(10):
text(i+1,ydata[i]+1000./(i+1),topwords[i])
#and sample from rest of range
for k in map(lambda x:2**(x+4),range(8)):
text(k+1,ydata[k]+5,topwords[k])
#plus the last
text(len(ydata),ydata[-1]+2,topwords[-1])
print 'vocab size =',len(count),', total terms =',sum(count.values())
print 'topwords = ',topwords[:10]
print 'lastwords = ',topwords[-5:]
In [13]:
zipfplot(words,'Oz')
vocab size = 2919 , total terms = 39256
topwords =  ['the', 'and', 'to', 'of', 'a', 'i', 'was', 'you', 'in', 'he']
lastwords =  ['dangerthat', 'gruffly', 'accidents', 'baked', 'gardens']
In [14]:
#now sherlock holmes, an even bigger text
with gzip.open("sherlock.txt.gz") as textfile:
#strip out all punctuation and split
words = re.sub(r'[^\w\s]','',words).split()
zipfplot(words,'Sherlock')
vocab size = 8410 , total terms = 104410
topwords =  ['the', 'and', 'i', 'to', 'of', 'a', 'in', 'that', 'it', 'you']
lastwords =  ['glint', 'illegally', 'accomplish', 'volumes', 'confronted']
In [15]:
#exp and log functions
fig=figure(figsize=(10,4.5))

X=np.linspace(0, 5, 256, endpoint=True)
l.plot(X,exp(X))
l.set_title('exp(x)')
l.set_ylim(0,100)

X=np.linspace(1, 100, 256, endpoint=True)
r.plot(X,log(X))
r.set_title('log(x)')
Out[15]:
<matplotlib.text.Text at 0x10a2f6490>
In [16]:
#see that power laws y=kx^b are linear in log-log plots
fig=figure(figsize=(10,9.5))

l.grid('on')
l.set_ylim(0,100)
l.plot(X,X*X)
l.plot(X,X)
l.plot(X,sqrt(X))
l.legend(['x**2','x','sqrt(x)'])

r.grid('on')
r.set_ylim(1,100)
r.plot(X,X*X)
r.plot(X,X)
r.plot(X,sqrt(X))
r.legend(['x**2','x','sqrt(x)'])
r.set_xscale('log')
r.set_yscale('log')

l.grid('on')
l.set_ylim(0,100)
l.plot(X,100/sqrt(X),label='100/sqrt(x)',color='r')
l.plot(X,100/X,label='100/x',color='g')
l.plot(X,100/(X*X),label='100/x**2',color='b')
l.legend()