count={}

#read in file00.txt through file39.txt
for n in range(40):
    for l in open("test/file%02d.txt"%n).readlines():
        for w in l.rstrip().lower().split():
            if w not in count: count[w]=0
            count[w] += 1

#sort according to highest rank
ydata=sorted(count.values(),reverse=True)

#top values of word counts
ydata[:10]

#find which they are
topwords=sorted(count,reverse=True,key=count.get)
topwords[:10]

#reference zipf distribution
zipf=[1./i for i in range(1,len(ydata)+1)]

figure(figsize=(6,6))
plot(range(1,len(ydata)+1),ydata,'o')
yscale('log')
xscale('log')
grid('on')
ylim(1,10000)
xlabel('word rank')
ylabel('word count')
#y=1000  # guess why intercept
#for i in range(5):
#    plot(range(1,len(ydata)+1),y*array(zipf))
#    y*=2
y= 2500 #y-intercept
#fit with zipf
plot(range(1,len(ydata)+1),y*array(zipf))
#annotate top 10 words
for i in range(10):
    text(i+1,ydata[i]+100,topwords[i])
#and sample from rest of range
for k in map(lambda x:2**(x+4),range(8)):
    text(k+1,ydata[k]+5,topwords[k])
text(len(ydata),ydata[-1]+2,topwords[-1])

#size of vocab
len(count)

#total size (40 files, roughly 600 words / file, i.e. *small*)
sum(count.values())

import gzip
import re  #for regular expressions

#this time read in gzipped file
with gzip.open("oz.txt.gz") as textfile:
   words = textfile.read().lower()
   #strip out all punctuation and split
   words = re.sub(r'[^\w\s]','',words).split()

#same as before, but now define function for everything
def zipfplot(words,source):
  count={}
  for w in words:
    if w not in count: count[w]=0
    count[w] += 1
  ydata=sorted(count.values(),reverse=True)
  zipf=[1./i for i in range(1,len(ydata)+1)]
  topwords=sorted(count,reverse=True,key=count.get)

  figure(figsize=(6,6))
  plot(range(1,len(ydata)+1),ydata,'o')
  ylim(1,10000)
  yscale('log')
  xscale('log')
  grid('on')
  xlabel('word rank')
  ylabel('word count')
  title(source)
  y= 300*ydata[300] #fit to zipf at rank 300
  plot(range(1,len(ydata)+1),y*array(zipf))
  #annotate top 10 words
  for i in range(10):
    text(i+1,ydata[i]+1000./(i+1),topwords[i])
  #and sample from rest of range
  for k in map(lambda x:2**(x+4),range(8)):
    text(k+1,ydata[k]+5,topwords[k])
  #plus the last
  text(len(ydata),ydata[-1]+2,topwords[-1])
  print 'vocab size =',len(count),', total terms =',sum(count.values())
  print 'topwords = ',topwords[:10]
  print 'lastwords = ',topwords[-5:]

zipfplot(words,'Oz')

#now sherlock holmes, an even bigger text
with gzip.open("sherlock.txt.gz") as textfile:
   words = textfile.read().lower()
   #strip out all punctuation and split
   words = re.sub(r'[^\w\s]','',words).split()
zipfplot(words,'Sherlock')

#exp and log functions
fig=figure(figsize=(10,4.5))

X=np.linspace(0, 5, 256, endpoint=True)
l=fig.add_subplot(121)
l.plot(X,exp(X))
l.set_title('exp(x)')
l.set_ylim(0,100)

X=np.linspace(1, 100, 256, endpoint=True)
r=fig.add_subplot(122)
r.plot(X,log(X))
r.set_title('log(x)')

#see that power laws y=kx^b are linear in log-log plots
fig=figure(figsize=(10,9.5))

l = fig.add_subplot(221)
l.grid('on')
l.set_ylim(0,100)
l.plot(X,X*X)
l.plot(X,X)
l.plot(X,sqrt(X))
l.legend(['x**2','x','sqrt(x)'])

r = fig.add_subplot(222)
r.grid('on')
r.set_ylim(1,100)
r.plot(X,X*X)
r.plot(X,X)
r.plot(X,sqrt(X))
r.legend(['x**2','x','sqrt(x)'])
r.set_xscale('log')
r.set_yscale('log')

l = fig.add_subplot(223)
l.grid('on')
l.set_ylim(0,100)
l.plot(X,100/sqrt(X),label='100/sqrt(x)',color='r')
l.plot(X,100/X,label='100/x',color='g')
l.plot(X,100/(X*X),label='100/x**2',color='b')
l.legend()

r = fig.add_subplot(224)
r.grid('on')
r.set_ylim(1,100)
r.plot(X,100/sqrt(X),label='100/sqrt(x)',color='r')
r.plot(X,100/X,label='100/x',color='g')
r.plot(X,100/(X*X),label='100/x**2',color='b')
r.legend()
r.set_xscale('log')
r.set_yscale('log')