count={} #read in file00.txt through file39.txt for n in range(40): for l in open("test/file%02d.txt"%n).readlines(): for w in l.rstrip().lower().split(): if w not in count: count[w]=0 count[w] += 1 #sort according to highest rank ydata=sorted(count.values(),reverse=True) #top values of word counts ydata[:10] #find which they are topwords=sorted(count,reverse=True,key=count.get) topwords[:10] #reference zipf distribution zipf=[1./i for i in range(1,len(ydata)+1)] figure(figsize=(6,6)) plot(range(1,len(ydata)+1),ydata,'o') yscale('log') xscale('log') grid('on') ylim(1,10000) xlabel('word rank') ylabel('word count') #y=1000 # guess why intercept #for i in range(5): # plot(range(1,len(ydata)+1),y*array(zipf)) # y*=2 y= 2500 #y-intercept #fit with zipf plot(range(1,len(ydata)+1),y*array(zipf)) #annotate top 10 words for i in range(10): text(i+1,ydata[i]+100,topwords[i]) #and sample from rest of range for k in map(lambda x:2**(x+4),range(8)): text(k+1,ydata[k]+5,topwords[k]) text(len(ydata),ydata[-1]+2,topwords[-1]) #size of vocab len(count) #total size (40 files, roughly 600 words / file, i.e. *small*) sum(count.values()) import gzip import re #for regular expressions #this time read in gzipped file with gzip.open("oz.txt.gz") as textfile: words = textfile.read().lower() #strip out all punctuation and split words = re.sub(r'[^\w\s]','',words).split() #same as before, but now define function for everything def zipfplot(words,source): count={} for w in words: if w not in count: count[w]=0 count[w] += 1 ydata=sorted(count.values(),reverse=True) zipf=[1./i for i in range(1,len(ydata)+1)] topwords=sorted(count,reverse=True,key=count.get) figure(figsize=(6,6)) plot(range(1,len(ydata)+1),ydata,'o') ylim(1,10000) yscale('log') xscale('log') grid('on') xlabel('word rank') ylabel('word count') title(source) y= 300*ydata[300] #fit to zipf at rank 300 plot(range(1,len(ydata)+1),y*array(zipf)) #annotate top 10 words for i in range(10): text(i+1,ydata[i]+1000./(i+1),topwords[i]) #and sample from rest of range for k in map(lambda x:2**(x+4),range(8)): text(k+1,ydata[k]+5,topwords[k]) #plus the last text(len(ydata),ydata[-1]+2,topwords[-1]) print 'vocab size =',len(count),', total terms =',sum(count.values()) print 'topwords = ',topwords[:10] print 'lastwords = ',topwords[-5:] zipfplot(words,'Oz') #now sherlock holmes, an even bigger text with gzip.open("sherlock.txt.gz") as textfile: words = textfile.read().lower() #strip out all punctuation and split words = re.sub(r'[^\w\s]','',words).split() zipfplot(words,'Sherlock') #exp and log functions fig=figure(figsize=(10,4.5)) X=np.linspace(0, 5, 256, endpoint=True) l=fig.add_subplot(121) l.plot(X,exp(X)) l.set_title('exp(x)') l.set_ylim(0,100) X=np.linspace(1, 100, 256, endpoint=True) r=fig.add_subplot(122) r.plot(X,log(X)) r.set_title('log(x)') #see that power laws y=kx^b are linear in log-log plots fig=figure(figsize=(10,9.5)) l = fig.add_subplot(221) l.grid('on') l.set_ylim(0,100) l.plot(X,X*X) l.plot(X,X) l.plot(X,sqrt(X)) l.legend(['x**2','x','sqrt(x)']) r = fig.add_subplot(222) r.grid('on') r.set_ylim(1,100) r.plot(X,X*X) r.plot(X,X) r.plot(X,sqrt(X)) r.legend(['x**2','x','sqrt(x)']) r.set_xscale('log') r.set_yscale('log') l = fig.add_subplot(223) l.grid('on') l.set_ylim(0,100) l.plot(X,100/sqrt(X),label='100/sqrt(x)',color='r') l.plot(X,100/X,label='100/x',color='g') l.plot(X,100/(X*X),label='100/x**2',color='b') l.legend() r = fig.add_subplot(224) r.grid('on') r.set_ylim(1,100) r.plot(X,100/sqrt(X),label='100/sqrt(x)',color='r') r.plot(X,100/X,label='100/x',color='g') r.plot(X,100/(X*X),label='100/x**2',color='b') r.legend() r.set_xscale('log') r.set_yscale('log')