#!/usr/bin/env python # coding: utf-8 # In[1]: get_ipython().run_line_magic('matplotlib', 'inline') import matplotlib.pyplot as plt import numpy as np import string import math import re # In[2]: fopen = open("WheelOfTime/EyeOfTheWorld.txt") text = fopen.read().lower().replace("\n", " ") text = re.sub(r'[^\w\s]', '', text) fopen.close() # In[3]: d = {} for w in text.split(): d[w] = d.get(w, 0) + 1 # In[4]: # https://stackoverflow.com/questions/3121979/how-to-sort-list-tuple-of-lists-tuples data = [] for k in d: data.append((k, d[k])) data.sort(key=lambda tup: tup[1], reverse=True) # In[5]: data[:10] # In[6]: xs = range(1, len(data) + 1) plt.scatter(xs, [f[1] for f in data], s=2) plt.title("Word Frequency by Rank") plt.xlabel("Rank") plt.ylabel("Frequency") # In[7]: plt.scatter([math.log(x) for x in xs], [math.log(f[1]) for f in data], s=2) plt.title("Word Frequency by Rank") plt.xlabel("log(Rank)") plt.ylabel("log(Frequency)") # In[8]: plt.scatter([math.log(x) for x in xs], [math.log(f[1]) for f in data], s=2, label="Words") plt.plot([math.log(x) for x in xs], [math.log(data[0][1] / x) for x in xs], color="g", label="1/x") plt.title("Word Frequency by Rank") plt.xlabel("log(Rank)") plt.ylabel("log(Frequency)") plt.legend() # In[9]: len(text.split()) # In[ ]: