%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import string
import math
import re
fopen = open("WheelOfTime/EyeOfTheWorld.txt")
text = fopen.read().lower().replace("\n", " ")
text = re.sub(r'[^\w\s]', '', text)
fopen.close()
d = {}
for w in text.split():
d[w] = d.get(w, 0) + 1
# https://stackoverflow.com/questions/3121979/how-to-sort-list-tuple-of-lists-tuples
data = []
for k in d:
data.append((k, d[k]))
data.sort(key=lambda tup: tup[1], reverse=True)
data[:10]
[('the', 19672), ('and', 8132), ('to', 7382), ('a', 6807), ('he', 6614), ('of', 6383), ('his', 4617), ('in', 4132), ('was', 3838), ('it', 3519)]
xs = range(1, len(data) + 1)
plt.scatter(xs, [f[1] for f in data], s=2)
plt.title("Word Frequency by Rank")
plt.xlabel("Rank")
plt.ylabel("Frequency")
<matplotlib.text.Text at 0x110baec88>
plt.scatter([math.log(x) for x in xs], [math.log(f[1]) for f in data], s=2)
plt.title("Word Frequency by Rank")
plt.xlabel("log(Rank)")
plt.ylabel("log(Frequency)")
<matplotlib.text.Text at 0x110bd02b0>
plt.scatter([math.log(x) for x in xs], [math.log(f[1]) for f in data], s=2, label="Words")
plt.plot([math.log(x) for x in xs], [math.log(data[0][1] / x) for x in xs], color="g", label="1/x")
plt.title("Word Frequency by Rank")
plt.xlabel("log(Rank)")
plt.ylabel("log(Frequency)")
plt.legend()
<matplotlib.legend.Legend at 0x112976828>
len(text.split())
312390