In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import string
import math
import re
In [2]:
fopen = open("WheelOfTime/EyeOfTheWorld.txt")
text = fopen.read().lower().replace("\n", " ")
text = re.sub(r'[^\w\s]', '', text)
fopen.close()
In [3]:
d = {}
for w in text.split():
    d[w] = d.get(w, 0) + 1
In [4]:
# https://stackoverflow.com/questions/3121979/how-to-sort-list-tuple-of-lists-tuples
data = []
for k in d:
    data.append((k, d[k]))
data.sort(key=lambda tup: tup[1], reverse=True) 
In [5]:
data[:10]
Out[5]:
[('the', 19672),
 ('and', 8132),
 ('to', 7382),
 ('a', 6807),
 ('he', 6614),
 ('of', 6383),
 ('his', 4617),
 ('in', 4132),
 ('was', 3838),
 ('it', 3519)]
In [6]:
xs = range(1, len(data) + 1)
plt.scatter(xs, [f[1] for f in data], s=2)

plt.title("Word Frequency by Rank")
plt.xlabel("Rank")
plt.ylabel("Frequency")
Out[6]:
<matplotlib.text.Text at 0x110baec88>
In [7]:
plt.scatter([math.log(x) for x in xs], [math.log(f[1]) for f in data], s=2)

plt.title("Word Frequency by Rank")
plt.xlabel("log(Rank)")
plt.ylabel("log(Frequency)")
Out[7]:
<matplotlib.text.Text at 0x110bd02b0>
In [8]:
plt.scatter([math.log(x) for x in xs], [math.log(f[1]) for f in data], s=2, label="Words")
plt.plot([math.log(x) for x in xs], [math.log(data[0][1] / x) for x in xs], color="g", label="1/x")

plt.title("Word Frequency by Rank")
plt.xlabel("log(Rank)")
plt.ylabel("log(Frequency)")
plt.legend()
Out[8]:
<matplotlib.legend.Legend at 0x112976828>
In [9]:
len(text.split())
Out[9]:
312390
In [ ]: