Relearing Python

In [2]:
wordfile = open("english2.txt")
words = [w.strip() for w in wordfile.readlines()]
wordfile.close()
In [6]:
words[:20]
Out[6]:
['aardvark',
 'aardvarks',
 'aardwolf',
 'aardwolves',
 'ab',
 'aba',
 'abaca',
 'abacas',
 'abaci',
 'aback',
 'abacterial',
 'abacus',
 'abacuses',
 'abaft',
 'abalone',
 'abalones',
 'abandon',
 'abandoned',
 'abandonee',
 'abandoner']

I want to know how many words are in my word list?

  • testing bullets
  • more bullets
In [5]:
len(words)
Out[5]:
98221
In [7]:
count = 0
for w in words:
    if w.endswith("ing"):
        count += 1
In [8]:
count
Out[8]:
6835
In [9]:
def abecedarian(w):
    for i in range(len(w) - 1):
        if w[i] > w[i + 1]:
            return False
    return True
In [10]:
abecedarian("cat")
Out[10]:
False
In [12]:
abecedarian("dog")
Out[12]:
False
In [18]:
abc = [w for w in words if abecedarian(w)]
In [19]:
best = ""
for w in abc:
    if len(w) > len(best):
        best = w
In [20]:
best
Out[20]:
'billowy'

Letter Frequency Counts

In [21]:
d = {}
for w in words:
    for c in w:
        d[c] = d.get(c, 0) + 1
In [22]:
d
Out[22]:
{'a': 69209,
 'b': 16827,
 'c': 37835,
 'd': 30050,
 'e': 99676,
 'f': 10796,
 'g': 23163,
 'h': 19260,
 'i': 82878,
 'j': 1260,
 'k': 6134,
 'l': 51765,
 'm': 25443,
 'n': 61380,
 'o': 57655,
 'p': 26144,
 'q': 1526,
 'r': 64183,
 's': 68267,
 't': 62494,
 'u': 29387,
 'v': 9252,
 'w': 5938,
 'x': 2555,
 'y': 17514,
 'z': 7321}
In [41]:
values = list(d.values())
In [24]:
values.sort(reverse = True)
In [34]:
revd = {}
for k in d:
    revd[d[k]] = k
In [35]:
s = ""
for v in values:
    s += revd[v]
In [36]:
s
Out[36]:
'ardvkswolfebcitungmhyxjzpq'
In [29]:
%matplotlib inline
import matplotlib.pyplot as plt
import string
In [38]:
plt.bar(range(len(d.values())), values)
plt.xticks(range(len(d.values())), list(string.ascii_lowercase))
Out[38]:
([<matplotlib.axis.XTick at 0x10c0b9358>,
  <matplotlib.axis.XTick at 0x10be48518>,
  <matplotlib.axis.XTick at 0x10c05de10>,
  <matplotlib.axis.XTick at 0x10c1f8f60>,
  <matplotlib.axis.XTick at 0x10c1ff978>,
  <matplotlib.axis.XTick at 0x10c207390>,
  <matplotlib.axis.XTick at 0x10c207d68>,
  <matplotlib.axis.XTick at 0x10c20c780>,
  <matplotlib.axis.XTick at 0x10c212198>,
  <matplotlib.axis.XTick at 0x10c212b70>,
  <matplotlib.axis.XTick at 0x10c217588>,
  <matplotlib.axis.XTick at 0x10c217f60>,
  <matplotlib.axis.XTick at 0x10c21e978>,
  <matplotlib.axis.XTick at 0x10c227390>,
  <matplotlib.axis.XTick at 0x10c227d68>,
  <matplotlib.axis.XTick at 0x10c22e780>,
  <matplotlib.axis.XTick at 0x10c235198>,
  <matplotlib.axis.XTick at 0x10c235b70>,
  <matplotlib.axis.XTick at 0x10c239588>,
  <matplotlib.axis.XTick at 0x10c239f60>,
  <matplotlib.axis.XTick at 0x10c240978>,
  <matplotlib.axis.XTick at 0x10c249390>,
  <matplotlib.axis.XTick at 0x10c249d68>,
  <matplotlib.axis.XTick at 0x10c24e780>,
  <matplotlib.axis.XTick at 0x10c255198>,
  <matplotlib.axis.XTick at 0x10c255b70>],
 <a list of 26 Text xticklabel objects>)
In [58]:
v2 = []
for c in string.ascii_lowercase:
    v2.append(d[c] / sum(d.values()))
In [59]:
plt.bar(range(len(d.values())), v2)
plt.xticks(range(len(d.values())), list(string.ascii_lowercase))
Out[59]:
([<matplotlib.axis.XTick at 0x11039de80>,
  <matplotlib.axis.XTick at 0x1103dcfd0>,
  <matplotlib.axis.XTick at 0x1101fd588>,
  <matplotlib.axis.XTick at 0x110531e48>,
  <matplotlib.axis.XTick at 0x110516860>,
  <matplotlib.axis.XTick at 0x110540278>,
  <matplotlib.axis.XTick at 0x110540c50>,
  <matplotlib.axis.XTick at 0x110546668>,
  <matplotlib.axis.XTick at 0x11054e080>,
  <matplotlib.axis.XTick at 0x11054ea58>,
  <matplotlib.axis.XTick at 0x110555470>,
  <matplotlib.axis.XTick at 0x110555e48>,
  <matplotlib.axis.XTick at 0x110558860>,
  <matplotlib.axis.XTick at 0x110561278>,
  <matplotlib.axis.XTick at 0x110561c50>,
  <matplotlib.axis.XTick at 0x110566668>,
  <matplotlib.axis.XTick at 0x11056f080>,
  <matplotlib.axis.XTick at 0x11056fa58>,
  <matplotlib.axis.XTick at 0x110576470>,
  <matplotlib.axis.XTick at 0x110576e48>,
  <matplotlib.axis.XTick at 0x11057b860>,
  <matplotlib.axis.XTick at 0x110582278>,
  <matplotlib.axis.XTick at 0x110582c50>,
  <matplotlib.axis.XTick at 0x110587668>,
  <matplotlib.axis.XTick at 0x110591080>,
  <matplotlib.axis.XTick at 0x110591a58>],
 <a list of 26 Text xticklabel objects>)
In [44]:
eyeopen = open("WheelOfTime/EyeOfTheWorld.txt")
eye = eyeopen.read().lower()
eyeopen.close()
In [45]:
eye[:100]
Out[45]:
'“the eye of the world is the best of its genre.”\n\n—the ottawa citizen\n\n\n\n“a splendid tale of heroic '
In [46]:
import re
eye = re.sub(r'[^\w\s]', '', eye)
In [48]:
eye[:100]
Out[48]:
'the eye of the world is the best of its genre\n\nthe ottawa citizen\n\n\n\na splendid tale of heroic fanta'
In [49]:
d2 = {}
for c in eye:
    d2[c] = d2.get(c, 0) + 1
In [60]:
v3 = []
for c in string.ascii_lowercase:
    v3.append(d2[c] / sum(d2.values()))
In [63]:
plt.bar(range(len(d.values())), v2, alpha=0.5, label="Word List")
plt.bar(range(len(d.values())), v3, alpha=0.3, label="Novel")
plt.xticks(range(len(d.values())), list(string.ascii_lowercase))
plt.title("Frequency of Letters")
plt.ylabel("percent")
plt.legend()
Out[63]:
<matplotlib.legend.Legend at 0x110ca0e10>
In [ ]: