Iterating over keys of a dictionary (Note that
Counter?
in a cell gives a little tutorial, same for any defined function;
Counter??
gives python source code for the class definition.)
from collections import Counter
c=Counter(['a']*3+['b']*7+['c']*2)
c
Counter({'a': 3, 'b': 7, 'c': 2})
c.keys()
(['a', 'c', 'b'], [3, 2, 7])
c.values()
[3, 2, 7]
c.items()
[('a', 3), ('c', 2), ('b', 7)]
for key in c: print key
#works the same as for key in c.keys():
a c b
mylist=c.keys()
for j in range(len(mylist)):
print mylist[j]
a c b
for list_item in mylist:
print list_item
a c b
Some "pythonic" programming style issues:
Instead of:
counts=n_s + n_p
thirty=[]
for i in counts.most_common():
if i[1] == 30:
thirty.append(i[0])
Shorter and ultimately easier to read:
#class inheritance, starts as a dict
thirty = [w for w in counts if counts[w] == 30]
And instead of c-style code:
j=0
while(j<10):
j = j + 1
...
can use:
for j in range(10):
Or for iterating over index:
j=0
while (j<len(mylist)):
if mylist[j] == ... # has some property
j += 1
can use:
for j in range(len(mylist)):
if mylist[j] == ... # has some property
but even better, iterate directly over list items:
for list_item in mylist:
if list_item == ...
(only very rarely is it ever necessary to iterate over indices of lists)
So instead of:
j=0
while(j<30):
if w in ptexts[j]:
...
if w in stexts[j]:
... [same thing]
j += 1
can use:
for txt in ptexts+stexts:
if w in txt: ...
#even more powerful ways of doing cuts on numpy arrays, e.g.
w=randint(0,100,50)
w[w<=46] #elements of w less than or equal to 46
array([22, 38, 15, 38, 2, 4, 14, 28, 46, 5, 22, 9, 24, 22, 12, 15, 46, 3, 28, 19, 33, 10, 20])
A final note regarding dictionaries: if you use a dictionary to accumulate counts, then you can't increment a value that hasn't been defined, e.g.
counts = {}
counts['a'] += 1
will give a KeyError
.
There are a few ways around this. For example with a simple test:
if 'a' not in counts: counts['a']=0
counts['a'] += 1
or use predefined dictionaries
from collections import defaultdict,Counter
counts = defaultdict(int)
counts['a'] += 1
the defaultdict()
automatically initializes to anything you specify, in this case int
makes it default to 0, but could be to any number, or to an empty list defaultdict(list)
, and so on. Or of course a Counter()
also acts like a defaultdict(int)
but with additional methods (like .most_common()
):
counts = Counter()
counts['a'] += 1
Now some quick text analysis examples following directly this resource: https://www.inferentialthinking.com/chapters/01/3/plotting-the-classics.html
from urllib2 import urlopen
%pylab inline
#load to top level namespace, avoiding plt. and np. for convenience
Populating the interactive namespace from numpy and matplotlib
# huckleberry finn
hf_txt=urlopen('http://www.gutenberg.org/files/76/76-0.txt').read().decode('utf-8')
hf_chpts=hf_txt.split('CHAPTER ')[44:]
for chpt in hf_chpts[:10]: print chpt[:60].replace('\n',' ').replace('\r','')
I. YOU don’t know about me without you have read a book b II. WE went tiptoeing along a path amongst the trees back III. WELL, I got a good going-over in the morning from ol IV. WELL, three or four months run along, and it was well V. I had shut the door to. Then I turned around and ther VI. WELL, pretty soon the old man was up and around again VII. “GIT up! What you ‘bout?” I opened my eyes and l VIII. THE sun was up so high when I waked that I judged i IX. I wanted to go and look at a place right about the mi X. AFTER breakfast I wanted to talk about the dead man an
hf_names=('Jim','Tom','Huck')
hf_namecounts={name: [chpt.count(name) for chpt in hf_chpts] for name in hf_names}
#https://www.inferentialthinking.com/chapters/01/3/1/literary-characters.html
xdata=1+arange(len(hf_chpts))
for name in hf_names:
plot (xdata,cumsum(hf_namecounts[name]),label=name)
title('cumulative # times name occurs')
xlabel('Chapter')
legend(loc='upper left')
grid('on')
# little women
lw_txt=urlopen('http://www.gutenberg.org/cache/epub/514/pg514.txt').read().decode('utf-8')
lw_chpts=lw_txt.split('CHAPTER ')[1:]
for chpt in lw_chpts[:10]: print chpt[:60].replace('\n',' ').replace('\r','')
ONE PLAYING PILGRIMS "Christmas won't be Christmas wit TWO A MERRY CHRISTMAS Jo was the first to wake in the THREE THE LAURENCE BOY "Jo! Jo! Where are you?" crie FOUR BURDENS "Oh, dear, how hard it does seem to take FIVE BEING NEIGHBORLY "What in the world are you going SIX BETH FINDS THE PALACE BEAUTIFUL The big house did SEVEN AMY'S VALLEY OF HUMILIATION "That boy is a perfe EIGHT JO MEETS APOLLYON "Girls, where are you going?" NINE MEG GOES TO VANITY FAIR "I do think it was the mo TEN THE P.C. AND P.O. As spring came on, a new set of
lw_names=('Amy','Beth','Jo','Meg','Laurie')
lw_namecounts={name: [chpt.count(name) for chpt in lw_chpts] for name in lw_names}
#https://www.inferentialthinking.com/chapters/01/3/1/literary-characters.html
xdata=1+arange(len(lw_chpts))
for name in lw_names:
plot (xdata,cumsum(lw_namecounts[name]),label=name)
title('cumulative # times name occurs')
xlabel('Chapter')
legend(loc='upper left')
grid('on')
hf_pcount= [(chpt.count('.'),len(chpt)) for chpt in hf_chpts]
lw_pcount = [(chpt.count('.'),len(chpt)) for chpt in lw_chpts]
for num_per,num_chars in hf_pcount[:10]: print num_chars,num_per
print
for num_per,num_chars in lw_pcount[:10]: print num_chars,num_per
7206 66 12353 117 8766 72 7029 84 8443 91 14910 125 13586 127 22935 249 8301 71 7234 70 22408 189 22827 188 21295 231 26165 195 24199 255 15016 140 14851 131 23144 214 34756 337 20150 185
#https://www.inferentialthinking.com/chapters/01/3/2/another-kind-of-character.html
figure(figsize=(6,6))
scatter(*zip(*hf_pcount))
scatter(*zip(*lw_pcount),color='orange')
xlabel('# periods in chpt')
ylabel('# chars in chpt')
xlim(0,500)
legend(['huck finn','little women'],loc='upper left')
xdata=arange(50,500,400)
#plot(xdata,hf_slope*xdata+hf_int,'darkblue')
#plot(xdata,lw_slope*xdata+hf_int,'orange')
grid('on')
from scipy.stats import linregress
hf_slope,hf_int=linregress(*zip(*hf_pcount))[:2]
lw_slope,lw_int=linregress(*zip(*lw_pcount))[:2]
#Now try Shakespeare, see
#http://www.gutenberg.org/ebooks/100
#http://www.gutenberg.org/cache/epub/100/pg100.txt
sh_txt=urlopen('http://www.gutenberg.org/cache/epub/100/pg100.txt').read().decode('utf-8')
sh_txt.count('Romeo')
155
sh_txt.find('1609')
#where the first sonnets start
7596
sh_words=sh_txt[7596:].lower().split()
len(sh_words)
902892
#finally plot the #occurrences of words against their rank as in ps2,
#should again be a -1 power law, i.e., -1 slope in log-log coords
#good fit from about rank 20 to 10,000
# vocab of 59605 distinct words