In this section we'll explore the sample characteristics, and determine possible features for using in the classifyier later.
import book_classification as bc
import shelve
import pandas
import numpy
import matplotlib.pyplot as plt
myShelf = shelve.open("storage_new.db")
aBookCollection = myShelf['aBookCollection']
del myShelf
Let's look at the word distribution across books.
tokenizer = bc.BasicTokenizer()
aPossibleFeatureAnalyzer = bc.PossibleFeatureAnalyzer.from_documents(tokenizer, (b.contents for b in aBookCollection))
aDataFrame = aPossibleFeatureAnalyzer.as_dataframe()
print(aDataFrame.describe())
countSeries = aDataFrame['Count']
print("Skewness: {}\nKurtosis: {}".format(countSeries.skew(), countSeries.kurt()))
Count Frequency count 161226.000000 1.612260e+05 mean 180.786678 6.202474e-06 std 6970.495073 2.391454e-04 min 1.000000 3.430824e-08 25% 1.000000 3.430824e-08 50% 3.000000 1.029247e-07 75% 15.000000 5.146237e-07 max 2191113.000000 7.517324e-02 Skewness: 230.19566383088508 Kurtosis: 65966.18551912217
It's not a well shaped distribution. Some numbers and a logarithmic box/density plot:
plt.figsize(4, 5)
boxplot(countSeries.apply(numpy.log))
{'boxes': [<matplotlib.lines.Line2D at 0x7f7f27122c50>], 'caps': [<matplotlib.lines.Line2D at 0x7f7f27283d90>, <matplotlib.lines.Line2D at 0x7f7f2711d550>], 'fliers': [<matplotlib.lines.Line2D at 0x7f7f26dbf210>, <matplotlib.lines.Line2D at 0x7f7f26aa1790>], 'medians': [<matplotlib.lines.Line2D at 0x7f7f27278790>], 'whiskers': [<matplotlib.lines.Line2D at 0x7f7f2727a810>, <matplotlib.lines.Line2D at 0x7f7f27282310>]}
plt.figsize(6, 4)
countSeries.apply(numpy.log).plot(kind='kde')
<matplotlib.axes.AxesSubplot at 0x7f7f26a01ed0>
We can see that the words at the extremes are very rare or stopwords.
df = aDataFrame.sort(columns='Count')
print(df.head(5))
print(df.tail(5))
print(len(df))
Count Frequency Word 80612 1 3.430824e-08 unwelcomely 114598 1 3.430824e-08 heterozygous 51599 1 3.430824e-08 myoides 51600 1 3.430824e-08 anglicé 114596 1 3.430824e-08 yasnaya Count Frequency Word 136836 355794 0.012207 his 60894 398451 0.013670 was 46061 472727 0.016218 that 44419 1178084 0.040418 and 50522 2191113 0.075173 the 161226
plt.figsize(10, 5)
resolution = 100
plot([x/resolution for x in range(resolution)], [math.log(countSeries.quantile(x/resolution)) for x in range(resolution)])
#plot([x/resolution for x in range(resolution)], [countSeries.quantile(x/resolution) for x in range(resolution)])
[<matplotlib.lines.Line2D at 0x7f7f0ae3e8d0>]
Now we'll remove some of the data.
anotherPossibleFeatureAnalyzer = aPossibleFeatureAnalyzer.prune_last_words(20).prune_less_occurrences_than(500)
anotherDataFrame = anotherPossibleFeatureAnalyzer.as_dataframe()
print(anotherDataFrame.describe())
anotherCountSeries = anotherDataFrame['Count']
print("Skewness: {}\nKurtosis: {}".format(anotherCountSeries.skew(), anotherCountSeries.kurt()))
Count Frequency count 5473.000000 5473.000000 mean 3233.204458 0.000183 std 8165.130393 0.000461 min 500.000000 0.000028 25% 718.000000 0.000041 50% 1178.000000 0.000067 75% 2390.000000 0.000135 max 123844.000000 0.006999 Skewness: 7.597174876456923 Kurtosis: 74.41778635368166
df = anotherDataFrame.sort(columns='Count')
print(df.head(5))
print(df.tail(5))
print(len(df))
Count Frequency Word 834 500 0.000028 designs 2631 500 0.000028 obviously 1622 500 0.000028 holland 2193 500 0.000028 horseback 2224 500 0.000028 withdrawn Count Frequency Word 3340 107906 0.006098 are 5149 117655 0.006649 there 4095 119119 0.006732 one 593 120227 0.006794 said 2913 123844 0.006999 were 5473
anotherCountSeries.apply(numpy.log).plot(kind='kde')
<matplotlib.axes.AxesSubplot at 0x7f7f0add4550>
#plt.xscale('log')
df['Count'].hist(log=True, bins=100)
<matplotlib.axes.AxesSubplot at 0x7f7f0ad78a90>
plt.figsize(4, 5)
boxplot(df['Count'].apply(numpy.log))
{'boxes': [<matplotlib.lines.Line2D at 0x7f7f0ab8e050>], 'caps': [<matplotlib.lines.Line2D at 0x7f7f0abac210>, <matplotlib.lines.Line2D at 0x7f7f0abac910>], 'fliers': [<matplotlib.lines.Line2D at 0x7f7f0ab8ee50>, <matplotlib.lines.Line2D at 0x7f7f0ab95590>], 'medians': [<matplotlib.lines.Line2D at 0x7f7f0ab8e750>], 'whiskers': [<matplotlib.lines.Line2D at 0x7f7f0ab9e690>, <matplotlib.lines.Line2D at 0x7f7f0ab9ea50>]}
plot([x/40 for x in range(40)], [math.log(anotherCountSeries.quantile(x/40)) for x in range(40)])
[<matplotlib.lines.Line2D at 0x7f7f0ab23b50>]
In this section we'll look at word entropies.
tokenizer = bc.BasicTokenizer()
grouper = bc.BasicGrouper(500)
entropies = {}
for book in aBookCollection:
entropies[book] = bc.TokenEntropies.from_parts(grouper.parts_from(tokenizer.tokens_from(book.contents)))
import functools
total_entropy = functools.reduce(lambda x,y: x.combine(y), entropies.values())
dfEntropies = pandas.DataFrame([[k,total_entropy[k],v/anotherPossibleFeatureAnalyzer._total] for (k,v) in anotherPossibleFeatureAnalyzer._counts.items()], columns=['Word', 'Entropy', 'Frequency'])
#hist([v for k,v in total_entropy.items()], log=True)
dfEntropies.Entropy.hist(log=True, bins=30)
<matplotlib.axes.AxesSubplot at 0x7f7f0ab2c790>
plot([x/20 for x in range(20)], [dfEntropies.Entropy.quantile(x/20) for x in range(20)])
[<matplotlib.lines.Line2D at 0x7f7f0aa19650>]
data = []
for k,v in anotherPossibleFeatureAnalyzer._counts.items():
data.append([k, v/anotherPossibleFeatureAnalyzer._total, total_entropy[k], v/anotherPossibleFeatureAnalyzer._total / total_entropy[k]])
#freq_entr_y2.append()
#freq_entr.append((v, total_entropy[k]))
data = pandas.DataFrame(data, columns=['Word', 'Freq', 'Entropy', 'Both'])
#(data.Freq * (1-data.Entropy)).plot()
#blah = (data.Freq * (1-data.Entropy))
#blah = data[data.Entropy > .01][data.Freq > .0001].sort(columns='Both')
blah = data.sort(columns='Both')
print(blah.head(20))
boxplot(blah.Entropy)
#boxplot(blah.Both)
Word Freq Entropy Both 2631 obviously 0.000028 0.562110 0.000050 2224 withdrawn 0.000028 0.562110 0.000050 1272 persisted 0.000028 0.563151 0.000050 12 intently 0.000028 0.564123 0.000050 2193 horseback 0.000028 0.560437 0.000050 2197 pathetic 0.000028 0.560944 0.000050 4614 merchantibility 0.000029 0.567355 0.000051 2629 forlorn 0.000028 0.560729 0.000051 554 protected 0.000028 0.562806 0.000051 4692 crisis 0.000028 0.560382 0.000051 4342 brightly 0.000029 0.564586 0.000051 1709 planned 0.000028 0.561172 0.000051 555 asserted 0.000028 0.561960 0.000051 1826 examining 0.000028 0.558461 0.000051 3949 perpetual 0.000028 0.557920 0.000051 528 deepest 0.000029 0.564272 0.000051 1209 perished 0.000028 0.559317 0.000051 2819 gorgeous 0.000029 0.562434 0.000051 1528 pleaded 0.000029 0.562184 0.000051 4815 boyish 0.000028 0.557615 0.000051
{'boxes': [<matplotlib.lines.Line2D at 0x7f7f0a96f390>], 'caps': [<matplotlib.lines.Line2D at 0x7f7f0a969550>, <matplotlib.lines.Line2D at 0x7f7f0a969c50>], 'fliers': [<matplotlib.lines.Line2D at 0x7f7f0a9731d0>, <matplotlib.lines.Line2D at 0x7f7f0a9738d0>], 'medians': [<matplotlib.lines.Line2D at 0x7f7f0a96fa90>], 'whiskers': [<matplotlib.lines.Line2D at 0x7f7f0a9669d0>, <matplotlib.lines.Line2D at 0x7f7f0a966d90>]}