import book_classification as bc import pandas import shelve myShelf = shelve.open("storage_new.db") aBookCollection = myShelf['aBookCollection'] aDataFrame = aBookCollection.as_dataframe() del myShelf aDataFrame.icol([0, 1]).describe() sort(aDataFrame.groupby('Author').size()).plot(kind='bar', figsize=(15, 6)) #aDataFrame.groupby('Author').size().plot(kind='kde', figsize=(6, 5)) aDataFrame.groupby('Author').size().hist() tokenizer = bc.BasicTokenizer() aBookAnalysis = bc.BookCollectionAnalysis(aBookCollection, tokenizer) aBookAnalysis.vocabulary_size_by_book().set_index('Book').sort(['Unique words']).plot() dataframe = aBookAnalysis.vocabulary_size_by_author().set_index('Author').sort(['Unique words']) dataframe.plot(kind='bar', figsize=(15, 6)) pandas.Series(aBookAnalysis.shared_words_by_authors()).apply(math.log10).plot(figsize=(6, 4)) pandas.Series(aBookAnalysis.shared_words_by_books()).apply(math.log10).plot(figsize=(8, 4)) pandas.Series(aBookAnalysis.shared_words_by_authors()).cumsum().apply(math.log).plot() pandas.Series(aBookAnalysis.shared_words_by_books()).cumsum().apply(math.log).plot() vocabularySizes = aBookAnalysis.vocabulary_size_by_book()['Unique words'] / len(aBookAnalysis.vocabulary().total()) vocabularySizes.hist(bins=100,figsize=(10,5)) #vocabularySizes.plot(kind='kde') print(vocabularySizes.sum() / len(vocabularySizes)) frequenciesExtractor = bc.FrequenciesExtractor(tokenizer) entropiesExtractor = bc.EntropiesExtractor(tokenizer, bc.FixedGrouper(500)) frequencies = bc.CollectionHierarchialFeatures.from_book_collection(aBookCollection, frequenciesExtractor) entropies = bc.CollectionHierarchialFeatures.from_book_collection(aBookCollection, entropiesExtractor) df_input = [] for word in aBookAnalysis._vocabulary.total().keys(): df_input.append([math.log(frequencies.total()[word]), entropies.total()[word]]) df_input.sort() entropies_vs_frequencies = pandas.DataFrame(df_input, columns=["Frequencies", "Entropies"]) entropies_vs_frequencies.plot(kind='kde', figsize=(8, 8), subplots=True, sharex=False) #entropies_vs_frequencies["Entropies"].plot(figsize=(12, 4)) plt.figsize(12,5) fig = plt.figure() l=len(entropies_vs_frequencies["Entropies"]) plt.axis([0,l,0,1]) scatter(range(l),entropies_vs_frequencies["Entropies"],s=1,alpha=0.05,figure=fig) plt.figsize(12,5) l=len(entropies_vs_frequencies["Entropies"]) plt.axis([140000,l,0,1]) scatter(range(l),entropies_vs_frequencies["Entropies"],s=1,alpha=0.2) plt.figsize(12,5) l=len(entropies_vs_frequencies["Entropies"]) plt.axis([130000,150000,0,1]) scatter(range(l),entropies_vs_frequencies["Entropies"],s=1,alpha=0.2) # TODO: get a decent density plot of x=freq,y=entr with log color map #figure(figsize(10, 10)) #scatter(entropies_vs_frequencies["Frequencies"], entropies_vs_frequencies["Entropies"]) #figure(figsize(5, 5)) entropies_vs_frequencies["Entropies"].diff().dropna().apply(abs).hist(log=True)