#!/usr/bin/env python # coding: utf-8 # In[1]: import pickle import nltk stats = pickle.load( open( "/media/storage/dpla-data/pickles/new/newstats.p", "rb" ) ) common = pickle.load( open( "/media/storage/dpla-data/pickles/new/common.p", "rb" ) ) searcom = pickle.load( open( "/media/storage/dpla-data/pickles/new/sear_common.p", "rb" ) ) searfilt = pickle.load(open( "/media/storage/dpla-data/pickles/new/searches_filtered.p", "rb" ) ) # In[2]: stats # In[3]: import pandas as pd df = pd.DataFrame(stats) df.columns = ['ARTstor', 'Biodiversity Heritage Library', 'Digital Commonwealth', 'Digital Library of Georgia', 'J. Paul Getty Trust', 'United States Government Printing Office (GPO)', 'Harvard Library', 'Internet Archive', 'University of Illinois at Urbana-Champaign', 'Kentucky Digital Library', 'Minnesota Digital Library', 'Missouri Hub', 'Mountain West Digital Library', 'National Archives and Records Administration', 'North Carolina Digital Heritage Center', ' ', 'David Rumsey', 'Smithsonian Institution', 'South Carolina Digital Library', 'The Portal to Texas History', 'University of Southern California. Libraries', 'University of Virginia Library'] df.T df.T.to_csv("nltk.stats.csv") # In[8]: from IPython.display import display display(pd.melt(df.T.reset_index(), id_vars=['index']).sort('index')) # In[9]: pd.melt(df.T.reset_index(), id_vars=['index']).sort('index').to_csv('nltk.stats.melted.tmp.csv') # In[4]: #>>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha()) fd = nltk.FreqDist(token.lower() for token in searfilt) fd.most_common() # In[7]: import pickle import nltk vap = pickle.load( open( "/media/storage/dpla-data/pickles/virginia.p", "rb" ) ) ### >>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha()) vafd = nltk.FreqDist(token.lower() for token in vap['virginia']['filtered']) vafd.most_common() # In[ ]: # In[ ]: colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard", "ia","getty","kentucky","minnesota","missouri","mwdl", "nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"] import pickle for c in colls: #p = pickle.load( open( "/media/storage/dpla-data/pickles/"+c+".p", "rb" ) ) p = pickle.load( open( "C:/Users/charper/dpla-temp/pickles/"+c+".p", "rb" ) ) print("\nGathering Stats for " + c) stats = p[c]['stats'] print(stats) print("percent unique:") print(round((p[c]['stats']['uniq'] / p[c]['stats']['wc']),5), "%") print("filtered percent unique:") print(round((p[c]['stats']['funiq'] / p[c]['stats']['fwc']),5), "%") print("*********") # In[8]: #type(fd) #haps = fd.hapaxes() len(vafd.hapaxes()) # In[9]: longwords = {} for k,v in vafd.items(): if len(k) > 10: longwords[k] = v # In[25]: longwords # In[ ]: from nltk.collocations import * finder = BigramCollocationFinder.from_words(filtered['ia']['filtered']) bigram_measures = nltk.collocations.BigramAssocMeasures() scored = finder.score_ngrams(bigram_measures.raw_freq) sorted(bigram for bigram, score in scored) # In[ ]: #This thing here just hangs forevs. I wonder if it's possible to do it without the notebook? from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() trigram_measures = nltk.collocations.TrigramAssocMeasures() finder = BigramCollocationFinder.from_words(filteredgpo) finder.nbest(bigram_measures.pmi, 10) # In[11]: import pickle import nltk colls = ["biodiv","rumsey","commonwealth","georgia","harvard", "ia","getty","kentucky","minnesota","missouri","mwdl", "nara","nocar","smiths","socar","texas","gpo","illinois", "usc","virginia","nocoll"] #colls = ["biodiv"] fd = pickle.load( open( "/media/storage/dpla-data/pickles/new/artstor_fd.p", "rb" ) ) fds = {} fds['artstor'] = fd print(len(fd)) for coll in colls: tmp = pickle.load( open( "/media/storage/dpla-data/pickles/new/"+coll+"_fd.p", "rb" ) ) print("updating FD with " + coll) fds[coll] = tmp fd.update(tmp) print(len(fd)) # In[12]: fd.most_common() # In[13]: len(fd.hapaxes()) # In[14]: fd.hapaxes() # In[15]: def hasNumbers(inputString): return any(char.isdigit() for char in inputString) # In[16]: texthaps = [] for hap in fd.hapaxes(): if not hasNumbers(hap): texthaps.append(hap) len(texthaps) # In[17]: len(fd.hapaxes()) # In[18]: texthaps # In[59]: colltexthaps = {} colls2 = colls colls2.append('artstor') for coll in colls2: colltexthaps[coll] = [] for hap in fds[coll].hapaxes(): if not hasNumbers(hap): #if hap in texthaps: colltexthaps[coll].append(hap) print(coll, "|", str(len(colltexthaps[coll]))) # In[ ]: