import pickle
import nltk
stats = pickle.load( open( "/media/storage/dpla-data/pickles/new/newstats.p", "rb" ) )
common = pickle.load( open( "/media/storage/dpla-data/pickles/new/common.p", "rb" ) )
searcom = pickle.load( open( "/media/storage/dpla-data/pickles/new/sear_common.p", "rb" ) )
searfilt = pickle.load(open( "/media/storage/dpla-data/pickles/new/searches_filtered.p", "rb" ) )
stats
{'artstor': {'funiq': 60168, 'fwc': 5025070, 'haps': 29757, 'lowerhaps': 24103, 'uniq': 60293, 'wc': 6972534}, 'biodiv': {'funiq': 94248, 'fwc': 5658739, 'haps': 44804, 'lowerhaps': 38471, 'uniq': 94372, 'wc': 6381376}, 'commonwealth': {'funiq': 204577, 'fwc': 11348522, 'haps': 159095, 'lowerhaps': 154009, 'uniq': 204703, 'wc': 14022356}, 'georgia': {'funiq': 150863, 'fwc': 32656431, 'haps': 89668, 'lowerhaps': 79492, 'uniq': 150990, 'wc': 42031491}, 'getty': {'funiq': 54355, 'fwc': 14251103, 'haps': 11663, 'lowerhaps': 9767, 'uniq': 54474, 'wc': 18732730}, 'gpo': {'funiq': 437646, 'fwc': 21860075, 'haps': 351637, 'lowerhaps': 343619, 'uniq': 437770, 'wc': 26316103}, 'harvard': {'funiq': 35918, 'fwc': 849987, 'haps': 20447, 'lowerhaps': 18025, 'uniq': 36036, 'wc': 968898}, 'ia': {'funiq': 502974, 'fwc': 16996418, 'haps': 394559, 'lowerhaps': 378206, 'uniq': 503101, 'wc': 23288038}, 'illinois': {'funiq': 49018, 'fwc': 1829267, 'haps': 29755, 'lowerhaps': 23849, 'uniq': 49143, 'wc': 2385501}, 'kentucky': {'funiq': 30374, 'fwc': 6800530, 'haps': 14090, 'lowerhaps': 11338, 'uniq': 30498, 'wc': 9405279}, 'minnesota': {'funiq': 43666, 'fwc': 3598870, 'haps': 21112, 'lowerhaps': 17674, 'uniq': 43791, 'wc': 4495075}, 'missouri': {'funiq': 119586, 'fwc': 3542143, 'haps': 90859, 'lowerhaps': 85222, 'uniq': 119713, 'wc': 4256929}, 'mwdl': {'funiq': 793849, 'fwc': 87424176, 'haps': 542772, 'lowerhaps': 504876, 'uniq': 793976, 'wc': 111155337}, 'nara': {'funiq': 1082133, 'fwc': 54355031, 'haps': 990235, 'lowerhaps': 978968, 'uniq': 1082259, 'wc': 65649116}, 'nocar': {'funiq': 258024, 'fwc': 27360155, 'haps': 165815, 'lowerhaps': 157524, 'uniq': 258151, 'wc': 33487819}, 'nocoll': {'funiq': 1785, 'fwc': 4626, 'haps': 1307, 'lowerhaps': 1202, 'uniq': 1867, 'wc': 6192}, 'rumsey': {'funiq': 47343, 'fwc': 8825833, 'haps': 14682, 'lowerhaps': 12520, 'uniq': 47463, 'wc': 11667865}, 'smiths': {'funiq': 432279, 'fwc': 51922316, 'haps': 182337, 'lowerhaps': 157337, 'uniq': 432406, 'wc': 59927374}, 'socar': {'funiq': 61687, 'fwc': 5809794, 'haps': 31055, 'lowerhaps': 25606, 'uniq': 61813, 'wc': 7138136}, 'texas': {'funiq': 855594, 'fwc': 72699549, 'haps': 245710, 'lowerhaps': 237998, 'uniq': 855720, 'wc': 88574895}, 'usc': {'funiq': 259523, 'fwc': 41542851, 'haps': 108852, 'lowerhaps': 94218, 'uniq': 259650, 'wc': 49296854}, 'virginia': {'funiq': 41374, 'fwc': 1790985, 'haps': 32902, 'lowerhaps': 31738, 'uniq': 41493, 'wc': 2248517}}
import pandas as pd
df = pd.DataFrame(stats)
df.columns = ['ARTstor', 'Biodiversity Heritage Library', 'Digital Commonwealth', 'Digital Library of Georgia',
'J. Paul Getty Trust', 'United States Government Printing Office (GPO)', 'Harvard Library',
'Internet Archive', 'University of Illinois at Urbana-Champaign', 'Kentucky Digital Library',
'Minnesota Digital Library', 'Missouri Hub', 'Mountain West Digital Library',
'National Archives and Records Administration', 'North Carolina Digital Heritage Center',
' ', 'David Rumsey', 'Smithsonian Institution', 'South Carolina Digital Library',
'The Portal to Texas History', 'University of Southern California. Libraries',
'University of Virginia Library']
df.T
df.T.to_csv("nltk.stats.csv")
from IPython.display import display
display(pd.melt(df.T.reset_index(), id_vars=['index']).sort('index'))
index | variable | value | |
---|---|---|---|
15 | funiq | 1785 | |
37 | fwc | 4626 | |
59 | haps | 1307 | |
81 | lowerhaps | 1202 | |
103 | uniq | 1867 | |
125 | wc | 6192 | |
0 | ARTstor | funiq | 60168 |
22 | ARTstor | fwc | 5025070 |
44 | ARTstor | haps | 29757 |
66 | ARTstor | lowerhaps | 24103 |
88 | ARTstor | uniq | 60293 |
110 | ARTstor | wc | 6972534 |
1 | Biodiversity Heritage Library | funiq | 94248 |
23 | Biodiversity Heritage Library | fwc | 5658739 |
45 | Biodiversity Heritage Library | haps | 44804 |
67 | Biodiversity Heritage Library | lowerhaps | 38471 |
89 | Biodiversity Heritage Library | uniq | 94372 |
111 | Biodiversity Heritage Library | wc | 6381376 |
16 | David Rumsey | funiq | 47343 |
38 | David Rumsey | fwc | 8825833 |
60 | David Rumsey | haps | 14682 |
82 | David Rumsey | lowerhaps | 12520 |
104 | David Rumsey | uniq | 47463 |
126 | David Rumsey | wc | 11667865 |
2 | Digital Commonwealth | funiq | 204577 |
24 | Digital Commonwealth | fwc | 11348522 |
46 | Digital Commonwealth | haps | 159095 |
68 | Digital Commonwealth | lowerhaps | 154009 |
90 | Digital Commonwealth | uniq | 204703 |
112 | Digital Commonwealth | wc | 14022356 |
... | ... | ... | ... |
19 | The Portal to Texas History | funiq | 855594 |
41 | The Portal to Texas History | fwc | 72699549 |
63 | The Portal to Texas History | haps | 245710 |
85 | The Portal to Texas History | lowerhaps | 237998 |
107 | The Portal to Texas History | uniq | 855720 |
129 | The Portal to Texas History | wc | 88574895 |
5 | United States Government Printing Office (GPO) | funiq | 437646 |
27 | United States Government Printing Office (GPO) | fwc | 21860075 |
49 | United States Government Printing Office (GPO) | haps | 351637 |
71 | United States Government Printing Office (GPO) | lowerhaps | 343619 |
93 | United States Government Printing Office (GPO) | uniq | 437770 |
115 | United States Government Printing Office (GPO) | wc | 26316103 |
8 | University of Illinois at Urbana-Champaign | funiq | 49018 |
30 | University of Illinois at Urbana-Champaign | fwc | 1829267 |
52 | University of Illinois at Urbana-Champaign | haps | 29755 |
74 | University of Illinois at Urbana-Champaign | lowerhaps | 23849 |
96 | University of Illinois at Urbana-Champaign | uniq | 49143 |
118 | University of Illinois at Urbana-Champaign | wc | 2385501 |
20 | University of Southern California. Libraries | funiq | 259523 |
42 | University of Southern California. Libraries | fwc | 41542851 |
64 | University of Southern California. Libraries | haps | 108852 |
86 | University of Southern California. Libraries | lowerhaps | 94218 |
108 | University of Southern California. Libraries | uniq | 259650 |
130 | University of Southern California. Libraries | wc | 49296854 |
21 | University of Virginia Library | funiq | 41374 |
43 | University of Virginia Library | fwc | 1790985 |
65 | University of Virginia Library | haps | 32902 |
87 | University of Virginia Library | lowerhaps | 31738 |
109 | University of Virginia Library | uniq | 41493 |
131 | University of Virginia Library | wc | 2248517 |
132 rows × 3 columns
pd.melt(df.T.reset_index(), id_vars=['index']).sort('index').to_csv('nltk.stats.melted.tmp.csv')
#>>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
fd = nltk.FreqDist(token.lower() for token in searfilt)
fd.most_common()
[('war', 9316), ('history', 7940), ('new', 7432), ('county', 7030), ('georgia', 6108), ('university', 6024), ('american', 5883), ('library', 5799), ('john', 5015), ('world', 4293), ('women', 4092), ('york', 3972), ('civil', 3853), ('states', 3604), ('united', 3590), ('carolina', 3570), ('de', 3223), ('william', 3148), ('south', 3072), ('art', 2864), ('school', 2734), ('utah', 2726), ('city', 2682), ('north', 2581), ('state', 2563), ('james', 2517), ('public', 2490), ('family', 2335), ('c', 2308), ('boston', 2280), ('george', 2254), ('college', 2159), ('map', 2154), ('atlanta', 2126), ('america', 2103), ('national', 2032), ('thomas', 1979), ('virginia', 1950), ('great', 1883), ('charles', 1825), ('st', 1820), ('texas', 1818), ('ga', 1804), ('washington', 1801), ('california', 1801), ('african', 1779), ('robert', 1727), ('minnesota', 1722), ('rights', 1708), ('genealogy', 1702), ('book', 1680), ('digital', 1634), ('lat', 1617), ('church', 1616), ('black', 1614), ('massachusetts', 1588), ('henry', 1587), ('king', 1584), ('education', 1581), ('j', 1581), ('life', 1577), ('books', 1568), ('d', 1525), ('archives', 1522), ('maps', 1500), ('music', 1472), ('century', 1468), ('west', 1450), ('la', 1434), ('b', 1430), ('e', 1429), ('park', 1420), ('david', 1411), ('railroad', 1370), ('island', 1354), ('ohio', 1343), ('museum', 1335), ('photographs', 1328), ('house', 1314), ('chicago', 1304), ('children', 1289), ('h', 1288), ('social', 1287), ('street', 1254), ('science', 1250), ('san', 1244), ('society', 1214), ('illinois', 1206), ('f', 1205), ('lake', 1187), ('works', 1174), ('w', 1168), ('law', 1162), ('revolution', 1161), ('martin', 1157), ('indian', 1149), ('indians', 1139), ('river', 1139), ('english', 1132), ('americans', 1126), ('libraries', 1100), ('architecture', 1091), ('mary', 1089), ('l', 1088), ('m', 1084), ('ny', 1080), ('historical', 1079), ('paul', 1065), ('kentucky', 1053), ('michigan', 1044), ('records', 1042), ('joseph', 1040), ('fiction', 1040), ('high', 1031), ('ii', 1020), ('r', 1019), ('center', 1009), ('37', 1007), ('government', 999), ('mass', 989), ('louis', 988), ('china', 983), ('act', 969), ('native', 966), ('996162679728116', 957), ('edward', 955), ('slavery', 941), ('vietnam', 930), ('company', 921), ('smith', 919), ('business', 919), ('lincoln', 916), ('literature', 899), ('death', 899), ('japanese', 897), ('woman', 894), ('language', 882), ('fire', 878), ('white', 876), ('day', 870), ('buildings', 865), ('richard', 860), ('pennsylvania', 853), ('chinese', 851), ('early', 845), ('spanish', 844), ('luther', 840), ('kennedy', 840), ('slave', 831), ('roosevelt', 826), ('ma', 816), ('brown', 812), ('general', 809), ('missouri', 809), ('design', 807), ('florida', 807), ('o', 806), ('frank', 806), ('collection', 805), ('first', 805), ('french', 802), ('child', 801), ('n', 797), ('pa', 796), ('management', 795), ('military', 792), ('old', 790), ('england', 781), ('lewis', 779), ('army', 774), ('southern', 769), ('mountain', 769), ('us', 767), ('los', 764), ('2', 762), ('u', 761), ('jackson', 759), ('labor', 755), ('indiana', 754), ('health', 752), ('bible', 752), ('immigration', 750), ('colorado', 749), ('ca', 747), ('mexico', 746), ('depression', 742), ('battle', 742), ('red', 740), ('union', 736), ('harvard', 730), ('lee', 728), ('baseball', 725), ('samuel', 723), ('man', 722), ('g', 717), ('nc', 716), ('1865', 716), ('east', 716), ('hill', 713), ('air', 710), ('p', 705), ('home', 705), ('image', 704), ('arizona', 702), ('research', 701), ('1', 701), ('alabama', 699), ('maine', 699), ('journal', 698), ('wisconsin', 695), ('technology', 693), ('people', 693), ('franklin', 692), ('tennessee', 689), ('men', 680), ('ancient', 680), ('fort', 679), ('theater', 672), ('little', 668), ('schools', 667), ('food', 665), ('france', 657), ('mn', 653), ('culture', 652), ('pictorial', 647), ('connecticut', 642), ('politics', 641), ('newspapers', 634), ('collections', 633), ('work', 626), ('1918', 625), ('francisco', 625), ('smithsonian', 624), ('oregon', 622), ('charleston', 621), ('movement', 618), ('medical', 617), ('german', 614), ('travel', 613), ('special', 613), ('johnson', 608), ('germany', 607), ('portrait', 605), ('angeles', 602), ('1920', 602), ('elizabeth', 600), ('peter', 597), ('information', 596), ('gold', 595), ('rock', 595), ('1800', 594), ('scott', 591), ('1861', 588), ('1914', 586), ('computer', 585), ('1945', 585), ('mining', 585), ('harry', 585), ('benjamin', 584), ('suffrage', 581), ('kansas', 575), ('camp', 575), ('india', 574), ('magazine', 573), ('clark', 572), ('v', 570), ('power', 568), ('alexander', 566), ('human', 566), ('service', 565), ('iowa', 563), ('michael', 563), ('photography', 562), ('letter', 562), ('religion', 561), ('medicine', 560), ('building', 557), ('portraits', 555), ('department', 554), ('project', 554), ('van', 549), ('water', 547), ('1939', 546), ('hall', 546), ('trade', 546), ('mississippi', 544), ('valley', 541), ('industry', 541), ('co', 540), ('philadelphia', 539), ('london', 538), ('young', 537), ('arts', 537), ('etc', 537), ('poetry', 535), ('institute', 535), ('horse', 533), ('arthur', 532), ('japan', 531), ('frederick', 531), ('columbia', 530), ('20th', 529), ('political', 526), ('race', 525), ('minneapolis', 524), ('sports', 524), ('maryland', 523), ('jersey', 523), ('space', 522), ('report', 521), ('dr', 521), ('mark', 520), ('students', 520), ('deal', 516), ('administration', 514), ('soldiers', 513), ('jr', 512), ('yale', 512), ('road', 511), ('international', 511), ('jones', 510), ('independence', 509), ('herald', 508), ('1900', 507), ('farm', 507), ('colonial', 506), ('nevada', 504), ('irish', 501), ('jane', 501), ('modern', 500), ('newspaper', 500), ('dog', 500), ('psychology', 498), ('pacific', 498), ('jefferson', 497), ('salt', 496), ('williams', 493), ('va', 493), ('theory', 490), ('letters', 490), ('system', 490), ('fair', 490), ('saint', 489), ('group', 489), ('green', 487), ('land', 483), ('russian', 482), ('free', 481), ('cherokee', 481), ('abraham', 481), ('time', 479), ('field', 478), ('adams', 475), ('guide', 475), ('africa', 474), ('british', 473), ('sc', 470), ('radio', 470), ('bill', 470), ('pictures', 470), ('beach', 469), ('central', 469), ('domain', 468), ('1940', 467), ('andrew', 464), ('walter', 462), ('louisiana', 460), ('theatre', 459), ('one', 458), ('francis', 453), ('girl', 452), ('hospital', 452), ('springs', 452), ('wilson', 449), ('police', 449), ('customs', 448), ('ford', 448), ('rumsey', 448), ('el', 447), ('stephen', 447), ('christian', 446), ('brooklyn', 445), ('albert', 443), ('jack', 443), ('shakespeare', 443), ('spain', 440), ('workers', 439), ('heritage', 438), ('wright', 437), ('hotel', 436), ('k', 433), ('immigrants', 433), ('engineering', 432), ('wwii', 432), ('media', 432), ('daniel', 431), ('19th', 429), ('gallery', 428), ('study', 428), ('two', 427), ('photos', 427), ('laws', 426), ('march', 425), ('periodicals', 425), ('y', 421), ('postcard', 420), ('philosophy', 419), ('montana', 419), ('squadron', 419), ('dance', 417), ('industrial', 416), ('western', 416), ('salem', 416), ('1950', 415), ('jewish', 415), ('hitler', 415), ('images', 414), ('president', 412), ('tom', 411), ('parks', 411), ('economics', 409), ('mexican', 409), ('gay', 409), ('al', 408), ('football', 408), ('creek', 408), ('gordon', 406), ('edgar', 406), ('howard', 405), ('census', 405), ('allen', 405), ('club', 404), ('detroit', 403), ('domestic', 401), ('construction', 400), ('association', 399), ('reading', 399), ('biography', 398), ('congress', 398), ('age', 397), ('lawrence', 395), ('community', 394), ('forest', 393), ('von', 392), ('sex', 392), ('love', 391), ('bridge', 390), ('description', 389), ('arkansas', 389), ('game', 389), ('declaration', 388), ('des', 388), ('development', 388), ('historic', 387), ('security', 387), ('town', 387), ('soviet', 386), ('may', 386), ('margaret', 386), ('federal', 385), ('11', 385), ('davis', 385), ('greek', 384), ('long', 383), ('tribune', 383), ('big', 383), ('natural', 380), ('miller', 380), ('control', 379), ('news', 379), ('dakota', 378), ('1930', 373), ('stone', 373), ('hampshire', 372), ('europe', 370), ('newton', 370), ('anne', 370), ('globe', 370), ('paris', 369), ('animal', 368), ('navy', 367), ('fashion', 366), ('britain', 366), ('santa', 365), ('slaves', 364), ('diary', 363), ('alice', 363), ('film', 362), ('empire', 362), ('charlotte', 362), ('avenue', 362), ('dept', 362), ('policy', 362), ('relations', 360), ('constitution', 360), ('catholic', 360), ('negro', 359), ('columbus', 359), ('cat', 359), ('jean', 358), ('ireland', 358), ('il', 358), ('corps', 357), ('garden', 357), ('post', 357), ('photo', 357), ('oklahoma', 356), ('russia', 356), ('russell', 356), ('car', 355), ('physical', 354), ('warren', 354), ('mill', 353), ('analysis', 352), ('party', 352), ('medieval', 351), ('canada', 351), ('sea', 350), ('nebraska', 350), ('freedom', 349), ('idaho', 349), ('learning', 348), ('painting', 348), ('oil', 348), ('agriculture', 348), ('duluth', 347), ('augusta', 347), ('1960', 346), ('poster', 346), ('ship', 345), ('cotton', 344), ('papers', 342), ('conservation', 342), ('aerial', 342), ('video', 341), ('cold', 340), ('times', 340), ('directory', 340), ('archive', 340), ('railroads', 339), ('poe', 339), ('ann', 338), ('bay', 338), ('review', 337), ('houses', 336), ('ky', 335), ('transportation', 335), ('grand', 334), ('strike', 334), ('vermont', 333), ('jim', 333), ('games', 330), ('theodore', 330), ('insignia', 330), ('twain', 330), ('photograph', 330), ('roman', 329), ('advertising', 328), ('train', 327), ('middle', 326), ('fishing', 324), ('trail', 324), ('dead', 324), ('revolutionary', 322), ('religious', 321), ('point', 321), ('story', 320), ('snow', 320), ('wood', 319), ('carter', 319), ('rush', 319), ('le', 319), ('falls', 319), ('italy', 318), ('prohibition', 318), ('fish', 317), ('orleans', 316), ('harbor', 315), ('bird', 314), ('carl', 313), ('uss', 313), ('artstor', 313), ('walker', 313), ('rome', 312), ('propaganda', 311), ('resources', 311), ('egypt', 311), ('nj', 310), ('jews', 309), ('office', 309), ('anti', 308), ('teaching', 308), ('clothing', 307), ('coal', 306), ('reform', 306), ('canal', 306), ('berlin', 306), ('greece', 306), ('emily', 306), ('hawaii', 305), ('wells', 304), ('italian', 304), ('physics', 304), ('fitzgerald', 302), ('landscape', 302), ('boy', 302), ('division', 302), ('mormon', 300), ('atlas', 300), ('dp', 300), ('bell', 298), ('writing', 298), ('colored', 297), ('douglas', 297), ('nmnh', 296), ('baltimore', 295), ('holocaust', 295), ('female', 295), ('iron', 294), ('square', 294), ('foreign', 294), ('station', 293), ('renaissance', 293), ('cross', 293), ('data', 293), ('urban', 293), ('memorial', 293), ('hemingway', 292), ('da', 292), ('helen', 292), ('1963', 291), ('girls', 291), ('taylor', 291), ('program', 291), ('press', 290), ('sources', 290), ('crime', 289), ('robinson', 289), ('nuclear', 288), ('philip', 288), ('code', 288), ('moon', 287), ('oral', 287), ('marriage', 287), ('chemistry', 286), ('las', 286), ('board', 285), ('rhode', 285), ('x', 285), ('christmas', 284), ('marshall', 284), ('cod', 284), ('alaska', 283), ('mission', 283), ('ernest', 283), ('internment', 283), ('views', 282), ('use', 282), ('protest', 282), ('market', 282), ('brothers', 281), ('jazz', 281), ('murray', 281), ('anthony', 281), ('madison', 281), ('et', 280), ('potter', 280), ('internet', 280), ('dick', 279), ('studies', 279), ('personal', 279), ('dogs', 279), ('queen', 279), ('dress', 279), ('camps', 279), ('walt', 279), ('second', 278), ('1961', 278), ('mills', 278), ('court', 278), ('posters', 278), ('confederate', 277), ('blue', 277), ('republic', 276), ('machine', 276), ('austin', 276), ('1929', 276), ('obituaries', 275), ('del', 274), ('grant', 274), ('glass', 274), ('emblem', 274), ('savannah', 273), ('marketing', 273), ('manual', 272), ('brazil', 272), ('picture', 271), ('years', 271), ('programs', 271), ('baptist', 271), ('richmond', 271), ('du', 270), ('basketball', 269), ('nelson', 269), ('pearl', 269), ('mount', 269), ('anderson', 269), ('student', 268), ('antonio', 268), ('nursing', 268), ('automobile', 267), ('ct', 267), ('star', 267), ('interior', 266), ('electric', 266), ('christopher', 265), ('cache', 265), ('murder', 264), ('jesus', 264), ('stories', 263), ('tree', 263), ('1920s', 263), ('der', 262), ('model', 262), ('august', 262), ('earth', 262), ('com', 262), ('text', 261), ('korean', 261), ('economic', 260), ('dallas', 260), ('usa', 260), ('alfred', 260), ('ellis', 259), ('communication', 259), ('temple', 258), ('league', 258), ('joe', 258), ('isaac', 258), ('eagle', 257), ('champaign', 257), ('year', 256), ('sarah', 256), ('sound', 256), ('housing', 256), ('witch', 256), ('ww2', 255), ('harriet', 255), ('brain', 255), ('training', 255), ('civilian', 255), ('cleveland', 255), ('force', 255), ('statistics', 255), ('introduction', 255), ('railway', 254), ('nature', 254), ('animals', 254), ('delaware', 254), ('massacre', 253), ('buffalo', 253), ('television', 251), ('peace', 251), ('programming', 251), ('line', 251), ('birds', 251), ('jacob', 250), ('card', 250), ('cities', 250), ('3', 250), ('dickens', 249), ('mathematics', 249), ('1775', 249), ('plant', 248), ('search', 248), ('ray', 247), ('golden', 247), ('dream', 246), ('boys', 246), ('assassination', 246), ('insurance', 245), ('rice', 245), ('harlem', 244), ('ethics', 244), ('hamilton', 244), ('class', 244), ('morgan', 244), ('cuba', 243), ('care', 243), ('ice', 243), ('1917', 243), ('1860', 243), ('bowl', 243), ('mine', 243), ('mines', 242), ('families', 242), ('monroe', 242), ('nh', 242), ('speech', 242), ('duke', 242), ('herbert', 242), ('rose', 242), ('ut', 241), ('geology', 241), ('horses', 241), ('vs', 241), ('douglass', 240), ('1910', 240), ('prince', 240), ('canyon', 240), ('catalog', 239), ('liberty', 239), ('good', 239), ('god', 239), ('reconstruction', 238), ('criticism', 237), ('naval', 237), ('july', 237), ('test', 237), ('winter', 237), ('night', 237), ('cats', 237), ('movie', 236), ('womens', 236), ('dictionary', 236), ('correspondence', 236), ('wpa', 236), ('ruth', 235), ('morris', 235), ('plan', 235), ('primary', 234), ('self', 234), ('country', 234), ('albany', 234), ('cooper', 234), ('mrs', 234), ('lowell', 233), ('color', 233), ('manuscripts', 233), ('rico', 233), ('thompson', 233), ('finance', 233), ('1975', 233), ('1970', 233), ('http', 233), ('tea', 232), ('manhattan', 232), ('district', 232), ('sam', 232), ('disney', 231), ('pittsburgh', 231), ('plants', 231), ('puerto', 231), ('elementary', 231), ('korea', 231), ('cars', 230), ('era', 230), ('birth', 230), ('churches', 229), ('urbana', 228), ('drug', 228), ('lord', 227), ('patrick', 227), ('lloyd', 226), ('seattle', 226), ('planning', 226), ('houston', 226), ('heart', 226), ('paper', 225), ('energy', 225), ('plantation', 225), ('change', 224), ('systems', 224), ('susan', 224), ('audio', 223), ('fred', 223), ('eugene', 222), ('bob', 222), ('academy', 222), ('light', 222), ('colleges', 222), ('annual', 221), ('record', 220), ('committee', 220), ('haven', 220), ('moore', 219), ('sanborn', 219), ('disease', 219), ('bureau', 219), ('ships', 219), ('eacute', 219), ('bomb', 219), ('sugar', 218), ('racing', 218), ('motion', 218), ('sculpture', 218), ('botany', 218), ('cooking', 217), ('manuals', 217), ('flight', 217), ('cemetery', 216), ('commission', 216), ('making', 216), ('tn', 215), ('wars', 215), ('period', 215), ('kill', 215), ('ocean', 215), ('9', 215), ('chester', 215), ('cultural', 215), ('campbell', 215), ('karl', 214), ('three', 214), ('1968', 214), ('marie', 214), ('olympics', 214), ('dickinson', 213), ('methodist', 213), ('wall', 213), ('soldier', 213), ('money', 213), ('cook', 213), ('org', 213), ('montgomery', 213), ('store', 212), ('lost', 212), ('trials', 212), ('latin', 212), ('amendment', 211), ('holmes', 211), ('drawings', 211), ('costume', 211), ('bank', 211), ('funeral', 210), ('anatomy', 210), ('survey', 210), ('encyclopedia', 210), ('1890', 209), ('segregation', 209), ('maria', 209), ('wayne', 209), ('baker', 209), ('beverly', 209), ('environmental', 208), ('ralph', 208), ('denver', 208), ('mountains', 208), ('postcards', 208), ('nude', 208), ('atlantic', 208), ('grammar', 208), ('dust', 207), ('mitchell', 207), ('1850', 207), ('1870', 206), ('mo', 206), ('gettysburg', 206), ('show', 206), ('di', 206), ('mental', 206), ('ben', 206), ('poland', 205), ('flowers', 205), ('atomic', 205), ('einstein', 205), ('harold', 205), ('web', 205), ('therapy', 204), ('register', 204), ('september', 204), ('1964', 204), ('truman', 204), ('greenville', 204), ('series', 204), ('graham', 204), ('1980', 204), ('crisis', 204), ('property', 204), ('en', 203), ('tx', 203), ('juan', 203), ('gun', 203), ('evolution', 203), ('www', 202), ('northern', 202), ('1912', 202), ('4', 201), ('newport', 201), ('joyce', 201), ('printing', 201), ('summer', 201), ('prison', 201), ('tobacco', 201), ('7', 200), ('simon', 200), ('reno', 199), ('clinton', 199), ('ward', 199), ('services', 199), ...]
import pickle
import nltk
vap = pickle.load( open( "/media/storage/dpla-data/pickles/virginia.p", "rb" ) )
### >>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
vafd = nltk.FreqDist(token.lower() for token in vap['virginia']['filtered'])
vafd.most_common()
[('virginia', 166709), ('university', 107608), ('library', 64289), ('charlottesville', 54329), ('lib', 40564), ('image', 35858), ('va', 35396), ('holsinger', 31644), ('studio', 31459), ('collection', 30225), ('states', 29421), ('uva', 29228), ('collections', 26683), ('special', 23382), ('negatives', 22543), ('visual', 22459), ('history', 22162), ('ca', 21622), ('material', 21482), ('use', 17953), ('united', 17390), ('information', 15949), ('go', 15910), ('please', 15897), ('search', 15895), ('edu', 15894), ('terms', 15894), ('http', 15894), ('regions', 12101), ('w', 12006), ('name', 11945), ('davis', 11606), ('photographs', 11564), ('jackson', 11421), ('1825', 11044), ('present', 10993), ('online', 10977), ('1890', 10538), ('1938', 10459), ('photographic', 10386), ('index', 10378), ('plus', 10377), ('volume', 10376), ('1930', 9763), ('1866', 9685), ('rufus', 9632), ('glass', 9520), ('african', 9456), ('portraits', 9377), ('visitors', 8915), ('may', 8908), ('must', 8871), ('without', 8871), ('reproduced', 8864), ('rector', 8864), ('permission', 8863), ('additional', 8863), ('credited', 8863), ('plate', 8485), ('white', 7954), ('good', 7663), ('black', 7652), ('condition', 7646), ('works', 6794), ('group', 6386), ('people', 6351), ('built', 6346), ('single', 6266), ('american', 6206), ('1882', 5640), ('1947', 5617), ('5x7', 5607), ('photography', 5528), ('restrictions', 5518), ('accessing', 5518), ('county', 5107), ('music', 5090), ('south', 4872), ('function', 4839), ('8x10', 4259), ('u', 4232), ('1915', 3833), ('text', 3699), ('ethnic', 3422), ('school', 3413), ('americans', 3335), ('record', 3332), ('digital', 3319), ('dpla', 3317), ('print', 2968), ('piano', 2898), ('towns', 2790), ('cities', 2790), ('schools', 2741), ('c', 2713), ('notated', 2549), ('contact', 2338), ('n', 2143), ('children', 2103), ('unknown', 2090), ('new', 2067), ('conditions', 2014), ('english', 1922), ('buildings', 1877), ('1917', 1818), ('date', 1794), ('03', 1768), ('york', 1738), ('untitled', 1689), ('07', 1628), ('09', 1616), ('11', 1609), ('prints', 1600), ('04', 1572), ('songs', 1549), ('j', 1526), ('12', 1508), ('05', 1497), ('1916', 1486), ('01', 1475), ('1918', 1474), ('10', 1454), ('08', 1441), ('1914', 1438), ('mrs', 1426), ('06', 1386), ('popular', 1335), ('architecture', 1329), ('miss', 1269), ('elements', 1259), ('h', 1238), ('02', 1193), ('co', 1169), ('e', 1161), ('st', 1160), ('students', 1129), ('richard', 1110), ('spaces', 1107), ('1912', 1077), ('gender', 1040), ('education', 1036), ('emulsion', 1032), ('john', 1009), ('content', 1006), ('linguistic', 1004), ('rotunda', 965), ('anderson', 954), ('1940', 942), ('f', 929), ('ralph', 910), ('training', 906), ('sciences', 890), ('social', 880), ('film', 877), ('unidentified', 864), ('b', 860), ('boston', 858), ('g', 853), ('carolina', 845), ('d', 837), ('1913', 824), ('m', 823), ('l', 817), ('trees', 816), ('overall', 800), ('william', 789), ('hall', 788), ('institutional', 778), ('africa', 768), ('babies', 764), ('philadelphia', 743), ('subject', 733), ('1919', 732), ('r', 728), ('open', 709), ('charles', 701), ('football', 697), ('college', 681), ('site', 672), ('men', 671), ('portrait', 670), ('army', 648), ('george', 642), ('houses', 640), ('colleges', 638), ('sports', 635), ('damage', 634), ('faculty', 632), ('industrial', 618), ('hand', 616), ('tex', 614), ('de', 612), ('war', 610), ('world', 608), ('house', 608), ('institute', 566), ('domestic', 561), ('18', 543), ('alderman', 538), ('players', 533), ('building', 531), ('soldiers', 525), ('landscape', 524), ('age', 519), ('uniforms', 519), ('shrubs', 516), ('mr', 506), ('rooms', 506), ('fraternity', 503), ('right', 499), ('furniture', 488), ('form', 485), ('land', 479), ('silver', 477), ('16', 476), ('1920', 476), ('photo', 474), ('gelatin', 470), ('interior', 467), ('thomas', 466), ('21', 465), ('left', 462), ('north', 461), ('washington', 453), ('colored', 452), ('instrumental', 452), ('hats', 451), ('p', 445), ('lawn', 441), ('james', 437), ('type', 433), ('types', 430), ('east', 429), ('24', 420), ('henry', 417), ('la', 417), ('dr', 410), ('ga', 407), ('women', 401), ('26', 401), ('14', 397), ('engravings', 394), ('middle', 394), ('17', 390), ('along', 389), ('continents', 388), ('20', 386), ('high', 382), ('architectural', 378), ('15', 376), ('13', 376), ('summer', 376), ('1921', 374), ('texas', 373), ('elizabeth', 372), ('state', 371), ('27', 370), ('old', 370), ('vehicles', 367), ('events', 367), ('bottom', 366), ('west', 366), ('city', 365), ('cartographic', 365), ('y', 363), ('albemarle', 361), ('25', 356), ('22', 356), ('19', 355), ('family', 355), ('materials', 353), ('automobiles', 352), ('o', 351), ('context', 350), ('animals', 349), ('28', 347), ('1924', 345), ('corner', 344), ('side', 343), ('smith', 341), ('29', 340), ('robert', 340), ('30', 337), ('baltimore', 337), ('two', 334), ('11x14', 332), ('costume', 331), ('agricultural', 330), ('ala', 326), ('home', 325), ('horses', 324), ('landforms', 322), ('church', 321), ('voice', 319), ('countries', 318), ('gardens', 318), ('occupation', 315), ('jr', 313), ('french', 312), ('union', 310), ('23', 309), ('academy', 307), ('teachers', 307), ('1974', 307), ('ark', 306), ('ditson', 298), ('evening', 297), ('1891', 297), ('vocal', 296), ('landscapes', 295), ('view', 292), ('company', 291), ('mississippi', 291), ('negro', 285), ('edward', 284), ('oliver', 280), ('components', 280), ('girls', 279), ('families', 278), ('gowns', 278), ('top', 275), ('map', 274), ('1925', 272), ('construction', 267), ('baseball', 267), ('saint', 266), ('visible', 266), ('views', 264), ('caroline', 263), ('universities', 263), ('tenn', 263), ('color', 263), ('schirmer', 263), ('activity', 258), ('sq', 258), ('missing', 257), ('chestnut', 257), ('general', 257), ('georgia', 257), ('structural', 255), ('joseph', 251), ('relationship', 250), ('4', 249), ('bettis', 249), ('water', 249), ('mary', 245), ('france', 244), ('taylor', 242), ('edgefield', 241), ('trenton', 239), ('normal', 238), ('roads', 238), ('field', 237), ('1895', 236), ('class', 235), ('street', 233), ('penn', 232), ('arranged', 232), ('dwellings', 231), ('alabama', 231), ('walker', 230), ('preachers', 228), ('1972', 226), ('broadway', 226), ('story', 225), ('1977', 224), ('photogravures', 223), ('31', 221), ('lee', 221), ('railroads', 220), ('pageants', 219), ('hench', 218), ('laughlin', 217), ('cumberland', 217), ('atcheson', 217), ('1922', 216), ('1928', 215), ('operas', 210), ('brothers', 209), ('session', 208), ('clothes', 207), ('costumes', 207), ('1923', 206), ('arkansas', 203), ('room', 203), ('louis', 202), ('couples', 202), ('henrico', 201), ('jefferson', 200), ('frank', 199), ('specific', 198), ('waltzes', 197), ('farms', 196), ('europe', 196), ('districts', 196), ('snow', 196), ('voices', 195), ('flowers', 195), ('paul', 192), ('richmond', 190), ('equipment', 189), ('boxing', 188), ('farm', 187), ('near', 187), ('alexander', 187), ('railroad', 187), ('arthur', 187), ('1970', 186), ('fla', 186), ('finals', 185), ('parish', 184), ('paris', 183), ('choruses', 182), ('pavilion', 181), ('sir', 179), ('edwin', 179), ('hill', 179), ('pa', 179), ('president', 179), ('ruth', 177), ('monticello', 175), ('fields', 175), ('mixed', 174), ('14x17', 173), ('delta', 171), ('warner', 171), ('upper', 170), ('track', 170), ('tennessee', 169), ('team', 169), ('horse', 169), ('cabell', 168), ('1966', 167), ('porches', 166), ('1969', 166), ('1896', 166), ('1889', 165), ('one', 165), ('king', 161), ('hampton', 161), ('exhibits', 160), ('party', 160), ('club', 160), ('island', 159), ('helena', 159), ('central', 159), ('mass', 158), ('cultural', 158), ('worn', 158), ('phi', 157), ('hospital', 156), ('1858', 155), ('mammals', 154), ('35', 154), ('sacred', 154), ('composition', 152), ('physical', 152), ('aerial', 152), ('gloucester', 151), ('sweet', 151), ('railway', 151), ('center', 150), ('chicago', 150), ('boys', 150), ('1885', 150), ('1968', 150), ('stores', 150), ('engineering', 149), ('1819', 148), ('ohio', 148), ('plants', 148), ('medical', 146), ('love', 146), ('conference', 145), ('1870', 144), ('sigma', 144), ('business', 144), ('secular', 142), ('dance', 142), ('day', 141), ('personal', 141), ('man', 141), ('parts', 141), ('1911', 140), ('main', 140), ('1937', 140), ('chesterfield', 139), ('1875', 139), ('1976', 139), ('1929', 139), ('louisiana', 138), ('1830', 138), ('weddings', 138), ('exhibitions', 136), ('florida', 136), ('brown', 134), ('public', 134), ('scott', 134), ('techniques', 133), ('range', 133), ('mountains', 133), ('pageant', 133), ('processes', 133), ('head', 132), ('1900', 132), ('1863', 131), ('1872', 131), ('scenes', 131), ('saalfield', 131), ('southern', 130), ('rural', 130), ('beaufort', 130), ('bodies', 129), ('albert', 129), ('deteriorating', 129), ('1971', 129), ('cincinnati', 128), ('basketball', 128), ('allen', 128), ('activities', 128), ('point', 127), ('africans', 127), ('helen', 127), ('1905', 126), ('accessories', 126), ('religion', 125), ('md', 125), ('vocational', 124), ('1813', 124), ('london', 124), ('retail', 124), ('1975', 124), ('1950', 124), ('brides', 124), ('slight', 123), ('graded', 123), ('1927', 123), ('score', 122), ('1850', 121), ('england', 121), ('medium', 121), ('1935', 121), ('green', 120), ('wood', 120), ('willig', 120), ('superintendent', 120), ('1926', 119), ('concepts', 118), ('dormitory', 118), ('grounds', 118), ('1839', 118), ('low', 118), ('dogs', 117), ('student', 116), ('memorial', 116), ('entirely', 116), ('francis', 116), ('mountain', 116), ('1809', 116), ('clark', 115), ('clergy', 114), ('martin', 114), ('none', 114), ('kenya', 114), ('1856', 113), ('1876', 113), ('gordon', 113), ('randolph', 113), ('1892', 113), ('1964', 113), ('webb', 112), ('edgar', 112), ('2', 112), ('arr', 112), ('1859', 111), ('1848', 111), ('wilson', 111), ('broken', 111), ('orleans', 110), ('daughters', 110), ('1910', 110), ('duets', 109), ('du', 108), ('chairs', 108), ('1826', 108), ('fourth', 108), ('statues', 108), ('1901', 108), ('harry', 107), ('1967', 107), ('camp', 106), ('k', 105), ('exhibit', 105), ('walter', 105), ('chi', 105), ('statue', 105), ('nottoway', 104), ('fences', 104), ('coats', 103), ('kappa', 103), ('road', 103), ('briar', 103), ('1963', 103), ('harrison', 103), ('harris', 102), ('1909', 102), ('1874', 102), ('marion', 102), ('steps', 102), ('bowling', 102), ('edge', 102), ('franz', 101), ('1984', 100), ('v', 100), ('1936', 100), ('dormitories', 100), ('food', 99), ('walls', 99), ('1941', 99), ('1827', 99), ('1600', 99), ('1880', 99), ('1852', 98), ('veils', 98), ('natural', 98), ('1797', 97), ('residence', 97), ('1833', 97), ('1865', 97), ('chapel', 97), ('garden', 97), ('fur', 97), ('von', 96), ('station', 96), ('1828', 96), ('joining', 96), ('areas', 96), ('graduation', 95), ('additive', 95), ('1400', 95), ('1861', 94), ('garments', 94), ('samuel', 94), ('toys', 94), ('frederick', 94), ('office', 93), ('quartets', 92), ('1867', 92), ('geographic', 92), ('organ', 92), ('frogmore', 92), ('1939', 92), ('carter', 91), ('1980', 91), ('containers', 91), ('1877', 91), ('1871', 91), ('sea', 91), ('work', 91), ('national', 91), ('brick', 91), ('morris', 90), ('jones', 90), ('sonatas', 90), ('peters', 90), ('campbell', 90), ('academic', 90), ('chipped', 90), ('son', 89), ('dillard', 89), ('1869', 89), ('1986', 89), ('bridge', 89), ('marches', 89), ('books', 89), ('madison', 89), ('1965', 89), ('across', 89), ('beta', 88), ('1979', 88), ('showing', 88), ('makers', 88), ('nurses', 88), ('monroe', 88), ('park', 87), ('1860', 87), ('river', 87), ('david', 87), ('alfred', 87), ('277', 87), ('front', 87), ('mount', 87), ('okla', 86), ('liberia', 86), ('1987', 86), ('montgomery', 86), ('marshall', 85), ('board', 85), ('1899', 85), ('1945', 84), ('alumni', 84), ('ceremonies', 84), ('albumen', 84), ('meeting', 84), ('corinne', 84), ('1832', 84), ('des', 83), ('stingray', 83), ('1837', 83), ('cleveland', 83), ('lewis', 82), ('1857', 82), ('churches', 82), ('red', 82), ('little', 82), ('ave', 82), ('carl', 82), ('sumter', 82), ('middlesex', 82), ('1883', 82), ('1847', 82), ('miller', 82), ('1888', 81), ('1907', 81), ('lawns', 81), ('obscuring', 81), ('yards', 81), ('jean', 81), ('laboratory', 81), ('1849', 81), ('consumer', 80), ('1862', 80), ('le', 80), ('et', 80), ('art', 80), ('court', 80), ('woman', 79), ('machinery', 79), ('hands', 79), ('groups', 79), ('1897', 79), ('oh', 78), ('bowles', 78), ('artifacts', 78), ('halifax', 78), ('1864', 77), ('williams', 77), ('ky', 77), ('department', 77), ('pond', 76), ('1973', 76), ('stations', 76), ('1829', 76), ('1886', 76), ('process', 76), ('1308', 76), ('1893', 76), ('rosenwald', 75), ('german', 75), ('military', 75), ('night', 75), ('minor', 75), ('tables', 74), ('anne', 74), ('player', 74), ('1810', 73), ('serpentine', 73), ('country', 73), ('small', 73), ('fair', 73), ('society', 73), ('frame', 72), ('1978', 72), ('peter', 72), ('dining', 72), ('1835', 72), ('watercraft', 72), ('prince', 72), ('conferences', 71), ('1855', 71), ('wm', 71), ('christian', 71), ('johnson', 71), ('lawrenceville', 71), ('1842', 71), ('1868', 71), ('oak', 71), ('game', 71), ('large', 71), ('stadiums', 71), ('van', 71), ('navy', 71), ('1846', 71), ('1840', 71), ('professor', 71), ('islands', 70), ('lyon', 70), ('rock', 70), ('forest', 70), ('life', 70), ('stephen', 69), ('1985', 69), ('location', 69), ('ii', 69), ('queen', 69), ('fruit', 69), ('russell', 69), ('berry', 69), ('shelby', 69), ('scene', 68), ('41', 68), ('drawn', 68), ('1884', 68), ('frederic', 68), ('1887', 68), ('warren', 68), ('clubs', 68), ('held', 68), ('ridge', 68), ('visit', 67), ('norfolk', 67), ('gillingham', 67), ('1908', 67), ('1903', 67), ('1873', 67), ('adams', 67), ('government', 67), ('1807', 67), ('apples', 66), ('rogers', 66), ('mostly', 66), ('accelerator', 66), ('1981', 66), ('poe', 66), ('parties', 66), ('1841', 66), ('grove', 66), ('brunswick', 65), ('stone', 65), ('1949', 65), ('1982', 65), ('law', 65), ('1962', 65), ('july', 65), ('young', 65), ('athletes', 65), ('wives', 65), ('plant', 65), ('association', 65), ('1808', 65), ('windows', 65), ('printed', 65), ('blue', 65), ('michael', 64), ('crowds', 64), ('societies', 64), ('trip', 64), ('chesapeake', 64), ('1934', 64), ('1844', 64), ('carriages', 64), ('clarke', 64), ('1812', 64), ('c1866', 64), ('trio', 63), ('continuing', 63), ('oklahoma', 63), ('america', 63), ('theodore', 63), ('1818', 62), ('sons', 62), ('horsemanship', 62), ('published', 62), ('fayette', 62), ('shannon', 62), ('since', 62), ('maria', 62), ('cookery', 62), ('manassas', 62), ('canada', 62), ('1823', 62), ('log', 62), ('rivers', 62), ('drawing', 62), ('tuskegee', 62), ('schmidt', 61), ('settlements', 61), ('mark', 61), ('tape', 61), ('1831', 61), ('1836', 61), ('funeral', 61), ('bible', 61), ('boxer', 60), ('affairs', 60), ('charlotte', 60), ('451', 60), ('service', 60), ('complexes', 60), ('weber', 60), ('hancock', 60), ('valley', 60), ('1898', 60), ('canning', 60), ('science', 59), ('98', 59), ('c1889', 59), ('tyler', 59), ('1786', 59), ('1983', 59), ('underwood', 59), ('lower', 59), ('orchards', 59), ('moore', 59), ('1851', 58), ('benjamin', 58), ('shakespeare', 58), ('duke', 58), ('1988', 58), ('herbaceous', 58), ('birds', 58), ('1820', 58), ('trade', 58), ('sewing', 58), ('first', 58), ('roofs', 58), ('area', 58), ('alpha', 58), ('1853', 57), ('rouge', 57), ('woodberry', 57), ('biology', 57), ('classrooms', 57), ('teacher', 57), ('canned', 57), ('figures', 57), ('painting', 57), ('1834', 57), ('maryland', 57), ('cracked', 57), ('edwards', 57), ('howard', 57), ('crb', 56), ('1817', 56), ('picture', 56), ('travis', 56), ('evans', 56), ('1933', 56), ('1932', 56), ('half', 56), ('1894', 56), ('1843', 56), ('cross', 56), ('ash', 56), ('par', 56), ('earl', 55), ('epsilon', 55), ('1854', 55), ('baton', 55), ('fire', 55), ('theta', 55), ('guadalupe', 55), ('living', 55), ('robinson', 55), ('slightly', 55), ('1806', 55), ('clay', 55), ('dean', 55), ('show', 55), ('approximately', 54), ('greene', 54), ('relief', 54), ('graduate', 54), ('1779', 54), ('swain', 54), ('mayo', 54), ('williamsburg', 54), ('transparencies', 54), ('c1882', 53), ('1953', 53), ('c1883', 53), ('x', 53), ('real', 53), ('violins', 53), ('come', 53), ('hereford', 53), ('great', 53), ('5', 53), ('environments', 53), ('extending', 53), ('lake', 53), ('years', 53), ('gymnasium', 53), ('cabins', 53), ('thornton', 52), ('1815', 52), ('louisville', 52), ('1811', 52), ('daniel', 52), ('c1887', 52), ...]
colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard",
"ia","getty","kentucky","minnesota","missouri","mwdl",
"nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]
import pickle
for c in colls:
#p = pickle.load( open( "/media/storage/dpla-data/pickles/"+c+".p", "rb" ) )
p = pickle.load( open( "C:/Users/charper/dpla-temp/pickles/"+c+".p", "rb" ) )
print("\nGathering Stats for " + c)
stats = p[c]['stats']
print(stats)
print("percent unique:")
print(round((p[c]['stats']['uniq'] / p[c]['stats']['wc']),5), "%")
print("filtered percent unique:")
print(round((p[c]['stats']['funiq'] / p[c]['stats']['fwc']),5), "%")
print("*********")
Gathering Stats for artstor {'fwc': 6518566, 'wc': 8466030, 'funiq': 60635, 'uniq': 60760} percent unique: 0.00718 % filtered percent unique: 0.0093 % ********* Gathering Stats for biodiv {'fwc': 7638579, 'wc': 8361216, 'funiq': 94631, 'uniq': 94755} percent unique: 0.01133 % filtered percent unique: 0.01239 % ********* Gathering Stats for rumsey {'fwc': 12562369, 'wc': 15404401, 'funiq': 47643, 'uniq': 47763} percent unique: 0.0031 % filtered percent unique: 0.00379 % ********* Gathering Stats for commonwealth
#type(fd)
#haps = fd.hapaxes()
len(vafd.hapaxes())
31738
longwords = {}
for k,v in vafd.items():
if len(k) > 10: longwords[k] = v
longwords
{'Accelerator': 65, 'Accessories': 120, 'Accomodates': 2, 'Accompanying': 1, 'Adjustments': 1, 'Administration': 2, 'Administrative': 13, 'Adminstration': 1, 'Aduertissement': 1, 'Advertisement': 2, 'Advertising': 5, 'Aerodynamics': 2, 'Aeronautical': 5, 'Aeronautics': 37, 'Agricultural': 303, 'Agriculture': 13, 'Alterations': 1, 'Amabassador': 1, 'Amazonenvitt': 1, 'Amphitheater': 2, 'Amphitheaters': 9, 'Amphitheatre': 43, 'Ampitheatre': 4, 'Amstelodami': 1, 'Anniversaries': 5, 'Anniversary': 22, 'Announcement': 1, 'Antiquities': 2, 'Apologetics': 1, 'Apparizione': 1, 'Appointment': 1, 'Archdeacons': 1, 'Architectural': 313, 'Architecture': 1012, 'Aristomenes': 1, 'Arrangements': 1, 'Assassination': 2, 'Association': 64, 'Associations': 7, 'Astronomical': 1, 'Auditoriums': 4, 'Aufforderung': 4, 'Automobiles': 351, 'Automoblies': 1, 'Badarzewska': 3, 'Balustrades': 5, 'Barboursville': 18, 'Barcarolles': 2, 'Barhamsville': 12, 'Bartholomew': 6, 'Battlements': 1, 'Baylorsville': 5, 'Beaujoyeulx': 1, 'Beckenstein': 3, 'Beuckenstein': 2, 'Bibliographical': 1, 'Bibliopolas': 1, 'Biscaccianti': 1, 'Bishopville': 2, 'Bjørnstjerne': 1, 'Blackboards': 6, 'Blacksmiths': 1, 'Blankenship': 2, 'Blennerhassett': 1, 'Blessington': 2, 'Bloomington': 1, 'Blumentritt': 1, 'Bodybuilders': 41, 'Bodybuilding': 4, 'Bonaventure': 2, 'Bonnycastle': 1, 'Bookbinders': 2, 'Bookbinding': 2, 'Bookshelves': 2, 'Brandeville': 1, 'Brandstetter': 3, 'Breckinridge': 2, 'Bridegrooms': 24, 'Bridgeforth': 1, 'Bridgewater': 1, 'Brockenbraugh': 1, 'Brownsville': 23, 'Brueschweiler': 1, 'Buentivolio': 1, 'Businessmen': 1, 'Butterflies': 1, 'COLLECTIONS': 1, 'Cadwallader': 1, 'Calcografia': 1, 'Calculating': 1, 'Cannonballs': 1, 'Canzonettas': 1, 'Cartersville': 39, 'Cartographic': 2, 'Castiglione': 4, 'Catholiques': 1, 'Cazouillement': 1, 'Celebration': 27, 'Celebrations': 3, 'Certificate': 2, 'Certificates': 2, 'Chamberlain': 9, 'Chamberlayne': 20, 'Chancellors': 1, 'Chandeliers': 2, 'Charakterstu': 1, 'Charlemagne': 2, 'Charlestown': 1, 'Charlottesviile': 1, 'Charlottesville': 54329, 'Chartseller': 1, 'Chatterbrick': 1, 'Chesterfield': 139, 'Christening': 3, 'Christianity': 2, 'Christiansburg': 11, 'Christopher': 14, 'Churchyards': 1, 'Clarksville': 2, 'Classification': 1, 'Clatterbrick': 1, 'Clinchfield': 2, 'Clotheslines': 1, 'Cobblestone': 1, 'Collections': 26682, 'Combination': 1, 'Commencement': 24, 'Commencements': 14, 'Commentaires': 1, 'Commisioner': 1, 'Commissioned': 2, 'Commissioner': 1, 'Commissioners': 3, 'Commisssioners': 1, 'Comparative': 1, 'Competition': 1, 'Confederacy': 1, 'Confederate': 52, 'Conferences': 71, 'Confidentially': 1, 'Confirmation': 1, 'Confirmations': 1, 'Congregation': 2, 'Congressional': 2, 'Conservatories': 1, 'Considerations': 1, 'Consolation': 2, 'Consolidated': 14, 'Consolidating': 1, 'Constantine': 2, 'Constitution': 2, 'Constitutional': 1, 'Construction': 114, 'Continental': 8, 'Continuation': 3, 'Contreblasons': 1, 'Convenience': 1, 'Conversation': 1, 'Convocation': 3, 'Cooperation': 8, 'Cooperative': 20, 'Coopersmith': 1, 'Coordinated': 1, 'Cornachione': 1, 'Coronations': 2, 'Corporation': 11, 'Corporations': 1, 'Corporative': 1, 'Correctional': 1, 'Corrections': 1, 'Correspondence': 1, 'Corrugating': 5, 'Cosmographie': 1, 'Cosstaphney': 1, 'Courlaender': 1, 'Courthouses': 32, 'Cowperthwait': 1, 'Critchenberger': 2, 'Critenbarger': 2, 'Crutchfield': 2, 'Dardensburg': 1, 'Declaration': 36, 'Declaratory': 2, 'Decorations': 2, 'Dedications': 1, 'Demonstation': 1, 'Demonstration': 32, 'Demonstrations': 16, 'Demonstrator': 1, 'Demonstrators': 1, 'Departments': 1, 'Description': 15, 'Destruction': 1, 'Development': 12, 'Diepenbeeck': 1, 'Discoloration': 3, 'Dispensations': 2, 'Disposition': 1, 'Distinguished': 10, 'Distributed': 2, 'Divertissement': 1, 'Documentary': 3, 'Dodelinette': 1, 'Dormitories': 100, 'Dressmakers': 2, 'Dressmaking': 1, 'Dunfermline': 1, 'Educational': 15, 'Electricity': 2, 'Elefterious': 1, 'Emancipation': 2, 'Embroidering': 1, 'Encarnacion': 1, 'Enchantment': 1, 'Encyclopaedia': 2, 'Engineering': 144, 'Entablatures': 1, 'Entertainers': 2, 'Entertainment': 46, 'Environments': 44, 'Equestrians': 3, 'Ergenbright': 2, 'Establishment': 3, 'Estudiantina': 1, 'Eternamente': 1, 'Euangelistes': 1, 'Eucharistie': 1, 'Evangelical': 2, 'Exhibitions': 135, 'Experimental': 25, 'Fairgrounds': 3, 'Fantasiestu': 3, 'Faschingsschwank': 1, 'Fayerweather': 28, 'Fayetteville': 19, 'Fenchtenberger': 1, 'Fieuberlake': 1, 'Fiskerjenten': 1, 'Fitzpatrick': 4, 'Fleetstreet': 8, 'Folkelivsbilleder': 1, 'Fontainebleau': 2, 'Footbridges': 4, 'Foraarstoner': 1, 'Fortification': 6, 'Fortifications': 1, 'Foundations': 8, 'Fountainebleau': 1, 'Fraternities': 39, 'Fredericksburg': 23, 'Freemasonry': 8, 'Freiligrath': 1, 'Friedenwald': 1, 'Furnishings': 18, 'Gainesville': 1, 'Gainsborough': 1, 'Gallicanism': 1, 'Gendarmerie': 1, 'Generations': 2, 'Geographical': 2, 'Giambattista': 1, 'Gildersleeve': 16, 'Gillenwater': 3, 'Goldschmetterlinge': 1, 'Goldschmidt': 1, 'Gordonsville': 1, 'Granddaughter': 1, 'Grandfather': 3, 'Grandfathers': 2, 'Grandmother': 4, 'Grandmothers': 1, 'Grandparents': 1, 'Grandsaigne': 1, 'Grandstands': 14, 'Graniteville': 1, 'Greeleyville': 6, 'Greenhouses': 2, 'Greensville': 3, 'Grigorʹevich': 1, 'Grillparzer': 3, 'Groundbreakings': 4, 'Hairdressing': 4, 'Hallettsville': 4, 'Hammarstrand': 2, 'Hanfstaengl': 1, 'Harpsichord': 26, 'Harrisonburg': 8, 'Hattiesburg': 11, 'Headquarters': 3, 'Hecatongraphie': 1, 'Hendrickson': 2, 'Heppenheimer': 2, 'Herrodsburg': 1, 'Herzallerliebsten': 2, 'Hetherington': 2, 'Hippocrates': 2, 'Hirondelles': 1, 'Historiarum': 1, 'Hodgkinsonne': 1, 'Hollingsworth': 2, 'Homesickness': 1, 'Honeysuckle': 1, 'Horsemanship': 62, 'Hortenstein': 1, 'Hortensteine': 1, 'Horticultural': 3, 'Horticulture': 7, 'Humoresques': 2, 'Hydrographer': 1, 'Hydrographical': 2, 'Identifying': 18, 'Illustrated': 2, 'Illustrations': 2, 'Improvement': 11, 'Improvisations': 1, 'Inaguration': 1, 'Inauguration': 17, 'Incorporated': 2, 'Independance': 1, 'Independence': 32, 'Independent': 6, 'Indianapolis': 4, 'Industrialization': 1, 'Infirmaries': 3, 'Information': 44, 'Infrastructural': 9, 'Infrastructure': 1, 'Inheritance': 1, 'Inscriptions': 1, 'Institution': 1, 'Institutional': 778, 'Institutions': 1, 'Instruction': 1, 'Instrumental': 1, 'Insurrection': 1, 'Integration': 1, 'Intelligence': 1, 'Interallied': 1, 'Intermediate': 1, 'International': 2, 'Internationally': 2, 'Interscholastic': 1, 'Intersection': 1, 'Intracoastal': 1, 'Introduktion': 1, 'Investigation': 1, 'Jacksonville': 17, 'JeanesTeachers': 1, 'Jeffersonville': 1, 'Katzenstein': 1, 'Kiinderscenen': 1, 'Kindergarten': 4, 'Kinderscenen': 1, 'Kirkpatrick': 2, 'Knickerbockers': 1, 'Kompositionen': 1, 'Laboratories': 13, 'Lamentations': 1, 'Landscaping': 1, 'Lawrenceville': 71, 'Legislative': 2, 'Legislators': 1, 'Legislature': 2, 'Lescribleur': 1, 'Letterheads': 2, 'Liebestrank': 1, 'Liebestraum': 1, 'Liebeswonne': 1, 'Lindencrone': 1, 'Lindenmeyer': 1, 'Lindpaintner': 3, 'Litchtenberg': 1, 'Lithographed': 3, 'Lithographic': 2, 'Lithographing': 2, 'Lithographs': 22, 'Lithography': 3, 'Livingstone': 4, 'Locomotives': 21, 'Mademoiselle': 1, 'Madisonville': 2, 'Mandolinata': 1, 'Mantelpiece': 2, 'Manufactures': 1, 'Manufacturing': 3, 'Manuscripts': 1, 'Marguerites': 1, 'Marigliotta': 2, 'Marketplaces': 10, 'Marlborough': 2, 'Marseillaise': 1, 'Masquerades': 2, 'Massachusetts': 12, 'Mathematics': 1, 'Maximillian': 7, 'McMinnville': 2, 'Mecklenburg': 1, 'Mediterranei': 2, 'Medringhaus': 2, 'Meistersinger': 2, 'Membranophones': 3, 'Mendelssohn': 29, 'Menosprecio': 1, 'Meriwtether': 1, 'Metalworking': 2, 'Middelburgum': 1, 'Middleditch': 2, 'Miscellaneous': 3, 'Missionaries': 3, 'Mississippi': 291, 'Mollenhauer': 2, 'Mondenschein': 1, 'Monseigneur': 2, 'Montecastle': 2, 'Montmorency': 1, 'Morningside': 6, 'Morrissette': 1, 'Mountaineer': 1, 'Multiplication': 1, 'Musikalische': 1, 'Napolitaine': 1, 'Naturalization': 1, 'Nebuchadnezzar': 1, 'Neighborhood': 2, 'Netherlands': 13, 'Newfoundland': 1, 'Newspictures': 3, 'Nightengale': 1, 'Nightingale': 3, 'Nonprescription': 1, 'Nonprojected': 3, 'Northampton': 24, 'Northumberland': 1, 'Northwestern': 1, 'Nouuellement': 3, 'Nullification': 1, 'Observatories': 10, 'Observatory': 17, 'Occidentales': 1, 'Occidentalioribus': 1, 'Offertories': 5, 'Opernthemas': 1, 'Oppugnation': 1, 'Organisations': 1, 'Organizations': 4, 'Orientation': 9, 'Orthography': 4, 'Outbuilding': 1, 'Outbuildings': 19, 'Outerbridge': 9, 'Outstanding': 1, 'Overexposed': 2, 'Oxfordshire': 21, 'Pagenstecker': 1, 'Pantagrueline': 2, 'Paraphrases': 1, 'Participation': 1, 'Pedestrians': 3, 'Penitential': 2, 'Pennsylvania': 35, 'Pennyslvania': 1, 'Pensilvania': 1, 'Performances': 12, 'Periodicals': 13, 'Peterborough': 1, 'Philadelphia': 743, 'Phildelphia': 1, 'Phillibrown': 16, 'Photocopies': 5, 'Photographed': 1, 'Photographers': 40, 'Photographic': 10, 'Photographs': 5524, 'Photography': 5521, 'Photogravure': 1, 'Photogravures': 223, 'Piccolomini': 1, 'Plaetsnyder': 1, 'Plantations': 4, 'Playgrounds': 17, 'Polytechnical': 2, 'Poniatowski': 1, 'Poplarville': 8, 'Posselwhite': 3, 'Preliminary': 3, 'Presbyteriain': 1, 'Presbyterian': 3, 'Presbyterians': 1, 'Presentation': 3, 'Presentazione': 1, 'Presidential': 16, 'Printseller': 9, 'Printsellers': 1, 'Privateering': 1, 'Proceedings': 1, 'Processions': 28, 'Prohibition': 4, 'Propagation': 1, 'Proprietors': 1, 'Protectionism': 1, 'Protestantism': 1, 'Protestation': 1, 'Psychotropic': 1, 'Publications': 5, 'Punchinello': 1, 'Punctuation': 1, 'Quesenberry': 1, 'RailroadCompany': 1, 'Rappahannock': 7, 'Ravenscroft': 1, 'Recollections': 1, 'Reconciliation': 1, 'Reconstruction': 3, 'Recreational': 10, 'Reflections': 1, 'Reformation': 1, 'Reformatories': 4, 'Reformatory': 8, 'Refreshments': 1, 'Refridgerators': 1, 'Refrigerator': 1, 'Refrigerators': 3, 'Registration': 19, 'Regulations': 1, 'Rehabilitation': 2, 'Remembrance': 1, 'Reminiscences': 2, 'Remonstrance': 4, 'Rentiesville': 2, 'Representations': 3, 'Residential': 1, 'Restaurants': 14, 'Restoration': 5, 'Resurrection': 2, 'Revolutionary': 2, 'Rheinberger': 3, 'Ritournelle': 1, 'Rittenhouse': 3, 'Riverfronts': 2, 'Rockefeller': 2, 'Rondolettos': 1, 'Rorzwaukaski': 2, 'Rosenbecker': 1, 'Rudersdorff': 2, 'Saltonstall': 2, 'Sandersville': 24, 'Satterfield': 1, 'Satterthwaite': 3, 'Saunnaituis': 2, 'Scarborough': 1, 'Scharfenberg': 13, 'Schleiffarth': 1, 'Schlepegrell': 1, 'Schlesinger': 1, 'Schnatterly': 1, 'Schoolbuildings': 1, 'Schoolhouse': 15, 'Schottisches': 17, 'Schottishce': 1, 'Schwalbenbotschaft': 1, 'Schwanengesang': 10, 'Searchlights': 1, 'Sebastianum': 1, 'Segregation': 18, 'Seneviratne': 1, 'Sesquicentennial': 7, 'Settlements': 45, 'Seventeenth': 4, 'Shackelford': 12, 'Shackleford': 2, 'Shaftesbury': 2, 'Shakespeare': 58, 'Shakespearean': 2, 'Shepherdstown': 3, 'Sherrington': 2, 'Shipbuilding': 1, 'Shoalwalter': 1, 'Silhouettes': 11, 'Smokestacks': 1, 'Somersville': 2, 'Sommernachtstraum': 2, 'Southampton': 10, 'Southwestern': 3, 'Spielmannslieder': 1, 'Spotslyvania': 1, 'Spotsylvania': 9, 'Spottsylvania': 1, 'Springfield': 6, 'Stadtkirche': 1, 'Stalactites': 3, 'Stallknecht': 2, 'Stepmothers': 1, 'Stereoscope': 1, 'Stereoscopic': 2, 'Stockbridge': 2, 'Storefronts': 3, 'Storytellers': 1, 'Streetscapes': 3, 'Stringfellow': 8, 'Stringfield': 1, 'Superintendent': 120, 'Superintendents': 12, 'Supervising': 2, 'Supervisors': 1, 'Supterintendent': 1, 'Surrounding': 2, 'Switzerland': 13, 'Tablecloths': 2, 'Tallahassee': 13, 'Tanzmomente': 1, 'Tanzskizzen': 1, 'Tappahannock': 1, 'Tarantellas': 3, 'Tchaikovsky': 2, 'Terraqueous': 1, 'Territorial': 2, 'Teschemacher': 1, 'Testimonial': 1, 'Thanksgiving': 1, 'Theological': 21, 'Therapentics': 1, 'Therapeutics': 2, 'Theyendanagea': 1, 'Thomasville': 1, 'Thunderbolt': 5, 'Tillinghast': 1, 'Timmonsville': 1, 'Topographic': 1, 'Topographical': 2, 'Transfusion': 1, 'Transilvania': 1, 'Translating': 1, 'Transparencies': 23, 'Transportation': 35, 'Transvestism': 28, 'Triosonatas': 1, 'Typographos': 1, 'Ueberwasser': 1, 'Unaccompanied': 10, 'Undentified': 1, 'Undergraduate': 8, 'Undergraduates': 8, 'Underground': 12, 'Unidentfied': 1, 'Unidentifed': 9, 'Unidentified': 718, 'Universitat': 1, 'Universities': 262, 'Universitut': 1, 'Universtity': 1, 'Unsatisfactory': 2, 'Unterhaltung': 1, 'Variationen': 1, 'Velimirovic': 2, 'Versification': 3, 'Villefranche': 1, 'Violoncello': 2, 'VirginiaFraternity': 1, 'Waccowochie': 1, 'Wagenknight': 3, 'Wallerstein': 3, 'Washingtons': 1, 'Watercolors': 3, 'Waterfronts': 3, 'Watermelons': 30, 'Weatherford': 2, 'Weissenborn': 1, 'Wertenbacker': 1, 'Wertenbaker': 12, 'Westerville': 1, 'Westmoreland': 4, 'Wheelbarrows': 5, 'Wheelchairs': 1, 'Wheelwright': 1, 'Wheelwrights': 1, 'Whitechurch': 1, 'Wiggleworth': 2, 'Williamsburg': 54, 'Williamsport': 2, 'Winsborough': 1, 'Winterhalter': 1, 'Wollenhaupt': 8, 'Woodworking': 13, 'Worthington': 4, 'Wrigglesworth': 2, 'accelerator': 1, 'accessories': 6, 'accommodating': 2, 'accommodations': 8, 'administration': 27, 'administrators': 2, 'adolescence': 1, 'advertisement': 2, 'advertising': 2, 'agricultural': 27, 'agriculture': 5, 'anniversary': 7, 'appassionata': 1, 'application': 1, 'appreciation': 1, 'approximately': 54, 'archeuesque': 2, 'architectural': 65, 'architecture': 317, 'artistiques': 1, 'association': 1, 'associations': 3, 'authorities': 2, 'automobiles': 1, 'automoblile': 1, 'battlefield': 1, 'beautifying': 3, 'benediction': 1, 'bottomlands': 1, 'caertmaecker': 1, 'capriccioso': 3, 'cartographic': 363, 'celebrating': 1, 'celebration': 1, 'celebrations': 9, 'chrestienne': 1, 'clarissimum': 1, 'combination': 1, 'communications': 1, 'composition': 152, 'comprehension': 1, 'confirmirte': 1, 'congregation': 1, 'connections': 6, 'consequences': 1, 'conservatory': 1, 'consolation': 1, 'consolidated': 3, 'consolidation': 1, 'constructed': 1, 'construction': 153, 'convocation': 2, 'cornerstone': 1, 'cultivation': 1, 'damoyselles': 1, 'declaration': 3, 'declination': 1, 'dedications': 1, 'deliberately': 1, 'demonstrates': 1, 'demonstration': 19, 'demonstrator': 1, 'demonstrators': 1, 'departments': 1, 'deportemens': 1, 'description': 4, 'destruction': 1, 'detereriorating': 5, 'deterioraing': 1, 'deteriorate': 1, 'deteriorating': 129, 'deterioration': 2, 'development': 21, 'developments': 1, 'dignitaries': 1, 'dilapidated': 1, 'disciplines': 10, 'discoloration': 22, 'discolorations': 1, 'discoveries': 4, 'disposition': 1, 'distinguished': 1, 'diversified': 7, 'educational': 1, 'eliminating': 1, 'engineering': 5, 'enseignemens': 1, 'enterprises': 3, 'entertainers': 4, 'entertainment': 2, 'environment': 1, 'environments': 9, 'eradicating': 1, 'exhibitions': 1, 'exploration': 3, 'expositions': 3, 'facinoribus': 1, 'furnishings': 2, 'generations': 1, 'geographischen': 1, 'governments': 1, 'granddaughter': 1, 'groundbreaking': 20, 'handwriting': 2, 'harpsichord': 3, 'headquarters': 1, 'hecatodistichon': 2, 'hlingsglaube': 1, 'horizontally': 2, 'hydrographica': 2, 'identifiable': 6, 'identifying': 1, 'illustrissime': 1, 'importables': 1, 'improvement': 9, 'improvements': 6, 'inauguration': 1, 'independant': 1, 'information': 15905, 'inhabitants': 1, 'inscription': 1, 'installation': 1, 'institution': 2, 'institutions': 2, 'instruction': 2, 'instructives': 1, 'instrumental': 451, 'instruments': 25, 'insurrections': 1, 'intellectual': 1, 'interdisciplinary': 2, 'interesting': 1, 'interjacentiumq': 1, 'interpretation': 1, 'interpreting': 1, 'introductio': 1, 'irregularly': 1, 'irresesitible': 1, 'irresistable': 1, 'lVniuersite': 1, 'laboratories': 1, 'laboratoryatue': 1, 'legislation': 2, 'legislature': 1, 'linstitution': 1, 'lithographer': 1, 'lithographs': 1, 'locomotives': 1, 'magnificence': 1, 'magnificque': 2, 'manufacture': 1, 'manufacturer': 1, 'manufacturing': 1, 'mathematics': 3, 'membranophones': 1, 'mercimoniis': 1, 'merveilleux': 1, 'mousquetaire': 1, 'napolitaine': 1, 'naturalization': 1, 'neighborhood': 3, 'nightingale': 1, 'nouuellement': 1, 'nstlerleben': 1, 'observations': 1, 'obstruction': 5, 'opportunity': 1, 'organization': 2, 'orientalioribus': 1, 'orientation': 2, 'oscilloscope': 1, 'overagainst': 1, 'overlooking': 1, 'participate': 1, 'particularly': 1, 'performances': 1, 'periodicals': 1, 'photographic': 10376, 'photographs': 6040, 'photography': 7, 'picturesque': 1, 'plantations': 2, 'practically': 1, 'predominantly': 1, 'preparation': 2, 'presentation': 5, 'principales': 1, 'printmaking': 1, 'productions': 32, 'prognostication': 2, 'progressives': 1, 'promptuaire': 1, 'promulgation': 1, 'prononciation': 1, 'provinciarum': 3, 'publication': 1, 'quarterback': 5, 'recognizable': 1, 'reconstruction': 7, 'recreations': 1, 'refreshments': 1, 'regulations': 1, 'relationship': 250, 'remembering': 1, 'representatives': 2, 'reproduction': 1, 'reproductions': 48, 'resiouissance': 2, 'respectibus': 1, 'resplendent': 1, 'restrictions': 5518, 'resurrection': 1, 'reverendissime': 1, 'sacrificateur': 1, 'sanctissima': 1, 'satisfactory': 3, 'schoolbuildings': 29, 'schottische': 3, 'sentimentale': 1, 'septemtrionale': 1, 'septentrionale': 1, 'settlements': 16, 'shareholder': 1, 'shopkeepers': 1, 'siciliennes': 1, 'sioujssances': 2, 'slenderness': 1, 'spectabilis': 1, 'spirituelles': 1, 'stalagmites': 3, 'stockholders': 1, 'strengthened': 1, 'subdivision': 1, 'superintendence': 1, 'supervising': 2, 'supervisors': 1, 'supplementary': 1, 'surrounding': 5, 'sweethearts': 2, 'temperaments': 1, 'temptations': 1, 'territories': 3, 'thanksgiving': 1, 'theological': 1, 'topographical': 1, 'traditional': 1, 'transgressions': 2, 'transparencies': 31, 'transportation': 3, 'transported': 2, 'transporting': 4, 'treschrestien': 5, 'tresexcellente': 1, 'tresillustre': 1, 'trespuissant': 1, 'triangulation': 1, 'triumphante': 3, 'ultracentrifuge': 2, 'unclassified': 1, 'undentified': 3, 'underground': 1, 'undernourished': 1, 'unidentifed': 4, 'unidentifiable': 1, 'unidentified': 146, 'universities': 1, 'unprotected': 1, 'unrecognizable': 1, 'ventilating': 1, 'vingthuictieme': 1, 'violoncello': 30, 'violoncellos': 1, 'waterfronts': 2, 'watermelons': 7, 'weatherboarded': 1, 'whitewashing': 1}
from nltk.collocations import *
finder = BigramCollocationFinder.from_words(filtered['ia']['filtered'])
bigram_measures = nltk.collocations.BigramAssocMeasures()
scored = finder.score_ngrams(bigram_measures.raw_freq)
sorted(bigram for bigram, score in scored)
#This thing here just hangs forevs. I wonder if it's possible to do it without the notebook?
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = BigramCollocationFinder.from_words(filteredgpo)
finder.nbest(bigram_measures.pmi, 10)
import pickle
import nltk
colls = ["biodiv","rumsey","commonwealth","georgia","harvard",
"ia","getty","kentucky","minnesota","missouri","mwdl",
"nara","nocar","smiths","socar","texas","gpo","illinois",
"usc","virginia","nocoll"]
#colls = ["biodiv"]
fd = pickle.load( open( "/media/storage/dpla-data/pickles/new/artstor_fd.p", "rb" ) )
fds = {}
fds['artstor'] = fd
print(len(fd))
for coll in colls:
tmp = pickle.load( open( "/media/storage/dpla-data/pickles/new/"+coll+"_fd.p", "rb" ) )
print("updating FD with " + coll)
fds[coll] = tmp
fd.update(tmp)
print(len(fd))
60168 updating FD with biodiv 131704 updating FD with rumsey 153719 updating FD with commonwealth 323186 updating FD with georgia 428524 updating FD with harvard 444294 updating FD with ia 853180 updating FD with getty 874037 updating FD with kentucky 879550 updating FD with minnesota 892810 updating FD with missouri 965647 updating FD with mwdl 1589542 updating FD with nara 2556336 updating FD with nocar 2689928 updating FD with smiths 2948721 updating FD with socar 2960280 updating FD with texas 3709827 updating FD with gpo 4056106 updating FD with illinois 4066429 updating FD with usc 4167642 updating FD with virginia 4193316 updating FD with nocoll 4193393
fd.most_common()
[('library', 6117895), ('university', 4474393), ('digital', 4393278), ('county', 4101874), ('archives', 3912296), ('image', 3856146), ('http', 3622052), ('utah', 3508437), ('texas', 3088382), ('states', 3034554), ('united', 2980122), ('records', 2940508), ('collection', 2933041), ('edu', 2869869), ('text', 2743094), ('u', 2735080), ('1', 2651930), ('national', 2627569), ('use', 2369332), ('c', 2323676), ('state', 2222489), ('office', 2198024), ('libraries', 2112146), ('north', 2089362), ('carolina', 2021982), ('georgia', 2013630), ('ark', 1940101), ('california', 1810572), ('images', 1793602), ('unt', 1793154), ('history', 1785922), ('public', 1720702), ('department', 1687369), ('english', 1663863), ('southern', 1559202), ('death', 1533958), ('d', 1516996), ('copyright', 1463445), ('67531', 1454858), ('photographs', 1451551), ('texashistory', 1432924), ('information', 1398449), ('washington', 1308460), ('x', 1220153), ('smithsonian', 1199436), ('available', 1197530), ('research', 1195215), ('center', 1163841), ('government', 1150791), ('institution', 1147249), ('collections', 1140774), ('content', 1116803), ('newspapers', 1106143), ('west', 1094535), ('ga', 1093339), ('ut', 1090170), ('administration', 1068841), ('college', 1065094), ('los', 1059295), ('angeles', 1047333), ('photograph', 1045279), ('ca', 1025436), ('local', 1007516), ('census', 1003194), ('terms', 996437), ('service', 988342), ('portal', 967273), ('american', 961283), ('mountain', 960732), ('may', 940055), ('made', 934848), ('statistics', 914170), ('special', 886509), ('defense', 868309), ('disk', 866632), ('ed', 862066), ('p', 856817), ('vital', 853095), ('agency', 852556), ('2', 845176), ('5', 841400), ('archive', 839173), ('gordon', 826933), ('usc', 782346), ('museum', 775978), ('see', 773608), ('historical', 772560), ('dept', 764440), ('j', 761928), ('n', 757319), ('magnetic', 747803), ('title', 747283), ('tex', 734878), ('america', 722533), ('photographic', 721011), ('part', 719264), ('including', 718627), ('1905', 715122), ('3', 713008), ('unrestricted', 706048), ('contact', 677717), ('w', 676065), ('md', 668947), ('study', 664102), ('park', 663796), ('nmnh', 649509), ('maps', 647804), ('b', 643911), ('cgi', 641422), ('10', 638652), ('south', 635438), ('secretary', 634762), ('1958', 631306), ('4', 625355), ('newspaper', 620590), ('please', 616244), ('map', 613643), ('01', 606953), ('include', 605423), ('1939', 605294), ('permission', 602815), ('new', 602509), ('anthropology', 597459), ('project', 587240), ('private', 587021), ('created', 583921), ('org', 582788), ('2008', 579928), ('8', 579199), ('lib', 571698), ('certificates', 567808), ('rights', 566483), ('certificate', 560875), ('health', 560430), ('contents', 559924), ('e', 557717), ('heritage', 550587), ('cm', 549198), ('atlanta', 547042), ('teaching', 544659), ('botany', 542307), ('7', 541722), ('full', 537197), ('general', 523510), ('sound', 516121), ('holding', 515365), ('video', 507502), ('11', 504369), ('society', 501581), ('12', 498668), ('calhoun', 494892), ('gpo', 492130), ('2007', 489677), ('17', 484438), ('recordings', 484166), ('mission', 482853), ('publicly', 481311), ('partners', 480262), ('thumbnail', 477850), ('itemurl', 477639), ('thumbnailurl', 477639), ('advertising', 475246), ('art', 473332), ('identifier', 473008), ('pictures', 472344), ('domain', 471126), ('military', 469395), ('l', 467158), ('h', 466419), ('assigned', 465535), ('specol', 465295), ('still', 463899), ('material', 462158), ('memorial', 461098), ('city', 455185), ('us', 444041), ('boston', 440872), ('war', 438395), ('includes', 433659), ('resource', 432396), ('number', 432306), ('213', 415973), ('news', 411263), ('arizona', 409384), ('internet', 408349), ('21', 406888), ('law', 406216), ('m', 401417), ('de', 400958), ('bureau', 392174), ('print', 385810), ('john', 385392), ('laws', 385150), ('22', 381806), ('www', 379402), ('white', 378232), ('business', 378114), ('90089', 377826), ('20', 376706), ('13', 376208), ('massachusetts', 375639), ('files', 374845), ('daily', 372969), ('09', 372792), ('january', 371526), ('1921', 368779), ('item', 368361), ('wpa', 368349), ('03', 366716), ('century', 365267), ('doheny', 364968), ('drive', 364385), ('0189', 363865), ('household', 363696), ('commercial', 363215), ('international', 361430), ('02', 361132), ('division', 359732), ('kentucky', 358883), ('15', 357764), ('school', 356600), ('inc', 355061), ('printing', 353855), ('gov', 352039), ('16', 348978), ('hard', 348575), ('lake', 344570), ('6', 343146), ('documents', 339507), ('06', 335715), ('07', 335134), ('14', 334743), ('file', 334434), ('page', 332226), ('05', 331733), ('9', 329820), ('places', 328713), ('04', 328619), ('institute', 328372), ('1982', 325214), ('1994', 325061), ('f', 324959), ('air', 324400), ('oclc', 322550), ('purposes', 322339), ('vol', 321707), ('protection', 321439), ('getty', 320777), ('bin', 320732), ('works', 320621), ('hutzel', 315352), ('died', 315245), ('house', 314953), ('forces', 314721), ('g', 314565), ('r', 314401), ('visual', 312926), ('18', 311659), ('black', 310081), ('24', 309774), ('agreement', 308813), ('23', 307921), ('one', 307763), ('2006', 304967), ('company', 303990), ('retained', 303438), ('services', 302549), ('education', 301797), ('intellectual', 301731), ('photo', 301328), ('hill', 301178), ('30', 300117), ('35', 299784), ('cont', 297790), ('reserve', 297783), ('2014', 297528), ('cards', 295644), ('pages', 294533), ('st', 293662), ('exceptions', 293184), ('25', 292434), ('view', 290866), ('housing', 289670), ('finance', 287345), ('improve', 285861), ('combined', 283872), ('proof', 283816), ('access', 283798), ('san', 282709), ('district', 282093), ('development', 281951), ('08', 281868), ('fill', 280758), ('street', 280641), ('certification', 279874), ('1960', 279644), ('gaps', 278142), ('management', 277529), ('familysearch', 277402), ('keypath', 277397), ('indexesresults', 277397), ('runwhat', 277397), ('idxfiles', 277397), ('v', 277391), ('william', 275976), ('2011', 275061), ('economics', 271972), ('geography', 271143), ('online', 268800), ('forest', 268767), ('2013', 267752), ('materials', 266942), ('communications', 266869), ('book', 266353), ('negative', 264987), ('19', 264908), ('salt', 264653), ('plants', 264340), ('28', 263042), ('dpla', 262957), ('color', 262938), ('along', 262712), ('environmental', 259654), ('jones', 259337), ('1940', 258904), ('obituaries', 258478), ('2000', 256307), ('mm', 254211), ('type', 254072), ('virginia', 253372), ('italy', 252068), ('index', 251253), ('athens', 250838), ('program', 249468), ('document', 249230), ('must', 248981), ('2003', 246499), ('minnesota', 246439), ('abilene', 245618), ('26', 244854), ('29', 243883), ('register', 242169), ('young', 241482), ('early', 239921), ('fax', 239667), ('educational', 239320), ('chapel', 238765), ('co', 238304), ('publications', 238114), ('system', 236973), ('periodicals', 235930), ('plantae', 235889), ('george', 234770), ('publication', 233713), ('association', 233414), ('2001', 232500), ('york', 232489), ('html', 230373), ('max', 230254), ('two', 229834), ('historic', 229645), ('phone', 228783), ('ocolc', 226412), ('world', 225206), ('board', 225179), ('david', 225159), ('zoology', 224905), ('0', 224735), ('digitized', 224072), ('27', 223035), ('leslie', 222591), ('prints', 222392), ('journalism', 221812), ('survey', 221666), ('events', 221545), ('required', 220720), ('published', 220199), ('pictorial', 219600), ('o', 219307), ('report', 218246), ('federal', 217026), ('officials', 216327), ('activities', 215985), ('site', 212735), ('lccn', 212473), ('people', 212459), ('2005', 212395), ('negatives', 212081), ('non', 211659), ('1900', 210990), ('life', 210606), ('1990', 210331), ('usa', 210289), ('multimedia', 209791), ('740', 209446), ('reserved', 209199), ('2010', 208823), ('microfilm', 208575), ('also', 207242), ('flowering', 206963), ('subject', 206775), ('1920', 206671), ('contentdm', 206566), ('dallas', 206113), ('ferns', 204716), ('areas', 203520), ('821', 203110), ('relating', 202918), ('2343', 201969), ('2366', 201736), ('1950', 200446), ('atlas', 199019), ('medical', 198449), ('31', 197786), ('cultural', 197434), ('section', 197143), ('without', 196840), ('surveys', 196775), ('portraits', 196242), ('louis', 196128), ('nc', 195737), ('series', 195587), ('ill', 195430), ('church', 195243), ('jan', 194458), ('architecture', 193697), ('paper', 192058), ('james', 191054), ('portrait', 190915), ('net', 188361), ('physical', 187254), ('aug', 187237), ('first', 187147), ('river', 186862), ('code', 186740), ('agriculture', 186244), ('electronic', 186066), ('attorney', 185874), ('building', 184663), ('block', 184394), ('courtesy', 184024), ('commerce', 184016), ('1860', 182868), ('animalia', 182830), ('handle', 182051), ('resources', 181443), ('relation', 181137), ('protected', 180416), ('civil', 180409), ('design', 180214), ('women', 179205), ('dicotyledonae', 179177), ('nevada', 179151), ('interior', 179110), ('social', 178571), ('enumeration', 178186), ('willard', 176096), ('weekly', 175513), ('family', 175324), ('buildings', 175169), ('space', 175042), ('paul', 174826), ('water', 174802), ('box', 174733), ('2004', 174469), ('men', 174309), ('card', 173753), ('hdl', 173582), ('work', 172733), ('region', 172700), ('personal', 172563), ('accession', 171192), ('marriott', 171158), ('edward', 170423), ('photography', 169921), ('ethnology', 168295), ('86', 167746), ('help', 166729), ('1947', 166686), ('rumsey', 166593), ('provided', 166261), ('invertebrate', 165889), ('charles', 165838), ('10020', 165827), ('jun', 165621), ('descriptions', 165543), ('indians', 165270), ('regulations', 165133), ('oct', 164313), ('105', 163809), ('2002', 162990), ('fort', 162660), ('language', 162111), ('brigham', 161673), ('student', 161631), ('group', 161578), ('scholarship', 161302), ('missouri', 160794), ('repository', 160549), ('ink', 159649), ('granted', 159447), ('employee', 159238), ('1945', 158703), ('go', 157813), ('right', 157809), ('photographer', 157783), ('press', 157356), ('1910', 156652), ('basel', 156514), ('written', 156014), ('governed', 155957), ('received', 155705), ('htm', 155475), ('pursuant', 154863), ('jul', 154733), ('volumes', 154694), ('clipping', 154343), ('left', 154335), ('unknown', 154220), ('thomas', 154133), ('field', 153549), ('2009', 153404), ('glass', 153129), ('sep', 153080), ('medieval', 152980), ('idaho', 152927), ('1930', 152870), ('risk', 152495), ('1911', 152192), ('arte', 151898), ('entomology', 151555), ('etc', 150509), ('2012', 150425), ('antonio', 150363), ('id', 149367), ('used', 149299), ('distribution', 149213), ('indian', 149011), ('public_domain_copyright_notice', 148603), ('pdf', 148440), ('dwellings', 148387), ('foto', 147761), ('minore', 147644), ('jpg', 146458), ('34', 146391), ('customs', 146179), ('freely', 146062), ('gallery', 145645), ('la', 145560), ('brothers', 144934), ('hosted', 144340), ('area', 143776), ('solely', 142953), ('front', 142895), ('students', 141998), ('papers', 141136), ('date', 140889), ('byu', 140345), ('displayed', 140175), ('mexico', 140084), ('cornell', 140057), ('committee', 140028), ('subsequent', 139334), ('german', 139105), ('1865', 138987), ('1980', 138962), ('property', 138813), ('biodiversity', 138775), ('bhl', 138362), ('four', 137459), ('118', 136952), ('natural', 136814), ('control', 136766), ('commonwealth', 136381), ('pencil', 136271), ('east', 136139), ('urban', 135941), ('henry', 135170), ('accordance', 134266), ('reproduction', 133619), ('landscape', 133579), ('well', 133358), ('el', 132876), ('june', 132790), ('children', 132419), ('views', 132410), ('1861', 132273), ('lee', 131946), ('pacific', 131651), ('unit', 131097), ('mass', 130764), ('creative', 130420), ('known', 130333), ('collected', 129987), ('address', 129929), ('extensive', 129875), ('mar', 129691), ('born', 129618), ('record', 129343), ('held', 129204), ('operations', 128698), ('african', 128237), ('presented', 128205), ('searched', 127683), ('electronically', 127637), ('owning', 127478), ('kdl', 127243), ('kyvl', 127243), ('illinois', 126520), ('trust', 126373), ('apr', 125606), ('year', 125582), ('sheet', 125238), ('form', 125225), ('y', 125130), ('restrictions', 125105), ('brown', 125012), ('robert', 124580), ('1931', 124444), ('open', 124106), ('army', 124086), ('tewksbury', 123514), ('1961', 122806), ('inches', 122774), ('1944', 122669), ('1901', 122437), ('geological', 122048), ('americans', 121953), ('unc', 121856), ('silver', 121508), ('data', 121190), ('man', 121139), ('identification', 121112), ('greensboro', 121093), ('letter', 121034), ('original', 120867), ('congress', 120865), ('force', 120828), ('economic', 120746), ('web', 120539), ('33', 120428), ('1988', 120197), ('version', 120127), ('july', 120049), ('smith', 119910), ('wilson', 118842), ('jackson', 118782), ('postcards', 118525), ('feb', 118024), ('branch', 117825), ('yale', 117823), ('charleston', 117478), ('marine', 117433), ('march', 117428), ('japan', 117362), ('breckenridge', 117142), ('president', 116931), ('journal', 116872), ('austin', 116517), ('1897', 116517), ('20th', 116413), ('geographic', 116068), ('high', 115940), ('underwood', 115422), ('1929', 115352), ('construction', 115301), ('affairs', 114797), ('dc', 114710), ('cdm', 114165), ('forms', 114104), ('houston', 114065), ('1968', 113695), ('photographers', 113481), ('galveston', 113434), ('reproduced', 113272), ('current', 112959), ('day', 112493), ('000', 112018), ('mrs', 111773), ('studio', 111396), ('1977', 111183), ('savannah', 110938), ('commission', 110714), ('columbia', 110364), ('topographic', 110106), ('arts', 110082), ('annual', 110025), ('near', 109753), ('1913', 109285), ('headquarters', 108867), ('1898', 108827), ('1870', 108769), ('scale', 108352), ('1884', 107901), ('director', 107781), ('hall', 107747), ('arthropoda', 107737), ('dec', 107661), ('1974', 107275), ('viewed', 106833), ('1970', 106778), ('impa', 106393), ('joseph', 105617), ('publishing', 105220), ('1960s', 105116), ('assignment', 105086), ('commons', 105012), ('campus', 104760), ('johnson', 104500), ('personnel', 104442), ('regents', 104304), ('gelatin', 104134), ('case', 103803), ('consortium', 103687), ('navy', 103680), ('cite', 103655), ('1880', 103470), ('paso', 103317), ('almshouse', 103020), ('montana', 102957), ('attribution', 102937), ('agricultural', 102773), ('senate', 102581), ('road', 102546), ('printed', 102276), ('french', 102025), ('cooper', 101934), ('1890', 101527), ('postcard', 101328), ('anderson', 101323), ('film', 101230), ('harold', 101015), ('central', 100930), ('18th', 100896), ('screen', 100874), ('sculpture', 100776), ('station', 100669), ('1967', 100337), ('wpacards', 100170), ('m38843', 99924), ('home', 99757), ('community', 99321), ('unidentified', 99280), ('1976', 99236), ('nov', 99147), ('objects', 99025), ('legislature', 99007), ('beyond', 98849), ('des', 98797), ('april', 98551), ('nature', 98401), ('bombus', 98357), ('funds', 98269), ('orange', 98133), ('clark', 97991), ('showing', 97933), ('aircraft', 97735), ('col', 97424), ('fair', 97124), ('classified', 97042), ('september', 97042), ('london', 96900), ('1914', 96875), ('october', 96659), ('given', 96657), ('three', 96618), ('administrative', 96053), ('monument', 95935), ('et', 95815), ('taylor', 95739), ('supported', 95486), ('old', 95416), ('paris', 95191), ('technology', 94990), ('uses', 94581), ('back', 94566), ('valley', 94549), ('photos', 94251), ('post', 94063), ('1909', 93737), ('years', 93596), ('saint', 93247), ('1912', 93169), ('province', 93153), ('administrator', 93103), ('mary', 93021), ('western', 92921), ('32', 92583), ('documentation', 92467), ('div', 92395), ('mr', 91921), ('obituary', 91676), ('via', 91507), ('transportation', 91502), ('monuments', 91381), ('science', 91042), ('town', 90978), ('africa', 90791), ('mission21', 90786), ('complex', 90711), ('references', 90398), ('documenting', 89892), ('administered', 89517), ('industry', 89006), ('1904', 88889), ('1985', 88667), ('governor', 88513), ('shows', 88507), ('sa', 88390), ('1917', 88367), ('1938', 88319), ('1934', 88316), ('temple', 88292), ('plant', 88071), ('homeplace', 87990), ('music', 87664), ('lsta', 87615), ('island', 87472), ('com', 87179), ('february', 86945), ('1895', 86620), ('woman', 86612), ('uintah', 86384), ('corps', 86376), ('1936', 86243), ('requests', 86169), ('php', 86149), ('herald', 86109), ('architectural', 86035), ('staff', 86024), ('1949', 85957), ('china', 85920), ('renaissance', 85865), ('1886', 85680), ('november', 85606), ('1918', 85453), ('fire', 85333), ('dr', 85104), ('august', 84869), ('native', 84753), ('arthur', 84674), ('legal', 84631), ('medicine', 84585), ('country', 84526), ('1975', 84336), ('asia', 84283), ('1935', 84264), ('land', 84007), ('sc', 83995), ('schools', 83983), ('1907', 83978), ('1951', 83959), ('lowell', 83688), ('assistant', 83635), ('42', 83578), ('intake', 83383), ('1956', 83372), ('specimen', 83361), ('time', 83317), ('description', 83261), ('small', 83247), ('1989', 83077), ('result', 83032), ('notes', 82788), ('reno', 82539), ('hewitt', 82525), ('va', 82463), ('tichnor', 82428), ('photographed', 82332), ('user', 82259), ('1942', 82008), ('obtained', 81803), ('list', 81773), ('baroque', 81668), ('side', 81633), ('appalachian', 81544), ('ii', 81526), ('organized', 81510), ('1933', 81503), ('programs', 81467), ('licenses', 81248), ('shipler', 81143), ('ocean', 80955), ('restricted', 80953), ('1987', 80923), ('frank', 80794), ('hospital', 80752), ('bibliographical', 80726), ('sites', 80710), ('1941', 80699), ('creativecommons', 80600), ('k', 80502), ('confederate', 80171), ('1915', 80024), ('power', 79771), ('walter', 79710), ('december', 79684), ('1850', 79519), ('individuals', 79174), ('prior', 79050), ('1943', 79023), ('eight', 79006), ('red', 78747), ('based', 78642), ('1908', 78581), ('gift', 78580), ('class', 78575), ('gothic', 78574), ('archaeology', 78568), ('richard', 78498), ('issued', 78486), ('session', 78459), ('members', 78346), ('romanesque', 78254), ('geologic', 78225), ('referred', 78141), ('nail', 78137), ('copy', 78007), ('examiner', 77946), ('1919', 77801), ('sciences', 77626), ('1972', 77583), ('1906', 77495), ('labor', 77399), ('large', 77087), ('gri', 77016), ('1962', 76933), ('politics', 76858), ('express', 76795), ('jr', 76613), ('security', 76405), ('scholar', 76358), ('base', 76119), ('standing', 76087), ('worth', 75775), ('opinions', 75772), ('islands', 75771), ('1903', 75761), ('union', 75761), ('nasa', 75756), ('1923', 75604), ('great', 75569), ('free', 75110), ('brenham', 75033), ('40', 74960), ('canyon', 74877), ('anthropological', 74734), ('1899', 74646), ('creek', 74628), ('romans', 74564), ('1928', 74549), ('otherwise', 74448), ('france', 74395), ('thorough', 74298), ('background', 74195), ('forests', 74118), ('harvard', 74020), ('1969', 73985), ('artstor', 73982), ('etruscans', 73875), ('1922', 73858), ('aerial', 73811), ('1916', 73791), ('support', 73740), ('1937', 73734), ('cat340573', 73646), ('place', 73577), ('1924', 73514), ('fulton', 73489), ('insects', 73347), ('1946', 73079), ('manuscript', 72832), ('uncg', 72805), ('training', 72779), ('mail', 72687), ('1963', 72645), ('thursday', 72533), ('second', 72314), ('1955', 72212), ('bastrop', 71902), ('avenue', 71864), ('1997', 71741), ('ariz', 71521), ('azlibrary', 71327), ('1965', 71258), ('application', 71055), ('azmemory', 70966), ('info', 70901), ('correspondence', 70645), ('1952', 70592), ('order', 70491), ('practices', 70254), ('1954', 70162), ('1953', 70149), ('palmer', 69983), ('uss', 69834), ('1883', 69703), ('format', 69689), ('many', 69679), ('harris', 69591), ...]
len(fd.hapaxes())
2773182
fd.hapaxes()
['contiiinecl', 'greise', 'genealogicalreco1888spof', 'noveluber', 'afgestooken', 'ocm15045154', '6455965', 'idx208420048060', '6406649', 'dnsc8706589', 'dnsc8706587', 'dnsc8706586', 'dnsc8706585', 'firstprincipleso00parl_0', 'firstprincipleso00parl_1', '2442383', '1318714', '04921800', 'wintkagasspan', 'choegun', 'd9e953e37b27cb8029cb2e5ca4ee690f', '6406647', '346373', 'ocm57308713', '4a329a29362c8693d61df65f222b60b4', '346376', 'dfst8304359', 'dnsc9001680', '04404800', 'dfst9106751', '652700225', 'meeler', '225251', 'underbank', 'ocm49779203', 'ufers', 'riilcil', 'qurxtity', '284358', '284359', '45882508', 'ficantly', '284352', '284353', '284351', '284356', '284357', '284355', '51758736', 'dnsc9001682', '06878700', 'c944f95bb4c850d0326c9bf7b6e35839', '412_dsp_waste2energy_0051', '469492', 'racticul', 'ahc071015002a', '2155521', 'wageindex', '7afeffc8f2e92619106', '2155522', '6698004', 'diarioenquesepro00cigo', '6698005', '0160734517', 'bankroll', 'ocm41962946', '236485020', '6698000', '870466468', '6698001', 'dfsd0202493', 'ahc1702149001', 'dfsd0202490', 'fiy', 'peble', 'colvtalner', '39999064270257', '428077796', 'dnsc8706588', 'veterinaryhomopa00hurn', '6698009', 'idx208420020051', 'idx208420020050', 'idx208420020053', 'idx208420020052', 'idx208420020055', 'idx208420020054', 'idx208420020057', 'idx208420020056', 'idx208420020059', 'idx208420020058', 'manipulaton', 'todod', '40_cfd_os_2004_1201_141_404', '40_cfd_os_2004_1201_141_405', '40_cfd_os_2004_1201_141_406', '40_cfd_os_2004_1201_141_400', '40_cfd_os_2004_1201_141_401', '40_cfd_os_2004_1201_141_402', '40_cfd_os_2004_1201_141_403', 'dnsn9010679', 'dnsn9010678', '40_cfd_os_2004_1201_141_408', '40_cfd_os_2004_1201_141_409', '6601455', 'highwagejobsinco00unit', 'fi4', '6601453', '6601452', 'whichemploy', 'fi9', '6601459', '501938444', '123955633', '43960370', 'onelincolnstreet00hmma', '723375', 'néill', 'phalluses', 'bustum', '237047684', '3a8f16a675b09218885', 'dfsd0405578', 'idx208420269484', '538677', '538675', '538674', '538673', '538672', '538671', '538670', 'embacy', 'llte1111', '538679', '538678', '6443891', '6443890', '6443893', '6443892', '6443895', '6443894', '6443897', '6443896', 'embach', '6443898', 'podali', 'dfst8600002', 'dfst8600003', 'footwashing', 'dfst8600001', 'dfst8600004', 'dnsc8703203', 'dnsc8703202', '0803728565', 'dnsc8703200', 'dnsc8703207', 'dnsc8703206', 'dnsc8703205', 'dnsc8703204', 'lettertodearaunt00west146', 'dnsc8703208', '5486051', 'idx208420165554', 'idx208420165555', 'idx208420165556', '67820610r', 'idx208420165550', 'idx208420165551', 'idx208420165552', 'idx208420165553', 'idx208420165558', 'idx208420165559', '39999066745066', '57368602', '0160708788', 'utarid', '728242073', '550828', 'putorious', 'utarin', 'hr096p2', 'berlingebrüder', 'ocm58431435', 'scrviot', 'dc10blk_c48061_o01', 'archonnov1923111dumm', 'agr65000117', '611902418', 'ranchlands', 'dfsd0505601', 'mw0398', 'mw0394', 'mw0395', 'mw0396', 'mw0397', 'mw0390', 'mw0392', 'mw0393', 'priiliitive', 'organisada', 'idx208420040645', 'chattenango', 'ocm57199627', 'annualreportnati19932nati', '755_131_015_01', '755_131_015_02', 'tobacco_nrp23e00', 'dayrl', '45199902', 'idx208420226025', 'lixuna', 'fineart', 'programlist00mass', 'b10828126', '03ad2af9e5e8a56ca6f69b3cc435e914', 'enfóquese', 'nomenclatoriszoo00agas', '766588', 'sermones04wycl', '7262462', '7262463', '7262460', '7262461', '7262466', '7262467', '7262464', '7262465', '7262468', '7262469', 'nxtilnd', 'ocn227183726', 'congenite', 'idx208420148768', 'idx208420148769', 'idx208420148766', 'idx208420148767', 'idx208420148764', 'idx208420148765', 'idx208420148762', 'idx208420148763', 'idx208420148760', 'idx208420148761', '826128635', 'claymoore', 'alleviations', 'contributiontohi01ridl', '12071628', 'magazijnvantuins15laar', 'investigationofcnyc0708unit', '454th', 'dnsc9108094', 'underband', 'achc172', 'compléte', 'annualreporttown1962huds', '0062_0053', '0062_0052', '0062_0051', '0062_0050', '0062_0057', '0062_0056', '0062_0055', '0062_0054', '0062_0059', '082833', '082836', 'in8st', '803985224', 'replytoharnackon00cremuoft', '200640', '200641', '200643', '200646', '200647', 'annualreport1908mass', 'ntdrmiyp', '39350564', '335648', 'bb3be5146b92278a48870308d6d82935', '885665870', 'r8026', 'archivfrnaturg7301berl', '57449076', 'dnsc9108090', 'ocm46705900', 'p0031_23621_0001', '6400344', '6400345', 'annualreportsoft1996stod', '6400347', '6400340', 'snrrey', '6400342', 'perlavenutainrom00pizz', '6400348', '6400349', 'bostoncollegemagsp1994bos', '676967', 'ahc099276002a', 'ocm50762106', 'ocm32263636', 'brucharzt', '19348145014000000000', '978984', '978987', '978982', '003753477', 'twk27811', 'vnrl', 'cbc48325_h02', 'vnrk', 'dfst9100038', 'gemonteerd', 'usparticipationi1993unit', 'ds01017', 'photomisc061', '000902464', '52913537', 'mueewscontains', '6473418', '6473419', '6473412', '6473413', '6473410', '6473411', '6473416', '6473417', '6473414', '6473415', '5865218', '5865219', '498_001', '34623874', '5865210', '5865211', '5865212', '5865213', '5865214', '5865215', '5865216', '438065950', '759866782', '427921282', '003_may', 'büderverzeichnis', '06_10_017847', '06_10_017846', '06_10_017845', '06_10_017844', '06_10_017843', '06_10_017842', '06_10_017841', '06_10_017840', 'rainmaking', '06_10_017849', '08_06_014939', '08_06_014938', '08_06_014937', '08_06_014936', '08_06_014935', '08_06_014934', '08_06_014933', '08_06_014932', '08_06_014931', '08_06_014930', 'ocm26865627', '00828702', '00828701', 'n24⁰', 'nouvellebiograph41hoef', 'alrso', 'annualreportofto2008unse', 'flnctnating', 'muskelbewegung', '14759262', 'ocm14847373', 'ecologicos', 'vegas_055', 'nfant', 'fws0bsusfi83140201usfi', 'eann', 'idx208420105650', 'idx208420105653', 'idx208420105652', 'idx208420105655', 'idx208420105657', 'idx208420105656', 'idx208420105659', 'idx208420105658', 'eana', '123915919', 'herthel', '20canyon', 'philipshandyatla00bart', 'pl10blk_c48185_000', '6890229', 'issliiiiiipo', 'tendenzdrama', 'translationsrepr00univ', 'swalllpy', '292504', 'm2g46', '292506', '292507', '93590', 'tchats', 'idx208420043302', 'idx208420043303', 'idx208420043300', 'idx208420043301', 'idx208420043306', 'idx208420043307', 'idx208420043304', 'idx208420043305', 'idx208420043308', 'idx208420043309', 'physeodesmos', 'ptbw_149', 'ptbw_148', '6514768', '6514769', 'ef9ce3ebeb45153cb2f8f310afa603e4', '6514764', '6514765', '6514766', '6514767', '6514760', '6514762', '6514763', '6371356', '6371357', '6371354', '6371355', '6371352', '6371353', '6371350', '6371351', 'ailsofi', 'prsity', '6371358', '6371359', 't35b', 't35c', '794278362', 'desso', '800957', 'capecodnationals222unit', 'boyd1905', '706503356', 'p0001_1136_18648_verso', '2007_8_1163', '234072517', '234072515', '39999065430165', 'pl10blk_c48185_004', 'historyofmoderns02ferg', 'dm0308', 'dm0309', 'dm0306', 'dm0305', 'dm0302', 'dm0303', 'dm0300', 'dm0301', 'reportofboardofm00mass_25', 'inoiiejr', 'reportofboardofm00mass_24', '180764899', '19348025650100000000', 'pedicellate', '098889', '851083009', '66_2_1_001', 'dasc8610641', 'dasc8610640', 'reportofboardofm00mass_23', 'кампании', 'ocm54515622', 'reportofboardofm00mass_22', '006275927', '46670027', '8c5cf2d331b244e18b0fd1adb787e260', 'dmsd0742448', '81983ac', 'samaritanchronic00josh', 'laukaan', 'universityofnort326univ', '42448916456601000000', 'totoramba', 'condev2848_overview', '522141', 'americanmedicalt2186unse', 'investigationofconc12unit', 'rightbrained', 'koekkoek', 'meramecgreenway00nati', '5228565', 'immmlmm', 'chonsia', 'rightlateralized', 'dfst8411099', 'dfst8411098', 'agr64000414', '19348209700200000000', 'nerinea', '800022', '81640821', 'idx208420177306', 'idx208420177307', 'idx208420177304', 'idx208420177305', 'idx208420177302', 'idx208420177303', 'idx208420177300', 'idx208420177301', 'archivfrmikros911berl', 'ocm44621113', 'iilacle', 'idx208420177309', 'marbledecoration00blag', '60694786', '318910995', '800023', 'ms_1604_mitigationhouse_2415', 'health_emphasis_program', 'idx208420258488', 'idx208420272199', 'idx208420272198', '800020', 'idx208420272195', 'idx208420272194', 'idx208420272197', 'idx208420272196', 'idx208420272191', 'idx208420272190', 'idx208420272193', 'idx208420272192', '00002466_tn_0001', 'newsbooks', '69952104', 'idx208420241588', 'idx208420241589', 'z203', 'idx208420241584', 'idx208420241586', 'idx208420241587', 'idx208420241580', 'idx208420241581', 'idx208420241582', 'idx208420241583', 'lltry', '6682510', '6682513', '6682515', '6682514', '6682517', 'acidlty', '6682519', '6682518', 'ahc167002032a', 'machery', 'researchpaper22nort', 'bibliapauperumco00unwi', '05000us48177m', 'soang', '206543', 'lettertodeardebo00chap', 'catanaei', '167656', '39999065612291', 'dnsn9804200', 'dfsd0202499', '06159200', 'dfsd0202498', '54490606', '31232081', 'universityofnort224univ', '800024', '35581049', '40152221', '00007852_tn_0001', 'studentsguidetod00nettuoft', 'fih', 'dfsd0202492', 'dentair', 'fik', '38596296', '97643431', '00002319_tn_0001', 'dfsd0202497', '19571027', '39999057055798', 'dfsd0202496', 'researchpapers81inte', 'cereti', 'kalokoe', 'dfsd0202495', '261710', 'dscn1922', 'cerete', '05067100', '13729521', 'censusofbusiness1952unit', 'cbs4893370_001', 'vorcl', 'instiu', 'evaluaciónes', 'ohtaiu', '65820290r', '19348199005000000000', '1049118', '48999328', '006355217', 'grifflin', 'ocm47248522', 'exobiologyineart00ames', 'armymedicalmuseumcollectioncatalogueofpathologicaldrawingsofmedical', '225867405', '6003189', '6003188', '6003187', '6003186', '39999063171092', '6003184', '6003183', '6003182', '6003181', '6003180', 'dnsn8604075', 'dnsn8604074', 'reportofdanishbi10dans', 'dnsn8604076', 'dnsn8604071', 'dnsn8604070', 'dnst8208141', 'athénée', 'dnst8208144', 'ptbw_209', '147279', 'shadscaje', 'abrahamvs', '6361628', 'marign', '6460578', '147271', '147270', '6361621', '147272', '6361627', '6361626', '6361625', '6361624', 'illspi', 'oceanographicobs1971moyn', 'ptbw_206', 'b20386424', 'idx208420027629', 'idx208420027628', 'idx208420027626', 'idx208420027625', 'idx208420027624', 'idx208420027623', 'idx208420027622', 'idx208420027621', 'ncreased', 'poorpotterofyork01bark', '6460570', 'lq9ic', '2011506740', '2011506741', '2011506746', '6460573', 'lsouth', 'charruaud', '729885960', 'moanavilla', '755_029_012_01', 'sumner138_2_1_120c1', 'suiuciuc', 'idx208420243402', '6460576', 'fillallcial', 'lytleton', 'portrteeiniger00riga', 'idx208420207973', '002590', 'catalogueofstate1863stat', '277_neuve214999', 'idx208420017259', 'idx208420017258', '6401770', '6401776', '6401777', '6401774', 'dnsn9010670', 'idx208420017251', 'idx208420017250', 'idx208420017252', 'idx208420017255', 'idx208420017254', 'idx208420017257', 'idx208420017256', '40_cfd_os_2004_1201_141_407', 'dnsn9010675', 'preinstallation', '002597', 'twk61259', 'twk61254', 'twk61255', 'twk61256', 'twk61257', 'twk61250', 'twk61251', 'twk61252', 'twk61253', 'americanfarmer2425balt', 'pendleys', 'coustet', '009917303', 'blapey', '754123', 'oubrerie', 'fs200112', '36595169', 'fs200117', '6601454', 'fs200119', 'nazvy', '6601457', '810216859', 'ocm32849495', '6601456', '6601451', '943296', '6601450', '94025092', 'idx208420243404', 'histoirenaturell00less', 'buonconsiglio', '7f97e8ebc56f3b036fbaba41f511ef14', '50141339', '14200136', '42148375002000000000', 'laueblose', '3f⁸', '6601458', '5623098', '5623099', '5623092', '5623093', '5623090', '5623091', '5623096', 'i1r11', '5623094', '5623095', 'eiiiaient', 'paternitiy', '49214378', '06801433', 'annualreportcolu1944colu', 'idx208420243409', '1039100', 'pocketalmanackfo1807amer', 'guilfordcollegi519021903', '001326612', '855883', 'lehrlings', 'internationalesa04inte', 'dnsc9401209', '41382262', '04_1of2_dsc_2693', '04372800', 'doi_2770', '06538300', '62_5_9_001', '173219654', 'uspharmacopoeiai00slsnuoft', '42448918750060000000', 'roffler', 'hydrabiologie', '05236200', 'nouvellehyginede00tave', 'yonaknoka', '708252503', 'ocm50037038', '0160754798', 'stakelin', '5fgeneral', '217821', '217822', '217823', '217824', '217825', '315850782', 'zeitschriftfrele16elek', 'umn23361b', 'cyclopaediaofpra11ziemuoft', '201416', '38994315', 'isetan', 'hotisse', 'cinique', '828288556', 'bogasse', '73272738', '39999065840397', 'kisima', 'whitemartinsgeni00whituoft', '719450300', '320456955', '5823242', '10760193', '9780160683268', 't45nr23e', '6663688', '6663689', '6663684', '6663685', '6663686', '6663687', '6663680', '6663681', '6663682', '6663683', '5613176', 'dnsd0311938', '753739604', 'bulletindelasoci311892soci', '449254', '2568052r', 'idx208420073510', 'dnst9101238', 'idx208420073511', 'dnst9101239', 'idx208420073512', 'dnst9101236', 'idx208420073513', '5613170', 'dnsd0311932', '6428023', 'dnst9101234', 'condev7577e', 'dnst9101235', 'idx208420073516', 'catalogueofficiel00expo', 'dnsd0311937', 'idx208420073517', 'annualreportofto1993unse', 'dnsd0311936', 'hightley', 'dnst9101230', '03229a', 'dnst9101231', '51277834', '6412729', 'ʹemile', 'staqe', 'wadowice', 'motilitydisturbance', '279240', '279242', '279244', '279247', 'idx208420045494', 'idx208420045495', 'idx208420045496', 'idx208420045497', 'idx208420045490', 'idx208420045491', 'idx208420045492', 'idx208420045493', 'cnnnor', 'idx208420045498', 'idx208420045499', 'iilpgs', 'va2259', 'gainestsown', '695055989', '464629053', '38136000196400', '85833201', '6472240', '6472241', '6472242', '6472243', '6472244', '6472245', '6472246', '6472247', '6472248', '6472249', '6600241', 'dx18780838', '00006612', '2412962', '00006615', '098231', 'nordhang', '713205230', '6600246', '01758900', '779978829', 'overcored', '00006616', 'paperspecs', '244008156', 'photosynthesis00spoe', 'dfsd0509278', '2065802', '7308832', 'lnrvnl', '1891mc', '88699', 'b1162353', 'lienor', 'annualreportfort1941bedf', 'eppsit', '52472843', '6523569', '85119', '8869c', '85116', '85117', '85114', '85115', 'd9f2d0667a199d15e0cc709874d89bb6', '765950', '2261177', 'lnnlity', 'siybgaalcteu', '109007', 'shmagin', '147974229', 'heredero', '230937142', '827199896', 'idx208420149314', 'idx208420149315', 'idx208420149316', 'idx208420149317', 'idx208420149310', 'musiclicensingsm00unit', 'idx208420149312', 'idx208420149313', '254209', 'cueur', 'idx208420149318', 'idx208420149319', '233697515', '7312413', '7312412', '7312411', 'fficiency', '7312417', '7312416', '7312415', '7312419', 'annualreporttown1960cent', 'caaperating', '291157', '04966000', 'histoireabrg00joll', 'psyamericanjourn33ameruoft', 'präparator', '61232861', '116782', 'problemofevilinp00full', 'dnsc8703201', 'idx208420042778', 'idx208420042779', 'a946', 'a940', 'a941', 'a942', 'a943', 'idx208420042770', 'idx208420042771', 'idx208420042772', 'idx208420042773', 'idx208420042774', 'perjudicados', 'idx208420042776', 'idx208420042777', 'establishplanned00bost', '294656', '294654', '294652', '294653', '294650', '294651', 'ascaláfidos', '06065200', 'livernois', '294658', '294659', 'henryviiienglish01gasq', 'va2253', '0160765609', 'economy4', ...]
def hasNumbers(inputString):
return any(char.isdigit() for char in inputString)
texthaps = []
for hap in fd.hapaxes():
if not hasNumbers(hap):
texthaps.append(hap)
len(texthaps)
392117
len(fd.hapaxes())
2773182
texthaps
['contiiinecl', 'greise', 'noveluber', 'afgestooken', 'wintkagasspan', 'choegun', 'meeler', 'underbank', 'ufers', 'riilcil', 'qurxtity', 'ficantly', 'racticul', 'wageindex', 'bankroll', 'fiy', 'peble', 'colvtalner', 'manipulaton', 'todod', 'whichemploy', 'néill', 'phalluses', 'bustum', 'embacy', 'embach', 'podali', 'footwashing', 'utarid', 'putorious', 'utarin', 'berlingebrüder', 'scrviot', 'ranchlands', 'priiliitive', 'organisada', 'chattenango', 'dayrl', 'lixuna', 'fineart', 'enfóquese', 'nxtilnd', 'congenite', 'claymoore', 'alleviations', 'underband', 'compléte', 'ntdrmiyp', 'snrrey', 'brucharzt', 'vnrl', 'vnrk', 'gemonteerd', 'mueewscontains', 'büderverzeichnis', 'rainmaking', 'alrso', 'flnctnating', 'muskelbewegung', 'ecologicos', 'nfant', 'eann', 'eana', 'herthel', 'issliiiiiipo', 'tendenzdrama', 'swalllpy', 'tchats', 'physeodesmos', 'ailsofi', 'prsity', 'desso', 'inoiiejr', 'pedicellate', 'кампании', 'laukaan', 'totoramba', 'rightbrained', 'koekkoek', 'immmlmm', 'chonsia', 'rightlateralized', 'nerinea', 'iilacle', 'health_emphasis_program', 'newsbooks', 'lltry', 'acidlty', 'machery', 'soang', 'catanaei', 'fih', 'dentair', 'fik', 'cereti', 'kalokoe', 'cerete', 'vorcl', 'instiu', 'evaluaciónes', 'ohtaiu', 'grifflin', 'armymedicalmuseumcollectioncatalogueofpathologicaldrawingsofmedical', 'athénée', 'shadscaje', 'abrahamvs', 'marign', 'illspi', 'ncreased', 'lsouth', 'charruaud', 'moanavilla', 'suiuciuc', 'fillallcial', 'lytleton', 'preinstallation', 'pendleys', 'coustet', 'blapey', 'oubrerie', 'nazvy', 'buonconsiglio', 'laueblose', 'eiiiaient', 'paternitiy', 'lehrlings', 'roffler', 'hydrabiologie', 'yonaknoka', 'stakelin', 'isetan', 'hotisse', 'cinique', 'bogasse', 'kisima', 'hightley', 'ʹemile', 'staqe', 'wadowice', 'motilitydisturbance', 'cnnnor', 'iilpgs', 'gainestsown', 'nordhang', 'overcored', 'paperspecs', 'lnrvnl', 'lienor', 'eppsit', 'lnnlity', 'siybgaalcteu', 'shmagin', 'heredero', 'cueur', 'fficiency', 'caaperating', 'präparator', 'perjudicados', 'ascaláfidos', 'livernois', 'yamasukera', 'saurole', 'antsiraben', 'ahimatitsy', 'goelawa', 'lusembe', 'agrcelnent', 'maurain', 'andana', 'bucassen', 'hcving', 'invigorators', 'natalbai', 'clvc', 'caarriecl', 'efugee', 'wahasha', 'clisenscs', 'schilcrat', 'ruvin', 'calmcit', 'eazey', 'hydrophobicities', 'llcigl', 'sincan', 'cesana', 'missionassisant', 'yecorato', 'sellare', 'tibetische', 'presidentswashington', 'tehefoucte', 'ethnomedicine', 'tasmanien', 'saimdang', 'linbs', 'harlaire', 'ethiop', 'bluehill', 'kafri', 'tekau', 'hurney', 'tioton', 'tekam', 'portaient', 'ssistetla', 'ghottingen', 'circuito', 'kaiawase', 'faucettralphebiography', 'definciency', 'belolawek', 'longnor', 'pilosebaceous', 'stoillach', 'msun', 'nlcclical', 'throi', 'kaululaau', 'throa', 'throc', 'антоновские', 'illatter', 'bollwyller', 'winkelmannischen', 'drycell', 'treff', 'yull', 'cialized', 'dahcotah', 'komarovʺ', 'rossolimo', 'thousaild', 'lognette', 'galgesberg', 'umbeluzi', 'prorluct', 'stephanius', 'castellamonte', 'lewice', 'wojtosik', 'abdeldaim', 'haitt', 'haita', 'schnée', 'gathr', 'piranema', 'lucrene', 'godov', 'hyten', 'jadot', 'brsorc', 'eingaklebt', 'swotting', 'kistenmacher', 'tichig', 'lliliversity', 'chimneystacks', 'dandolo', 'shevistskikh', 'gentisate', 'semiofficiul', 'mearsheimer', 'jounrnal', 'waliace', 'wahrhafftige', 'rechardson', 'turnikom', 'niounttlins', 'baramadagascar', 'ressainance', 'janorschke', 'аграрной', 'editnr', 'upholstory', 'rcjuis', 'mormoopidae', 'dipsogenic', 'ŭihoe', 'salomonsen', 'crames', 'apalaches', 'mcotea', 'sagacitate', 'fringy', 'seminaristinnenkurs', 'encouragers', 'alî', 'agmiller', 'morzon', 'insitituto', 'regentibus', 'strandmark', 'ladhoff', 'иностранным', 'créance', 'branty', 'nncienl', 'pietruszo', 'reous', 'pietrusza', 'requring', 'eigl', 'eigo', 'sinamary', 'dashino', 'dorpsoverste', 'eevvaannggeelliizziinngg', 'lloyte', 'wwod', 'ontiberos', 'cqulvnlent', 'espero', 'photogrammety', 'ulst', 'multifire', 'cuicado', 'adamsone', 'dcvclqpwnt', 'aspian', 'adamsons', 'converty', 'travleing', 'dendale', 'cveli', 'ashurnasirpal', 'lsbilitgo', 'abgerungen', 'tacting', 'reeneng', 'nucleotidases', 'alz', 'diaconia', 'langnaw', 'phorbia', 'schwst', 'mediatii', 'einschlieslich', 'losleben', 'withsubsequent', 'boxman', 'gewonnener', 'hensry', 'bruti', 'gefährligkeiten', 'rhayader', 'albenseiten', 'reloplnent', 'approsimately', 'vlttu', 'louzon', 'gloghini', 'reasest', 'diigital', 'chúng', 'junkets', 'glenrosa', 'harmount', 'abarms', 'wawuna', 'hogards', 'delinavit', 'ē', 'solvccl', 'lumutbalai', 'sigananda', 'martargis', 'iaensc', 'friedrichstafen', 'elyah', 'lililyo', 'hungerpest', 'shaijples', 'estioin', 'akomatsu', 'latil', 'unfought', 'kasavubu', 'rited', 'disfrutara', 'piedmonth', 'insepector', 'latiu', 'wanzhou', 'вапко', 'claii', 'tomologyw', 'claie', 'thatsächliche', 'braunschw', 'tlcleterious', 'comwesseafron', 'cliscoveri', 'cliscoverg', 'cliscoverd', 'onyefulu', 'eggsquisite', 'binjouin', 'vatundamu', 'lugee', 'merpeople', 'fcedings', 'jaywalkers', 'dllrille', 'durres', 'durret', 'ungen', 'ungeh', 'suerfu', 'ungef', 'fischtrockenplatz', 'amaudruz', 'atact', 'podepsa', 'citlzens', 'hoëvell', 'feteioa', 'jsteiv', 'pánico', 'sosikrates', 'collectivized', 'gülden', 'ordinavit', 'asxi', 'intimiano', 'ctzel', 'boutot', 'murza', 'celebrees', 'halacsy', 'obsèques', 'availnl', 'arcandi', 'cidio', 'consecta', 'soopahya', 'houtton', 'yungon', 'breissgauischen', 'suhe', 'suho', 'suhs', 'sanitio', 'infrmatin', 'romatipografia', 'xianwu孔宪武', 'gijsbertus', 'propolae', 'gastrophod', 'medisons', 'accão', 'probates', 'vergier', 'головину', 'tjlu', 'syncopalis', 'tjll', 'tjli', 'tenthouses', 'heldenbuch', 'nonunions', 'beets_______________________________________________', 'sitllation', 'muckerman', 'illvasioli', 'spaski', 'dascalescu', 'belgariad', 'pothicary', 'oouurrccuullttuurree', 'плеть', 'fogva', 'noening', 'uncentralized', 'zuydt', 'ambitiously', 'cllallges', 'ivifbboobrrrniiiggiinngg', 'mgney', 'babungokind', 'iinetl', 'bioisosteres', 'pressurevolume', 'isurumunija', 'solfeggio', 'illformed', 'merdin', 'dannal', 'dannam', 'cakenge', 'merhandise', 'dannat', 'vniuersis', 'magnética', 'caxcan', 'cjrried', 'cessing', 'oeurs', 'investmerrts', 'götzenfesten', 'lnental', 'mdclxxxxix', 'essercitato', 'munduruku', 'atomgruppen', 'женской', 'copiè', 'benicht', 'sairsc', 'genvinam', 'thehvside', 'rachitogenesis', 'aclminislration', 'koepf', 'kift', 'detenninecl', 'horloger', 'naudaei', 'magaro', 'limbowe', 'abegawa', 'mahemiah', 'bakevellia', '牧草大田轮作制的理论与技术', 'canaletti', 'conrltr', 'failittg', 'patert', 'ganisnl', 'optitrack', 'stationsgeschwister', 'sonderbahre', 'reestab', 'tranvik', 'ericksonhurt', 'ayudarán', 'superintendint', 'cfveral', 'bisignano', 'lagrassa', 'hwadlefy', 'prrat', 'dissapproval', 'platenses', 'gorean', 'oolanahhee', 'snni', 'yuj', 'riječi', 'friedensengel', 'shoort', 'pyonchan', 'oncepts', 'flatcl', 'typper', 'sarcamento', 'scecam', 'elllenton', 'grufferman', 'nerby', 'sabeundi', 'hulsobus', 'nicklos', 'hallengeso', 'thâeophile', 'hypother', 'angiolini', 'cuthrie', 'sorgfältigste', 'gretehelskov', 'latabll', 'petroiacomo', 'cltlily', 'prnited', 'bejewelled', 'charmber', 'streynsham', 'mangaoang', 'marantearum', 'inocultztio', 'bescherme', 'surrouncled', 'ohrloff', 'luggs', 'kršćanskom', 'lugga', 'irlcilils', 'heavenbeijingming', 'frommarch', 'wasenberg', 'augustln', 'shodows', 'venkatanarasimha', 'mavjen', 'tenox', 'eiran', 'vnccint', 'tabyshalieva', 'millwr', 'prevendado', 'millwn', 'wärmflaschen', 'lillypad', 'pharisiens', 'trsiiive', 'accwnul', 'belcano', 'nitwi', 'menaham', 'meditsinskii', 'vigileo', 'kressner', 'efforcaient', 'fulanis', 'sorgotten', 'faciendo', 'sanshui', 'facienda', 'zeën', 'shacleford', 'parguasa', 'jtiaidia', 'videofile', 'kuchni', 'tanceac', 'bonifont', 'unatineg', 'consvls', 'sutek', 'norresundby', 'originalgrösse', 'specialistsâ', 'pflycbolojfy', 'lopardi', 'mjesta', 'yokevich', 'strafeships', 'mahachan', 'plistodon', 'ficticsoosi', 'psnlts', 'weegar', 'dobrego', 'iderable', 'cotapino', 'oversllpply', 'octaethylporphyrinatomanganese', 'iderably', 'signiiicnnt', 'robbards', 'leimentoll', 'lamroena', 'noncompact', 'ryckère', 'primaveral', 'gnieznienskiego', 'tamikas', 'grenony', 'bactcria', 'gastroli', 'ocotal', 'riconoscenza', 'monoarticular', 'bulletinsummerquappa', 'crotzer', 'utilus', 'talija', 'fereeda', 'kleinster', 'kleinstes', 'promnix', 'rusciano', 'chrisant', 'misshappen', 'oinicn', 'связанных', 'harrewyn', 'phlegm', 'skupljena', 'epidemiologi', 'imags', 'berliz', 'sheelat', 'descobertos', 'unreviewed', 'caesareansectionatfullterm', 'justitiam', 'rairden', 'strakville', 'vavenby', 'unmt', 'tuscapampa', 'morlund', 'hrougll', 'cluestionable', 'kupecký', 'vebred', 'guican', 'lastboote', 'nitrogenn', 'mewaygo', 'nitrogenj', 'womenßs', 'turistjcgob', 'explicatifs', 'cuyubini', 'carwrecks', 'korlet', 'nalyo', 'parasitarias', 'auristes', 'cawte', 'mitlli', 'clairinghnigh', 'gierish', 'estanco', 'ugel', 'missionsalmosen', 'uger', 'ussie', 'choquita', 'factaor', 'intermecliate', 'văn', 'tempelterrasse', 'multisynaptic', 'palmerly', 'malakhovskago', 'cheother', 'tremellen', 'diputndo', 'uvum', 'garwicz', 'tsournos', 'licntion', 'bauant', 'виссарионович', 'powdilfhorn', 'murphay', 'geroll', 'feyken', 'fromrichard', 'eastabrooks', 'jugali', 'mégaptère', 'utlis', 'xsoiirces', 'mgct', 'abulencia', 'marindin', 'blumenga', 'brethaur', 'guitele', 'dégré', 'potjokonkong', 'erysipela', 'variodermite', 'depiciences', 'shadkhen', 'catechetische', 'owyhigh', 'moutliecl', 'scherschligt', 'soudé', 'valbracht', 'historienschreiber', 'zamparelli', 'foulc', 'lecram', 'poespa', 'hospsital', 'veteriizary', 'ezekeli', 'unqualifiecl', 'eloya', 'succer', 'reginarum', 'succee', 'bildenback', 'chahogum', 'generalate', 'ordensmönchen', 'karioglu', 'proverbia', 'élisabeth', 'doncqu', 'publisherl', 'bameileke', 'fannel', 'alligence', 'prevellts', 'aassembly', 'enroller', 'metiokochda', 'trumall', 'killbourn', 'tailaferro', 'enforcerneiit', 'linggau', 'baranovski', 'xenpang', 'etymologicas', 'ahoo', 'officemax', 'thrach', 'fiskerjenten', 'düsseldorff', 'fosfatado', 'nechl', 'necho', 'ne_bras_ka', 'cosp', 'steaminj', 'mlekopitaiushchi', 'kodachromes', 'tyrolis', 'laminoplasty', 'mcgegor', 'lakeoff', 'asthenospheric', 'robatzek', 'privileres', 'yurugi', 'gmj', 'gmn', 'gmb', 'uppon', 'cuddapan', 'terrerium', 'ninnigret', 'mutilados', 'strandtmann', 'wäscherei', 'hilalcement', 'собраны', 'graspers', 'gesetzgebungen', 'benandri', 'masonthe', 'rusticity', 'sencoes', 'maineman', 'enemecio', 'foldwer', 'brimsby', 'convience', 'triunfalel', 'marshmellow', 'mtji', 'ebraeorvm', 'stwrt', 'leptandrin', 'samilkameenensis', 'zaander', 'unple', 'rrelelv', 'melothian', 'rorich', 'deprllcls', 'cwky', 'awesomely', 'accommodationist', 'dilena', 'silkrree', 'intrabead', 'armymedicalmuseumcollectionlogbooksprovisionalpathologicalseries', 'padoukholz', 'lienholders', 'beethom', 'organian', 'legary', 'rofewors', 'roseneid', 'subpena', 'nwhieh', 'mcmadaw', 'iyula', 'chengwen许成文', 'dhamala', 'нагорный', 'fulvigula', 'erntetanz', 'passmores', 'gallicarum', 'yfls', 'coalcompany', 'romatic', 'zobo', 'yfli', 'dadle', 'bsatroop', 'rcalixar', 'purumi', 'aescorting', 'audiovisuelles', 'brinkac', 'teritoriul', 'illdefinite', 'finanze', 'teasppon', 'novopen', 'ramasseurs', 'amirdara', 'mulanax', 'shulzes', 'sleying', 'mclliocls', 'qugmentation', 'berggeschichten', 'imint', 'stlein', 'distiehlis', 'resoarch', 'albigenses', 'rajasingha', 'ungoya', 'inkunabel', 'stactites', 'contillually', 'evenutally', 'deformidad', 'le__', 'fullchearings', 'gardnevi', 'obliquinity', 'charduar', 'stiiclic', 'engelska', 'jakins', 'huach', 'engelskt', 'descoeudres', 'barcal', 'seltlo', 'generalhead', 'wasdetermined', 'stratoii', 'asentaran', 'nationagl', 'raypublican', 'bindernagel', 'rudal', 'venkatachalapathy', 'angioletti', 'tamtamspeler', 'glstn', 'gansarski', 'remijius', 'ilune', 'conontoxin', 'lreling', 'alfrlfr', 'morrili', 'snouted', 'incurabli', 'deparptment', 'deviacion', 'glockebaum', 'connitlons', 'potö', 'forestell', 'glayd', 'gevaerlijcke', 'glays', 'meresman', 'januam', 'other_so_', 'ongeo', 'irmscher', 'clitioiis', 'rinserrano', 'hylenaea', 'yonathan', 'pinlc', 'outwith', 'iinnssppiriartiaotniiostns', 'arisoptera', 'ensnare', 'schallbildung', 'ranexa', 'olsenm', 'duchesnoy', 'bowlingtown', 'laumsa', 'remport', 'serting', 'estandarizadas', 'ivac', 'depraedation', 'tiongco', 'shahnovich', 'chausses', 'emisije', 'peyritsch', 'kaake', 'affligeante', 'brukes', 'bruker', 'ontstanding', 'mpraéso', ...]
colltexthaps = {}
colls2 = colls
colls2.append('artstor')
for coll in colls2:
colltexthaps[coll] = []
for hap in fds[coll].hapaxes():
if not hasNumbers(hap):
#if hap in texthaps:
colltexthaps[coll].append(hap)
print(coll, "|", str(len(colltexthaps[coll])))
biodiv | 36351 rumsey | 11564 commonwealth | 17291 georgia | 19225 harvard | 10822 ia | 82685 getty | 8896 kentucky | 10656 minnesota | 14554 missouri | 17175 mwdl | 161243 nara | 42642 nocar | 30677 smiths | 96110 socar | 18369 texas | 23045 gpo | 27582 illinois | 19754 usc | 70684 virginia | 6878 nocoll | 995 artstor | 392117