In [1]:
import pickle
import nltk

stats = pickle.load( open( "/media/storage/dpla-data/pickles/new/newstats.p", "rb" ) )
common = pickle.load( open( "/media/storage/dpla-data/pickles/new/common.p", "rb" ) )
searcom = pickle.load( open( "/media/storage/dpla-data/pickles/new/sear_common.p", "rb" ) )
searfilt = pickle.load(open( "/media/storage/dpla-data/pickles/new/searches_filtered.p", "rb" ) ) 
In [2]:
stats
Out[2]:
{'artstor': {'funiq': 60168,
  'fwc': 5025070,
  'haps': 29757,
  'lowerhaps': 24103,
  'uniq': 60293,
  'wc': 6972534},
 'biodiv': {'funiq': 94248,
  'fwc': 5658739,
  'haps': 44804,
  'lowerhaps': 38471,
  'uniq': 94372,
  'wc': 6381376},
 'commonwealth': {'funiq': 204577,
  'fwc': 11348522,
  'haps': 159095,
  'lowerhaps': 154009,
  'uniq': 204703,
  'wc': 14022356},
 'georgia': {'funiq': 150863,
  'fwc': 32656431,
  'haps': 89668,
  'lowerhaps': 79492,
  'uniq': 150990,
  'wc': 42031491},
 'getty': {'funiq': 54355,
  'fwc': 14251103,
  'haps': 11663,
  'lowerhaps': 9767,
  'uniq': 54474,
  'wc': 18732730},
 'gpo': {'funiq': 437646,
  'fwc': 21860075,
  'haps': 351637,
  'lowerhaps': 343619,
  'uniq': 437770,
  'wc': 26316103},
 'harvard': {'funiq': 35918,
  'fwc': 849987,
  'haps': 20447,
  'lowerhaps': 18025,
  'uniq': 36036,
  'wc': 968898},
 'ia': {'funiq': 502974,
  'fwc': 16996418,
  'haps': 394559,
  'lowerhaps': 378206,
  'uniq': 503101,
  'wc': 23288038},
 'illinois': {'funiq': 49018,
  'fwc': 1829267,
  'haps': 29755,
  'lowerhaps': 23849,
  'uniq': 49143,
  'wc': 2385501},
 'kentucky': {'funiq': 30374,
  'fwc': 6800530,
  'haps': 14090,
  'lowerhaps': 11338,
  'uniq': 30498,
  'wc': 9405279},
 'minnesota': {'funiq': 43666,
  'fwc': 3598870,
  'haps': 21112,
  'lowerhaps': 17674,
  'uniq': 43791,
  'wc': 4495075},
 'missouri': {'funiq': 119586,
  'fwc': 3542143,
  'haps': 90859,
  'lowerhaps': 85222,
  'uniq': 119713,
  'wc': 4256929},
 'mwdl': {'funiq': 793849,
  'fwc': 87424176,
  'haps': 542772,
  'lowerhaps': 504876,
  'uniq': 793976,
  'wc': 111155337},
 'nara': {'funiq': 1082133,
  'fwc': 54355031,
  'haps': 990235,
  'lowerhaps': 978968,
  'uniq': 1082259,
  'wc': 65649116},
 'nocar': {'funiq': 258024,
  'fwc': 27360155,
  'haps': 165815,
  'lowerhaps': 157524,
  'uniq': 258151,
  'wc': 33487819},
 'nocoll': {'funiq': 1785,
  'fwc': 4626,
  'haps': 1307,
  'lowerhaps': 1202,
  'uniq': 1867,
  'wc': 6192},
 'rumsey': {'funiq': 47343,
  'fwc': 8825833,
  'haps': 14682,
  'lowerhaps': 12520,
  'uniq': 47463,
  'wc': 11667865},
 'smiths': {'funiq': 432279,
  'fwc': 51922316,
  'haps': 182337,
  'lowerhaps': 157337,
  'uniq': 432406,
  'wc': 59927374},
 'socar': {'funiq': 61687,
  'fwc': 5809794,
  'haps': 31055,
  'lowerhaps': 25606,
  'uniq': 61813,
  'wc': 7138136},
 'texas': {'funiq': 855594,
  'fwc': 72699549,
  'haps': 245710,
  'lowerhaps': 237998,
  'uniq': 855720,
  'wc': 88574895},
 'usc': {'funiq': 259523,
  'fwc': 41542851,
  'haps': 108852,
  'lowerhaps': 94218,
  'uniq': 259650,
  'wc': 49296854},
 'virginia': {'funiq': 41374,
  'fwc': 1790985,
  'haps': 32902,
  'lowerhaps': 31738,
  'uniq': 41493,
  'wc': 2248517}}
In [3]:
import pandas as pd
df = pd.DataFrame(stats)
df.columns = ['ARTstor', 'Biodiversity Heritage Library', 'Digital Commonwealth', 'Digital Library of Georgia',
              'J. Paul Getty Trust', 'United States Government Printing Office (GPO)', 'Harvard Library',
              'Internet Archive', 'University of Illinois at Urbana-Champaign', 'Kentucky Digital Library',
              'Minnesota Digital Library', 'Missouri Hub', 'Mountain West Digital Library',
              'National Archives and Records Administration', 'North Carolina Digital Heritage Center',
              ' ', 'David Rumsey', 'Smithsonian Institution', 'South Carolina Digital Library', 
              'The Portal to Texas History', 'University of Southern California. Libraries',
              'University of Virginia Library']
df.T
df.T.to_csv("nltk.stats.csv")
In [8]:
from IPython.display import display
display(pd.melt(df.T.reset_index(), id_vars=['index']).sort('index'))
index variable value
15 funiq 1785
37 fwc 4626
59 haps 1307
81 lowerhaps 1202
103 uniq 1867
125 wc 6192
0 ARTstor funiq 60168
22 ARTstor fwc 5025070
44 ARTstor haps 29757
66 ARTstor lowerhaps 24103
88 ARTstor uniq 60293
110 ARTstor wc 6972534
1 Biodiversity Heritage Library funiq 94248
23 Biodiversity Heritage Library fwc 5658739
45 Biodiversity Heritage Library haps 44804
67 Biodiversity Heritage Library lowerhaps 38471
89 Biodiversity Heritage Library uniq 94372
111 Biodiversity Heritage Library wc 6381376
16 David Rumsey funiq 47343
38 David Rumsey fwc 8825833
60 David Rumsey haps 14682
82 David Rumsey lowerhaps 12520
104 David Rumsey uniq 47463
126 David Rumsey wc 11667865
2 Digital Commonwealth funiq 204577
24 Digital Commonwealth fwc 11348522
46 Digital Commonwealth haps 159095
68 Digital Commonwealth lowerhaps 154009
90 Digital Commonwealth uniq 204703
112 Digital Commonwealth wc 14022356
... ... ... ...
19 The Portal to Texas History funiq 855594
41 The Portal to Texas History fwc 72699549
63 The Portal to Texas History haps 245710
85 The Portal to Texas History lowerhaps 237998
107 The Portal to Texas History uniq 855720
129 The Portal to Texas History wc 88574895
5 United States Government Printing Office (GPO) funiq 437646
27 United States Government Printing Office (GPO) fwc 21860075
49 United States Government Printing Office (GPO) haps 351637
71 United States Government Printing Office (GPO) lowerhaps 343619
93 United States Government Printing Office (GPO) uniq 437770
115 United States Government Printing Office (GPO) wc 26316103
8 University of Illinois at Urbana-Champaign funiq 49018
30 University of Illinois at Urbana-Champaign fwc 1829267
52 University of Illinois at Urbana-Champaign haps 29755
74 University of Illinois at Urbana-Champaign lowerhaps 23849
96 University of Illinois at Urbana-Champaign uniq 49143
118 University of Illinois at Urbana-Champaign wc 2385501
20 University of Southern California. Libraries funiq 259523
42 University of Southern California. Libraries fwc 41542851
64 University of Southern California. Libraries haps 108852
86 University of Southern California. Libraries lowerhaps 94218
108 University of Southern California. Libraries uniq 259650
130 University of Southern California. Libraries wc 49296854
21 University of Virginia Library funiq 41374
43 University of Virginia Library fwc 1790985
65 University of Virginia Library haps 32902
87 University of Virginia Library lowerhaps 31738
109 University of Virginia Library uniq 41493
131 University of Virginia Library wc 2248517

132 rows × 3 columns

In [9]:
pd.melt(df.T.reset_index(), id_vars=['index']).sort('index').to_csv('nltk.stats.melted.tmp.csv')
In [4]:
#>>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
fd = nltk.FreqDist(token.lower() for token in searfilt)
fd.most_common()
Out[4]:
[('war', 9316),
 ('history', 7940),
 ('new', 7432),
 ('county', 7030),
 ('georgia', 6108),
 ('university', 6024),
 ('american', 5883),
 ('library', 5799),
 ('john', 5015),
 ('world', 4293),
 ('women', 4092),
 ('york', 3972),
 ('civil', 3853),
 ('states', 3604),
 ('united', 3590),
 ('carolina', 3570),
 ('de', 3223),
 ('william', 3148),
 ('south', 3072),
 ('art', 2864),
 ('school', 2734),
 ('utah', 2726),
 ('city', 2682),
 ('north', 2581),
 ('state', 2563),
 ('james', 2517),
 ('public', 2490),
 ('family', 2335),
 ('c', 2308),
 ('boston', 2280),
 ('george', 2254),
 ('college', 2159),
 ('map', 2154),
 ('atlanta', 2126),
 ('america', 2103),
 ('national', 2032),
 ('thomas', 1979),
 ('virginia', 1950),
 ('great', 1883),
 ('charles', 1825),
 ('st', 1820),
 ('texas', 1818),
 ('ga', 1804),
 ('washington', 1801),
 ('california', 1801),
 ('african', 1779),
 ('robert', 1727),
 ('minnesota', 1722),
 ('rights', 1708),
 ('genealogy', 1702),
 ('book', 1680),
 ('digital', 1634),
 ('lat', 1617),
 ('church', 1616),
 ('black', 1614),
 ('massachusetts', 1588),
 ('henry', 1587),
 ('king', 1584),
 ('education', 1581),
 ('j', 1581),
 ('life', 1577),
 ('books', 1568),
 ('d', 1525),
 ('archives', 1522),
 ('maps', 1500),
 ('music', 1472),
 ('century', 1468),
 ('west', 1450),
 ('la', 1434),
 ('b', 1430),
 ('e', 1429),
 ('park', 1420),
 ('david', 1411),
 ('railroad', 1370),
 ('island', 1354),
 ('ohio', 1343),
 ('museum', 1335),
 ('photographs', 1328),
 ('house', 1314),
 ('chicago', 1304),
 ('children', 1289),
 ('h', 1288),
 ('social', 1287),
 ('street', 1254),
 ('science', 1250),
 ('san', 1244),
 ('society', 1214),
 ('illinois', 1206),
 ('f', 1205),
 ('lake', 1187),
 ('works', 1174),
 ('w', 1168),
 ('law', 1162),
 ('revolution', 1161),
 ('martin', 1157),
 ('indian', 1149),
 ('indians', 1139),
 ('river', 1139),
 ('english', 1132),
 ('americans', 1126),
 ('libraries', 1100),
 ('architecture', 1091),
 ('mary', 1089),
 ('l', 1088),
 ('m', 1084),
 ('ny', 1080),
 ('historical', 1079),
 ('paul', 1065),
 ('kentucky', 1053),
 ('michigan', 1044),
 ('records', 1042),
 ('joseph', 1040),
 ('fiction', 1040),
 ('high', 1031),
 ('ii', 1020),
 ('r', 1019),
 ('center', 1009),
 ('37', 1007),
 ('government', 999),
 ('mass', 989),
 ('louis', 988),
 ('china', 983),
 ('act', 969),
 ('native', 966),
 ('996162679728116', 957),
 ('edward', 955),
 ('slavery', 941),
 ('vietnam', 930),
 ('company', 921),
 ('smith', 919),
 ('business', 919),
 ('lincoln', 916),
 ('literature', 899),
 ('death', 899),
 ('japanese', 897),
 ('woman', 894),
 ('language', 882),
 ('fire', 878),
 ('white', 876),
 ('day', 870),
 ('buildings', 865),
 ('richard', 860),
 ('pennsylvania', 853),
 ('chinese', 851),
 ('early', 845),
 ('spanish', 844),
 ('luther', 840),
 ('kennedy', 840),
 ('slave', 831),
 ('roosevelt', 826),
 ('ma', 816),
 ('brown', 812),
 ('general', 809),
 ('missouri', 809),
 ('design', 807),
 ('florida', 807),
 ('o', 806),
 ('frank', 806),
 ('collection', 805),
 ('first', 805),
 ('french', 802),
 ('child', 801),
 ('n', 797),
 ('pa', 796),
 ('management', 795),
 ('military', 792),
 ('old', 790),
 ('england', 781),
 ('lewis', 779),
 ('army', 774),
 ('southern', 769),
 ('mountain', 769),
 ('us', 767),
 ('los', 764),
 ('2', 762),
 ('u', 761),
 ('jackson', 759),
 ('labor', 755),
 ('indiana', 754),
 ('health', 752),
 ('bible', 752),
 ('immigration', 750),
 ('colorado', 749),
 ('ca', 747),
 ('mexico', 746),
 ('depression', 742),
 ('battle', 742),
 ('red', 740),
 ('union', 736),
 ('harvard', 730),
 ('lee', 728),
 ('baseball', 725),
 ('samuel', 723),
 ('man', 722),
 ('g', 717),
 ('nc', 716),
 ('1865', 716),
 ('east', 716),
 ('hill', 713),
 ('air', 710),
 ('p', 705),
 ('home', 705),
 ('image', 704),
 ('arizona', 702),
 ('research', 701),
 ('1', 701),
 ('alabama', 699),
 ('maine', 699),
 ('journal', 698),
 ('wisconsin', 695),
 ('technology', 693),
 ('people', 693),
 ('franklin', 692),
 ('tennessee', 689),
 ('men', 680),
 ('ancient', 680),
 ('fort', 679),
 ('theater', 672),
 ('little', 668),
 ('schools', 667),
 ('food', 665),
 ('france', 657),
 ('mn', 653),
 ('culture', 652),
 ('pictorial', 647),
 ('connecticut', 642),
 ('politics', 641),
 ('newspapers', 634),
 ('collections', 633),
 ('work', 626),
 ('1918', 625),
 ('francisco', 625),
 ('smithsonian', 624),
 ('oregon', 622),
 ('charleston', 621),
 ('movement', 618),
 ('medical', 617),
 ('german', 614),
 ('travel', 613),
 ('special', 613),
 ('johnson', 608),
 ('germany', 607),
 ('portrait', 605),
 ('angeles', 602),
 ('1920', 602),
 ('elizabeth', 600),
 ('peter', 597),
 ('information', 596),
 ('gold', 595),
 ('rock', 595),
 ('1800', 594),
 ('scott', 591),
 ('1861', 588),
 ('1914', 586),
 ('computer', 585),
 ('1945', 585),
 ('mining', 585),
 ('harry', 585),
 ('benjamin', 584),
 ('suffrage', 581),
 ('kansas', 575),
 ('camp', 575),
 ('india', 574),
 ('magazine', 573),
 ('clark', 572),
 ('v', 570),
 ('power', 568),
 ('alexander', 566),
 ('human', 566),
 ('service', 565),
 ('iowa', 563),
 ('michael', 563),
 ('photography', 562),
 ('letter', 562),
 ('religion', 561),
 ('medicine', 560),
 ('building', 557),
 ('portraits', 555),
 ('department', 554),
 ('project', 554),
 ('van', 549),
 ('water', 547),
 ('1939', 546),
 ('hall', 546),
 ('trade', 546),
 ('mississippi', 544),
 ('valley', 541),
 ('industry', 541),
 ('co', 540),
 ('philadelphia', 539),
 ('london', 538),
 ('young', 537),
 ('arts', 537),
 ('etc', 537),
 ('poetry', 535),
 ('institute', 535),
 ('horse', 533),
 ('arthur', 532),
 ('japan', 531),
 ('frederick', 531),
 ('columbia', 530),
 ('20th', 529),
 ('political', 526),
 ('race', 525),
 ('minneapolis', 524),
 ('sports', 524),
 ('maryland', 523),
 ('jersey', 523),
 ('space', 522),
 ('report', 521),
 ('dr', 521),
 ('mark', 520),
 ('students', 520),
 ('deal', 516),
 ('administration', 514),
 ('soldiers', 513),
 ('jr', 512),
 ('yale', 512),
 ('road', 511),
 ('international', 511),
 ('jones', 510),
 ('independence', 509),
 ('herald', 508),
 ('1900', 507),
 ('farm', 507),
 ('colonial', 506),
 ('nevada', 504),
 ('irish', 501),
 ('jane', 501),
 ('modern', 500),
 ('newspaper', 500),
 ('dog', 500),
 ('psychology', 498),
 ('pacific', 498),
 ('jefferson', 497),
 ('salt', 496),
 ('williams', 493),
 ('va', 493),
 ('theory', 490),
 ('letters', 490),
 ('system', 490),
 ('fair', 490),
 ('saint', 489),
 ('group', 489),
 ('green', 487),
 ('land', 483),
 ('russian', 482),
 ('free', 481),
 ('cherokee', 481),
 ('abraham', 481),
 ('time', 479),
 ('field', 478),
 ('adams', 475),
 ('guide', 475),
 ('africa', 474),
 ('british', 473),
 ('sc', 470),
 ('radio', 470),
 ('bill', 470),
 ('pictures', 470),
 ('beach', 469),
 ('central', 469),
 ('domain', 468),
 ('1940', 467),
 ('andrew', 464),
 ('walter', 462),
 ('louisiana', 460),
 ('theatre', 459),
 ('one', 458),
 ('francis', 453),
 ('girl', 452),
 ('hospital', 452),
 ('springs', 452),
 ('wilson', 449),
 ('police', 449),
 ('customs', 448),
 ('ford', 448),
 ('rumsey', 448),
 ('el', 447),
 ('stephen', 447),
 ('christian', 446),
 ('brooklyn', 445),
 ('albert', 443),
 ('jack', 443),
 ('shakespeare', 443),
 ('spain', 440),
 ('workers', 439),
 ('heritage', 438),
 ('wright', 437),
 ('hotel', 436),
 ('k', 433),
 ('immigrants', 433),
 ('engineering', 432),
 ('wwii', 432),
 ('media', 432),
 ('daniel', 431),
 ('19th', 429),
 ('gallery', 428),
 ('study', 428),
 ('two', 427),
 ('photos', 427),
 ('laws', 426),
 ('march', 425),
 ('periodicals', 425),
 ('y', 421),
 ('postcard', 420),
 ('philosophy', 419),
 ('montana', 419),
 ('squadron', 419),
 ('dance', 417),
 ('industrial', 416),
 ('western', 416),
 ('salem', 416),
 ('1950', 415),
 ('jewish', 415),
 ('hitler', 415),
 ('images', 414),
 ('president', 412),
 ('tom', 411),
 ('parks', 411),
 ('economics', 409),
 ('mexican', 409),
 ('gay', 409),
 ('al', 408),
 ('football', 408),
 ('creek', 408),
 ('gordon', 406),
 ('edgar', 406),
 ('howard', 405),
 ('census', 405),
 ('allen', 405),
 ('club', 404),
 ('detroit', 403),
 ('domestic', 401),
 ('construction', 400),
 ('association', 399),
 ('reading', 399),
 ('biography', 398),
 ('congress', 398),
 ('age', 397),
 ('lawrence', 395),
 ('community', 394),
 ('forest', 393),
 ('von', 392),
 ('sex', 392),
 ('love', 391),
 ('bridge', 390),
 ('description', 389),
 ('arkansas', 389),
 ('game', 389),
 ('declaration', 388),
 ('des', 388),
 ('development', 388),
 ('historic', 387),
 ('security', 387),
 ('town', 387),
 ('soviet', 386),
 ('may', 386),
 ('margaret', 386),
 ('federal', 385),
 ('11', 385),
 ('davis', 385),
 ('greek', 384),
 ('long', 383),
 ('tribune', 383),
 ('big', 383),
 ('natural', 380),
 ('miller', 380),
 ('control', 379),
 ('news', 379),
 ('dakota', 378),
 ('1930', 373),
 ('stone', 373),
 ('hampshire', 372),
 ('europe', 370),
 ('newton', 370),
 ('anne', 370),
 ('globe', 370),
 ('paris', 369),
 ('animal', 368),
 ('navy', 367),
 ('fashion', 366),
 ('britain', 366),
 ('santa', 365),
 ('slaves', 364),
 ('diary', 363),
 ('alice', 363),
 ('film', 362),
 ('empire', 362),
 ('charlotte', 362),
 ('avenue', 362),
 ('dept', 362),
 ('policy', 362),
 ('relations', 360),
 ('constitution', 360),
 ('catholic', 360),
 ('negro', 359),
 ('columbus', 359),
 ('cat', 359),
 ('jean', 358),
 ('ireland', 358),
 ('il', 358),
 ('corps', 357),
 ('garden', 357),
 ('post', 357),
 ('photo', 357),
 ('oklahoma', 356),
 ('russia', 356),
 ('russell', 356),
 ('car', 355),
 ('physical', 354),
 ('warren', 354),
 ('mill', 353),
 ('analysis', 352),
 ('party', 352),
 ('medieval', 351),
 ('canada', 351),
 ('sea', 350),
 ('nebraska', 350),
 ('freedom', 349),
 ('idaho', 349),
 ('learning', 348),
 ('painting', 348),
 ('oil', 348),
 ('agriculture', 348),
 ('duluth', 347),
 ('augusta', 347),
 ('1960', 346),
 ('poster', 346),
 ('ship', 345),
 ('cotton', 344),
 ('papers', 342),
 ('conservation', 342),
 ('aerial', 342),
 ('video', 341),
 ('cold', 340),
 ('times', 340),
 ('directory', 340),
 ('archive', 340),
 ('railroads', 339),
 ('poe', 339),
 ('ann', 338),
 ('bay', 338),
 ('review', 337),
 ('houses', 336),
 ('ky', 335),
 ('transportation', 335),
 ('grand', 334),
 ('strike', 334),
 ('vermont', 333),
 ('jim', 333),
 ('games', 330),
 ('theodore', 330),
 ('insignia', 330),
 ('twain', 330),
 ('photograph', 330),
 ('roman', 329),
 ('advertising', 328),
 ('train', 327),
 ('middle', 326),
 ('fishing', 324),
 ('trail', 324),
 ('dead', 324),
 ('revolutionary', 322),
 ('religious', 321),
 ('point', 321),
 ('story', 320),
 ('snow', 320),
 ('wood', 319),
 ('carter', 319),
 ('rush', 319),
 ('le', 319),
 ('falls', 319),
 ('italy', 318),
 ('prohibition', 318),
 ('fish', 317),
 ('orleans', 316),
 ('harbor', 315),
 ('bird', 314),
 ('carl', 313),
 ('uss', 313),
 ('artstor', 313),
 ('walker', 313),
 ('rome', 312),
 ('propaganda', 311),
 ('resources', 311),
 ('egypt', 311),
 ('nj', 310),
 ('jews', 309),
 ('office', 309),
 ('anti', 308),
 ('teaching', 308),
 ('clothing', 307),
 ('coal', 306),
 ('reform', 306),
 ('canal', 306),
 ('berlin', 306),
 ('greece', 306),
 ('emily', 306),
 ('hawaii', 305),
 ('wells', 304),
 ('italian', 304),
 ('physics', 304),
 ('fitzgerald', 302),
 ('landscape', 302),
 ('boy', 302),
 ('division', 302),
 ('mormon', 300),
 ('atlas', 300),
 ('dp', 300),
 ('bell', 298),
 ('writing', 298),
 ('colored', 297),
 ('douglas', 297),
 ('nmnh', 296),
 ('baltimore', 295),
 ('holocaust', 295),
 ('female', 295),
 ('iron', 294),
 ('square', 294),
 ('foreign', 294),
 ('station', 293),
 ('renaissance', 293),
 ('cross', 293),
 ('data', 293),
 ('urban', 293),
 ('memorial', 293),
 ('hemingway', 292),
 ('da', 292),
 ('helen', 292),
 ('1963', 291),
 ('girls', 291),
 ('taylor', 291),
 ('program', 291),
 ('press', 290),
 ('sources', 290),
 ('crime', 289),
 ('robinson', 289),
 ('nuclear', 288),
 ('philip', 288),
 ('code', 288),
 ('moon', 287),
 ('oral', 287),
 ('marriage', 287),
 ('chemistry', 286),
 ('las', 286),
 ('board', 285),
 ('rhode', 285),
 ('x', 285),
 ('christmas', 284),
 ('marshall', 284),
 ('cod', 284),
 ('alaska', 283),
 ('mission', 283),
 ('ernest', 283),
 ('internment', 283),
 ('views', 282),
 ('use', 282),
 ('protest', 282),
 ('market', 282),
 ('brothers', 281),
 ('jazz', 281),
 ('murray', 281),
 ('anthony', 281),
 ('madison', 281),
 ('et', 280),
 ('potter', 280),
 ('internet', 280),
 ('dick', 279),
 ('studies', 279),
 ('personal', 279),
 ('dogs', 279),
 ('queen', 279),
 ('dress', 279),
 ('camps', 279),
 ('walt', 279),
 ('second', 278),
 ('1961', 278),
 ('mills', 278),
 ('court', 278),
 ('posters', 278),
 ('confederate', 277),
 ('blue', 277),
 ('republic', 276),
 ('machine', 276),
 ('austin', 276),
 ('1929', 276),
 ('obituaries', 275),
 ('del', 274),
 ('grant', 274),
 ('glass', 274),
 ('emblem', 274),
 ('savannah', 273),
 ('marketing', 273),
 ('manual', 272),
 ('brazil', 272),
 ('picture', 271),
 ('years', 271),
 ('programs', 271),
 ('baptist', 271),
 ('richmond', 271),
 ('du', 270),
 ('basketball', 269),
 ('nelson', 269),
 ('pearl', 269),
 ('mount', 269),
 ('anderson', 269),
 ('student', 268),
 ('antonio', 268),
 ('nursing', 268),
 ('automobile', 267),
 ('ct', 267),
 ('star', 267),
 ('interior', 266),
 ('electric', 266),
 ('christopher', 265),
 ('cache', 265),
 ('murder', 264),
 ('jesus', 264),
 ('stories', 263),
 ('tree', 263),
 ('1920s', 263),
 ('der', 262),
 ('model', 262),
 ('august', 262),
 ('earth', 262),
 ('com', 262),
 ('text', 261),
 ('korean', 261),
 ('economic', 260),
 ('dallas', 260),
 ('usa', 260),
 ('alfred', 260),
 ('ellis', 259),
 ('communication', 259),
 ('temple', 258),
 ('league', 258),
 ('joe', 258),
 ('isaac', 258),
 ('eagle', 257),
 ('champaign', 257),
 ('year', 256),
 ('sarah', 256),
 ('sound', 256),
 ('housing', 256),
 ('witch', 256),
 ('ww2', 255),
 ('harriet', 255),
 ('brain', 255),
 ('training', 255),
 ('civilian', 255),
 ('cleveland', 255),
 ('force', 255),
 ('statistics', 255),
 ('introduction', 255),
 ('railway', 254),
 ('nature', 254),
 ('animals', 254),
 ('delaware', 254),
 ('massacre', 253),
 ('buffalo', 253),
 ('television', 251),
 ('peace', 251),
 ('programming', 251),
 ('line', 251),
 ('birds', 251),
 ('jacob', 250),
 ('card', 250),
 ('cities', 250),
 ('3', 250),
 ('dickens', 249),
 ('mathematics', 249),
 ('1775', 249),
 ('plant', 248),
 ('search', 248),
 ('ray', 247),
 ('golden', 247),
 ('dream', 246),
 ('boys', 246),
 ('assassination', 246),
 ('insurance', 245),
 ('rice', 245),
 ('harlem', 244),
 ('ethics', 244),
 ('hamilton', 244),
 ('class', 244),
 ('morgan', 244),
 ('cuba', 243),
 ('care', 243),
 ('ice', 243),
 ('1917', 243),
 ('1860', 243),
 ('bowl', 243),
 ('mine', 243),
 ('mines', 242),
 ('families', 242),
 ('monroe', 242),
 ('nh', 242),
 ('speech', 242),
 ('duke', 242),
 ('herbert', 242),
 ('rose', 242),
 ('ut', 241),
 ('geology', 241),
 ('horses', 241),
 ('vs', 241),
 ('douglass', 240),
 ('1910', 240),
 ('prince', 240),
 ('canyon', 240),
 ('catalog', 239),
 ('liberty', 239),
 ('good', 239),
 ('god', 239),
 ('reconstruction', 238),
 ('criticism', 237),
 ('naval', 237),
 ('july', 237),
 ('test', 237),
 ('winter', 237),
 ('night', 237),
 ('cats', 237),
 ('movie', 236),
 ('womens', 236),
 ('dictionary', 236),
 ('correspondence', 236),
 ('wpa', 236),
 ('ruth', 235),
 ('morris', 235),
 ('plan', 235),
 ('primary', 234),
 ('self', 234),
 ('country', 234),
 ('albany', 234),
 ('cooper', 234),
 ('mrs', 234),
 ('lowell', 233),
 ('color', 233),
 ('manuscripts', 233),
 ('rico', 233),
 ('thompson', 233),
 ('finance', 233),
 ('1975', 233),
 ('1970', 233),
 ('http', 233),
 ('tea', 232),
 ('manhattan', 232),
 ('district', 232),
 ('sam', 232),
 ('disney', 231),
 ('pittsburgh', 231),
 ('plants', 231),
 ('puerto', 231),
 ('elementary', 231),
 ('korea', 231),
 ('cars', 230),
 ('era', 230),
 ('birth', 230),
 ('churches', 229),
 ('urbana', 228),
 ('drug', 228),
 ('lord', 227),
 ('patrick', 227),
 ('lloyd', 226),
 ('seattle', 226),
 ('planning', 226),
 ('houston', 226),
 ('heart', 226),
 ('paper', 225),
 ('energy', 225),
 ('plantation', 225),
 ('change', 224),
 ('systems', 224),
 ('susan', 224),
 ('audio', 223),
 ('fred', 223),
 ('eugene', 222),
 ('bob', 222),
 ('academy', 222),
 ('light', 222),
 ('colleges', 222),
 ('annual', 221),
 ('record', 220),
 ('committee', 220),
 ('haven', 220),
 ('moore', 219),
 ('sanborn', 219),
 ('disease', 219),
 ('bureau', 219),
 ('ships', 219),
 ('eacute', 219),
 ('bomb', 219),
 ('sugar', 218),
 ('racing', 218),
 ('motion', 218),
 ('sculpture', 218),
 ('botany', 218),
 ('cooking', 217),
 ('manuals', 217),
 ('flight', 217),
 ('cemetery', 216),
 ('commission', 216),
 ('making', 216),
 ('tn', 215),
 ('wars', 215),
 ('period', 215),
 ('kill', 215),
 ('ocean', 215),
 ('9', 215),
 ('chester', 215),
 ('cultural', 215),
 ('campbell', 215),
 ('karl', 214),
 ('three', 214),
 ('1968', 214),
 ('marie', 214),
 ('olympics', 214),
 ('dickinson', 213),
 ('methodist', 213),
 ('wall', 213),
 ('soldier', 213),
 ('money', 213),
 ('cook', 213),
 ('org', 213),
 ('montgomery', 213),
 ('store', 212),
 ('lost', 212),
 ('trials', 212),
 ('latin', 212),
 ('amendment', 211),
 ('holmes', 211),
 ('drawings', 211),
 ('costume', 211),
 ('bank', 211),
 ('funeral', 210),
 ('anatomy', 210),
 ('survey', 210),
 ('encyclopedia', 210),
 ('1890', 209),
 ('segregation', 209),
 ('maria', 209),
 ('wayne', 209),
 ('baker', 209),
 ('beverly', 209),
 ('environmental', 208),
 ('ralph', 208),
 ('denver', 208),
 ('mountains', 208),
 ('postcards', 208),
 ('nude', 208),
 ('atlantic', 208),
 ('grammar', 208),
 ('dust', 207),
 ('mitchell', 207),
 ('1850', 207),
 ('1870', 206),
 ('mo', 206),
 ('gettysburg', 206),
 ('show', 206),
 ('di', 206),
 ('mental', 206),
 ('ben', 206),
 ('poland', 205),
 ('flowers', 205),
 ('atomic', 205),
 ('einstein', 205),
 ('harold', 205),
 ('web', 205),
 ('therapy', 204),
 ('register', 204),
 ('september', 204),
 ('1964', 204),
 ('truman', 204),
 ('greenville', 204),
 ('series', 204),
 ('graham', 204),
 ('1980', 204),
 ('crisis', 204),
 ('property', 204),
 ('en', 203),
 ('tx', 203),
 ('juan', 203),
 ('gun', 203),
 ('evolution', 203),
 ('www', 202),
 ('northern', 202),
 ('1912', 202),
 ('4', 201),
 ('newport', 201),
 ('joyce', 201),
 ('printing', 201),
 ('summer', 201),
 ('prison', 201),
 ('tobacco', 201),
 ('7', 200),
 ('simon', 200),
 ('reno', 199),
 ('clinton', 199),
 ('ward', 199),
 ('services', 199),
 ...]
In [7]:
import pickle
import nltk

vap = pickle.load( open( "/media/storage/dpla-data/pickles/virginia.p", "rb" ) )

### >>> fdist = nltk.FreqDist(ch.lower() for ch in raw if ch.isalpha())
vafd = nltk.FreqDist(token.lower() for token in vap['virginia']['filtered'])
vafd.most_common()
Out[7]:
[('virginia', 166709),
 ('university', 107608),
 ('library', 64289),
 ('charlottesville', 54329),
 ('lib', 40564),
 ('image', 35858),
 ('va', 35396),
 ('holsinger', 31644),
 ('studio', 31459),
 ('collection', 30225),
 ('states', 29421),
 ('uva', 29228),
 ('collections', 26683),
 ('special', 23382),
 ('negatives', 22543),
 ('visual', 22459),
 ('history', 22162),
 ('ca', 21622),
 ('material', 21482),
 ('use', 17953),
 ('united', 17390),
 ('information', 15949),
 ('go', 15910),
 ('please', 15897),
 ('search', 15895),
 ('edu', 15894),
 ('terms', 15894),
 ('http', 15894),
 ('regions', 12101),
 ('w', 12006),
 ('name', 11945),
 ('davis', 11606),
 ('photographs', 11564),
 ('jackson', 11421),
 ('1825', 11044),
 ('present', 10993),
 ('online', 10977),
 ('1890', 10538),
 ('1938', 10459),
 ('photographic', 10386),
 ('index', 10378),
 ('plus', 10377),
 ('volume', 10376),
 ('1930', 9763),
 ('1866', 9685),
 ('rufus', 9632),
 ('glass', 9520),
 ('african', 9456),
 ('portraits', 9377),
 ('visitors', 8915),
 ('may', 8908),
 ('must', 8871),
 ('without', 8871),
 ('reproduced', 8864),
 ('rector', 8864),
 ('permission', 8863),
 ('additional', 8863),
 ('credited', 8863),
 ('plate', 8485),
 ('white', 7954),
 ('good', 7663),
 ('black', 7652),
 ('condition', 7646),
 ('works', 6794),
 ('group', 6386),
 ('people', 6351),
 ('built', 6346),
 ('single', 6266),
 ('american', 6206),
 ('1882', 5640),
 ('1947', 5617),
 ('5x7', 5607),
 ('photography', 5528),
 ('restrictions', 5518),
 ('accessing', 5518),
 ('county', 5107),
 ('music', 5090),
 ('south', 4872),
 ('function', 4839),
 ('8x10', 4259),
 ('u', 4232),
 ('1915', 3833),
 ('text', 3699),
 ('ethnic', 3422),
 ('school', 3413),
 ('americans', 3335),
 ('record', 3332),
 ('digital', 3319),
 ('dpla', 3317),
 ('print', 2968),
 ('piano', 2898),
 ('towns', 2790),
 ('cities', 2790),
 ('schools', 2741),
 ('c', 2713),
 ('notated', 2549),
 ('contact', 2338),
 ('n', 2143),
 ('children', 2103),
 ('unknown', 2090),
 ('new', 2067),
 ('conditions', 2014),
 ('english', 1922),
 ('buildings', 1877),
 ('1917', 1818),
 ('date', 1794),
 ('03', 1768),
 ('york', 1738),
 ('untitled', 1689),
 ('07', 1628),
 ('09', 1616),
 ('11', 1609),
 ('prints', 1600),
 ('04', 1572),
 ('songs', 1549),
 ('j', 1526),
 ('12', 1508),
 ('05', 1497),
 ('1916', 1486),
 ('01', 1475),
 ('1918', 1474),
 ('10', 1454),
 ('08', 1441),
 ('1914', 1438),
 ('mrs', 1426),
 ('06', 1386),
 ('popular', 1335),
 ('architecture', 1329),
 ('miss', 1269),
 ('elements', 1259),
 ('h', 1238),
 ('02', 1193),
 ('co', 1169),
 ('e', 1161),
 ('st', 1160),
 ('students', 1129),
 ('richard', 1110),
 ('spaces', 1107),
 ('1912', 1077),
 ('gender', 1040),
 ('education', 1036),
 ('emulsion', 1032),
 ('john', 1009),
 ('content', 1006),
 ('linguistic', 1004),
 ('rotunda', 965),
 ('anderson', 954),
 ('1940', 942),
 ('f', 929),
 ('ralph', 910),
 ('training', 906),
 ('sciences', 890),
 ('social', 880),
 ('film', 877),
 ('unidentified', 864),
 ('b', 860),
 ('boston', 858),
 ('g', 853),
 ('carolina', 845),
 ('d', 837),
 ('1913', 824),
 ('m', 823),
 ('l', 817),
 ('trees', 816),
 ('overall', 800),
 ('william', 789),
 ('hall', 788),
 ('institutional', 778),
 ('africa', 768),
 ('babies', 764),
 ('philadelphia', 743),
 ('subject', 733),
 ('1919', 732),
 ('r', 728),
 ('open', 709),
 ('charles', 701),
 ('football', 697),
 ('college', 681),
 ('site', 672),
 ('men', 671),
 ('portrait', 670),
 ('army', 648),
 ('george', 642),
 ('houses', 640),
 ('colleges', 638),
 ('sports', 635),
 ('damage', 634),
 ('faculty', 632),
 ('industrial', 618),
 ('hand', 616),
 ('tex', 614),
 ('de', 612),
 ('war', 610),
 ('world', 608),
 ('house', 608),
 ('institute', 566),
 ('domestic', 561),
 ('18', 543),
 ('alderman', 538),
 ('players', 533),
 ('building', 531),
 ('soldiers', 525),
 ('landscape', 524),
 ('age', 519),
 ('uniforms', 519),
 ('shrubs', 516),
 ('mr', 506),
 ('rooms', 506),
 ('fraternity', 503),
 ('right', 499),
 ('furniture', 488),
 ('form', 485),
 ('land', 479),
 ('silver', 477),
 ('16', 476),
 ('1920', 476),
 ('photo', 474),
 ('gelatin', 470),
 ('interior', 467),
 ('thomas', 466),
 ('21', 465),
 ('left', 462),
 ('north', 461),
 ('washington', 453),
 ('colored', 452),
 ('instrumental', 452),
 ('hats', 451),
 ('p', 445),
 ('lawn', 441),
 ('james', 437),
 ('type', 433),
 ('types', 430),
 ('east', 429),
 ('24', 420),
 ('henry', 417),
 ('la', 417),
 ('dr', 410),
 ('ga', 407),
 ('women', 401),
 ('26', 401),
 ('14', 397),
 ('engravings', 394),
 ('middle', 394),
 ('17', 390),
 ('along', 389),
 ('continents', 388),
 ('20', 386),
 ('high', 382),
 ('architectural', 378),
 ('15', 376),
 ('13', 376),
 ('summer', 376),
 ('1921', 374),
 ('texas', 373),
 ('elizabeth', 372),
 ('state', 371),
 ('27', 370),
 ('old', 370),
 ('vehicles', 367),
 ('events', 367),
 ('bottom', 366),
 ('west', 366),
 ('city', 365),
 ('cartographic', 365),
 ('y', 363),
 ('albemarle', 361),
 ('25', 356),
 ('22', 356),
 ('19', 355),
 ('family', 355),
 ('materials', 353),
 ('automobiles', 352),
 ('o', 351),
 ('context', 350),
 ('animals', 349),
 ('28', 347),
 ('1924', 345),
 ('corner', 344),
 ('side', 343),
 ('smith', 341),
 ('29', 340),
 ('robert', 340),
 ('30', 337),
 ('baltimore', 337),
 ('two', 334),
 ('11x14', 332),
 ('costume', 331),
 ('agricultural', 330),
 ('ala', 326),
 ('home', 325),
 ('horses', 324),
 ('landforms', 322),
 ('church', 321),
 ('voice', 319),
 ('countries', 318),
 ('gardens', 318),
 ('occupation', 315),
 ('jr', 313),
 ('french', 312),
 ('union', 310),
 ('23', 309),
 ('academy', 307),
 ('teachers', 307),
 ('1974', 307),
 ('ark', 306),
 ('ditson', 298),
 ('evening', 297),
 ('1891', 297),
 ('vocal', 296),
 ('landscapes', 295),
 ('view', 292),
 ('company', 291),
 ('mississippi', 291),
 ('negro', 285),
 ('edward', 284),
 ('oliver', 280),
 ('components', 280),
 ('girls', 279),
 ('families', 278),
 ('gowns', 278),
 ('top', 275),
 ('map', 274),
 ('1925', 272),
 ('construction', 267),
 ('baseball', 267),
 ('saint', 266),
 ('visible', 266),
 ('views', 264),
 ('caroline', 263),
 ('universities', 263),
 ('tenn', 263),
 ('color', 263),
 ('schirmer', 263),
 ('activity', 258),
 ('sq', 258),
 ('missing', 257),
 ('chestnut', 257),
 ('general', 257),
 ('georgia', 257),
 ('structural', 255),
 ('joseph', 251),
 ('relationship', 250),
 ('4', 249),
 ('bettis', 249),
 ('water', 249),
 ('mary', 245),
 ('france', 244),
 ('taylor', 242),
 ('edgefield', 241),
 ('trenton', 239),
 ('normal', 238),
 ('roads', 238),
 ('field', 237),
 ('1895', 236),
 ('class', 235),
 ('street', 233),
 ('penn', 232),
 ('arranged', 232),
 ('dwellings', 231),
 ('alabama', 231),
 ('walker', 230),
 ('preachers', 228),
 ('1972', 226),
 ('broadway', 226),
 ('story', 225),
 ('1977', 224),
 ('photogravures', 223),
 ('31', 221),
 ('lee', 221),
 ('railroads', 220),
 ('pageants', 219),
 ('hench', 218),
 ('laughlin', 217),
 ('cumberland', 217),
 ('atcheson', 217),
 ('1922', 216),
 ('1928', 215),
 ('operas', 210),
 ('brothers', 209),
 ('session', 208),
 ('clothes', 207),
 ('costumes', 207),
 ('1923', 206),
 ('arkansas', 203),
 ('room', 203),
 ('louis', 202),
 ('couples', 202),
 ('henrico', 201),
 ('jefferson', 200),
 ('frank', 199),
 ('specific', 198),
 ('waltzes', 197),
 ('farms', 196),
 ('europe', 196),
 ('districts', 196),
 ('snow', 196),
 ('voices', 195),
 ('flowers', 195),
 ('paul', 192),
 ('richmond', 190),
 ('equipment', 189),
 ('boxing', 188),
 ('farm', 187),
 ('near', 187),
 ('alexander', 187),
 ('railroad', 187),
 ('arthur', 187),
 ('1970', 186),
 ('fla', 186),
 ('finals', 185),
 ('parish', 184),
 ('paris', 183),
 ('choruses', 182),
 ('pavilion', 181),
 ('sir', 179),
 ('edwin', 179),
 ('hill', 179),
 ('pa', 179),
 ('president', 179),
 ('ruth', 177),
 ('monticello', 175),
 ('fields', 175),
 ('mixed', 174),
 ('14x17', 173),
 ('delta', 171),
 ('warner', 171),
 ('upper', 170),
 ('track', 170),
 ('tennessee', 169),
 ('team', 169),
 ('horse', 169),
 ('cabell', 168),
 ('1966', 167),
 ('porches', 166),
 ('1969', 166),
 ('1896', 166),
 ('1889', 165),
 ('one', 165),
 ('king', 161),
 ('hampton', 161),
 ('exhibits', 160),
 ('party', 160),
 ('club', 160),
 ('island', 159),
 ('helena', 159),
 ('central', 159),
 ('mass', 158),
 ('cultural', 158),
 ('worn', 158),
 ('phi', 157),
 ('hospital', 156),
 ('1858', 155),
 ('mammals', 154),
 ('35', 154),
 ('sacred', 154),
 ('composition', 152),
 ('physical', 152),
 ('aerial', 152),
 ('gloucester', 151),
 ('sweet', 151),
 ('railway', 151),
 ('center', 150),
 ('chicago', 150),
 ('boys', 150),
 ('1885', 150),
 ('1968', 150),
 ('stores', 150),
 ('engineering', 149),
 ('1819', 148),
 ('ohio', 148),
 ('plants', 148),
 ('medical', 146),
 ('love', 146),
 ('conference', 145),
 ('1870', 144),
 ('sigma', 144),
 ('business', 144),
 ('secular', 142),
 ('dance', 142),
 ('day', 141),
 ('personal', 141),
 ('man', 141),
 ('parts', 141),
 ('1911', 140),
 ('main', 140),
 ('1937', 140),
 ('chesterfield', 139),
 ('1875', 139),
 ('1976', 139),
 ('1929', 139),
 ('louisiana', 138),
 ('1830', 138),
 ('weddings', 138),
 ('exhibitions', 136),
 ('florida', 136),
 ('brown', 134),
 ('public', 134),
 ('scott', 134),
 ('techniques', 133),
 ('range', 133),
 ('mountains', 133),
 ('pageant', 133),
 ('processes', 133),
 ('head', 132),
 ('1900', 132),
 ('1863', 131),
 ('1872', 131),
 ('scenes', 131),
 ('saalfield', 131),
 ('southern', 130),
 ('rural', 130),
 ('beaufort', 130),
 ('bodies', 129),
 ('albert', 129),
 ('deteriorating', 129),
 ('1971', 129),
 ('cincinnati', 128),
 ('basketball', 128),
 ('allen', 128),
 ('activities', 128),
 ('point', 127),
 ('africans', 127),
 ('helen', 127),
 ('1905', 126),
 ('accessories', 126),
 ('religion', 125),
 ('md', 125),
 ('vocational', 124),
 ('1813', 124),
 ('london', 124),
 ('retail', 124),
 ('1975', 124),
 ('1950', 124),
 ('brides', 124),
 ('slight', 123),
 ('graded', 123),
 ('1927', 123),
 ('score', 122),
 ('1850', 121),
 ('england', 121),
 ('medium', 121),
 ('1935', 121),
 ('green', 120),
 ('wood', 120),
 ('willig', 120),
 ('superintendent', 120),
 ('1926', 119),
 ('concepts', 118),
 ('dormitory', 118),
 ('grounds', 118),
 ('1839', 118),
 ('low', 118),
 ('dogs', 117),
 ('student', 116),
 ('memorial', 116),
 ('entirely', 116),
 ('francis', 116),
 ('mountain', 116),
 ('1809', 116),
 ('clark', 115),
 ('clergy', 114),
 ('martin', 114),
 ('none', 114),
 ('kenya', 114),
 ('1856', 113),
 ('1876', 113),
 ('gordon', 113),
 ('randolph', 113),
 ('1892', 113),
 ('1964', 113),
 ('webb', 112),
 ('edgar', 112),
 ('2', 112),
 ('arr', 112),
 ('1859', 111),
 ('1848', 111),
 ('wilson', 111),
 ('broken', 111),
 ('orleans', 110),
 ('daughters', 110),
 ('1910', 110),
 ('duets', 109),
 ('du', 108),
 ('chairs', 108),
 ('1826', 108),
 ('fourth', 108),
 ('statues', 108),
 ('1901', 108),
 ('harry', 107),
 ('1967', 107),
 ('camp', 106),
 ('k', 105),
 ('exhibit', 105),
 ('walter', 105),
 ('chi', 105),
 ('statue', 105),
 ('nottoway', 104),
 ('fences', 104),
 ('coats', 103),
 ('kappa', 103),
 ('road', 103),
 ('briar', 103),
 ('1963', 103),
 ('harrison', 103),
 ('harris', 102),
 ('1909', 102),
 ('1874', 102),
 ('marion', 102),
 ('steps', 102),
 ('bowling', 102),
 ('edge', 102),
 ('franz', 101),
 ('1984', 100),
 ('v', 100),
 ('1936', 100),
 ('dormitories', 100),
 ('food', 99),
 ('walls', 99),
 ('1941', 99),
 ('1827', 99),
 ('1600', 99),
 ('1880', 99),
 ('1852', 98),
 ('veils', 98),
 ('natural', 98),
 ('1797', 97),
 ('residence', 97),
 ('1833', 97),
 ('1865', 97),
 ('chapel', 97),
 ('garden', 97),
 ('fur', 97),
 ('von', 96),
 ('station', 96),
 ('1828', 96),
 ('joining', 96),
 ('areas', 96),
 ('graduation', 95),
 ('additive', 95),
 ('1400', 95),
 ('1861', 94),
 ('garments', 94),
 ('samuel', 94),
 ('toys', 94),
 ('frederick', 94),
 ('office', 93),
 ('quartets', 92),
 ('1867', 92),
 ('geographic', 92),
 ('organ', 92),
 ('frogmore', 92),
 ('1939', 92),
 ('carter', 91),
 ('1980', 91),
 ('containers', 91),
 ('1877', 91),
 ('1871', 91),
 ('sea', 91),
 ('work', 91),
 ('national', 91),
 ('brick', 91),
 ('morris', 90),
 ('jones', 90),
 ('sonatas', 90),
 ('peters', 90),
 ('campbell', 90),
 ('academic', 90),
 ('chipped', 90),
 ('son', 89),
 ('dillard', 89),
 ('1869', 89),
 ('1986', 89),
 ('bridge', 89),
 ('marches', 89),
 ('books', 89),
 ('madison', 89),
 ('1965', 89),
 ('across', 89),
 ('beta', 88),
 ('1979', 88),
 ('showing', 88),
 ('makers', 88),
 ('nurses', 88),
 ('monroe', 88),
 ('park', 87),
 ('1860', 87),
 ('river', 87),
 ('david', 87),
 ('alfred', 87),
 ('277', 87),
 ('front', 87),
 ('mount', 87),
 ('okla', 86),
 ('liberia', 86),
 ('1987', 86),
 ('montgomery', 86),
 ('marshall', 85),
 ('board', 85),
 ('1899', 85),
 ('1945', 84),
 ('alumni', 84),
 ('ceremonies', 84),
 ('albumen', 84),
 ('meeting', 84),
 ('corinne', 84),
 ('1832', 84),
 ('des', 83),
 ('stingray', 83),
 ('1837', 83),
 ('cleveland', 83),
 ('lewis', 82),
 ('1857', 82),
 ('churches', 82),
 ('red', 82),
 ('little', 82),
 ('ave', 82),
 ('carl', 82),
 ('sumter', 82),
 ('middlesex', 82),
 ('1883', 82),
 ('1847', 82),
 ('miller', 82),
 ('1888', 81),
 ('1907', 81),
 ('lawns', 81),
 ('obscuring', 81),
 ('yards', 81),
 ('jean', 81),
 ('laboratory', 81),
 ('1849', 81),
 ('consumer', 80),
 ('1862', 80),
 ('le', 80),
 ('et', 80),
 ('art', 80),
 ('court', 80),
 ('woman', 79),
 ('machinery', 79),
 ('hands', 79),
 ('groups', 79),
 ('1897', 79),
 ('oh', 78),
 ('bowles', 78),
 ('artifacts', 78),
 ('halifax', 78),
 ('1864', 77),
 ('williams', 77),
 ('ky', 77),
 ('department', 77),
 ('pond', 76),
 ('1973', 76),
 ('stations', 76),
 ('1829', 76),
 ('1886', 76),
 ('process', 76),
 ('1308', 76),
 ('1893', 76),
 ('rosenwald', 75),
 ('german', 75),
 ('military', 75),
 ('night', 75),
 ('minor', 75),
 ('tables', 74),
 ('anne', 74),
 ('player', 74),
 ('1810', 73),
 ('serpentine', 73),
 ('country', 73),
 ('small', 73),
 ('fair', 73),
 ('society', 73),
 ('frame', 72),
 ('1978', 72),
 ('peter', 72),
 ('dining', 72),
 ('1835', 72),
 ('watercraft', 72),
 ('prince', 72),
 ('conferences', 71),
 ('1855', 71),
 ('wm', 71),
 ('christian', 71),
 ('johnson', 71),
 ('lawrenceville', 71),
 ('1842', 71),
 ('1868', 71),
 ('oak', 71),
 ('game', 71),
 ('large', 71),
 ('stadiums', 71),
 ('van', 71),
 ('navy', 71),
 ('1846', 71),
 ('1840', 71),
 ('professor', 71),
 ('islands', 70),
 ('lyon', 70),
 ('rock', 70),
 ('forest', 70),
 ('life', 70),
 ('stephen', 69),
 ('1985', 69),
 ('location', 69),
 ('ii', 69),
 ('queen', 69),
 ('fruit', 69),
 ('russell', 69),
 ('berry', 69),
 ('shelby', 69),
 ('scene', 68),
 ('41', 68),
 ('drawn', 68),
 ('1884', 68),
 ('frederic', 68),
 ('1887', 68),
 ('warren', 68),
 ('clubs', 68),
 ('held', 68),
 ('ridge', 68),
 ('visit', 67),
 ('norfolk', 67),
 ('gillingham', 67),
 ('1908', 67),
 ('1903', 67),
 ('1873', 67),
 ('adams', 67),
 ('government', 67),
 ('1807', 67),
 ('apples', 66),
 ('rogers', 66),
 ('mostly', 66),
 ('accelerator', 66),
 ('1981', 66),
 ('poe', 66),
 ('parties', 66),
 ('1841', 66),
 ('grove', 66),
 ('brunswick', 65),
 ('stone', 65),
 ('1949', 65),
 ('1982', 65),
 ('law', 65),
 ('1962', 65),
 ('july', 65),
 ('young', 65),
 ('athletes', 65),
 ('wives', 65),
 ('plant', 65),
 ('association', 65),
 ('1808', 65),
 ('windows', 65),
 ('printed', 65),
 ('blue', 65),
 ('michael', 64),
 ('crowds', 64),
 ('societies', 64),
 ('trip', 64),
 ('chesapeake', 64),
 ('1934', 64),
 ('1844', 64),
 ('carriages', 64),
 ('clarke', 64),
 ('1812', 64),
 ('c1866', 64),
 ('trio', 63),
 ('continuing', 63),
 ('oklahoma', 63),
 ('america', 63),
 ('theodore', 63),
 ('1818', 62),
 ('sons', 62),
 ('horsemanship', 62),
 ('published', 62),
 ('fayette', 62),
 ('shannon', 62),
 ('since', 62),
 ('maria', 62),
 ('cookery', 62),
 ('manassas', 62),
 ('canada', 62),
 ('1823', 62),
 ('log', 62),
 ('rivers', 62),
 ('drawing', 62),
 ('tuskegee', 62),
 ('schmidt', 61),
 ('settlements', 61),
 ('mark', 61),
 ('tape', 61),
 ('1831', 61),
 ('1836', 61),
 ('funeral', 61),
 ('bible', 61),
 ('boxer', 60),
 ('affairs', 60),
 ('charlotte', 60),
 ('451', 60),
 ('service', 60),
 ('complexes', 60),
 ('weber', 60),
 ('hancock', 60),
 ('valley', 60),
 ('1898', 60),
 ('canning', 60),
 ('science', 59),
 ('98', 59),
 ('c1889', 59),
 ('tyler', 59),
 ('1786', 59),
 ('1983', 59),
 ('underwood', 59),
 ('lower', 59),
 ('orchards', 59),
 ('moore', 59),
 ('1851', 58),
 ('benjamin', 58),
 ('shakespeare', 58),
 ('duke', 58),
 ('1988', 58),
 ('herbaceous', 58),
 ('birds', 58),
 ('1820', 58),
 ('trade', 58),
 ('sewing', 58),
 ('first', 58),
 ('roofs', 58),
 ('area', 58),
 ('alpha', 58),
 ('1853', 57),
 ('rouge', 57),
 ('woodberry', 57),
 ('biology', 57),
 ('classrooms', 57),
 ('teacher', 57),
 ('canned', 57),
 ('figures', 57),
 ('painting', 57),
 ('1834', 57),
 ('maryland', 57),
 ('cracked', 57),
 ('edwards', 57),
 ('howard', 57),
 ('crb', 56),
 ('1817', 56),
 ('picture', 56),
 ('travis', 56),
 ('evans', 56),
 ('1933', 56),
 ('1932', 56),
 ('half', 56),
 ('1894', 56),
 ('1843', 56),
 ('cross', 56),
 ('ash', 56),
 ('par', 56),
 ('earl', 55),
 ('epsilon', 55),
 ('1854', 55),
 ('baton', 55),
 ('fire', 55),
 ('theta', 55),
 ('guadalupe', 55),
 ('living', 55),
 ('robinson', 55),
 ('slightly', 55),
 ('1806', 55),
 ('clay', 55),
 ('dean', 55),
 ('show', 55),
 ('approximately', 54),
 ('greene', 54),
 ('relief', 54),
 ('graduate', 54),
 ('1779', 54),
 ('swain', 54),
 ('mayo', 54),
 ('williamsburg', 54),
 ('transparencies', 54),
 ('c1882', 53),
 ('1953', 53),
 ('c1883', 53),
 ('x', 53),
 ('real', 53),
 ('violins', 53),
 ('come', 53),
 ('hereford', 53),
 ('great', 53),
 ('5', 53),
 ('environments', 53),
 ('extending', 53),
 ('lake', 53),
 ('years', 53),
 ('gymnasium', 53),
 ('cabins', 53),
 ('thornton', 52),
 ('1815', 52),
 ('louisville', 52),
 ('1811', 52),
 ('daniel', 52),
 ('c1887', 52),
 ...]
In [ ]:
colls = ["artstor","biodiv","rumsey","commonwealth","georgia","harvard",
        "ia","getty","kentucky","minnesota","missouri","mwdl",
        "nara","nocar","smiths","socar","texas","gpo","illinois","usc","virginia","nocoll"]

import pickle

for c in colls:
    #p = pickle.load( open( "/media/storage/dpla-data/pickles/"+c+".p", "rb" ) )
    p = pickle.load( open( "C:/Users/charper/dpla-temp/pickles/"+c+".p", "rb" ) )
    print("\nGathering Stats for " + c)
    stats = p[c]['stats']
    print(stats)
    print("percent unique:")
    print(round((p[c]['stats']['uniq'] / p[c]['stats']['wc']),5), "%") 
    print("filtered percent unique:")
    print(round((p[c]['stats']['funiq'] / p[c]['stats']['fwc']),5), "%")
    print("*********")
Gathering Stats for artstor
{'fwc': 6518566, 'wc': 8466030, 'funiq': 60635, 'uniq': 60760}
percent unique:
0.00718 %
filtered percent unique:
0.0093 %
*********

Gathering Stats for biodiv
{'fwc': 7638579, 'wc': 8361216, 'funiq': 94631, 'uniq': 94755}
percent unique:
0.01133 %
filtered percent unique:
0.01239 %
*********

Gathering Stats for rumsey
{'fwc': 12562369, 'wc': 15404401, 'funiq': 47643, 'uniq': 47763}
percent unique:
0.0031 %
filtered percent unique:
0.00379 %
*********

Gathering Stats for commonwealth
In [8]:
#type(fd)
#haps = fd.hapaxes()
len(vafd.hapaxes())
Out[8]:
31738
In [9]:
longwords = {}
for k,v in vafd.items():
    if len(k) > 10: longwords[k] = v
In [25]:
longwords
Out[25]:
{'Accelerator': 65,
 'Accessories': 120,
 'Accomodates': 2,
 'Accompanying': 1,
 'Adjustments': 1,
 'Administration': 2,
 'Administrative': 13,
 'Adminstration': 1,
 'Aduertissement': 1,
 'Advertisement': 2,
 'Advertising': 5,
 'Aerodynamics': 2,
 'Aeronautical': 5,
 'Aeronautics': 37,
 'Agricultural': 303,
 'Agriculture': 13,
 'Alterations': 1,
 'Amabassador': 1,
 'Amazonenvitt': 1,
 'Amphitheater': 2,
 'Amphitheaters': 9,
 'Amphitheatre': 43,
 'Ampitheatre': 4,
 'Amstelodami': 1,
 'Anniversaries': 5,
 'Anniversary': 22,
 'Announcement': 1,
 'Antiquities': 2,
 'Apologetics': 1,
 'Apparizione': 1,
 'Appointment': 1,
 'Archdeacons': 1,
 'Architectural': 313,
 'Architecture': 1012,
 'Aristomenes': 1,
 'Arrangements': 1,
 'Assassination': 2,
 'Association': 64,
 'Associations': 7,
 'Astronomical': 1,
 'Auditoriums': 4,
 'Aufforderung': 4,
 'Automobiles': 351,
 'Automoblies': 1,
 'Badarzewska': 3,
 'Balustrades': 5,
 'Barboursville': 18,
 'Barcarolles': 2,
 'Barhamsville': 12,
 'Bartholomew': 6,
 'Battlements': 1,
 'Baylorsville': 5,
 'Beaujoyeulx': 1,
 'Beckenstein': 3,
 'Beuckenstein': 2,
 'Bibliographical': 1,
 'Bibliopolas': 1,
 'Biscaccianti': 1,
 'Bishopville': 2,
 'Bjørnstjerne': 1,
 'Blackboards': 6,
 'Blacksmiths': 1,
 'Blankenship': 2,
 'Blennerhassett': 1,
 'Blessington': 2,
 'Bloomington': 1,
 'Blumentritt': 1,
 'Bodybuilders': 41,
 'Bodybuilding': 4,
 'Bonaventure': 2,
 'Bonnycastle': 1,
 'Bookbinders': 2,
 'Bookbinding': 2,
 'Bookshelves': 2,
 'Brandeville': 1,
 'Brandstetter': 3,
 'Breckinridge': 2,
 'Bridegrooms': 24,
 'Bridgeforth': 1,
 'Bridgewater': 1,
 'Brockenbraugh': 1,
 'Brownsville': 23,
 'Brueschweiler': 1,
 'Buentivolio': 1,
 'Businessmen': 1,
 'Butterflies': 1,
 'COLLECTIONS': 1,
 'Cadwallader': 1,
 'Calcografia': 1,
 'Calculating': 1,
 'Cannonballs': 1,
 'Canzonettas': 1,
 'Cartersville': 39,
 'Cartographic': 2,
 'Castiglione': 4,
 'Catholiques': 1,
 'Cazouillement': 1,
 'Celebration': 27,
 'Celebrations': 3,
 'Certificate': 2,
 'Certificates': 2,
 'Chamberlain': 9,
 'Chamberlayne': 20,
 'Chancellors': 1,
 'Chandeliers': 2,
 'Charakterstu': 1,
 'Charlemagne': 2,
 'Charlestown': 1,
 'Charlottesviile': 1,
 'Charlottesville': 54329,
 'Chartseller': 1,
 'Chatterbrick': 1,
 'Chesterfield': 139,
 'Christening': 3,
 'Christianity': 2,
 'Christiansburg': 11,
 'Christopher': 14,
 'Churchyards': 1,
 'Clarksville': 2,
 'Classification': 1,
 'Clatterbrick': 1,
 'Clinchfield': 2,
 'Clotheslines': 1,
 'Cobblestone': 1,
 'Collections': 26682,
 'Combination': 1,
 'Commencement': 24,
 'Commencements': 14,
 'Commentaires': 1,
 'Commisioner': 1,
 'Commissioned': 2,
 'Commissioner': 1,
 'Commissioners': 3,
 'Commisssioners': 1,
 'Comparative': 1,
 'Competition': 1,
 'Confederacy': 1,
 'Confederate': 52,
 'Conferences': 71,
 'Confidentially': 1,
 'Confirmation': 1,
 'Confirmations': 1,
 'Congregation': 2,
 'Congressional': 2,
 'Conservatories': 1,
 'Considerations': 1,
 'Consolation': 2,
 'Consolidated': 14,
 'Consolidating': 1,
 'Constantine': 2,
 'Constitution': 2,
 'Constitutional': 1,
 'Construction': 114,
 'Continental': 8,
 'Continuation': 3,
 'Contreblasons': 1,
 'Convenience': 1,
 'Conversation': 1,
 'Convocation': 3,
 'Cooperation': 8,
 'Cooperative': 20,
 'Coopersmith': 1,
 'Coordinated': 1,
 'Cornachione': 1,
 'Coronations': 2,
 'Corporation': 11,
 'Corporations': 1,
 'Corporative': 1,
 'Correctional': 1,
 'Corrections': 1,
 'Correspondence': 1,
 'Corrugating': 5,
 'Cosmographie': 1,
 'Cosstaphney': 1,
 'Courlaender': 1,
 'Courthouses': 32,
 'Cowperthwait': 1,
 'Critchenberger': 2,
 'Critenbarger': 2,
 'Crutchfield': 2,
 'Dardensburg': 1,
 'Declaration': 36,
 'Declaratory': 2,
 'Decorations': 2,
 'Dedications': 1,
 'Demonstation': 1,
 'Demonstration': 32,
 'Demonstrations': 16,
 'Demonstrator': 1,
 'Demonstrators': 1,
 'Departments': 1,
 'Description': 15,
 'Destruction': 1,
 'Development': 12,
 'Diepenbeeck': 1,
 'Discoloration': 3,
 'Dispensations': 2,
 'Disposition': 1,
 'Distinguished': 10,
 'Distributed': 2,
 'Divertissement': 1,
 'Documentary': 3,
 'Dodelinette': 1,
 'Dormitories': 100,
 'Dressmakers': 2,
 'Dressmaking': 1,
 'Dunfermline': 1,
 'Educational': 15,
 'Electricity': 2,
 'Elefterious': 1,
 'Emancipation': 2,
 'Embroidering': 1,
 'Encarnacion': 1,
 'Enchantment': 1,
 'Encyclopaedia': 2,
 'Engineering': 144,
 'Entablatures': 1,
 'Entertainers': 2,
 'Entertainment': 46,
 'Environments': 44,
 'Equestrians': 3,
 'Ergenbright': 2,
 'Establishment': 3,
 'Estudiantina': 1,
 'Eternamente': 1,
 'Euangelistes': 1,
 'Eucharistie': 1,
 'Evangelical': 2,
 'Exhibitions': 135,
 'Experimental': 25,
 'Fairgrounds': 3,
 'Fantasiestu': 3,
 'Faschingsschwank': 1,
 'Fayerweather': 28,
 'Fayetteville': 19,
 'Fenchtenberger': 1,
 'Fieuberlake': 1,
 'Fiskerjenten': 1,
 'Fitzpatrick': 4,
 'Fleetstreet': 8,
 'Folkelivsbilleder': 1,
 'Fontainebleau': 2,
 'Footbridges': 4,
 'Foraarstoner': 1,
 'Fortification': 6,
 'Fortifications': 1,
 'Foundations': 8,
 'Fountainebleau': 1,
 'Fraternities': 39,
 'Fredericksburg': 23,
 'Freemasonry': 8,
 'Freiligrath': 1,
 'Friedenwald': 1,
 'Furnishings': 18,
 'Gainesville': 1,
 'Gainsborough': 1,
 'Gallicanism': 1,
 'Gendarmerie': 1,
 'Generations': 2,
 'Geographical': 2,
 'Giambattista': 1,
 'Gildersleeve': 16,
 'Gillenwater': 3,
 'Goldschmetterlinge': 1,
 'Goldschmidt': 1,
 'Gordonsville': 1,
 'Granddaughter': 1,
 'Grandfather': 3,
 'Grandfathers': 2,
 'Grandmother': 4,
 'Grandmothers': 1,
 'Grandparents': 1,
 'Grandsaigne': 1,
 'Grandstands': 14,
 'Graniteville': 1,
 'Greeleyville': 6,
 'Greenhouses': 2,
 'Greensville': 3,
 'Grigorʹevich': 1,
 'Grillparzer': 3,
 'Groundbreakings': 4,
 'Hairdressing': 4,
 'Hallettsville': 4,
 'Hammarstrand': 2,
 'Hanfstaengl': 1,
 'Harpsichord': 26,
 'Harrisonburg': 8,
 'Hattiesburg': 11,
 'Headquarters': 3,
 'Hecatongraphie': 1,
 'Hendrickson': 2,
 'Heppenheimer': 2,
 'Herrodsburg': 1,
 'Herzallerliebsten': 2,
 'Hetherington': 2,
 'Hippocrates': 2,
 'Hirondelles': 1,
 'Historiarum': 1,
 'Hodgkinsonne': 1,
 'Hollingsworth': 2,
 'Homesickness': 1,
 'Honeysuckle': 1,
 'Horsemanship': 62,
 'Hortenstein': 1,
 'Hortensteine': 1,
 'Horticultural': 3,
 'Horticulture': 7,
 'Humoresques': 2,
 'Hydrographer': 1,
 'Hydrographical': 2,
 'Identifying': 18,
 'Illustrated': 2,
 'Illustrations': 2,
 'Improvement': 11,
 'Improvisations': 1,
 'Inaguration': 1,
 'Inauguration': 17,
 'Incorporated': 2,
 'Independance': 1,
 'Independence': 32,
 'Independent': 6,
 'Indianapolis': 4,
 'Industrialization': 1,
 'Infirmaries': 3,
 'Information': 44,
 'Infrastructural': 9,
 'Infrastructure': 1,
 'Inheritance': 1,
 'Inscriptions': 1,
 'Institution': 1,
 'Institutional': 778,
 'Institutions': 1,
 'Instruction': 1,
 'Instrumental': 1,
 'Insurrection': 1,
 'Integration': 1,
 'Intelligence': 1,
 'Interallied': 1,
 'Intermediate': 1,
 'International': 2,
 'Internationally': 2,
 'Interscholastic': 1,
 'Intersection': 1,
 'Intracoastal': 1,
 'Introduktion': 1,
 'Investigation': 1,
 'Jacksonville': 17,
 'JeanesTeachers': 1,
 'Jeffersonville': 1,
 'Katzenstein': 1,
 'Kiinderscenen': 1,
 'Kindergarten': 4,
 'Kinderscenen': 1,
 'Kirkpatrick': 2,
 'Knickerbockers': 1,
 'Kompositionen': 1,
 'Laboratories': 13,
 'Lamentations': 1,
 'Landscaping': 1,
 'Lawrenceville': 71,
 'Legislative': 2,
 'Legislators': 1,
 'Legislature': 2,
 'Lescribleur': 1,
 'Letterheads': 2,
 'Liebestrank': 1,
 'Liebestraum': 1,
 'Liebeswonne': 1,
 'Lindencrone': 1,
 'Lindenmeyer': 1,
 'Lindpaintner': 3,
 'Litchtenberg': 1,
 'Lithographed': 3,
 'Lithographic': 2,
 'Lithographing': 2,
 'Lithographs': 22,
 'Lithography': 3,
 'Livingstone': 4,
 'Locomotives': 21,
 'Mademoiselle': 1,
 'Madisonville': 2,
 'Mandolinata': 1,
 'Mantelpiece': 2,
 'Manufactures': 1,
 'Manufacturing': 3,
 'Manuscripts': 1,
 'Marguerites': 1,
 'Marigliotta': 2,
 'Marketplaces': 10,
 'Marlborough': 2,
 'Marseillaise': 1,
 'Masquerades': 2,
 'Massachusetts': 12,
 'Mathematics': 1,
 'Maximillian': 7,
 'McMinnville': 2,
 'Mecklenburg': 1,
 'Mediterranei': 2,
 'Medringhaus': 2,
 'Meistersinger': 2,
 'Membranophones': 3,
 'Mendelssohn': 29,
 'Menosprecio': 1,
 'Meriwtether': 1,
 'Metalworking': 2,
 'Middelburgum': 1,
 'Middleditch': 2,
 'Miscellaneous': 3,
 'Missionaries': 3,
 'Mississippi': 291,
 'Mollenhauer': 2,
 'Mondenschein': 1,
 'Monseigneur': 2,
 'Montecastle': 2,
 'Montmorency': 1,
 'Morningside': 6,
 'Morrissette': 1,
 'Mountaineer': 1,
 'Multiplication': 1,
 'Musikalische': 1,
 'Napolitaine': 1,
 'Naturalization': 1,
 'Nebuchadnezzar': 1,
 'Neighborhood': 2,
 'Netherlands': 13,
 'Newfoundland': 1,
 'Newspictures': 3,
 'Nightengale': 1,
 'Nightingale': 3,
 'Nonprescription': 1,
 'Nonprojected': 3,
 'Northampton': 24,
 'Northumberland': 1,
 'Northwestern': 1,
 'Nouuellement': 3,
 'Nullification': 1,
 'Observatories': 10,
 'Observatory': 17,
 'Occidentales': 1,
 'Occidentalioribus': 1,
 'Offertories': 5,
 'Opernthemas': 1,
 'Oppugnation': 1,
 'Organisations': 1,
 'Organizations': 4,
 'Orientation': 9,
 'Orthography': 4,
 'Outbuilding': 1,
 'Outbuildings': 19,
 'Outerbridge': 9,
 'Outstanding': 1,
 'Overexposed': 2,
 'Oxfordshire': 21,
 'Pagenstecker': 1,
 'Pantagrueline': 2,
 'Paraphrases': 1,
 'Participation': 1,
 'Pedestrians': 3,
 'Penitential': 2,
 'Pennsylvania': 35,
 'Pennyslvania': 1,
 'Pensilvania': 1,
 'Performances': 12,
 'Periodicals': 13,
 'Peterborough': 1,
 'Philadelphia': 743,
 'Phildelphia': 1,
 'Phillibrown': 16,
 'Photocopies': 5,
 'Photographed': 1,
 'Photographers': 40,
 'Photographic': 10,
 'Photographs': 5524,
 'Photography': 5521,
 'Photogravure': 1,
 'Photogravures': 223,
 'Piccolomini': 1,
 'Plaetsnyder': 1,
 'Plantations': 4,
 'Playgrounds': 17,
 'Polytechnical': 2,
 'Poniatowski': 1,
 'Poplarville': 8,
 'Posselwhite': 3,
 'Preliminary': 3,
 'Presbyteriain': 1,
 'Presbyterian': 3,
 'Presbyterians': 1,
 'Presentation': 3,
 'Presentazione': 1,
 'Presidential': 16,
 'Printseller': 9,
 'Printsellers': 1,
 'Privateering': 1,
 'Proceedings': 1,
 'Processions': 28,
 'Prohibition': 4,
 'Propagation': 1,
 'Proprietors': 1,
 'Protectionism': 1,
 'Protestantism': 1,
 'Protestation': 1,
 'Psychotropic': 1,
 'Publications': 5,
 'Punchinello': 1,
 'Punctuation': 1,
 'Quesenberry': 1,
 'RailroadCompany': 1,
 'Rappahannock': 7,
 'Ravenscroft': 1,
 'Recollections': 1,
 'Reconciliation': 1,
 'Reconstruction': 3,
 'Recreational': 10,
 'Reflections': 1,
 'Reformation': 1,
 'Reformatories': 4,
 'Reformatory': 8,
 'Refreshments': 1,
 'Refridgerators': 1,
 'Refrigerator': 1,
 'Refrigerators': 3,
 'Registration': 19,
 'Regulations': 1,
 'Rehabilitation': 2,
 'Remembrance': 1,
 'Reminiscences': 2,
 'Remonstrance': 4,
 'Rentiesville': 2,
 'Representations': 3,
 'Residential': 1,
 'Restaurants': 14,
 'Restoration': 5,
 'Resurrection': 2,
 'Revolutionary': 2,
 'Rheinberger': 3,
 'Ritournelle': 1,
 'Rittenhouse': 3,
 'Riverfronts': 2,
 'Rockefeller': 2,
 'Rondolettos': 1,
 'Rorzwaukaski': 2,
 'Rosenbecker': 1,
 'Rudersdorff': 2,
 'Saltonstall': 2,
 'Sandersville': 24,
 'Satterfield': 1,
 'Satterthwaite': 3,
 'Saunnaituis': 2,
 'Scarborough': 1,
 'Scharfenberg': 13,
 'Schleiffarth': 1,
 'Schlepegrell': 1,
 'Schlesinger': 1,
 'Schnatterly': 1,
 'Schoolbuildings': 1,
 'Schoolhouse': 15,
 'Schottisches': 17,
 'Schottishce': 1,
 'Schwalbenbotschaft': 1,
 'Schwanengesang': 10,
 'Searchlights': 1,
 'Sebastianum': 1,
 'Segregation': 18,
 'Seneviratne': 1,
 'Sesquicentennial': 7,
 'Settlements': 45,
 'Seventeenth': 4,
 'Shackelford': 12,
 'Shackleford': 2,
 'Shaftesbury': 2,
 'Shakespeare': 58,
 'Shakespearean': 2,
 'Shepherdstown': 3,
 'Sherrington': 2,
 'Shipbuilding': 1,
 'Shoalwalter': 1,
 'Silhouettes': 11,
 'Smokestacks': 1,
 'Somersville': 2,
 'Sommernachtstraum': 2,
 'Southampton': 10,
 'Southwestern': 3,
 'Spielmannslieder': 1,
 'Spotslyvania': 1,
 'Spotsylvania': 9,
 'Spottsylvania': 1,
 'Springfield': 6,
 'Stadtkirche': 1,
 'Stalactites': 3,
 'Stallknecht': 2,
 'Stepmothers': 1,
 'Stereoscope': 1,
 'Stereoscopic': 2,
 'Stockbridge': 2,
 'Storefronts': 3,
 'Storytellers': 1,
 'Streetscapes': 3,
 'Stringfellow': 8,
 'Stringfield': 1,
 'Superintendent': 120,
 'Superintendents': 12,
 'Supervising': 2,
 'Supervisors': 1,
 'Supterintendent': 1,
 'Surrounding': 2,
 'Switzerland': 13,
 'Tablecloths': 2,
 'Tallahassee': 13,
 'Tanzmomente': 1,
 'Tanzskizzen': 1,
 'Tappahannock': 1,
 'Tarantellas': 3,
 'Tchaikovsky': 2,
 'Terraqueous': 1,
 'Territorial': 2,
 'Teschemacher': 1,
 'Testimonial': 1,
 'Thanksgiving': 1,
 'Theological': 21,
 'Therapentics': 1,
 'Therapeutics': 2,
 'Theyendanagea': 1,
 'Thomasville': 1,
 'Thunderbolt': 5,
 'Tillinghast': 1,
 'Timmonsville': 1,
 'Topographic': 1,
 'Topographical': 2,
 'Transfusion': 1,
 'Transilvania': 1,
 'Translating': 1,
 'Transparencies': 23,
 'Transportation': 35,
 'Transvestism': 28,
 'Triosonatas': 1,
 'Typographos': 1,
 'Ueberwasser': 1,
 'Unaccompanied': 10,
 'Undentified': 1,
 'Undergraduate': 8,
 'Undergraduates': 8,
 'Underground': 12,
 'Unidentfied': 1,
 'Unidentifed': 9,
 'Unidentified': 718,
 'Universitat': 1,
 'Universities': 262,
 'Universitut': 1,
 'Universtity': 1,
 'Unsatisfactory': 2,
 'Unterhaltung': 1,
 'Variationen': 1,
 'Velimirovic': 2,
 'Versification': 3,
 'Villefranche': 1,
 'Violoncello': 2,
 'VirginiaFraternity': 1,
 'Waccowochie': 1,
 'Wagenknight': 3,
 'Wallerstein': 3,
 'Washingtons': 1,
 'Watercolors': 3,
 'Waterfronts': 3,
 'Watermelons': 30,
 'Weatherford': 2,
 'Weissenborn': 1,
 'Wertenbacker': 1,
 'Wertenbaker': 12,
 'Westerville': 1,
 'Westmoreland': 4,
 'Wheelbarrows': 5,
 'Wheelchairs': 1,
 'Wheelwright': 1,
 'Wheelwrights': 1,
 'Whitechurch': 1,
 'Wiggleworth': 2,
 'Williamsburg': 54,
 'Williamsport': 2,
 'Winsborough': 1,
 'Winterhalter': 1,
 'Wollenhaupt': 8,
 'Woodworking': 13,
 'Worthington': 4,
 'Wrigglesworth': 2,
 'accelerator': 1,
 'accessories': 6,
 'accommodating': 2,
 'accommodations': 8,
 'administration': 27,
 'administrators': 2,
 'adolescence': 1,
 'advertisement': 2,
 'advertising': 2,
 'agricultural': 27,
 'agriculture': 5,
 'anniversary': 7,
 'appassionata': 1,
 'application': 1,
 'appreciation': 1,
 'approximately': 54,
 'archeuesque': 2,
 'architectural': 65,
 'architecture': 317,
 'artistiques': 1,
 'association': 1,
 'associations': 3,
 'authorities': 2,
 'automobiles': 1,
 'automoblile': 1,
 'battlefield': 1,
 'beautifying': 3,
 'benediction': 1,
 'bottomlands': 1,
 'caertmaecker': 1,
 'capriccioso': 3,
 'cartographic': 363,
 'celebrating': 1,
 'celebration': 1,
 'celebrations': 9,
 'chrestienne': 1,
 'clarissimum': 1,
 'combination': 1,
 'communications': 1,
 'composition': 152,
 'comprehension': 1,
 'confirmirte': 1,
 'congregation': 1,
 'connections': 6,
 'consequences': 1,
 'conservatory': 1,
 'consolation': 1,
 'consolidated': 3,
 'consolidation': 1,
 'constructed': 1,
 'construction': 153,
 'convocation': 2,
 'cornerstone': 1,
 'cultivation': 1,
 'damoyselles': 1,
 'declaration': 3,
 'declination': 1,
 'dedications': 1,
 'deliberately': 1,
 'demonstrates': 1,
 'demonstration': 19,
 'demonstrator': 1,
 'demonstrators': 1,
 'departments': 1,
 'deportemens': 1,
 'description': 4,
 'destruction': 1,
 'detereriorating': 5,
 'deterioraing': 1,
 'deteriorate': 1,
 'deteriorating': 129,
 'deterioration': 2,
 'development': 21,
 'developments': 1,
 'dignitaries': 1,
 'dilapidated': 1,
 'disciplines': 10,
 'discoloration': 22,
 'discolorations': 1,
 'discoveries': 4,
 'disposition': 1,
 'distinguished': 1,
 'diversified': 7,
 'educational': 1,
 'eliminating': 1,
 'engineering': 5,
 'enseignemens': 1,
 'enterprises': 3,
 'entertainers': 4,
 'entertainment': 2,
 'environment': 1,
 'environments': 9,
 'eradicating': 1,
 'exhibitions': 1,
 'exploration': 3,
 'expositions': 3,
 'facinoribus': 1,
 'furnishings': 2,
 'generations': 1,
 'geographischen': 1,
 'governments': 1,
 'granddaughter': 1,
 'groundbreaking': 20,
 'handwriting': 2,
 'harpsichord': 3,
 'headquarters': 1,
 'hecatodistichon': 2,
 'hlingsglaube': 1,
 'horizontally': 2,
 'hydrographica': 2,
 'identifiable': 6,
 'identifying': 1,
 'illustrissime': 1,
 'importables': 1,
 'improvement': 9,
 'improvements': 6,
 'inauguration': 1,
 'independant': 1,
 'information': 15905,
 'inhabitants': 1,
 'inscription': 1,
 'installation': 1,
 'institution': 2,
 'institutions': 2,
 'instruction': 2,
 'instructives': 1,
 'instrumental': 451,
 'instruments': 25,
 'insurrections': 1,
 'intellectual': 1,
 'interdisciplinary': 2,
 'interesting': 1,
 'interjacentiumq': 1,
 'interpretation': 1,
 'interpreting': 1,
 'introductio': 1,
 'irregularly': 1,
 'irresesitible': 1,
 'irresistable': 1,
 'lVniuersite': 1,
 'laboratories': 1,
 'laboratoryatue': 1,
 'legislation': 2,
 'legislature': 1,
 'linstitution': 1,
 'lithographer': 1,
 'lithographs': 1,
 'locomotives': 1,
 'magnificence': 1,
 'magnificque': 2,
 'manufacture': 1,
 'manufacturer': 1,
 'manufacturing': 1,
 'mathematics': 3,
 'membranophones': 1,
 'mercimoniis': 1,
 'merveilleux': 1,
 'mousquetaire': 1,
 'napolitaine': 1,
 'naturalization': 1,
 'neighborhood': 3,
 'nightingale': 1,
 'nouuellement': 1,
 'nstlerleben': 1,
 'observations': 1,
 'obstruction': 5,
 'opportunity': 1,
 'organization': 2,
 'orientalioribus': 1,
 'orientation': 2,
 'oscilloscope': 1,
 'overagainst': 1,
 'overlooking': 1,
 'participate': 1,
 'particularly': 1,
 'performances': 1,
 'periodicals': 1,
 'photographic': 10376,
 'photographs': 6040,
 'photography': 7,
 'picturesque': 1,
 'plantations': 2,
 'practically': 1,
 'predominantly': 1,
 'preparation': 2,
 'presentation': 5,
 'principales': 1,
 'printmaking': 1,
 'productions': 32,
 'prognostication': 2,
 'progressives': 1,
 'promptuaire': 1,
 'promulgation': 1,
 'prononciation': 1,
 'provinciarum': 3,
 'publication': 1,
 'quarterback': 5,
 'recognizable': 1,
 'reconstruction': 7,
 'recreations': 1,
 'refreshments': 1,
 'regulations': 1,
 'relationship': 250,
 'remembering': 1,
 'representatives': 2,
 'reproduction': 1,
 'reproductions': 48,
 'resiouissance': 2,
 'respectibus': 1,
 'resplendent': 1,
 'restrictions': 5518,
 'resurrection': 1,
 'reverendissime': 1,
 'sacrificateur': 1,
 'sanctissima': 1,
 'satisfactory': 3,
 'schoolbuildings': 29,
 'schottische': 3,
 'sentimentale': 1,
 'septemtrionale': 1,
 'septentrionale': 1,
 'settlements': 16,
 'shareholder': 1,
 'shopkeepers': 1,
 'siciliennes': 1,
 'sioujssances': 2,
 'slenderness': 1,
 'spectabilis': 1,
 'spirituelles': 1,
 'stalagmites': 3,
 'stockholders': 1,
 'strengthened': 1,
 'subdivision': 1,
 'superintendence': 1,
 'supervising': 2,
 'supervisors': 1,
 'supplementary': 1,
 'surrounding': 5,
 'sweethearts': 2,
 'temperaments': 1,
 'temptations': 1,
 'territories': 3,
 'thanksgiving': 1,
 'theological': 1,
 'topographical': 1,
 'traditional': 1,
 'transgressions': 2,
 'transparencies': 31,
 'transportation': 3,
 'transported': 2,
 'transporting': 4,
 'treschrestien': 5,
 'tresexcellente': 1,
 'tresillustre': 1,
 'trespuissant': 1,
 'triangulation': 1,
 'triumphante': 3,
 'ultracentrifuge': 2,
 'unclassified': 1,
 'undentified': 3,
 'underground': 1,
 'undernourished': 1,
 'unidentifed': 4,
 'unidentifiable': 1,
 'unidentified': 146,
 'universities': 1,
 'unprotected': 1,
 'unrecognizable': 1,
 'ventilating': 1,
 'vingthuictieme': 1,
 'violoncello': 30,
 'violoncellos': 1,
 'waterfronts': 2,
 'watermelons': 7,
 'weatherboarded': 1,
 'whitewashing': 1}
In [ ]:
from nltk.collocations import *
finder = BigramCollocationFinder.from_words(filtered['ia']['filtered'])
bigram_measures = nltk.collocations.BigramAssocMeasures()
scored = finder.score_ngrams(bigram_measures.raw_freq)
sorted(bigram for bigram, score in scored)  
In [ ]:
#This thing here just hangs forevs. I wonder if it's possible to do it without the notebook?
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
trigram_measures = nltk.collocations.TrigramAssocMeasures()
finder = BigramCollocationFinder.from_words(filteredgpo)
finder.nbest(bigram_measures.pmi, 10)
In [11]:
import pickle
import nltk

colls = ["biodiv","rumsey","commonwealth","georgia","harvard",
        "ia","getty","kentucky","minnesota","missouri","mwdl",
        "nara","nocar","smiths","socar","texas","gpo","illinois",
         "usc","virginia","nocoll"]

#colls = ["biodiv"]

fd = pickle.load( open( "/media/storage/dpla-data/pickles/new/artstor_fd.p", "rb" ) )
fds = {}
fds['artstor'] = fd
print(len(fd))
for coll in colls:
    tmp = pickle.load( open( "/media/storage/dpla-data/pickles/new/"+coll+"_fd.p", "rb" ) )
    print("updating FD with " + coll)
    fds[coll] = tmp
    fd.update(tmp)
    print(len(fd))
60168
updating FD with biodiv
131704
updating FD with rumsey
153719
updating FD with commonwealth
323186
updating FD with georgia
428524
updating FD with harvard
444294
updating FD with ia
853180
updating FD with getty
874037
updating FD with kentucky
879550
updating FD with minnesota
892810
updating FD with missouri
965647
updating FD with mwdl
1589542
updating FD with nara
2556336
updating FD with nocar
2689928
updating FD with smiths
2948721
updating FD with socar
2960280
updating FD with texas
3709827
updating FD with gpo
4056106
updating FD with illinois
4066429
updating FD with usc
4167642
updating FD with virginia
4193316
updating FD with nocoll
4193393
In [12]:
fd.most_common()
Out[12]:
[('library', 6117895),
 ('university', 4474393),
 ('digital', 4393278),
 ('county', 4101874),
 ('archives', 3912296),
 ('image', 3856146),
 ('http', 3622052),
 ('utah', 3508437),
 ('texas', 3088382),
 ('states', 3034554),
 ('united', 2980122),
 ('records', 2940508),
 ('collection', 2933041),
 ('edu', 2869869),
 ('text', 2743094),
 ('u', 2735080),
 ('1', 2651930),
 ('national', 2627569),
 ('use', 2369332),
 ('c', 2323676),
 ('state', 2222489),
 ('office', 2198024),
 ('libraries', 2112146),
 ('north', 2089362),
 ('carolina', 2021982),
 ('georgia', 2013630),
 ('ark', 1940101),
 ('california', 1810572),
 ('images', 1793602),
 ('unt', 1793154),
 ('history', 1785922),
 ('public', 1720702),
 ('department', 1687369),
 ('english', 1663863),
 ('southern', 1559202),
 ('death', 1533958),
 ('d', 1516996),
 ('copyright', 1463445),
 ('67531', 1454858),
 ('photographs', 1451551),
 ('texashistory', 1432924),
 ('information', 1398449),
 ('washington', 1308460),
 ('x', 1220153),
 ('smithsonian', 1199436),
 ('available', 1197530),
 ('research', 1195215),
 ('center', 1163841),
 ('government', 1150791),
 ('institution', 1147249),
 ('collections', 1140774),
 ('content', 1116803),
 ('newspapers', 1106143),
 ('west', 1094535),
 ('ga', 1093339),
 ('ut', 1090170),
 ('administration', 1068841),
 ('college', 1065094),
 ('los', 1059295),
 ('angeles', 1047333),
 ('photograph', 1045279),
 ('ca', 1025436),
 ('local', 1007516),
 ('census', 1003194),
 ('terms', 996437),
 ('service', 988342),
 ('portal', 967273),
 ('american', 961283),
 ('mountain', 960732),
 ('may', 940055),
 ('made', 934848),
 ('statistics', 914170),
 ('special', 886509),
 ('defense', 868309),
 ('disk', 866632),
 ('ed', 862066),
 ('p', 856817),
 ('vital', 853095),
 ('agency', 852556),
 ('2', 845176),
 ('5', 841400),
 ('archive', 839173),
 ('gordon', 826933),
 ('usc', 782346),
 ('museum', 775978),
 ('see', 773608),
 ('historical', 772560),
 ('dept', 764440),
 ('j', 761928),
 ('n', 757319),
 ('magnetic', 747803),
 ('title', 747283),
 ('tex', 734878),
 ('america', 722533),
 ('photographic', 721011),
 ('part', 719264),
 ('including', 718627),
 ('1905', 715122),
 ('3', 713008),
 ('unrestricted', 706048),
 ('contact', 677717),
 ('w', 676065),
 ('md', 668947),
 ('study', 664102),
 ('park', 663796),
 ('nmnh', 649509),
 ('maps', 647804),
 ('b', 643911),
 ('cgi', 641422),
 ('10', 638652),
 ('south', 635438),
 ('secretary', 634762),
 ('1958', 631306),
 ('4', 625355),
 ('newspaper', 620590),
 ('please', 616244),
 ('map', 613643),
 ('01', 606953),
 ('include', 605423),
 ('1939', 605294),
 ('permission', 602815),
 ('new', 602509),
 ('anthropology', 597459),
 ('project', 587240),
 ('private', 587021),
 ('created', 583921),
 ('org', 582788),
 ('2008', 579928),
 ('8', 579199),
 ('lib', 571698),
 ('certificates', 567808),
 ('rights', 566483),
 ('certificate', 560875),
 ('health', 560430),
 ('contents', 559924),
 ('e', 557717),
 ('heritage', 550587),
 ('cm', 549198),
 ('atlanta', 547042),
 ('teaching', 544659),
 ('botany', 542307),
 ('7', 541722),
 ('full', 537197),
 ('general', 523510),
 ('sound', 516121),
 ('holding', 515365),
 ('video', 507502),
 ('11', 504369),
 ('society', 501581),
 ('12', 498668),
 ('calhoun', 494892),
 ('gpo', 492130),
 ('2007', 489677),
 ('17', 484438),
 ('recordings', 484166),
 ('mission', 482853),
 ('publicly', 481311),
 ('partners', 480262),
 ('thumbnail', 477850),
 ('itemurl', 477639),
 ('thumbnailurl', 477639),
 ('advertising', 475246),
 ('art', 473332),
 ('identifier', 473008),
 ('pictures', 472344),
 ('domain', 471126),
 ('military', 469395),
 ('l', 467158),
 ('h', 466419),
 ('assigned', 465535),
 ('specol', 465295),
 ('still', 463899),
 ('material', 462158),
 ('memorial', 461098),
 ('city', 455185),
 ('us', 444041),
 ('boston', 440872),
 ('war', 438395),
 ('includes', 433659),
 ('resource', 432396),
 ('number', 432306),
 ('213', 415973),
 ('news', 411263),
 ('arizona', 409384),
 ('internet', 408349),
 ('21', 406888),
 ('law', 406216),
 ('m', 401417),
 ('de', 400958),
 ('bureau', 392174),
 ('print', 385810),
 ('john', 385392),
 ('laws', 385150),
 ('22', 381806),
 ('www', 379402),
 ('white', 378232),
 ('business', 378114),
 ('90089', 377826),
 ('20', 376706),
 ('13', 376208),
 ('massachusetts', 375639),
 ('files', 374845),
 ('daily', 372969),
 ('09', 372792),
 ('january', 371526),
 ('1921', 368779),
 ('item', 368361),
 ('wpa', 368349),
 ('03', 366716),
 ('century', 365267),
 ('doheny', 364968),
 ('drive', 364385),
 ('0189', 363865),
 ('household', 363696),
 ('commercial', 363215),
 ('international', 361430),
 ('02', 361132),
 ('division', 359732),
 ('kentucky', 358883),
 ('15', 357764),
 ('school', 356600),
 ('inc', 355061),
 ('printing', 353855),
 ('gov', 352039),
 ('16', 348978),
 ('hard', 348575),
 ('lake', 344570),
 ('6', 343146),
 ('documents', 339507),
 ('06', 335715),
 ('07', 335134),
 ('14', 334743),
 ('file', 334434),
 ('page', 332226),
 ('05', 331733),
 ('9', 329820),
 ('places', 328713),
 ('04', 328619),
 ('institute', 328372),
 ('1982', 325214),
 ('1994', 325061),
 ('f', 324959),
 ('air', 324400),
 ('oclc', 322550),
 ('purposes', 322339),
 ('vol', 321707),
 ('protection', 321439),
 ('getty', 320777),
 ('bin', 320732),
 ('works', 320621),
 ('hutzel', 315352),
 ('died', 315245),
 ('house', 314953),
 ('forces', 314721),
 ('g', 314565),
 ('r', 314401),
 ('visual', 312926),
 ('18', 311659),
 ('black', 310081),
 ('24', 309774),
 ('agreement', 308813),
 ('23', 307921),
 ('one', 307763),
 ('2006', 304967),
 ('company', 303990),
 ('retained', 303438),
 ('services', 302549),
 ('education', 301797),
 ('intellectual', 301731),
 ('photo', 301328),
 ('hill', 301178),
 ('30', 300117),
 ('35', 299784),
 ('cont', 297790),
 ('reserve', 297783),
 ('2014', 297528),
 ('cards', 295644),
 ('pages', 294533),
 ('st', 293662),
 ('exceptions', 293184),
 ('25', 292434),
 ('view', 290866),
 ('housing', 289670),
 ('finance', 287345),
 ('improve', 285861),
 ('combined', 283872),
 ('proof', 283816),
 ('access', 283798),
 ('san', 282709),
 ('district', 282093),
 ('development', 281951),
 ('08', 281868),
 ('fill', 280758),
 ('street', 280641),
 ('certification', 279874),
 ('1960', 279644),
 ('gaps', 278142),
 ('management', 277529),
 ('familysearch', 277402),
 ('keypath', 277397),
 ('indexesresults', 277397),
 ('runwhat', 277397),
 ('idxfiles', 277397),
 ('v', 277391),
 ('william', 275976),
 ('2011', 275061),
 ('economics', 271972),
 ('geography', 271143),
 ('online', 268800),
 ('forest', 268767),
 ('2013', 267752),
 ('materials', 266942),
 ('communications', 266869),
 ('book', 266353),
 ('negative', 264987),
 ('19', 264908),
 ('salt', 264653),
 ('plants', 264340),
 ('28', 263042),
 ('dpla', 262957),
 ('color', 262938),
 ('along', 262712),
 ('environmental', 259654),
 ('jones', 259337),
 ('1940', 258904),
 ('obituaries', 258478),
 ('2000', 256307),
 ('mm', 254211),
 ('type', 254072),
 ('virginia', 253372),
 ('italy', 252068),
 ('index', 251253),
 ('athens', 250838),
 ('program', 249468),
 ('document', 249230),
 ('must', 248981),
 ('2003', 246499),
 ('minnesota', 246439),
 ('abilene', 245618),
 ('26', 244854),
 ('29', 243883),
 ('register', 242169),
 ('young', 241482),
 ('early', 239921),
 ('fax', 239667),
 ('educational', 239320),
 ('chapel', 238765),
 ('co', 238304),
 ('publications', 238114),
 ('system', 236973),
 ('periodicals', 235930),
 ('plantae', 235889),
 ('george', 234770),
 ('publication', 233713),
 ('association', 233414),
 ('2001', 232500),
 ('york', 232489),
 ('html', 230373),
 ('max', 230254),
 ('two', 229834),
 ('historic', 229645),
 ('phone', 228783),
 ('ocolc', 226412),
 ('world', 225206),
 ('board', 225179),
 ('david', 225159),
 ('zoology', 224905),
 ('0', 224735),
 ('digitized', 224072),
 ('27', 223035),
 ('leslie', 222591),
 ('prints', 222392),
 ('journalism', 221812),
 ('survey', 221666),
 ('events', 221545),
 ('required', 220720),
 ('published', 220199),
 ('pictorial', 219600),
 ('o', 219307),
 ('report', 218246),
 ('federal', 217026),
 ('officials', 216327),
 ('activities', 215985),
 ('site', 212735),
 ('lccn', 212473),
 ('people', 212459),
 ('2005', 212395),
 ('negatives', 212081),
 ('non', 211659),
 ('1900', 210990),
 ('life', 210606),
 ('1990', 210331),
 ('usa', 210289),
 ('multimedia', 209791),
 ('740', 209446),
 ('reserved', 209199),
 ('2010', 208823),
 ('microfilm', 208575),
 ('also', 207242),
 ('flowering', 206963),
 ('subject', 206775),
 ('1920', 206671),
 ('contentdm', 206566),
 ('dallas', 206113),
 ('ferns', 204716),
 ('areas', 203520),
 ('821', 203110),
 ('relating', 202918),
 ('2343', 201969),
 ('2366', 201736),
 ('1950', 200446),
 ('atlas', 199019),
 ('medical', 198449),
 ('31', 197786),
 ('cultural', 197434),
 ('section', 197143),
 ('without', 196840),
 ('surveys', 196775),
 ('portraits', 196242),
 ('louis', 196128),
 ('nc', 195737),
 ('series', 195587),
 ('ill', 195430),
 ('church', 195243),
 ('jan', 194458),
 ('architecture', 193697),
 ('paper', 192058),
 ('james', 191054),
 ('portrait', 190915),
 ('net', 188361),
 ('physical', 187254),
 ('aug', 187237),
 ('first', 187147),
 ('river', 186862),
 ('code', 186740),
 ('agriculture', 186244),
 ('electronic', 186066),
 ('attorney', 185874),
 ('building', 184663),
 ('block', 184394),
 ('courtesy', 184024),
 ('commerce', 184016),
 ('1860', 182868),
 ('animalia', 182830),
 ('handle', 182051),
 ('resources', 181443),
 ('relation', 181137),
 ('protected', 180416),
 ('civil', 180409),
 ('design', 180214),
 ('women', 179205),
 ('dicotyledonae', 179177),
 ('nevada', 179151),
 ('interior', 179110),
 ('social', 178571),
 ('enumeration', 178186),
 ('willard', 176096),
 ('weekly', 175513),
 ('family', 175324),
 ('buildings', 175169),
 ('space', 175042),
 ('paul', 174826),
 ('water', 174802),
 ('box', 174733),
 ('2004', 174469),
 ('men', 174309),
 ('card', 173753),
 ('hdl', 173582),
 ('work', 172733),
 ('region', 172700),
 ('personal', 172563),
 ('accession', 171192),
 ('marriott', 171158),
 ('edward', 170423),
 ('photography', 169921),
 ('ethnology', 168295),
 ('86', 167746),
 ('help', 166729),
 ('1947', 166686),
 ('rumsey', 166593),
 ('provided', 166261),
 ('invertebrate', 165889),
 ('charles', 165838),
 ('10020', 165827),
 ('jun', 165621),
 ('descriptions', 165543),
 ('indians', 165270),
 ('regulations', 165133),
 ('oct', 164313),
 ('105', 163809),
 ('2002', 162990),
 ('fort', 162660),
 ('language', 162111),
 ('brigham', 161673),
 ('student', 161631),
 ('group', 161578),
 ('scholarship', 161302),
 ('missouri', 160794),
 ('repository', 160549),
 ('ink', 159649),
 ('granted', 159447),
 ('employee', 159238),
 ('1945', 158703),
 ('go', 157813),
 ('right', 157809),
 ('photographer', 157783),
 ('press', 157356),
 ('1910', 156652),
 ('basel', 156514),
 ('written', 156014),
 ('governed', 155957),
 ('received', 155705),
 ('htm', 155475),
 ('pursuant', 154863),
 ('jul', 154733),
 ('volumes', 154694),
 ('clipping', 154343),
 ('left', 154335),
 ('unknown', 154220),
 ('thomas', 154133),
 ('field', 153549),
 ('2009', 153404),
 ('glass', 153129),
 ('sep', 153080),
 ('medieval', 152980),
 ('idaho', 152927),
 ('1930', 152870),
 ('risk', 152495),
 ('1911', 152192),
 ('arte', 151898),
 ('entomology', 151555),
 ('etc', 150509),
 ('2012', 150425),
 ('antonio', 150363),
 ('id', 149367),
 ('used', 149299),
 ('distribution', 149213),
 ('indian', 149011),
 ('public_domain_copyright_notice', 148603),
 ('pdf', 148440),
 ('dwellings', 148387),
 ('foto', 147761),
 ('minore', 147644),
 ('jpg', 146458),
 ('34', 146391),
 ('customs', 146179),
 ('freely', 146062),
 ('gallery', 145645),
 ('la', 145560),
 ('brothers', 144934),
 ('hosted', 144340),
 ('area', 143776),
 ('solely', 142953),
 ('front', 142895),
 ('students', 141998),
 ('papers', 141136),
 ('date', 140889),
 ('byu', 140345),
 ('displayed', 140175),
 ('mexico', 140084),
 ('cornell', 140057),
 ('committee', 140028),
 ('subsequent', 139334),
 ('german', 139105),
 ('1865', 138987),
 ('1980', 138962),
 ('property', 138813),
 ('biodiversity', 138775),
 ('bhl', 138362),
 ('four', 137459),
 ('118', 136952),
 ('natural', 136814),
 ('control', 136766),
 ('commonwealth', 136381),
 ('pencil', 136271),
 ('east', 136139),
 ('urban', 135941),
 ('henry', 135170),
 ('accordance', 134266),
 ('reproduction', 133619),
 ('landscape', 133579),
 ('well', 133358),
 ('el', 132876),
 ('june', 132790),
 ('children', 132419),
 ('views', 132410),
 ('1861', 132273),
 ('lee', 131946),
 ('pacific', 131651),
 ('unit', 131097),
 ('mass', 130764),
 ('creative', 130420),
 ('known', 130333),
 ('collected', 129987),
 ('address', 129929),
 ('extensive', 129875),
 ('mar', 129691),
 ('born', 129618),
 ('record', 129343),
 ('held', 129204),
 ('operations', 128698),
 ('african', 128237),
 ('presented', 128205),
 ('searched', 127683),
 ('electronically', 127637),
 ('owning', 127478),
 ('kdl', 127243),
 ('kyvl', 127243),
 ('illinois', 126520),
 ('trust', 126373),
 ('apr', 125606),
 ('year', 125582),
 ('sheet', 125238),
 ('form', 125225),
 ('y', 125130),
 ('restrictions', 125105),
 ('brown', 125012),
 ('robert', 124580),
 ('1931', 124444),
 ('open', 124106),
 ('army', 124086),
 ('tewksbury', 123514),
 ('1961', 122806),
 ('inches', 122774),
 ('1944', 122669),
 ('1901', 122437),
 ('geological', 122048),
 ('americans', 121953),
 ('unc', 121856),
 ('silver', 121508),
 ('data', 121190),
 ('man', 121139),
 ('identification', 121112),
 ('greensboro', 121093),
 ('letter', 121034),
 ('original', 120867),
 ('congress', 120865),
 ('force', 120828),
 ('economic', 120746),
 ('web', 120539),
 ('33', 120428),
 ('1988', 120197),
 ('version', 120127),
 ('july', 120049),
 ('smith', 119910),
 ('wilson', 118842),
 ('jackson', 118782),
 ('postcards', 118525),
 ('feb', 118024),
 ('branch', 117825),
 ('yale', 117823),
 ('charleston', 117478),
 ('marine', 117433),
 ('march', 117428),
 ('japan', 117362),
 ('breckenridge', 117142),
 ('president', 116931),
 ('journal', 116872),
 ('austin', 116517),
 ('1897', 116517),
 ('20th', 116413),
 ('geographic', 116068),
 ('high', 115940),
 ('underwood', 115422),
 ('1929', 115352),
 ('construction', 115301),
 ('affairs', 114797),
 ('dc', 114710),
 ('cdm', 114165),
 ('forms', 114104),
 ('houston', 114065),
 ('1968', 113695),
 ('photographers', 113481),
 ('galveston', 113434),
 ('reproduced', 113272),
 ('current', 112959),
 ('day', 112493),
 ('000', 112018),
 ('mrs', 111773),
 ('studio', 111396),
 ('1977', 111183),
 ('savannah', 110938),
 ('commission', 110714),
 ('columbia', 110364),
 ('topographic', 110106),
 ('arts', 110082),
 ('annual', 110025),
 ('near', 109753),
 ('1913', 109285),
 ('headquarters', 108867),
 ('1898', 108827),
 ('1870', 108769),
 ('scale', 108352),
 ('1884', 107901),
 ('director', 107781),
 ('hall', 107747),
 ('arthropoda', 107737),
 ('dec', 107661),
 ('1974', 107275),
 ('viewed', 106833),
 ('1970', 106778),
 ('impa', 106393),
 ('joseph', 105617),
 ('publishing', 105220),
 ('1960s', 105116),
 ('assignment', 105086),
 ('commons', 105012),
 ('campus', 104760),
 ('johnson', 104500),
 ('personnel', 104442),
 ('regents', 104304),
 ('gelatin', 104134),
 ('case', 103803),
 ('consortium', 103687),
 ('navy', 103680),
 ('cite', 103655),
 ('1880', 103470),
 ('paso', 103317),
 ('almshouse', 103020),
 ('montana', 102957),
 ('attribution', 102937),
 ('agricultural', 102773),
 ('senate', 102581),
 ('road', 102546),
 ('printed', 102276),
 ('french', 102025),
 ('cooper', 101934),
 ('1890', 101527),
 ('postcard', 101328),
 ('anderson', 101323),
 ('film', 101230),
 ('harold', 101015),
 ('central', 100930),
 ('18th', 100896),
 ('screen', 100874),
 ('sculpture', 100776),
 ('station', 100669),
 ('1967', 100337),
 ('wpacards', 100170),
 ('m38843', 99924),
 ('home', 99757),
 ('community', 99321),
 ('unidentified', 99280),
 ('1976', 99236),
 ('nov', 99147),
 ('objects', 99025),
 ('legislature', 99007),
 ('beyond', 98849),
 ('des', 98797),
 ('april', 98551),
 ('nature', 98401),
 ('bombus', 98357),
 ('funds', 98269),
 ('orange', 98133),
 ('clark', 97991),
 ('showing', 97933),
 ('aircraft', 97735),
 ('col', 97424),
 ('fair', 97124),
 ('classified', 97042),
 ('september', 97042),
 ('london', 96900),
 ('1914', 96875),
 ('october', 96659),
 ('given', 96657),
 ('three', 96618),
 ('administrative', 96053),
 ('monument', 95935),
 ('et', 95815),
 ('taylor', 95739),
 ('supported', 95486),
 ('old', 95416),
 ('paris', 95191),
 ('technology', 94990),
 ('uses', 94581),
 ('back', 94566),
 ('valley', 94549),
 ('photos', 94251),
 ('post', 94063),
 ('1909', 93737),
 ('years', 93596),
 ('saint', 93247),
 ('1912', 93169),
 ('province', 93153),
 ('administrator', 93103),
 ('mary', 93021),
 ('western', 92921),
 ('32', 92583),
 ('documentation', 92467),
 ('div', 92395),
 ('mr', 91921),
 ('obituary', 91676),
 ('via', 91507),
 ('transportation', 91502),
 ('monuments', 91381),
 ('science', 91042),
 ('town', 90978),
 ('africa', 90791),
 ('mission21', 90786),
 ('complex', 90711),
 ('references', 90398),
 ('documenting', 89892),
 ('administered', 89517),
 ('industry', 89006),
 ('1904', 88889),
 ('1985', 88667),
 ('governor', 88513),
 ('shows', 88507),
 ('sa', 88390),
 ('1917', 88367),
 ('1938', 88319),
 ('1934', 88316),
 ('temple', 88292),
 ('plant', 88071),
 ('homeplace', 87990),
 ('music', 87664),
 ('lsta', 87615),
 ('island', 87472),
 ('com', 87179),
 ('february', 86945),
 ('1895', 86620),
 ('woman', 86612),
 ('uintah', 86384),
 ('corps', 86376),
 ('1936', 86243),
 ('requests', 86169),
 ('php', 86149),
 ('herald', 86109),
 ('architectural', 86035),
 ('staff', 86024),
 ('1949', 85957),
 ('china', 85920),
 ('renaissance', 85865),
 ('1886', 85680),
 ('november', 85606),
 ('1918', 85453),
 ('fire', 85333),
 ('dr', 85104),
 ('august', 84869),
 ('native', 84753),
 ('arthur', 84674),
 ('legal', 84631),
 ('medicine', 84585),
 ('country', 84526),
 ('1975', 84336),
 ('asia', 84283),
 ('1935', 84264),
 ('land', 84007),
 ('sc', 83995),
 ('schools', 83983),
 ('1907', 83978),
 ('1951', 83959),
 ('lowell', 83688),
 ('assistant', 83635),
 ('42', 83578),
 ('intake', 83383),
 ('1956', 83372),
 ('specimen', 83361),
 ('time', 83317),
 ('description', 83261),
 ('small', 83247),
 ('1989', 83077),
 ('result', 83032),
 ('notes', 82788),
 ('reno', 82539),
 ('hewitt', 82525),
 ('va', 82463),
 ('tichnor', 82428),
 ('photographed', 82332),
 ('user', 82259),
 ('1942', 82008),
 ('obtained', 81803),
 ('list', 81773),
 ('baroque', 81668),
 ('side', 81633),
 ('appalachian', 81544),
 ('ii', 81526),
 ('organized', 81510),
 ('1933', 81503),
 ('programs', 81467),
 ('licenses', 81248),
 ('shipler', 81143),
 ('ocean', 80955),
 ('restricted', 80953),
 ('1987', 80923),
 ('frank', 80794),
 ('hospital', 80752),
 ('bibliographical', 80726),
 ('sites', 80710),
 ('1941', 80699),
 ('creativecommons', 80600),
 ('k', 80502),
 ('confederate', 80171),
 ('1915', 80024),
 ('power', 79771),
 ('walter', 79710),
 ('december', 79684),
 ('1850', 79519),
 ('individuals', 79174),
 ('prior', 79050),
 ('1943', 79023),
 ('eight', 79006),
 ('red', 78747),
 ('based', 78642),
 ('1908', 78581),
 ('gift', 78580),
 ('class', 78575),
 ('gothic', 78574),
 ('archaeology', 78568),
 ('richard', 78498),
 ('issued', 78486),
 ('session', 78459),
 ('members', 78346),
 ('romanesque', 78254),
 ('geologic', 78225),
 ('referred', 78141),
 ('nail', 78137),
 ('copy', 78007),
 ('examiner', 77946),
 ('1919', 77801),
 ('sciences', 77626),
 ('1972', 77583),
 ('1906', 77495),
 ('labor', 77399),
 ('large', 77087),
 ('gri', 77016),
 ('1962', 76933),
 ('politics', 76858),
 ('express', 76795),
 ('jr', 76613),
 ('security', 76405),
 ('scholar', 76358),
 ('base', 76119),
 ('standing', 76087),
 ('worth', 75775),
 ('opinions', 75772),
 ('islands', 75771),
 ('1903', 75761),
 ('union', 75761),
 ('nasa', 75756),
 ('1923', 75604),
 ('great', 75569),
 ('free', 75110),
 ('brenham', 75033),
 ('40', 74960),
 ('canyon', 74877),
 ('anthropological', 74734),
 ('1899', 74646),
 ('creek', 74628),
 ('romans', 74564),
 ('1928', 74549),
 ('otherwise', 74448),
 ('france', 74395),
 ('thorough', 74298),
 ('background', 74195),
 ('forests', 74118),
 ('harvard', 74020),
 ('1969', 73985),
 ('artstor', 73982),
 ('etruscans', 73875),
 ('1922', 73858),
 ('aerial', 73811),
 ('1916', 73791),
 ('support', 73740),
 ('1937', 73734),
 ('cat340573', 73646),
 ('place', 73577),
 ('1924', 73514),
 ('fulton', 73489),
 ('insects', 73347),
 ('1946', 73079),
 ('manuscript', 72832),
 ('uncg', 72805),
 ('training', 72779),
 ('mail', 72687),
 ('1963', 72645),
 ('thursday', 72533),
 ('second', 72314),
 ('1955', 72212),
 ('bastrop', 71902),
 ('avenue', 71864),
 ('1997', 71741),
 ('ariz', 71521),
 ('azlibrary', 71327),
 ('1965', 71258),
 ('application', 71055),
 ('azmemory', 70966),
 ('info', 70901),
 ('correspondence', 70645),
 ('1952', 70592),
 ('order', 70491),
 ('practices', 70254),
 ('1954', 70162),
 ('1953', 70149),
 ('palmer', 69983),
 ('uss', 69834),
 ('1883', 69703),
 ('format', 69689),
 ('many', 69679),
 ('harris', 69591),
 ...]
In [13]:
len(fd.hapaxes())
Out[13]:
2773182
In [14]:
fd.hapaxes()
Out[14]:
['contiiinecl',
 'greise',
 'genealogicalreco1888spof',
 'noveluber',
 'afgestooken',
 'ocm15045154',
 '6455965',
 'idx208420048060',
 '6406649',
 'dnsc8706589',
 'dnsc8706587',
 'dnsc8706586',
 'dnsc8706585',
 'firstprincipleso00parl_0',
 'firstprincipleso00parl_1',
 '2442383',
 '1318714',
 '04921800',
 'wintkagasspan',
 'choegun',
 'd9e953e37b27cb8029cb2e5ca4ee690f',
 '6406647',
 '346373',
 'ocm57308713',
 '4a329a29362c8693d61df65f222b60b4',
 '346376',
 'dfst8304359',
 'dnsc9001680',
 '04404800',
 'dfst9106751',
 '652700225',
 'meeler',
 '225251',
 'underbank',
 'ocm49779203',
 'ufers',
 'riilcil',
 'qurxtity',
 '284358',
 '284359',
 '45882508',
 'ficantly',
 '284352',
 '284353',
 '284351',
 '284356',
 '284357',
 '284355',
 '51758736',
 'dnsc9001682',
 '06878700',
 'c944f95bb4c850d0326c9bf7b6e35839',
 '412_dsp_waste2energy_0051',
 '469492',
 'racticul',
 'ahc071015002a',
 '2155521',
 'wageindex',
 '7afeffc8f2e92619106',
 '2155522',
 '6698004',
 'diarioenquesepro00cigo',
 '6698005',
 '0160734517',
 'bankroll',
 'ocm41962946',
 '236485020',
 '6698000',
 '870466468',
 '6698001',
 'dfsd0202493',
 'ahc1702149001',
 'dfsd0202490',
 'fiy',
 'peble',
 'colvtalner',
 '39999064270257',
 '428077796',
 'dnsc8706588',
 'veterinaryhomopa00hurn',
 '6698009',
 'idx208420020051',
 'idx208420020050',
 'idx208420020053',
 'idx208420020052',
 'idx208420020055',
 'idx208420020054',
 'idx208420020057',
 'idx208420020056',
 'idx208420020059',
 'idx208420020058',
 'manipulaton',
 'todod',
 '40_cfd_os_2004_1201_141_404',
 '40_cfd_os_2004_1201_141_405',
 '40_cfd_os_2004_1201_141_406',
 '40_cfd_os_2004_1201_141_400',
 '40_cfd_os_2004_1201_141_401',
 '40_cfd_os_2004_1201_141_402',
 '40_cfd_os_2004_1201_141_403',
 'dnsn9010679',
 'dnsn9010678',
 '40_cfd_os_2004_1201_141_408',
 '40_cfd_os_2004_1201_141_409',
 '6601455',
 'highwagejobsinco00unit',
 'fi4',
 '6601453',
 '6601452',
 'whichemploy',
 'fi9',
 '6601459',
 '501938444',
 '123955633',
 '43960370',
 'onelincolnstreet00hmma',
 '723375',
 'néill',
 'phalluses',
 'bustum',
 '237047684',
 '3a8f16a675b09218885',
 'dfsd0405578',
 'idx208420269484',
 '538677',
 '538675',
 '538674',
 '538673',
 '538672',
 '538671',
 '538670',
 'embacy',
 'llte1111',
 '538679',
 '538678',
 '6443891',
 '6443890',
 '6443893',
 '6443892',
 '6443895',
 '6443894',
 '6443897',
 '6443896',
 'embach',
 '6443898',
 'podali',
 'dfst8600002',
 'dfst8600003',
 'footwashing',
 'dfst8600001',
 'dfst8600004',
 'dnsc8703203',
 'dnsc8703202',
 '0803728565',
 'dnsc8703200',
 'dnsc8703207',
 'dnsc8703206',
 'dnsc8703205',
 'dnsc8703204',
 'lettertodearaunt00west146',
 'dnsc8703208',
 '5486051',
 'idx208420165554',
 'idx208420165555',
 'idx208420165556',
 '67820610r',
 'idx208420165550',
 'idx208420165551',
 'idx208420165552',
 'idx208420165553',
 'idx208420165558',
 'idx208420165559',
 '39999066745066',
 '57368602',
 '0160708788',
 'utarid',
 '728242073',
 '550828',
 'putorious',
 'utarin',
 'hr096p2',
 'berlingebrüder',
 'ocm58431435',
 'scrviot',
 'dc10blk_c48061_o01',
 'archonnov1923111dumm',
 'agr65000117',
 '611902418',
 'ranchlands',
 'dfsd0505601',
 'mw0398',
 'mw0394',
 'mw0395',
 'mw0396',
 'mw0397',
 'mw0390',
 'mw0392',
 'mw0393',
 'priiliitive',
 'organisada',
 'idx208420040645',
 'chattenango',
 'ocm57199627',
 'annualreportnati19932nati',
 '755_131_015_01',
 '755_131_015_02',
 'tobacco_nrp23e00',
 'dayrl',
 '45199902',
 'idx208420226025',
 'lixuna',
 'fineart',
 'programlist00mass',
 'b10828126',
 '03ad2af9e5e8a56ca6f69b3cc435e914',
 'enfóquese',
 'nomenclatoriszoo00agas',
 '766588',
 'sermones04wycl',
 '7262462',
 '7262463',
 '7262460',
 '7262461',
 '7262466',
 '7262467',
 '7262464',
 '7262465',
 '7262468',
 '7262469',
 'nxtilnd',
 'ocn227183726',
 'congenite',
 'idx208420148768',
 'idx208420148769',
 'idx208420148766',
 'idx208420148767',
 'idx208420148764',
 'idx208420148765',
 'idx208420148762',
 'idx208420148763',
 'idx208420148760',
 'idx208420148761',
 '826128635',
 'claymoore',
 'alleviations',
 'contributiontohi01ridl',
 '12071628',
 'magazijnvantuins15laar',
 'investigationofcnyc0708unit',
 '454th',
 'dnsc9108094',
 'underband',
 'achc172',
 'compléte',
 'annualreporttown1962huds',
 '0062_0053',
 '0062_0052',
 '0062_0051',
 '0062_0050',
 '0062_0057',
 '0062_0056',
 '0062_0055',
 '0062_0054',
 '0062_0059',
 '082833',
 '082836',
 'in8st',
 '803985224',
 'replytoharnackon00cremuoft',
 '200640',
 '200641',
 '200643',
 '200646',
 '200647',
 'annualreport1908mass',
 'ntdrmiyp',
 '39350564',
 '335648',
 'bb3be5146b92278a48870308d6d82935',
 '885665870',
 'r8026',
 'archivfrnaturg7301berl',
 '57449076',
 'dnsc9108090',
 'ocm46705900',
 'p0031_23621_0001',
 '6400344',
 '6400345',
 'annualreportsoft1996stod',
 '6400347',
 '6400340',
 'snrrey',
 '6400342',
 'perlavenutainrom00pizz',
 '6400348',
 '6400349',
 'bostoncollegemagsp1994bos',
 '676967',
 'ahc099276002a',
 'ocm50762106',
 'ocm32263636',
 'brucharzt',
 '19348145014000000000',
 '978984',
 '978987',
 '978982',
 '003753477',
 'twk27811',
 'vnrl',
 'cbc48325_h02',
 'vnrk',
 'dfst9100038',
 'gemonteerd',
 'usparticipationi1993unit',
 'ds01017',
 'photomisc061',
 '000902464',
 '52913537',
 'mueewscontains',
 '6473418',
 '6473419',
 '6473412',
 '6473413',
 '6473410',
 '6473411',
 '6473416',
 '6473417',
 '6473414',
 '6473415',
 '5865218',
 '5865219',
 '498_001',
 '34623874',
 '5865210',
 '5865211',
 '5865212',
 '5865213',
 '5865214',
 '5865215',
 '5865216',
 '438065950',
 '759866782',
 '427921282',
 '003_may',
 'büderverzeichnis',
 '06_10_017847',
 '06_10_017846',
 '06_10_017845',
 '06_10_017844',
 '06_10_017843',
 '06_10_017842',
 '06_10_017841',
 '06_10_017840',
 'rainmaking',
 '06_10_017849',
 '08_06_014939',
 '08_06_014938',
 '08_06_014937',
 '08_06_014936',
 '08_06_014935',
 '08_06_014934',
 '08_06_014933',
 '08_06_014932',
 '08_06_014931',
 '08_06_014930',
 'ocm26865627',
 '00828702',
 '00828701',
 'n24⁰',
 'nouvellebiograph41hoef',
 'alrso',
 'annualreportofto2008unse',
 'flnctnating',
 'muskelbewegung',
 '14759262',
 'ocm14847373',
 'ecologicos',
 'vegas_055',
 'nfant',
 'fws0bsusfi83140201usfi',
 'eann',
 'idx208420105650',
 'idx208420105653',
 'idx208420105652',
 'idx208420105655',
 'idx208420105657',
 'idx208420105656',
 'idx208420105659',
 'idx208420105658',
 'eana',
 '123915919',
 'herthel',
 '20canyon',
 'philipshandyatla00bart',
 'pl10blk_c48185_000',
 '6890229',
 'issliiiiiipo',
 'tendenzdrama',
 'translationsrepr00univ',
 'swalllpy',
 '292504',
 'm2g46',
 '292506',
 '292507',
 '93590',
 'tchats',
 'idx208420043302',
 'idx208420043303',
 'idx208420043300',
 'idx208420043301',
 'idx208420043306',
 'idx208420043307',
 'idx208420043304',
 'idx208420043305',
 'idx208420043308',
 'idx208420043309',
 'physeodesmos',
 'ptbw_149',
 'ptbw_148',
 '6514768',
 '6514769',
 'ef9ce3ebeb45153cb2f8f310afa603e4',
 '6514764',
 '6514765',
 '6514766',
 '6514767',
 '6514760',
 '6514762',
 '6514763',
 '6371356',
 '6371357',
 '6371354',
 '6371355',
 '6371352',
 '6371353',
 '6371350',
 '6371351',
 'ailsofi',
 'prsity',
 '6371358',
 '6371359',
 't35b',
 't35c',
 '794278362',
 'desso',
 '800957',
 'capecodnationals222unit',
 'boyd1905',
 '706503356',
 'p0001_1136_18648_verso',
 '2007_8_1163',
 '234072517',
 '234072515',
 '39999065430165',
 'pl10blk_c48185_004',
 'historyofmoderns02ferg',
 'dm0308',
 'dm0309',
 'dm0306',
 'dm0305',
 'dm0302',
 'dm0303',
 'dm0300',
 'dm0301',
 'reportofboardofm00mass_25',
 'inoiiejr',
 'reportofboardofm00mass_24',
 '180764899',
 '19348025650100000000',
 'pedicellate',
 '098889',
 '851083009',
 '66_2_1_001',
 'dasc8610641',
 'dasc8610640',
 'reportofboardofm00mass_23',
 'кампании',
 'ocm54515622',
 'reportofboardofm00mass_22',
 '006275927',
 '46670027',
 '8c5cf2d331b244e18b0fd1adb787e260',
 'dmsd0742448',
 '81983ac',
 'samaritanchronic00josh',
 'laukaan',
 'universityofnort326univ',
 '42448916456601000000',
 'totoramba',
 'condev2848_overview',
 '522141',
 'americanmedicalt2186unse',
 'investigationofconc12unit',
 'rightbrained',
 'koekkoek',
 'meramecgreenway00nati',
 '5228565',
 'immmlmm',
 'chonsia',
 'rightlateralized',
 'dfst8411099',
 'dfst8411098',
 'agr64000414',
 '19348209700200000000',
 'nerinea',
 '800022',
 '81640821',
 'idx208420177306',
 'idx208420177307',
 'idx208420177304',
 'idx208420177305',
 'idx208420177302',
 'idx208420177303',
 'idx208420177300',
 'idx208420177301',
 'archivfrmikros911berl',
 'ocm44621113',
 'iilacle',
 'idx208420177309',
 'marbledecoration00blag',
 '60694786',
 '318910995',
 '800023',
 'ms_1604_mitigationhouse_2415',
 'health_emphasis_program',
 'idx208420258488',
 'idx208420272199',
 'idx208420272198',
 '800020',
 'idx208420272195',
 'idx208420272194',
 'idx208420272197',
 'idx208420272196',
 'idx208420272191',
 'idx208420272190',
 'idx208420272193',
 'idx208420272192',
 '00002466_tn_0001',
 'newsbooks',
 '69952104',
 'idx208420241588',
 'idx208420241589',
 'z203',
 'idx208420241584',
 'idx208420241586',
 'idx208420241587',
 'idx208420241580',
 'idx208420241581',
 'idx208420241582',
 'idx208420241583',
 'lltry',
 '6682510',
 '6682513',
 '6682515',
 '6682514',
 '6682517',
 'acidlty',
 '6682519',
 '6682518',
 'ahc167002032a',
 'machery',
 'researchpaper22nort',
 'bibliapauperumco00unwi',
 '05000us48177m',
 'soang',
 '206543',
 'lettertodeardebo00chap',
 'catanaei',
 '167656',
 '39999065612291',
 'dnsn9804200',
 'dfsd0202499',
 '06159200',
 'dfsd0202498',
 '54490606',
 '31232081',
 'universityofnort224univ',
 '800024',
 '35581049',
 '40152221',
 '00007852_tn_0001',
 'studentsguidetod00nettuoft',
 'fih',
 'dfsd0202492',
 'dentair',
 'fik',
 '38596296',
 '97643431',
 '00002319_tn_0001',
 'dfsd0202497',
 '19571027',
 '39999057055798',
 'dfsd0202496',
 'researchpapers81inte',
 'cereti',
 'kalokoe',
 'dfsd0202495',
 '261710',
 'dscn1922',
 'cerete',
 '05067100',
 '13729521',
 'censusofbusiness1952unit',
 'cbs4893370_001',
 'vorcl',
 'instiu',
 'evaluaciónes',
 'ohtaiu',
 '65820290r',
 '19348199005000000000',
 '1049118',
 '48999328',
 '006355217',
 'grifflin',
 'ocm47248522',
 'exobiologyineart00ames',
 'armymedicalmuseumcollectioncatalogueofpathologicaldrawingsofmedical',
 '225867405',
 '6003189',
 '6003188',
 '6003187',
 '6003186',
 '39999063171092',
 '6003184',
 '6003183',
 '6003182',
 '6003181',
 '6003180',
 'dnsn8604075',
 'dnsn8604074',
 'reportofdanishbi10dans',
 'dnsn8604076',
 'dnsn8604071',
 'dnsn8604070',
 'dnst8208141',
 'athénée',
 'dnst8208144',
 'ptbw_209',
 '147279',
 'shadscaje',
 'abrahamvs',
 '6361628',
 'marign',
 '6460578',
 '147271',
 '147270',
 '6361621',
 '147272',
 '6361627',
 '6361626',
 '6361625',
 '6361624',
 'illspi',
 'oceanographicobs1971moyn',
 'ptbw_206',
 'b20386424',
 'idx208420027629',
 'idx208420027628',
 'idx208420027626',
 'idx208420027625',
 'idx208420027624',
 'idx208420027623',
 'idx208420027622',
 'idx208420027621',
 'ncreased',
 'poorpotterofyork01bark',
 '6460570',
 'lq9ic',
 '2011506740',
 '2011506741',
 '2011506746',
 '6460573',
 'lsouth',
 'charruaud',
 '729885960',
 'moanavilla',
 '755_029_012_01',
 'sumner138_2_1_120c1',
 'suiuciuc',
 'idx208420243402',
 '6460576',
 'fillallcial',
 'lytleton',
 'portrteeiniger00riga',
 'idx208420207973',
 '002590',
 'catalogueofstate1863stat',
 '277_neuve214999',
 'idx208420017259',
 'idx208420017258',
 '6401770',
 '6401776',
 '6401777',
 '6401774',
 'dnsn9010670',
 'idx208420017251',
 'idx208420017250',
 'idx208420017252',
 'idx208420017255',
 'idx208420017254',
 'idx208420017257',
 'idx208420017256',
 '40_cfd_os_2004_1201_141_407',
 'dnsn9010675',
 'preinstallation',
 '002597',
 'twk61259',
 'twk61254',
 'twk61255',
 'twk61256',
 'twk61257',
 'twk61250',
 'twk61251',
 'twk61252',
 'twk61253',
 'americanfarmer2425balt',
 'pendleys',
 'coustet',
 '009917303',
 'blapey',
 '754123',
 'oubrerie',
 'fs200112',
 '36595169',
 'fs200117',
 '6601454',
 'fs200119',
 'nazvy',
 '6601457',
 '810216859',
 'ocm32849495',
 '6601456',
 '6601451',
 '943296',
 '6601450',
 '94025092',
 'idx208420243404',
 'histoirenaturell00less',
 'buonconsiglio',
 '7f97e8ebc56f3b036fbaba41f511ef14',
 '50141339',
 '14200136',
 '42148375002000000000',
 'laueblose',
 '3f⁸',
 '6601458',
 '5623098',
 '5623099',
 '5623092',
 '5623093',
 '5623090',
 '5623091',
 '5623096',
 'i1r11',
 '5623094',
 '5623095',
 'eiiiaient',
 'paternitiy',
 '49214378',
 '06801433',
 'annualreportcolu1944colu',
 'idx208420243409',
 '1039100',
 'pocketalmanackfo1807amer',
 'guilfordcollegi519021903',
 '001326612',
 '855883',
 'lehrlings',
 'internationalesa04inte',
 'dnsc9401209',
 '41382262',
 '04_1of2_dsc_2693',
 '04372800',
 'doi_2770',
 '06538300',
 '62_5_9_001',
 '173219654',
 'uspharmacopoeiai00slsnuoft',
 '42448918750060000000',
 'roffler',
 'hydrabiologie',
 '05236200',
 'nouvellehyginede00tave',
 'yonaknoka',
 '708252503',
 'ocm50037038',
 '0160754798',
 'stakelin',
 '5fgeneral',
 '217821',
 '217822',
 '217823',
 '217824',
 '217825',
 '315850782',
 'zeitschriftfrele16elek',
 'umn23361b',
 'cyclopaediaofpra11ziemuoft',
 '201416',
 '38994315',
 'isetan',
 'hotisse',
 'cinique',
 '828288556',
 'bogasse',
 '73272738',
 '39999065840397',
 'kisima',
 'whitemartinsgeni00whituoft',
 '719450300',
 '320456955',
 '5823242',
 '10760193',
 '9780160683268',
 't45nr23e',
 '6663688',
 '6663689',
 '6663684',
 '6663685',
 '6663686',
 '6663687',
 '6663680',
 '6663681',
 '6663682',
 '6663683',
 '5613176',
 'dnsd0311938',
 '753739604',
 'bulletindelasoci311892soci',
 '449254',
 '2568052r',
 'idx208420073510',
 'dnst9101238',
 'idx208420073511',
 'dnst9101239',
 'idx208420073512',
 'dnst9101236',
 'idx208420073513',
 '5613170',
 'dnsd0311932',
 '6428023',
 'dnst9101234',
 'condev7577e',
 'dnst9101235',
 'idx208420073516',
 'catalogueofficiel00expo',
 'dnsd0311937',
 'idx208420073517',
 'annualreportofto1993unse',
 'dnsd0311936',
 'hightley',
 'dnst9101230',
 '03229a',
 'dnst9101231',
 '51277834',
 '6412729',
 'ʹemile',
 'staqe',
 'wadowice',
 'motilitydisturbance',
 '279240',
 '279242',
 '279244',
 '279247',
 'idx208420045494',
 'idx208420045495',
 'idx208420045496',
 'idx208420045497',
 'idx208420045490',
 'idx208420045491',
 'idx208420045492',
 'idx208420045493',
 'cnnnor',
 'idx208420045498',
 'idx208420045499',
 'iilpgs',
 'va2259',
 'gainestsown',
 '695055989',
 '464629053',
 '38136000196400',
 '85833201',
 '6472240',
 '6472241',
 '6472242',
 '6472243',
 '6472244',
 '6472245',
 '6472246',
 '6472247',
 '6472248',
 '6472249',
 '6600241',
 'dx18780838',
 '00006612',
 '2412962',
 '00006615',
 '098231',
 'nordhang',
 '713205230',
 '6600246',
 '01758900',
 '779978829',
 'overcored',
 '00006616',
 'paperspecs',
 '244008156',
 'photosynthesis00spoe',
 'dfsd0509278',
 '2065802',
 '7308832',
 'lnrvnl',
 '1891mc',
 '88699',
 'b1162353',
 'lienor',
 'annualreportfort1941bedf',
 'eppsit',
 '52472843',
 '6523569',
 '85119',
 '8869c',
 '85116',
 '85117',
 '85114',
 '85115',
 'd9f2d0667a199d15e0cc709874d89bb6',
 '765950',
 '2261177',
 'lnnlity',
 'siybgaalcteu',
 '109007',
 'shmagin',
 '147974229',
 'heredero',
 '230937142',
 '827199896',
 'idx208420149314',
 'idx208420149315',
 'idx208420149316',
 'idx208420149317',
 'idx208420149310',
 'musiclicensingsm00unit',
 'idx208420149312',
 'idx208420149313',
 '254209',
 'cueur',
 'idx208420149318',
 'idx208420149319',
 '233697515',
 '7312413',
 '7312412',
 '7312411',
 'fficiency',
 '7312417',
 '7312416',
 '7312415',
 '7312419',
 'annualreporttown1960cent',
 'caaperating',
 '291157',
 '04966000',
 'histoireabrg00joll',
 'psyamericanjourn33ameruoft',
 'präparator',
 '61232861',
 '116782',
 'problemofevilinp00full',
 'dnsc8703201',
 'idx208420042778',
 'idx208420042779',
 'a946',
 'a940',
 'a941',
 'a942',
 'a943',
 'idx208420042770',
 'idx208420042771',
 'idx208420042772',
 'idx208420042773',
 'idx208420042774',
 'perjudicados',
 'idx208420042776',
 'idx208420042777',
 'establishplanned00bost',
 '294656',
 '294654',
 '294652',
 '294653',
 '294650',
 '294651',
 'ascaláfidos',
 '06065200',
 'livernois',
 '294658',
 '294659',
 'henryviiienglish01gasq',
 'va2253',
 '0160765609',
 'economy4',
 ...]
In [15]:
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)
In [16]:
texthaps = []
for hap in fd.hapaxes():
    if not hasNumbers(hap):
        texthaps.append(hap)
len(texthaps)
Out[16]:
392117
In [17]:
len(fd.hapaxes())
Out[17]:
2773182
In [18]:
texthaps
Out[18]:
['contiiinecl',
 'greise',
 'noveluber',
 'afgestooken',
 'wintkagasspan',
 'choegun',
 'meeler',
 'underbank',
 'ufers',
 'riilcil',
 'qurxtity',
 'ficantly',
 'racticul',
 'wageindex',
 'bankroll',
 'fiy',
 'peble',
 'colvtalner',
 'manipulaton',
 'todod',
 'whichemploy',
 'néill',
 'phalluses',
 'bustum',
 'embacy',
 'embach',
 'podali',
 'footwashing',
 'utarid',
 'putorious',
 'utarin',
 'berlingebrüder',
 'scrviot',
 'ranchlands',
 'priiliitive',
 'organisada',
 'chattenango',
 'dayrl',
 'lixuna',
 'fineart',
 'enfóquese',
 'nxtilnd',
 'congenite',
 'claymoore',
 'alleviations',
 'underband',
 'compléte',
 'ntdrmiyp',
 'snrrey',
 'brucharzt',
 'vnrl',
 'vnrk',
 'gemonteerd',
 'mueewscontains',
 'büderverzeichnis',
 'rainmaking',
 'alrso',
 'flnctnating',
 'muskelbewegung',
 'ecologicos',
 'nfant',
 'eann',
 'eana',
 'herthel',
 'issliiiiiipo',
 'tendenzdrama',
 'swalllpy',
 'tchats',
 'physeodesmos',
 'ailsofi',
 'prsity',
 'desso',
 'inoiiejr',
 'pedicellate',
 'кампании',
 'laukaan',
 'totoramba',
 'rightbrained',
 'koekkoek',
 'immmlmm',
 'chonsia',
 'rightlateralized',
 'nerinea',
 'iilacle',
 'health_emphasis_program',
 'newsbooks',
 'lltry',
 'acidlty',
 'machery',
 'soang',
 'catanaei',
 'fih',
 'dentair',
 'fik',
 'cereti',
 'kalokoe',
 'cerete',
 'vorcl',
 'instiu',
 'evaluaciónes',
 'ohtaiu',
 'grifflin',
 'armymedicalmuseumcollectioncatalogueofpathologicaldrawingsofmedical',
 'athénée',
 'shadscaje',
 'abrahamvs',
 'marign',
 'illspi',
 'ncreased',
 'lsouth',
 'charruaud',
 'moanavilla',
 'suiuciuc',
 'fillallcial',
 'lytleton',
 'preinstallation',
 'pendleys',
 'coustet',
 'blapey',
 'oubrerie',
 'nazvy',
 'buonconsiglio',
 'laueblose',
 'eiiiaient',
 'paternitiy',
 'lehrlings',
 'roffler',
 'hydrabiologie',
 'yonaknoka',
 'stakelin',
 'isetan',
 'hotisse',
 'cinique',
 'bogasse',
 'kisima',
 'hightley',
 'ʹemile',
 'staqe',
 'wadowice',
 'motilitydisturbance',
 'cnnnor',
 'iilpgs',
 'gainestsown',
 'nordhang',
 'overcored',
 'paperspecs',
 'lnrvnl',
 'lienor',
 'eppsit',
 'lnnlity',
 'siybgaalcteu',
 'shmagin',
 'heredero',
 'cueur',
 'fficiency',
 'caaperating',
 'präparator',
 'perjudicados',
 'ascaláfidos',
 'livernois',
 'yamasukera',
 'saurole',
 'antsiraben',
 'ahimatitsy',
 'goelawa',
 'lusembe',
 'agrcelnent',
 'maurain',
 'andana',
 'bucassen',
 'hcving',
 'invigorators',
 'natalbai',
 'clvc',
 'caarriecl',
 'efugee',
 'wahasha',
 'clisenscs',
 'schilcrat',
 'ruvin',
 'calmcit',
 'eazey',
 'hydrophobicities',
 'llcigl',
 'sincan',
 'cesana',
 'missionassisant',
 'yecorato',
 'sellare',
 'tibetische',
 'presidentswashington',
 'tehefoucte',
 'ethnomedicine',
 'tasmanien',
 'saimdang',
 'linbs',
 'harlaire',
 'ethiop',
 'bluehill',
 'kafri',
 'tekau',
 'hurney',
 'tioton',
 'tekam',
 'portaient',
 'ssistetla',
 'ghottingen',
 'circuito',
 'kaiawase',
 'faucettralphebiography',
 'definciency',
 'belolawek',
 'longnor',
 'pilosebaceous',
 'stoillach',
 'msun',
 'nlcclical',
 'throi',
 'kaululaau',
 'throa',
 'throc',
 'антоновские',
 'illatter',
 'bollwyller',
 'winkelmannischen',
 'drycell',
 'treff',
 'yull',
 'cialized',
 'dahcotah',
 'komarovʺ',
 'rossolimo',
 'thousaild',
 'lognette',
 'galgesberg',
 'umbeluzi',
 'prorluct',
 'stephanius',
 'castellamonte',
 'lewice',
 'wojtosik',
 'abdeldaim',
 'haitt',
 'haita',
 'schnée',
 'gathr',
 'piranema',
 'lucrene',
 'godov',
 'hyten',
 'jadot',
 'brsorc',
 'eingaklebt',
 'swotting',
 'kistenmacher',
 'tichig',
 'lliliversity',
 'chimneystacks',
 'dandolo',
 'shevistskikh',
 'gentisate',
 'semiofficiul',
 'mearsheimer',
 'jounrnal',
 'waliace',
 'wahrhafftige',
 'rechardson',
 'turnikom',
 'niounttlins',
 'baramadagascar',
 'ressainance',
 'janorschke',
 'аграрной',
 'editnr',
 'upholstory',
 'rcjuis',
 'mormoopidae',
 'dipsogenic',
 'ŭihoe',
 'salomonsen',
 'crames',
 'apalaches',
 'mcotea',
 'sagacitate',
 'fringy',
 'seminaristinnenkurs',
 'encouragers',
 'alî',
 'agmiller',
 'morzon',
 'insitituto',
 'regentibus',
 'strandmark',
 'ladhoff',
 'иностранным',
 'créance',
 'branty',
 'nncienl',
 'pietruszo',
 'reous',
 'pietrusza',
 'requring',
 'eigl',
 'eigo',
 'sinamary',
 'dashino',
 'dorpsoverste',
 'eevvaannggeelliizziinngg',
 'lloyte',
 'wwod',
 'ontiberos',
 'cqulvnlent',
 'espero',
 'photogrammety',
 'ulst',
 'multifire',
 'cuicado',
 'adamsone',
 'dcvclqpwnt',
 'aspian',
 'adamsons',
 'converty',
 'travleing',
 'dendale',
 'cveli',
 'ashurnasirpal',
 'lsbilitgo',
 'abgerungen',
 'tacting',
 'reeneng',
 'nucleotidases',
 'alz',
 'diaconia',
 'langnaw',
 'phorbia',
 'schwst',
 'mediatii',
 'einschlieslich',
 'losleben',
 'withsubsequent',
 'boxman',
 'gewonnener',
 'hensry',
 'bruti',
 'gefährligkeiten',
 'rhayader',
 'albenseiten',
 'reloplnent',
 'approsimately',
 'vlttu',
 'louzon',
 'gloghini',
 'reasest',
 'diigital',
 'chúng',
 'junkets',
 'glenrosa',
 'harmount',
 'abarms',
 'wawuna',
 'hogards',
 'delinavit',
 'ē',
 'solvccl',
 'lumutbalai',
 'sigananda',
 'martargis',
 'iaensc',
 'friedrichstafen',
 'elyah',
 'lililyo',
 'hungerpest',
 'shaijples',
 'estioin',
 'akomatsu',
 'latil',
 'unfought',
 'kasavubu',
 'rited',
 'disfrutara',
 'piedmonth',
 'insepector',
 'latiu',
 'wanzhou',
 'вапко',
 'claii',
 'tomologyw',
 'claie',
 'thatsächliche',
 'braunschw',
 'tlcleterious',
 'comwesseafron',
 'cliscoveri',
 'cliscoverg',
 'cliscoverd',
 'onyefulu',
 'eggsquisite',
 'binjouin',
 'vatundamu',
 'lugee',
 'merpeople',
 'fcedings',
 'jaywalkers',
 'dllrille',
 'durres',
 'durret',
 'ungen',
 'ungeh',
 'suerfu',
 'ungef',
 'fischtrockenplatz',
 'amaudruz',
 'atact',
 'podepsa',
 'citlzens',
 'hoëvell',
 'feteioa',
 'jsteiv',
 'pánico',
 'sosikrates',
 'collectivized',
 'gülden',
 'ordinavit',
 'asxi',
 'intimiano',
 'ctzel',
 'boutot',
 'murza',
 'celebrees',
 'halacsy',
 'obsèques',
 'availnl',
 'arcandi',
 'cidio',
 'consecta',
 'soopahya',
 'houtton',
 'yungon',
 'breissgauischen',
 'suhe',
 'suho',
 'suhs',
 'sanitio',
 'infrmatin',
 'romatipografia',
 'xianwu孔宪武',
 'gijsbertus',
 'propolae',
 'gastrophod',
 'medisons',
 'accão',
 'probates',
 'vergier',
 'головину',
 'tjlu',
 'syncopalis',
 'tjll',
 'tjli',
 'tenthouses',
 'heldenbuch',
 'nonunions',
 'beets_______________________________________________',
 'sitllation',
 'muckerman',
 'illvasioli',
 'spaski',
 'dascalescu',
 'belgariad',
 'pothicary',
 'oouurrccuullttuurree',
 'плеть',
 'fogva',
 'noening',
 'uncentralized',
 'zuydt',
 'ambitiously',
 'cllallges',
 'ivifbboobrrrniiiggiinngg',
 'mgney',
 'babungokind',
 'iinetl',
 'bioisosteres',
 'pressurevolume',
 'isurumunija',
 'solfeggio',
 'illformed',
 'merdin',
 'dannal',
 'dannam',
 'cakenge',
 'merhandise',
 'dannat',
 'vniuersis',
 'magnética',
 'caxcan',
 'cjrried',
 'cessing',
 'oeurs',
 'investmerrts',
 'götzenfesten',
 'lnental',
 'mdclxxxxix',
 'essercitato',
 'munduruku',
 'atomgruppen',
 'женской',
 'copiè',
 'benicht',
 'sairsc',
 'genvinam',
 'thehvside',
 'rachitogenesis',
 'aclminislration',
 'koepf',
 'kift',
 'detenninecl',
 'horloger',
 'naudaei',
 'magaro',
 'limbowe',
 'abegawa',
 'mahemiah',
 'bakevellia',
 '牧草大田轮作制的理论与技术',
 'canaletti',
 'conrltr',
 'failittg',
 'patert',
 'ganisnl',
 'optitrack',
 'stationsgeschwister',
 'sonderbahre',
 'reestab',
 'tranvik',
 'ericksonhurt',
 'ayudarán',
 'superintendint',
 'cfveral',
 'bisignano',
 'lagrassa',
 'hwadlefy',
 'prrat',
 'dissapproval',
 'platenses',
 'gorean',
 'oolanahhee',
 'snni',
 'yuj',
 'riječi',
 'friedensengel',
 'shoort',
 'pyonchan',
 'oncepts',
 'flatcl',
 'typper',
 'sarcamento',
 'scecam',
 'elllenton',
 'grufferman',
 'nerby',
 'sabeundi',
 'hulsobus',
 'nicklos',
 'hallengeso',
 'thâeophile',
 'hypother',
 'angiolini',
 'cuthrie',
 'sorgfältigste',
 'gretehelskov',
 'latabll',
 'petroiacomo',
 'cltlily',
 'prnited',
 'bejewelled',
 'charmber',
 'streynsham',
 'mangaoang',
 'marantearum',
 'inocultztio',
 'bescherme',
 'surrouncled',
 'ohrloff',
 'luggs',
 'kršćanskom',
 'lugga',
 'irlcilils',
 'heavenbeijingming',
 'frommarch',
 'wasenberg',
 'augustln',
 'shodows',
 'venkatanarasimha',
 'mavjen',
 'tenox',
 'eiran',
 'vnccint',
 'tabyshalieva',
 'millwr',
 'prevendado',
 'millwn',
 'wärmflaschen',
 'lillypad',
 'pharisiens',
 'trsiiive',
 'accwnul',
 'belcano',
 'nitwi',
 'menaham',
 'meditsinskii',
 'vigileo',
 'kressner',
 'efforcaient',
 'fulanis',
 'sorgotten',
 'faciendo',
 'sanshui',
 'facienda',
 'zeën',
 'shacleford',
 'parguasa',
 'jtiaidia',
 'videofile',
 'kuchni',
 'tanceac',
 'bonifont',
 'unatineg',
 'consvls',
 'sutek',
 'norresundby',
 'originalgrösse',
 'specialistsâ',
 'pflycbolojfy',
 'lopardi',
 'mjesta',
 'yokevich',
 'strafeships',
 'mahachan',
 'plistodon',
 'ficticsoosi',
 'psnlts',
 'weegar',
 'dobrego',
 'iderable',
 'cotapino',
 'oversllpply',
 'octaethylporphyrinatomanganese',
 'iderably',
 'signiiicnnt',
 'robbards',
 'leimentoll',
 'lamroena',
 'noncompact',
 'ryckère',
 'primaveral',
 'gnieznienskiego',
 'tamikas',
 'grenony',
 'bactcria',
 'gastroli',
 'ocotal',
 'riconoscenza',
 'monoarticular',
 'bulletinsummerquappa',
 'crotzer',
 'utilus',
 'talija',
 'fereeda',
 'kleinster',
 'kleinstes',
 'promnix',
 'rusciano',
 'chrisant',
 'misshappen',
 'oinicn',
 'связанных',
 'harrewyn',
 'phlegm',
 'skupljena',
 'epidemiologi',
 'imags',
 'berliz',
 'sheelat',
 'descobertos',
 'unreviewed',
 'caesareansectionatfullterm',
 'justitiam',
 'rairden',
 'strakville',
 'vavenby',
 'unmt',
 'tuscapampa',
 'morlund',
 'hrougll',
 'cluestionable',
 'kupecký',
 'vebred',
 'guican',
 'lastboote',
 'nitrogenn',
 'mewaygo',
 'nitrogenj',
 'womenßs',
 'turistjcgob',
 'explicatifs',
 'cuyubini',
 'carwrecks',
 'korlet',
 'nalyo',
 'parasitarias',
 'auristes',
 'cawte',
 'mitlli',
 'clairinghnigh',
 'gierish',
 'estanco',
 'ugel',
 'missionsalmosen',
 'uger',
 'ussie',
 'choquita',
 'factaor',
 'intermecliate',
 'văn',
 'tempelterrasse',
 'multisynaptic',
 'palmerly',
 'malakhovskago',
 'cheother',
 'tremellen',
 'diputndo',
 'uvum',
 'garwicz',
 'tsournos',
 'licntion',
 'bauant',
 'виссарионович',
 'powdilfhorn',
 'murphay',
 'geroll',
 'feyken',
 'fromrichard',
 'eastabrooks',
 'jugali',
 'mégaptère',
 'utlis',
 'xsoiirces',
 'mgct',
 'abulencia',
 'marindin',
 'blumenga',
 'brethaur',
 'guitele',
 'dégré',
 'potjokonkong',
 'erysipela',
 'variodermite',
 'depiciences',
 'shadkhen',
 'catechetische',
 'owyhigh',
 'moutliecl',
 'scherschligt',
 'soudé',
 'valbracht',
 'historienschreiber',
 'zamparelli',
 'foulc',
 'lecram',
 'poespa',
 'hospsital',
 'veteriizary',
 'ezekeli',
 'unqualifiecl',
 'eloya',
 'succer',
 'reginarum',
 'succee',
 'bildenback',
 'chahogum',
 'generalate',
 'ordensmönchen',
 'karioglu',
 'proverbia',
 'élisabeth',
 'doncqu',
 'publisherl',
 'bameileke',
 'fannel',
 'alligence',
 'prevellts',
 'aassembly',
 'enroller',
 'metiokochda',
 'trumall',
 'killbourn',
 'tailaferro',
 'enforcerneiit',
 'linggau',
 'baranovski',
 'xenpang',
 'etymologicas',
 'ahoo',
 'officemax',
 'thrach',
 'fiskerjenten',
 'düsseldorff',
 'fosfatado',
 'nechl',
 'necho',
 'ne_bras_ka',
 'cosp',
 'steaminj',
 'mlekopitaiushchi',
 'kodachromes',
 'tyrolis',
 'laminoplasty',
 'mcgegor',
 'lakeoff',
 'asthenospheric',
 'robatzek',
 'privileres',
 'yurugi',
 'gmj',
 'gmn',
 'gmb',
 'uppon',
 'cuddapan',
 'terrerium',
 'ninnigret',
 'mutilados',
 'strandtmann',
 'wäscherei',
 'hilalcement',
 'собраны',
 'graspers',
 'gesetzgebungen',
 'benandri',
 'masonthe',
 'rusticity',
 'sencoes',
 'maineman',
 'enemecio',
 'foldwer',
 'brimsby',
 'convience',
 'triunfalel',
 'marshmellow',
 'mtji',
 'ebraeorvm',
 'stwrt',
 'leptandrin',
 'samilkameenensis',
 'zaander',
 'unple',
 'rrelelv',
 'melothian',
 'rorich',
 'deprllcls',
 'cwky',
 'awesomely',
 'accommodationist',
 'dilena',
 'silkrree',
 'intrabead',
 'armymedicalmuseumcollectionlogbooksprovisionalpathologicalseries',
 'padoukholz',
 'lienholders',
 'beethom',
 'organian',
 'legary',
 'rofewors',
 'roseneid',
 'subpena',
 'nwhieh',
 'mcmadaw',
 'iyula',
 'chengwen许成文',
 'dhamala',
 'нагорный',
 'fulvigula',
 'erntetanz',
 'passmores',
 'gallicarum',
 'yfls',
 'coalcompany',
 'romatic',
 'zobo',
 'yfli',
 'dadle',
 'bsatroop',
 'rcalixar',
 'purumi',
 'aescorting',
 'audiovisuelles',
 'brinkac',
 'teritoriul',
 'illdefinite',
 'finanze',
 'teasppon',
 'novopen',
 'ramasseurs',
 'amirdara',
 'mulanax',
 'shulzes',
 'sleying',
 'mclliocls',
 'qugmentation',
 'berggeschichten',
 'imint',
 'stlein',
 'distiehlis',
 'resoarch',
 'albigenses',
 'rajasingha',
 'ungoya',
 'inkunabel',
 'stactites',
 'contillually',
 'evenutally',
 'deformidad',
 'le__',
 'fullchearings',
 'gardnevi',
 'obliquinity',
 'charduar',
 'stiiclic',
 'engelska',
 'jakins',
 'huach',
 'engelskt',
 'descoeudres',
 'barcal',
 'seltlo',
 'generalhead',
 'wasdetermined',
 'stratoii',
 'asentaran',
 'nationagl',
 'raypublican',
 'bindernagel',
 'rudal',
 'venkatachalapathy',
 'angioletti',
 'tamtamspeler',
 'glstn',
 'gansarski',
 'remijius',
 'ilune',
 'conontoxin',
 'lreling',
 'alfrlfr',
 'morrili',
 'snouted',
 'incurabli',
 'deparptment',
 'deviacion',
 'glockebaum',
 'connitlons',
 'potö',
 'forestell',
 'glayd',
 'gevaerlijcke',
 'glays',
 'meresman',
 'januam',
 'other_so_',
 'ongeo',
 'irmscher',
 'clitioiis',
 'rinserrano',
 'hylenaea',
 'yonathan',
 'pinlc',
 'outwith',
 'iinnssppiriartiaotniiostns',
 'arisoptera',
 'ensnare',
 'schallbildung',
 'ranexa',
 'olsenm',
 'duchesnoy',
 'bowlingtown',
 'laumsa',
 'remport',
 'serting',
 'estandarizadas',
 'ivac',
 'depraedation',
 'tiongco',
 'shahnovich',
 'chausses',
 'emisije',
 'peyritsch',
 'kaake',
 'affligeante',
 'brukes',
 'bruker',
 'ontstanding',
 'mpraéso',
 ...]
In [59]:
colltexthaps = {}
colls2 = colls
colls2.append('artstor')

for coll in colls2:
    colltexthaps[coll] = []
    for hap in fds[coll].hapaxes():
        if not hasNumbers(hap):
        #if hap in texthaps:
            colltexthaps[coll].append(hap)
    print(coll, "|", str(len(colltexthaps[coll])))
biodiv | 36351
rumsey | 11564
commonwealth | 17291
georgia | 19225
harvard | 10822
ia | 82685
getty | 8896
kentucky | 10656
minnesota | 14554
missouri | 17175
mwdl | 161243
nara | 42642
nocar | 30677
smiths | 96110
socar | 18369
texas | 23045
gpo | 27582
illinois | 19754
usc | 70684
virginia | 6878
nocoll | 995
artstor | 392117