#!/usr/bin/env python # coding: utf-8 # # Review and Herald Pilot Sample # # This notebook documents the process by which I created a pilot sample of the Review and Herald. It is a 10% sample of the available Review and Herald corpus released on [the Adventists Archives](http://documents.adventistarchives.org/Periodicals/Forms/AllItems.aspx?RootFolder=%2fPeriodicals%2fRH&FolderCTID=0x012000DDAC5B94CFBD234AB142FC5C311C732700042C85EA7C1C1A4DB8D75C62A7517A6E) as of Spring 2014. # ## Load local libraries # The first function reads the file names from a given directory. The second function parses the standard file names from the SDA periodicals (created by the SDA archives) into a standard CSV format. # In[3]: # %load '../lib/list-directory.py' import os, sys def get_corpus_list(directory): wd = os.listdir(directory) return(wd) # In[6]: # %load '../lib/get_corpus_data_SDA.py' import csv import os, sys import re def get_corpus_data_SDA(directory): wd = os.listdir(directory) listing=[] for each in wd: if each.endswith('pdf'): _id = each #print(_id) path = '/Users/jeriwieringa/Dissertation/text/corpus-RH/all-pdf/'+ _id pre = re.findall(r'^[a-zA-Z]*', _id) # print pre foo = re.findall(r'\d+', _id) # print (foo) year = foo[0][0:4] month = foo[0][4:6] day = foo[0][6:8] volume = foo[1] issue = foo[2] url = "http://documents.adventistarchives.org/Periodicals/" + pre[0] + "/" + _id listing.append([_id, pre[0], year, month, day, volume, issue, path, url]) return(listing) # In[8]: RHList = get_corpus_data_SDA('/Users/jeriwieringa/Dissertation/text/corpus-RH/all-pdf/') # ## Add Identifiers # # I experimented with two methods of identifying the different periodicals in the list. The first adds a UUID to each issue in the list of periodicals. # In[9]: import uuid # In[10]: for each in RHList: each.append(str(uuid.uuid4())) # In[11]: print(RHList[1]) # This second method adds a count to the array (called in the resulting CSVs `num_id`) for all of the issues in the list of files. This was useful for quickly seeing how the random library sampled the total corpus. # In[12]: n=1 for each in RHList: each.append(n) n=n+1 # In[13]: print(RHList[1]) # ## Print Corpus + Identifiers # # Next, for record keeping, I printed the list of all of the periodicals, with their identifiers, to a CSV file. # In[58]: fout = open('20150904-2-corpus-list-RH-sample.csv', 'w') writer = csv.writer(fout, delimiter=',', quotechar='"') # In[59]: headers = ['_id', 'prefix', 'year', 'month', 'day', 'volume', 'issue', 'path', 'url', 'UUID', 'num_id'] # In[60]: writer.writerow(headers) # In[61]: for each in RHList: writer.writerow(each) # ## Create the Sample # # Next, I created a second CSV file to hold the list of sample files. # In[57]: open('20150904-corpus-list-RH-sample.csv') # Then, I imported math to calculate the size of teh sample, given the total number of periodicals in the corpus. # In[62]: import random import math sampleSize = math.floor(len(RHList)/10) print(sampleSize) # Following advice from [stackoverflow](http://stackoverflow.com/questions/6482889/get-random-sample-from-list-while-maintaining-ordering-of-items), I used `random.sample` and passed in the full corpus and the sample size calculated above. `random` pulled 382 numbers out of an available field of 3822 (the size of the whole corpus), sorts them, and then add the periodical that corresponds with that list position to a new list. Note that, while the number from the random generator matches the list position, the associated ID in the CSV is one ahead. # In[63]: # http://stackoverflow.com/questions/6482889/get-random-sample-from-list-while-maintaining-ordering-of-items rand_smpl = [RHList[i] for i in sorted(random.sample(range(len(RHList)), sampleSize)) ] # And to confirm that everything worked as anticipated, I printed both one random list item and the length of the sample. Both pointed to a successful sampling. # In[64]: print(rand_smpl[3]) # In[65]: len(rand_smpl) # ## Retrieve the Sample # The final step was to use the resulting list of periodicals to separate out the sample corpus. First, the file path was isolated out of the information on each sample member. # In[66]: fileList = [] for each in rand_smpl: fileList.append(each[7]) # Then, using `shutil`, each file in the sample was copied to a separate directory. # In[67]: import shutil # In[68]: for each in fileList: if (os.path.isfile(each)): shutil.copy(each, '../../data-sources/corpus-RH/sample-pdf/') # ## Save Sample Data to CSV # # And finally, for record keeping, the information for the sample was saved into a CSV file. # In[69]: sampleListOut = open('20150904-sample-list.csv', 'w') # In[70]: writer2 = csv.writer(sampleListOut, delimiter=',', quotechar='"') # In[71]: writer2.writerow(headers) # In[72]: for each in rand_smpl: writer2.writerow(each)