import urllib.request federalistURL = "http://www.gutenberg.org/cache/epub/1404/pg1404.txt" federalistString = urllib.request.urlopen( federalistURL ).read().decode() import os directory = "data" if not os.path.exists( directory ): os.makedirs( directory ) fic = open( "data/federalist.txt", "w" ) fic.write( federalistString ) fic.close() fic = open( "data/federalist.txt", "r" ) federalistString = fic.read( ) fic.close() print( federalistString[ :200 ] ) # Strip the file header and footer startIndex = federalistString.find( "FEDERALIST No." ) endIndex = federalistString.find( "End of the Project Gutenberg EBook of The Federalist Papers" ) federalistStringNoHeaderFooter = federalistString[ startIndex : endIndex ] # Divide into 85 separate files papersList = federalistStringNoHeaderFooter.split( "FEDERALIST No.", 85 ) # Since split() removes the separator, let's return it to each paper by hand in case we end up using it sometime. papersList = [ "FEDERALIST No." + paper for paper in papersList ] # And now, save the files. Remember that the first entry in papersList is a dummy that we need to # ignore, thus the slice in the for loop currentPaper = 1 for paper in papersList[ 1: ]: currentPaperFileName = "data/federalist_{0}.txt".format( currentPaper ) fic = open( currentPaperFileName, "w" ) fic.write( papersList[ currentPaper ] ) fic.close() currentPaper += 1 # A function that concatenates a list of text files into a single string def read_files_into_string( fileList ): theString = "" for eachFile in fileList: fic = open( "data/federalist_{0}.txt".format( eachFile ), "r" ) theString += fic.read() fic.close() return theString # Define the lists of papers in the sub-corpora madisonPapersList = [ 10, 14, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 62, 63] hamiltonPapersList = [ 1, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 21, 22, 23, 24, 25, 26, 27, 28, 29, \ 30, 31, 32, 33, 34, 35, 36, 59, 60, 61, 65, 66, 67, 68, 69, 70, 71, 72, 73, \ 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85 ] jayPapersList = [ 2, 3, 4, 5 ] disputedPapersList = [ 18, 19, 20, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 ] testCaseList = [ 64 ] # Make a dictionary out of the sub-corpora federalistByAuthor = dict() federalistByAuthor[ "Madison" ] = read_files_into_string( madisonPapersList ) federalistByAuthor[ "Hamilton" ] = read_files_into_string( hamiltonPapersList ) federalistByAuthor[ "Jay" ] = read_files_into_string( jayPapersList ) federalistByAuthor[ "Disputed" ] = read_files_into_string( disputedPapersList ) federalistByAuthor[ "TestCase" ] = read_files_into_string( testCaseList ) # Setup procedure import nltk %matplotlib inline # Tokenize the sub-corpora. We tokenize Jay's texts right away, even though we don't consider # them at this point, because they'll be useful later on. federalistByAuthorTokens = dict() federalistByAuthorLengthDistributions = dict() for subcorpus in [ "Hamilton", "Madison", "Disputed", "Jay" ]: tokens = nltk.word_tokenize( federalistByAuthor[ subcorpus ] ) # Filter out punctuation federalistByAuthorTokens[ subcorpus ] = [ token.lower() for token in tokens \ if any (c.isalpha() for c in token) ] # Get a distribution of token lengths tokenLengths = [ len( token ) for token in federalistByAuthorTokens[ subcorpus ] ] federalistByAuthorLengthDistributions[ subcorpus ] = nltk.FreqDist( tokenLengths ) federalistByAuthorLengthDistributions[ subcorpus ].plot( 15, title = subcorpus ) federalistByAuthorTokenDistributions = dict() for subcorpus in [ "Hamilton", "Madison", "Disputed" ]: federalistByAuthorTokenDistributions[ subcorpus ] = nltk.FreqDist( federalistByAuthorTokens[ subcorpus ] ) print( "Favourite words for:", subcorpus, ":", \ federalistByAuthorTokenDistributions[ subcorpus ].most_common( 10 ), "\n" ) federalistByAuthorText = dict() for subcorpus in [ "Hamilton", "Madison", "Disputed" ]: federalistByAuthorText[ subcorpus ] = nltk.Text( federalistByAuthorTokens[ subcorpus ] ) print( "Favourite bigrams for", subcorpus, ":\n" ) federalistByAuthorText[ subcorpus ].collocations( 20 ) print( "\n" ) federalistByAuthorTrigrams = dict() for subcorpus in [ "Hamilton", "Madison", "Disputed" ]: federalistByAuthorTrigrams[ subcorpus ] = \ list( nltk.ngrams( federalistByAuthorTokens[ subcorpus ], 3 ) ) print( "Favourite trigrams for", subcorpus, ":\n" ) trigramDist = nltk.FreqDist( federalistByAuthorTrigrams[ subcorpus ] ) print( trigramDist.most_common( 10 ), "\n\n" ) # Create the data structures federalistByAuthorContentTokens = dict() federalistByAuthorContentFreqDist = dict() # Setup for filtering and lemmatizing stopwords = nltk.corpus.stopwords.words( "English" ) from nltk.stem import WordNetLemmatizer wnl = WordNetLemmatizer() # Build lists of content-word lemmas and plot their distributions for subcorpus in [ "Hamilton", "Madison", "Disputed" ]: federalistByAuthorContentTokens[ subcorpus ] = [ wnl.lemmatize( token ) \ for token in federalistByAuthorTokens[ subcorpus ] \ if not token in stopwords ] federalistByAuthorContentFreqDist[ subcorpus ] = \ nltk.FreqDist( federalistByAuthorContentTokens[ subcorpus ] ) federalistByAuthorContentFreqDist[ subcorpus ].plot( 20, title = subcorpus ) # How many of the 50 most frequent words in the disputed papers are also among the top 50 in # Hamilton's own? And in Madison's? hamiltonTop50 = [ word for (word, freq) \ in federalistByAuthorContentFreqDist[ "Hamilton" ].most_common( 50 ) ] madisonTop50 = [ word for (word, freq) \ in federalistByAuthorContentFreqDist[ "Madison" ].most_common( 50 ) ] disputedTop50 = [ word for (word, freq) \ in federalistByAuthorContentFreqDist[ "Disputed" ].most_common( 50 ) ] hamiltonHowMany = len( [ word for word in hamiltonTop50 if word in disputedTop50 ] ) madisonHowMany = len( [ word for word in madisonTop50 if word in disputedTop50 ] ) print( "Of Hamilton's top 50, {0} appear in the disputed papers.".format( hamiltonHowMany ) ) print( "Of Madison's top 50, {0} appear in the disputed papers.".format( madisonHowMany ) ) # A little helper function to calculate the distances between the positions of words in List1 # and the positions of the same words in List2; if the words aren't in List2 at all, assign a # large distance def calc_distances_between_lists( list1, list2 ): dist = 0 for word in list1: if word in list2: dist += abs( list1.index( word ) - list2.index( word ) ) else: dist += 50 # If the words don't match, they are far, far away return dist print( "Hamilton's distances:", calc_distances_between_lists( hamiltonTop50, disputedTop50 ) ) print( "Madison's distances:", calc_distances_between_lists( madisonTop50, disputedTop50 ) ) for candidate in [ "Hamilton", "Madison" ]: # First, build a joint corpus and identify the most frequent words in it # We'll keep the stopwords since they are commonly used in authorship attribution studies jointCorpus = federalistByAuthorTokens[ candidate ] + federalistByAuthorTokens[ "Disputed" ] jointFreqDist = nltk.FreqDist( jointCorpus ) mostCommonInJointCorpus = list( jointFreqDist.most_common( 500 ) ) # What proportion of the joint corpus is made up of the candidate corpus' tokens? candidateShareInJointCorpus = len( federalistByAuthorTokens[ candidate ] ) / len( jointCorpus ) # Now, let's look at these 50 words in the candidate author's corpus and compare the number of # times it can be observed to what would be expected if the candidate corpus and the Disputed # corpus were both random samples from the same distribution. chisquared = 0 for word, jointCount in mostCommonInJointCorpus: # How often do we really see it? candidateCount = federalistByAuthorTokens[ candidate ].count( word ) disputedCount = federalistByAuthorTokens[ "Disputed" ].count( word ) # How often should we see it? expCandidateCount = jointCount * candidateShareInJointCorpus expDisputedCount = jointCount * ( 1 - candidateShareInJointCorpus ) # Add the word's contribution to the chi-squared statistic chisquared += ( candidateCount - expCandidateCount ) * \ ( candidateCount - expCandidateCount ) / expCandidateCount chisquared += ( disputedCount - expDisputedCount ) * \ ( disputedCount - expDisputedCount ) / expDisputedCount print( "The Chi-squared statistic for candidate", candidate, "is", chisquared ) candidateList = [ "Hamilton", "Madison", "Jay", "Disputed" ] federalistByAuthorPOS = dict() for candidate in candidateList: federalistByAuthorPOS[ candidate ] = nltk.pos_tag( federalistByAuthorTokens[ candidate ] ) print( federalistByAuthorPOS[ candidate ][ :10 ] ) # Combine into a single corpus wholeCorpusPOS = [] for candidate in candidateList: wholeCorpusPOS += federalistByAuthorPOS[ candidate ] # Get a frequency distribution wholeCorpusPOSFreqsTop30 = list( nltk.FreqDist( wholeCorpusPOS ).most_common( 30 ) ) wholeCorpusPOSFreqsTop30[ :10 ] featuresList = [ wordpospair for ( wordpospair, freq ) in wholeCorpusPOSFreqsTop30 ] featuresList[ :10 ] # The main data structure featureFrequencies = dict() for candidate in candidateList: # A dictionary for each candidate's features featureFrequencies[ candidate ] = dict() # A helper value containing the number of (token, pos) pairs in the subcorpus overall = len( federalistByAuthorPOS[ candidate] ) # Calculate each feature's presence in the subcorpus for feature in featuresList: presence = federalistByAuthorPOS[ candidate ].count( feature ) featureFrequencies[ candidate ][ feature ] = presence / overall import math # The data structure into which we will be storing the "corpus standard" statistics corpusFeatures = dict() # For each feature... for feature in featuresList: # Create a sub-dictionary that will contain the feature's mean and standard deviation corpusFeatures[ feature ] = dict() # Calculate the mean of the frequencies expressed in the subcorpora featureAverage = 0 for candidate in candidateList: featureAverage += featureFrequencies[ candidate ][ feature ] featureAverage /= len( candidateList ) corpusFeatures[ feature ][ "Mean" ] = featureAverage # Calculate the standard deviation using the basic formula for a sample featureStdDev = 0 for candidate in candidateList: diff = featureFrequencies[ candidate ][ feature ] - corpusFeatures[ feature ][ "Mean" ] featureStdDev += ( diff * diff ) featureStdDev /= ( len( candidateList ) - 1 ) featureStdDev = math.sqrt( featureStdDev ) corpusFeatures[ feature ][ "StdDev" ] = featureStdDev featureZScores = dict() for candidate in candidateList: featureZScores[ candidate ] = dict() for feature in featuresList: # Z-score definition = (value - mean) / stddev # We use intermediate variables to make the code easier to read featureVal = featureFrequencies[ candidate ][ feature ] featureMean = corpusFeatures[ feature ][ "Mean" ] featureStdDev = corpusFeatures[ feature ][ "StdDev" ] featureZScores[ candidate ][ feature ] = ( featureVal - featureMean ) / featureStdDev # Tokenize the test case testCaseTokens = nltk.word_tokenize( federalistByAuthor[ "TestCase" ] ) # Filter out punctuation testCaseTokens = [ token.lower() for token in testCaseTokens \ if any (c.isalpha() for c in token) ] # Tag the test case for parts of speech testCaseTokensPOS = nltk.pos_tag( testCaseTokens ) # Calculate the test case's features overall = len( testCaseTokensPOS ) testCaseFeatureFrequencies = dict() for feature in featuresList: presence = testCaseTokensPOS.count( feature ) testCaseFeatureFrequencies[ feature ] = presence / overall # Calculate the test case's feature z-scores testCaseZScores = dict() for feature in featuresList: featureVal = testCaseFeatureFrequencies[ feature ] featureMean = corpusFeatures[ feature ][ "Mean" ] featureStdDev = corpusFeatures[ feature ][ "StdDev" ] testCaseZScores[ feature ] = ( featureVal - featureMean ) / featureStdDev print( "Test case z-score for feature", feature, "is", testCaseZScores[ feature ] ) for candidate in candidateList: delta = 0 for feature in featuresList: delta += math.fabs( testCaseZScores[ feature ] - featureZScores[ candidate ][ feature ] ) delta /= len( featuresList ) print( "Delta score for candidate", candidate, "is", delta )