import urllib.request
federalistURL = "http://www.gutenberg.org/cache/epub/1404/pg1404.txt"
federalistString = urllib.request.urlopen( federalistURL ).read().decode()

import os
directory = "data"
if not os.path.exists( directory ):
    os.makedirs( directory )
    
fic = open( "data/federalist.txt", "w" )
fic.write( federalistString )
fic.close()

fic = open( "data/federalist.txt", "r" )
federalistString = fic.read( )
fic.close()

print( federalistString[ :200 ] )

# Strip the file header and footer
startIndex = federalistString.find( "FEDERALIST No." ) 
endIndex = federalistString.find( "End of the Project Gutenberg EBook of The Federalist Papers" )
federalistStringNoHeaderFooter = federalistString[ startIndex : endIndex ]

# Divide into 85 separate files
papersList = federalistStringNoHeaderFooter.split( "FEDERALIST No.", 85 )

# Since split() removes the separator, let's return it to each paper by hand in case we end up using it sometime.
papersList = [ "FEDERALIST No." + paper for paper in papersList ]

# And now, save the files. Remember that the first entry in papersList is a dummy that we need to 
# ignore, thus the slice in the for loop 
currentPaper = 1
for paper in papersList[ 1: ]:
    currentPaperFileName = "data/federalist_{0}.txt".format( currentPaper )
    fic = open( currentPaperFileName, "w" )
    fic.write( papersList[ currentPaper ] )
    fic.close()
    currentPaper += 1
    

# A function that concatenates a list of text files into a single string
def read_files_into_string( fileList ):
    theString = ""
    for eachFile in fileList:
        fic = open( "data/federalist_{0}.txt".format( eachFile ), "r" )
        theString += fic.read()
        fic.close()
    return theString
        

# Define the lists of papers in the sub-corpora
madisonPapersList = [ 10, 14, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 62, 63]
hamiltonPapersList = [ 1, 6, 7, 8, 9, 11, 12, 13, 15, 16, 17, 21, 22, 23, 24, 25, 26, 27, 28, 29, \
                      30, 31, 32, 33, 34, 35, 36, 59, 60, 61, 65, 66, 67, 68, 69, 70, 71, 72, 73, \
                      74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85 ]
jayPapersList = [ 2, 3, 4, 5 ]
disputedPapersList = [ 18, 19, 20, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58 ]
testCaseList = [ 64 ]

# Make a dictionary out of the sub-corpora
federalistByAuthor = dict()
federalistByAuthor[ "Madison" ] = read_files_into_string( madisonPapersList )
federalistByAuthor[ "Hamilton" ] = read_files_into_string( hamiltonPapersList )
federalistByAuthor[ "Jay" ] = read_files_into_string( jayPapersList )
federalistByAuthor[ "Disputed" ] = read_files_into_string( disputedPapersList )
federalistByAuthor[ "TestCase" ] = read_files_into_string( testCaseList )

# Setup procedure
import nltk
%matplotlib inline

# Tokenize the sub-corpora. We tokenize Jay's texts right away, even though we don't consider
# them at this point, because they'll be useful later on.
federalistByAuthorTokens = dict()
federalistByAuthorLengthDistributions = dict()
for subcorpus in [ "Hamilton", "Madison", "Disputed", "Jay" ]:
    tokens = nltk.word_tokenize( federalistByAuthor[ subcorpus ] )
    
    # Filter out punctuation
    federalistByAuthorTokens[ subcorpus ] = [ token.lower() for token in tokens \
                                             if any (c.isalpha() for c in token) ]
   
    # Get a distribution of token lengths
    tokenLengths = [ len( token ) for token in federalistByAuthorTokens[ subcorpus ] ]
    federalistByAuthorLengthDistributions[ subcorpus ] = nltk.FreqDist( tokenLengths )
    federalistByAuthorLengthDistributions[ subcorpus ].plot( 15, title = subcorpus )
    

federalistByAuthorTokenDistributions = dict()
for subcorpus in [ "Hamilton", "Madison", "Disputed" ]:
    federalistByAuthorTokenDistributions[ subcorpus ] = nltk.FreqDist( federalistByAuthorTokens[ subcorpus ] )
    print( "Favourite words for:", subcorpus, ":", \
          federalistByAuthorTokenDistributions[ subcorpus ].most_common( 10 ), "\n" )

federalistByAuthorText = dict()
for subcorpus in [ "Hamilton", "Madison", "Disputed" ]:
    federalistByAuthorText[ subcorpus ] = nltk.Text( federalistByAuthorTokens[ subcorpus ] )
    print( "Favourite bigrams for", subcorpus, ":\n" )
    federalistByAuthorText[ subcorpus ].collocations( 20 )
    print( "\n" )

federalistByAuthorTrigrams = dict()
for subcorpus in [ "Hamilton", "Madison", "Disputed" ]:
    federalistByAuthorTrigrams[ subcorpus ] = \
        list( nltk.ngrams( federalistByAuthorTokens[ subcorpus ], 3 ) )
    print( "Favourite trigrams for", subcorpus, ":\n" )
    trigramDist = nltk.FreqDist( federalistByAuthorTrigrams[ subcorpus ] )
    print( trigramDist.most_common( 10 ), "\n\n" )
    
    
# Create the data structures
federalistByAuthorContentTokens = dict()
federalistByAuthorContentFreqDist = dict()

# Setup for filtering and lemmatizing
stopwords = nltk.corpus.stopwords.words( "English" )
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()

# Build lists of content-word lemmas and plot their distributions
for subcorpus in [ "Hamilton", "Madison", "Disputed" ]:
    federalistByAuthorContentTokens[ subcorpus ] = [ wnl.lemmatize( token ) \
                                                    for token in federalistByAuthorTokens[ subcorpus ] \
                                                    if not token in stopwords ]
    federalistByAuthorContentFreqDist[ subcorpus ] = \
        nltk.FreqDist( federalistByAuthorContentTokens[ subcorpus ] )
    federalistByAuthorContentFreqDist[ subcorpus ].plot( 20, title = subcorpus )

# How many of the 50 most frequent words in the disputed papers are also among the top 50 in
# Hamilton's own? And in Madison's?
hamiltonTop50 = [ word for (word, freq) \
                 in federalistByAuthorContentFreqDist[ "Hamilton" ].most_common( 50 ) ]
madisonTop50 = [ word for (word, freq) \
                 in federalistByAuthorContentFreqDist[ "Madison" ].most_common( 50 ) ]
disputedTop50 = [ word for (word, freq) \
                 in federalistByAuthorContentFreqDist[ "Disputed" ].most_common( 50 ) ]
hamiltonHowMany = len( [ word for word in hamiltonTop50 if word in disputedTop50 ] )
madisonHowMany = len( [ word for word in madisonTop50 if word in disputedTop50 ] )
print( "Of Hamilton's top 50, {0} appear in the disputed papers.".format( hamiltonHowMany ) )
print( "Of Madison's top 50, {0} appear in the disputed papers.".format( madisonHowMany ) )

# A little helper function to calculate the distances between the positions of words in List1
# and the positions of the same words in List2; if the words aren't in List2 at all, assign a 
# large distance
def calc_distances_between_lists( list1, list2 ):
    dist = 0
    for word in list1:
        if word in list2:
            dist += abs( list1.index( word ) - list2.index( word ) )
        else:
            dist += 50  # If the words don't match, they are far, far away
    return dist

print( "Hamilton's distances:", calc_distances_between_lists( hamiltonTop50, disputedTop50 ) )
print( "Madison's distances:", calc_distances_between_lists( madisonTop50, disputedTop50 ) )

for candidate in [ "Hamilton", "Madison" ]:
    # First, build a joint corpus and identify the most frequent words in it
    # We'll keep the stopwords since they are commonly used in authorship attribution studies
    jointCorpus = federalistByAuthorTokens[ candidate ] + federalistByAuthorTokens[ "Disputed" ]
    jointFreqDist = nltk.FreqDist( jointCorpus )
    mostCommonInJointCorpus = list( jointFreqDist.most_common( 500 ) )

    # What proportion of the joint corpus is made up of the candidate corpus' tokens?
    candidateShareInJointCorpus = len( federalistByAuthorTokens[ candidate ] ) / len( jointCorpus )
    
    # Now, let's look at these 50 words in the candidate author's corpus and compare the number of
    # times it can be observed to what would be expected if the candidate corpus and the Disputed
    # corpus were both random samples from the same distribution.
    chisquared = 0
    for word, jointCount in mostCommonInJointCorpus:
        
        # How often do we really see it?
        candidateCount = federalistByAuthorTokens[ candidate ].count( word )
        disputedCount = federalistByAuthorTokens[ "Disputed" ].count( word )
        
        # How often should we see it?
        expCandidateCount = jointCount * candidateShareInJointCorpus
        expDisputedCount = jointCount * ( 1 - candidateShareInJointCorpus )
        
        # Add the word's contribution to the chi-squared statistic
        chisquared += ( candidateCount - expCandidateCount ) * \
                    ( candidateCount - expCandidateCount ) / expCandidateCount
                    
        chisquared += ( disputedCount - expDisputedCount ) * \
                    ( disputedCount - expDisputedCount ) / expDisputedCount
        
    print( "The Chi-squared statistic for candidate", candidate, "is", chisquared )

candidateList = [ "Hamilton", "Madison", "Jay", "Disputed" ]
federalistByAuthorPOS = dict()
for candidate in candidateList:
    federalistByAuthorPOS[ candidate ] = nltk.pos_tag( federalistByAuthorTokens[ candidate ] )
    print( federalistByAuthorPOS[ candidate ][ :10 ] )
    

# Combine into a single corpus
wholeCorpusPOS = []
for candidate in candidateList:
    wholeCorpusPOS += federalistByAuthorPOS[ candidate ]
    
# Get a frequency distribution
wholeCorpusPOSFreqsTop30 = list( nltk.FreqDist( wholeCorpusPOS ).most_common( 30 ) )
wholeCorpusPOSFreqsTop30[ :10 ]


featuresList = [ wordpospair for ( wordpospair, freq ) in wholeCorpusPOSFreqsTop30 ]
featuresList[ :10 ]

# The main data structure
featureFrequencies = dict()

for candidate in candidateList:
    # A dictionary for each candidate's features
    featureFrequencies[ candidate ] = dict()  
    
    # A helper value containing the number of (token, pos) pairs in the subcorpus
    overall = len( federalistByAuthorPOS[ candidate] )
    
    # Calculate each feature's presence in the subcorpus
    for feature in featuresList:
        presence = federalistByAuthorPOS[ candidate ].count( feature )
        featureFrequencies[ candidate ][ feature ] = presence / overall


import math

# The data structure into which we will be storing the "corpus standard" statistics
corpusFeatures = dict()

# For each feature...
for feature in featuresList:
    # Create a sub-dictionary that will contain the feature's mean and standard deviation
    corpusFeatures[ feature ] = dict()
    
    # Calculate the mean of the frequencies expressed in the subcorpora
    featureAverage = 0
    for candidate in candidateList:
        featureAverage += featureFrequencies[ candidate ][ feature ]
    featureAverage /= len( candidateList )
    corpusFeatures[ feature ][ "Mean" ] = featureAverage
    
    # Calculate the standard deviation using the basic formula for a sample
    featureStdDev = 0
    for candidate in candidateList:
        diff = featureFrequencies[ candidate ][ feature ] - corpusFeatures[ feature ][ "Mean" ]
        featureStdDev += ( diff * diff )
    featureStdDev /= ( len( candidateList ) - 1 )
    featureStdDev = math.sqrt( featureStdDev )
    corpusFeatures[ feature ][ "StdDev" ] = featureStdDev

featureZScores = dict()
for candidate in candidateList:
    featureZScores[ candidate ] = dict()
    for feature in featuresList:
        
        # Z-score definition = (value - mean) / stddev
        # We use intermediate variables to make the code easier to read
        featureVal = featureFrequencies[ candidate ][ feature ]
        featureMean = corpusFeatures[ feature ][ "Mean" ]
        featureStdDev = corpusFeatures[ feature ][ "StdDev" ]
        featureZScores[ candidate ][ feature ] = ( featureVal - featureMean ) / featureStdDev

# Tokenize the test case
testCaseTokens = nltk.word_tokenize( federalistByAuthor[ "TestCase" ] )
    
# Filter out punctuation
testCaseTokens = [ token.lower() for token in testCaseTokens \
                                             if any (c.isalpha() for c in token) ]
 
# Tag the test case for parts of speech
testCaseTokensPOS = nltk.pos_tag( testCaseTokens )

# Calculate the test case's features
overall = len( testCaseTokensPOS )
testCaseFeatureFrequencies = dict()
for feature in featuresList:
    presence = testCaseTokensPOS.count( feature )
    testCaseFeatureFrequencies[ feature ] = presence / overall
    
# Calculate the test case's feature z-scores
testCaseZScores = dict()
for feature in featuresList:
    featureVal = testCaseFeatureFrequencies[ feature ]
    featureMean = corpusFeatures[ feature ][ "Mean" ]
    featureStdDev = corpusFeatures[ feature ][ "StdDev" ]
    testCaseZScores[ feature ] = ( featureVal - featureMean ) / featureStdDev
    print( "Test case z-score for feature", feature, "is", testCaseZScores[ feature ] )
    

for candidate in candidateList:
    delta = 0
    for feature in featuresList:
        delta += math.fabs( testCaseZScores[ feature ] - featureZScores[ candidate ][ feature ] )
    delta /= len( featuresList )
    print( "Delta score for candidate", candidate, "is", delta )