Notebook

In [2]:

import pandas as pd
import json
import os
import urllib
import urllib2
import numpy as np
import unicodedata
from myalchemy import MyAlchemy

In [3]:

#Since we only get so many Alchemy API calls, might as well not call
#Alchemy on duplicate posts between subreddits. We'll merge these later......
#df = pd.read_csv('Data/full.csv', encoding='utf-8')

#print "Original size of data set is", len(df)
#df = df.drop_duplicates('id') # We only want unique post id entries, not to waste alchemy calls
#print "Size of data set with only unique posts is", len(df)
#subs = list(df['subreddit'].unique())
#dflen = len(df)
#df['alchemy'] = ['null']*dflen

If you are re-running the program, uncomment out the cell block above and simply use the cell block below instead.

In [4]:

#df = pd.read_csv('Data/uniqueentries.csv', encoding='utf-8')

In [4]:

file_dir = "Data/combinedcomments/"

path, dirs, files = os.walk(file_dir).next()
csvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files
csvfiles.sort()
#csvfiles

In [5]:

def check_null(x):
    try:
        np.isnan(x)
        return False
    except:
        return True

In [6]:

def alchemy_comments(df, start, apikey, csvfiles, end=len(df)):
    '''
    will run the alchemy keyword annalysis on the comment files
    '''
    p = MyAlchemy(apikey)
    dfids = list(df.index)
    for i in range(start, end):
        subrequest = df['subreddit'][dfids[i]] 
        commentfile = ''
        for comb in csvfiles:
            if subrequest in comb:
                commentfile = comb
        commentdf = pd.read_csv(commentfile, encoding='utf-8')
        commentdf = commentdf.drop('type',1)
        commentdf = commentdf.drop_duplicates()
        commentdf = commentdf[commentdf['comment'].apply(lambda x: check_null(x))]
        commentdf['comment'] = commentdf['comment'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
        commentdf = commentdf[commentdf['post'] == df['id'][dfids[i]]]
        comments = list(commentdf['comment'])
        # If we want to add the title to the alchemy call
        comments.append(df['title'][dfids[i]])
        # If we want to add the self text to the alchemy call 
        if check_null(df['selftext'][dfids[i]]):
            comments.append(df['selftext'][dfids[i]])
            
        # Both joining the comments and sending alchemy calls can be problematic    
        try:
            comments = ' '.join(comments)
            if len(comments) > 8000:
                comments = comments[0:7999]
        except:
            print "Comment join error", comments
        
        # I'm not sure what Alchemy does once you reach the cap... so you might want to check if null or something here.
        try:
            df['alchemy'][dfids[i]] = p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})
        except:
            print "Alchemy error", df['id'][dfids[i]]
        
    return df
        

This section calls the alchemy keyword grabber for each post in the unique data set.

In [7]:

#Alchemy keys
apikey1 = "dcac82649daaa2627ee783b25779cfaed4af0067" #Jay's key
apikey2 = "e945cef59338f9e8e7bc962badde170e623fb7e5" #Basti's key
apikey3 = "cb736ca44e57cd6764b70ec86886f4fce8f6a68d" #Serguei's Key

Each alchemy key only works for 1000 calls so this part must be done by multiple people over many days.

In [20]:

#df = alchemy_comments(df, 25000, apikey2, csvfiles, end=25992)
#df.to_csv('Data/uniqueentries.csv', index=False, encoding='utf-8')

Now, we need to merge the unique data set into the full data set.

In [7]:

fulldf = pd.read_csv('Data/full.csv', encoding='utf-8')
fulldf['alchemy'] = ['null']*len(fulldf)

In [14]:

for i in fulldf.index:
    fid = fulldf['id'][i]
    alc = df[df['id'] == fid]['alchemy']
    fulldf['alchemy'][i] = alc

In [15]:

fulldf.to_csv('Data/full.csv', index=False, encoding='utf-8')

In [6]:

#print p.run_method(comments, 'concepts')
#print p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})
#print p.run_method(comments, 'category')
#print p.run_method(comments, 'sentiment')
#print p.run_method(comments, 'entities')
#print p.run_method(reddit_base, 'urlkeywords')

In [ ]: