import pandas as pd
import json
import os
import urllib
import urllib2
import numpy as np
import unicodedata
from myalchemy import MyAlchemy
#Since we only get so many Alchemy API calls, might as well not call
#Alchemy on duplicate posts between subreddits. We'll merge these later......
#df = pd.read_csv('Data/full.csv', encoding='utf-8')
#print "Original size of data set is", len(df)
#df = df.drop_duplicates('id') # We only want unique post id entries, not to waste alchemy calls
#print "Size of data set with only unique posts is", len(df)
#subs = list(df['subreddit'].unique())
#dflen = len(df)
#df['alchemy'] = ['null']*dflen
If you are re-running the program, uncomment out the cell block above and simply use the cell block below instead.
#df = pd.read_csv('Data/uniqueentries.csv', encoding='utf-8')
file_dir = "Data/combinedcomments/"
path, dirs, files = os.walk(file_dir).next()
csvfiles = [file_dir + i for i in files if ".csv" in i ] #Builds a list with .csv files
csvfiles.sort()
#csvfiles
def check_null(x):
try:
np.isnan(x)
return False
except:
return True
def alchemy_comments(df, start, apikey, csvfiles, end=len(df)):
'''
will run the alchemy keyword annalysis on the comment files
'''
p = MyAlchemy(apikey)
dfids = list(df.index)
for i in range(start, end):
subrequest = df['subreddit'][dfids[i]]
commentfile = ''
for comb in csvfiles:
if subrequest in comb:
commentfile = comb
commentdf = pd.read_csv(commentfile, encoding='utf-8')
commentdf = commentdf.drop('type',1)
commentdf = commentdf.drop_duplicates()
commentdf = commentdf[commentdf['comment'].apply(lambda x: check_null(x))]
commentdf['comment'] = commentdf['comment'].apply(lambda x: unicodedata.normalize('NFKD', x).encode('ascii', 'ignore'))
commentdf = commentdf[commentdf['post'] == df['id'][dfids[i]]]
comments = list(commentdf['comment'])
# If we want to add the title to the alchemy call
comments.append(df['title'][dfids[i]])
# If we want to add the self text to the alchemy call
if check_null(df['selftext'][dfids[i]]):
comments.append(df['selftext'][dfids[i]])
# Both joining the comments and sending alchemy calls can be problematic
try:
comments = ' '.join(comments)
if len(comments) > 8000:
comments = comments[0:7999]
except:
print "Comment join error", comments
# I'm not sure what Alchemy does once you reach the cap... so you might want to check if null or something here.
try:
df['alchemy'][dfids[i]] = p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})
except:
print "Alchemy error", df['id'][dfids[i]]
return df
This section calls the alchemy keyword grabber for each post in the unique data set.
#Alchemy keys
apikey1 = "dcac82649daaa2627ee783b25779cfaed4af0067" #Jay's key
apikey2 = "e945cef59338f9e8e7bc962badde170e623fb7e5" #Basti's key
apikey3 = "cb736ca44e57cd6764b70ec86886f4fce8f6a68d" #Serguei's Key
Each alchemy key only works for 1000 calls so this part must be done by multiple people over many days.
#df = alchemy_comments(df, 25000, apikey2, csvfiles, end=25992)
#df.to_csv('Data/uniqueentries.csv', index=False, encoding='utf-8')
Now, we need to merge the unique data set into the full data set.
fulldf = pd.read_csv('Data/full.csv', encoding='utf-8')
fulldf['alchemy'] = ['null']*len(fulldf)
for i in fulldf.index:
fid = fulldf['id'][i]
alc = df[df['id'] == fid]['alchemy']
fulldf['alchemy'][i] = alc
fulldf.to_csv('Data/full.csv', index=False, encoding='utf-8')
#print p.run_method(comments, 'concepts')
#print p.run_method(comments, 'keywords', {'keywordExtractMode':'strict'})
#print p.run_method(comments, 'category')
#print p.run_method(comments, 'sentiment')
#print p.run_method(comments, 'entities')
#print p.run_method(reddit_base, 'urlkeywords')