#load pandas
import pandas as pd
from IPython.parallel import Client
from httplib import HTTPConnection
cred, api_key, ep, url = None, None, None, None
#path = '/Users/gk/Desktop/data_sci'
path='/home/gkandlikar'
def load_essentials():
#load credentials, api key, url, end points, etc
global cred, api_key, url, ep, path
cred = pd.read_csv(''.join((path,'/credentials/credentials.csv')))
url = 'api.crunchbase.com'
ep = ['/v/1/companies.js?','/v/1/company/']
with open(''.join((path,'/crunchbase/key')), 'r') as f:
api_key = f.read().strip()
return
def gen_s3_obj(credentials, bucket=''):
from boto.s3.connection import S3Connection
s3conn=S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
if bucket != '':
s3bucket = s3conn.get_bucket(bucket)
return s3conn, s3bucket
def fetch_and_file_comp_db(key = False):
if not key:
cb_conn=HTTPConnection(url)
#initiate connection with crunchbase
query = ''.join((ep[0],'api_key=',api_key))
cb_conn.request("GET",query)
data=cb_conn.getresponse().read()
#make a query for the file, read the request
s3conn, s3bucket = gen_s3_obj(cred)
compfile=s3bucket.new_key('crunchbase/companies.json')
compfile.set_contents_from_string(data)
s3conn.close()
#create key on S3, set its contents
return json.loads(data, strict = False)
else:
return json.loads(key.get_contents_as_string(), strict = False)
def fetch_company(company):
#establish connections
cb_conn=HTTPConnection(url)
s3conn, s3bucket=gen_s3_obj(cred, bucket='kandlikards')
keyname=''.join(('crunchbase/',company['permalink'],'.json'))
key = s3bucket.get_key(keyname)
if ((not key) or ('Developer Over Qps' in key.get_contents_as_string())):
#retrieve file from crunchbase
#build query:'/v/1/company/company-name.js?api_key=<API_KEY>'
query=''.join((ep[1],company['permalink'],'.js?api_key=',api_key))
#initiate and read connection
cb_conn.request("GET",query)
data=cb_conn.getresponse().read()
while 'Developer Over Qps' in data:
time.sleep(5)
cb_conn.request("GET",query)
data=cb_conn.getresponse().read()
key=s3bucket.new_key(keyname)
key.set_contents_from_string(data)
else :
data=key.get_contents_as_string()
try :
json_data=json.loads(data, strict=False)
except :
json_data={'error': 'Load Error during json.loads()'}
if 'error' in json_data.keys():
retval=None
else :
try :
retval = (company['permalink'],company['name'], company['category_code'], json_data['overview'], json_data['description'], json_data['tag_list'], None, None)
except :
retval = (company['permalink'],company['name'], company['category_code'], None, None, None, 'error', json_data)
#close connections after we're done!
cb_conn.close()
s3conn.close()
#sleep for 1/6th of a second
time.sleep(0.16)
return retval
#get the essentials out of the way
load_essentials()
#load client
client = Client()
directview = client.direct_view()
directview.push(dict(cred=cred, api_key=api_key, ep=ep, url=url, gen_s3_obj=gen_s3_obj))
#make sure the necessary things are imported across clients
with directview.sync_imports():
import time, json
from boto.s3.connection import S3Connection
from httplib import HTTPConnection
#initiate connection with S3
#get s3bucket
s3conn, s3bucket = gen_s3_obj(cred, bucket='kandlikards')
#check if there is a json file of companies in S3
#retrieve a json document with the companies
companies_key='crunchbase/companies.json'
if not s3bucket.get_key(companies_key):
companies = fetch_and_file_comp_db()
else :
companies = fetch_and_file_comp_db(s3bucket.get_key(companies_key))
results = directview.map_async(fetch_company, companies[:5000])
s3conn.close()
#connect to companies endpoint
#getting information about every single company and filing it into companies.db
import json
import httplib
import pprint as pp
import sqlite3
import codecs
API_KEY = open('/Users/gk/Desktop/github/keys/cb', 'r').read().strip()
OUTPUTDIR='/Users/gk/Desktop/data_sci/crunchbase/data'
URL = 'api.crunchbase.com'
COMPANIES_EL = '/v/1/companies.js?'
COMPANY_EI = '/v/1/company/'
EXCEPTION_FILE= ''.join((OUTPUTDIR,'/exceptions.txt'))
def initiate_connection(url):
return httplib.HTTPConnection(url)
def generate_description_string(company_detail):
return '\n'.join(filter(None, [company_detail[key] for key in ['overview', 'description', 'tag_list']]))
def get_company_overview(cb_conn,permalink):
global COMPANY_EI, API_KEY, EXCEPTION_FILE
cb_conn.request("GET",''.join((COMPANY_EI,permalink,'.js?api_key=',API_KEY)))
return generate_description_string(json.loads(cb_conn.getresponse().read()))
def db_vals(conn, company):
code = lambda x: x if x != '' else None
name = company['name']
permalink = company['permalink']
cat = code(company['category_code'])
descr = get_company_overview(conn, permalink)
return permalink, name, cat, descr
def send_companies_query(conn, query, sql_connection):
global EXCEPTION_FILE
count = 0
conn.request("GET", query)
val = conn.getresponse().read().replace('][',',')
response = json.loads(val)
for company in response:
try :
sql_connection.execute('''INSERT INTO companies VALUES (?, ?, ?, ?)''', db_vals(conn,company))
count += 1
if count % 100 == 0 :
sql_connection.commit()
print "Count: %d" % count
except :
with codecs.open(EXCEPTION_FILE, 'a') as f:
f.write('\n')
f.write ('=======ERROR=======\n')
f.write(str(company))
sql_connection.commit()
def setup_companies_table():
global OUTPUTDIR
sql_connection = sqlite3.connect(''.join((OUTPUTDIR,"/companies.db")))
sql_connection.cursor()
sql_connection.execute('''DROP TABLE IF EXISTS companies''')
sql_connection.execute('''CREATE TABLE companies (permalink text primary key, name text, cat text, descr text)''')
sql_connection.commit()
return sql_connection
def companies():
global URL, COMPANIES_EL, API_KEY
cb_connection=initiate_connection(URL)
sql_connection = setup_companies_table()
query = "%sapi_key=%s" % (COMPANIES_EL, API_KEY)
send_companies_query (cb_connection, query, sql_connection)
cb_connection.close()
sql_connection.close()
if __name__ == '__main__':
companies()
'''Write to files'''
#connect to companies endpoint
#getting information about every single company and filing it into companies.db
import json
import httplib
import sqlite3
import codecs
import sys
PATH = '/Users/gk/Desktop/data_sci/crunchbase'
API_KEY = open(''.join((PATH,'/keys/cb')), 'r').read().strip()
OUTPUTDIR=''.join((PATH,'/data'))
URL = 'api.crunchbase.com'
COMPANIES_EL = '/v/1/companies.js?'
COMPANY_EI = '/v/1/company/'
EXCEPTION_FILE= ''.join((OUTPUTDIR,'/exceptions.txt'))
def initiate_connection(url):
return httplib.HTTPConnection(url)
def fetch_company_file(cb_conn, company):
global COMPANY_EI, API_KEY, EXCEPTION_FILE, OUTPUTDIR
query=''.join((COMPANY_EI,company['permalink'],'.js?api_key=',API_KEY))
company_file=''.join((OUTPUTDIR,'/',company['permalink'],'.json'))
cb_conn.request("GET",query)
data=cb_conn.getresponse().read()
with open(company_file, 'r+') as f:
if len(f.read()) < 1:
f.write(data)
def fetch_companies(cb_conn, companies_file):
global EXCEPTION_FILE
with open(''.join((PATH,'/data/count.txt'))) as count_file:
count = int(count_file.read().strip())
with open(companies_file) as f:
companies = json.loads(f.read(), strict=False)
for company in companies[count:]:
if count > 200:
break
fetch_company_file(cb_conn, company)
except :
extype, value = sys.exc_info()[:2]
with open(EXCEPTION_FILE, 'a') as g:
g.write('\n=======ERROR: FETCH_COMPANIES=======')
g.write('\nTYPE: %s \t VALUE: %s' % (extype, value))
g.write('\n'+str(company))
count += 1
with open(''.join((PATH,'/data/count.txt')),'w') as count_file:
count_file.write(str(count))
def fetch_companies_file(cb_conn):
global COMPANIES_EL, API_KEY, OUTPUTDIR, EXCEPTION_FILE
json_file=''.join((OUTPUTDIR,'/companies.json'))
with open(json_file, 'r+') as f:
if len(f.read()) < 1:
query = ''.join((COMPANIES_EL,"api_key=",API_KEY))
cb_conn.request("GET", query)
f.write(cb_conn.getresponse().read().replace('][',','))
return json_file
def companies():
global URL
cb_connection=initiate_connection(URL)
companies_file = fetch_companies_file(cb_connection)
fetch_companies(cb_connection, companies_file)
cb_connection.close()
if __name__ == '__main__':
companies()
import IPython, json
import pandas as pd
from IPython.parallel import Client
def get_cb_apikey(path):
retval=''
with open(''.join((path,'/crunchbase/key')), 'r') as f:
retval = f.read().strip()
return retval
def get_s3_bucket(credentials):
from boto.s3.connection import S3Connection
s3conn = S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
s3bucket = s3conn.get_bucket('kandlikards')
return s3conn, s3bucket
def get_cb_conn():
from httplib import HTTPConnection
return HTTPConnection('api.crunchbase.com')
def fetch_comp_json(s3bucket):
import json
#load the companies file as a json object
#return the json object
companies_key=s3bucket.get_key('crunchbase/companies.json')
return json.loads(companies_key.get_contents_as_string(), strict=False)
#local paths
path = '/Users/gk/Desktop/data_sci'
#path='/home/gkandlikar'
#crunchbase stuff
apikey = get_cb_apikey(path)
url = 'api.crunchbase.com'
ep = '/v/1/company/'
#S3 stuff
#credentials = pd.read_csv(''.join((PATH,'/credentials.csv')))
cred = pd.read_csv(''.join((path, '/credentials/credentials.csv')))
s3conn, s3bucket =get_s3_bucket(cred)
#get the json object that holds references to all the companies
#companies = fetch_comp_json(s3bucket)
existing_companies=s3bucket.list('crunchbase/data')
#cluster stuff
c=Client()
dview = c.direct_view()
#make it non-blocking so it frees us up to do something else with this notebook.
dview.block=False
#spread out the companies among the nodes in the cluster
dview.scatter('existing_companies', [company for company in existing_companies], dist='r')
#pass in the values which are going to be used to fetch each file
#notice that I am not using %%px, so I my function is only being defined
#locally. This is why I have to pass functions like get_s3_bucket and get_cb_conn
#My other option was to pass them in as arguments for the
# fetch_company_files function, but I thought this was a better way of going about it.
dview.push(dict(apikey=apikey, url=url, ep=ep, cred=cred, get_cb_conn=get_cb_conn, get_s3_bucket=get_s3_bucket, fetch_company_files=fetch_company_files))
try:
%px fetch_company_files()
val = reduce(lambda x, y: x+y, result)
print ('Retrieved %d companies' % val)
except :
print 'fail'
pass
#close yo connections before you wreck yo connections
s3conn.close()
fail
lsmagic
Available line magics: %alias %alias_magic %autocall %automagic %autopx %autosave %bookmark %cd %clear %colors %config %connect_info %debug %dhist %dirs %doctest_mode %ed %edit %env %gui %hist %history %install_default_config %install_ext %install_profiles %killbgscripts %less %load %load_ext %loadpy %logoff %logon %logstart %logstate %logstop %lsmagic %macro %magic %man %matplotlib %more %notebook %page %pastebin %pdb %pdef %pdoc %pfile %pinfo %pinfo2 %popd %pprint %precision %profile %prun %psearch %psource %pushd %pwd %px %pxconfig %pxresult %pycat %pylab %qtconsole %quickref %recall %rehashx %reload_ext %rep %rerun %reset %reset_selective %result %run %save %sc %store %sx %system %tb %time %timeit %unalias %unload_ext %who %who_ls %whos %xdel %xmode Available cell magics: %%! %%HTML %%SVG %%bash %%capture %%debug %%file %%html %%javascript %%latex %%perl %%prun %%px %%pypy %%python %%python3 %%ruby %%script %%sh %%svg %%sx %%system %%time %%timeit %%writefile Automagic is ON, % prefix IS NOT needed for line magics.