Notebook

In [ ]:

#load pandas
import pandas as pd
from IPython.parallel import Client
from httplib import HTTPConnection


cred, api_key, ep, url = None, None, None, None

#path = '/Users/gk/Desktop/data_sci'
path='/home/gkandlikar'

def load_essentials():
    #load credentials, api key, url, end points, etc
    global cred, api_key, url, ep, path
    cred = pd.read_csv(''.join((path,'/credentials/credentials.csv')))
    url = 'api.crunchbase.com'
    ep = ['/v/1/companies.js?','/v/1/company/']
    with open(''.join((path,'/crunchbase/key')), 'r') as f:
        api_key = f.read().strip()
    return


def gen_s3_obj(credentials, bucket=''):
    from boto.s3.connection import S3Connection
    s3conn=S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
    if bucket != '':
        s3bucket = s3conn.get_bucket(bucket)
    return s3conn, s3bucket

def fetch_and_file_comp_db(key = False):
    if not key:
        cb_conn=HTTPConnection(url)
        
        #initiate connection with crunchbase
        query = ''.join((ep[0],'api_key=',api_key))
        cb_conn.request("GET",query)
        data=cb_conn.getresponse().read()
        
        #make a query for the file, read the request
        s3conn, s3bucket = gen_s3_obj(cred)
        compfile=s3bucket.new_key('crunchbase/companies.json')
        compfile.set_contents_from_string(data)
        s3conn.close()
        
        #create key on S3, set its contents
        return json.loads(data, strict = False)
        
    else:
        return json.loads(key.get_contents_as_string(), strict = False)

def fetch_company(company):
    #establish connections 
    cb_conn=HTTPConnection(url)
    s3conn, s3bucket=gen_s3_obj(cred, bucket='kandlikards')
    
    keyname=''.join(('crunchbase/',company['permalink'],'.json'))
    key = s3bucket.get_key(keyname)
    
    if ((not key) or ('Developer Over Qps' in key.get_contents_as_string())):
        #retrieve file from crunchbase
        #build query:'/v/1/company/company-name.js?api_key=<API_KEY>'
        query=''.join((ep[1],company['permalink'],'.js?api_key=',api_key))
        
        #initiate and read connection
        cb_conn.request("GET",query)
        data=cb_conn.getresponse().read()
        
        while 'Developer Over Qps' in data:
            time.sleep(5)
            cb_conn.request("GET",query)
            data=cb_conn.getresponse().read()
        
        key=s3bucket.new_key(keyname)
        key.set_contents_from_string(data)
        
    else :
        data=key.get_contents_as_string()
    
    try :
        json_data=json.loads(data, strict=False)
    except :
        json_data={'error': 'Load Error during json.loads()'}
    
    if 'error' in json_data.keys():
        retval=None
    else :
        try :
            retval = (company['permalink'],company['name'], company['category_code'], json_data['overview'], json_data['description'], json_data['tag_list'], None, None)
        except :
            retval = (company['permalink'],company['name'], company['category_code'], None, None, None, 'error', json_data)
            
    #close connections after we're done!    
    cb_conn.close()
    s3conn.close()
    #sleep for 1/6th of a second
    time.sleep(0.16)
    return retval
        
#get the essentials out of the way
load_essentials()

#load client
client = Client()
directview = client.direct_view()
directview.push(dict(cred=cred, api_key=api_key, ep=ep, url=url, gen_s3_obj=gen_s3_obj))

#make sure the necessary things are imported across clients
with directview.sync_imports():
    import time, json
    from boto.s3.connection import S3Connection
    from httplib import HTTPConnection

#initiate connection with S3
#get s3bucket
s3conn, s3bucket = gen_s3_obj(cred, bucket='kandlikards')

#check if there is a json file of companies in S3
#retrieve a json document with the companies
companies_key='crunchbase/companies.json'
if not s3bucket.get_key(companies_key):
    companies = fetch_and_file_comp_db()
else :
    companies = fetch_and_file_comp_db(s3bucket.get_key(companies_key))

results = directview.map_async(fetch_company, companies[:5000])

s3conn.close()

In [ ]:

#connect to companies endpoint
#getting information about every single company and filing it into companies.db
import json
import httplib
import pprint as pp
import sqlite3
import codecs

API_KEY = open('/Users/gk/Desktop/github/keys/cb', 'r').read().strip()
OUTPUTDIR='/Users/gk/Desktop/data_sci/crunchbase/data'
URL = 'api.crunchbase.com'
COMPANIES_EL = '/v/1/companies.js?'
COMPANY_EI = '/v/1/company/'
EXCEPTION_FILE= ''.join((OUTPUTDIR,'/exceptions.txt'))


def initiate_connection(url):
	return httplib.HTTPConnection(url)

def generate_description_string(company_detail):
    return '\n'.join(filter(None, [company_detail[key] for key in ['overview', 'description', 'tag_list']]))

def get_company_overview(cb_conn,permalink):
    global COMPANY_EI, API_KEY, EXCEPTION_FILE
    cb_conn.request("GET",''.join((COMPANY_EI,permalink,'.js?api_key=',API_KEY)))
    return generate_description_string(json.loads(cb_conn.getresponse().read()))

def db_vals(conn, company):
    code = lambda x: x if x != '' else None
    name = company['name']
    permalink = company['permalink']
    cat = code(company['category_code'])
    descr = get_company_overview(conn, permalink)
    return permalink, name, cat, descr
    

def send_companies_query(conn, query, sql_connection):
    global EXCEPTION_FILE
    count = 0
    conn.request("GET", query)
    val = conn.getresponse().read().replace('][',',')
    response = json.loads(val)
    for company in response:
        try :
            sql_connection.execute('''INSERT INTO companies VALUES (?, ?, ?, ?)''', db_vals(conn,company))
            count += 1
            if count % 100 == 0 :
                sql_connection.commit()
                print "Count: %d" % count
        except :
            with codecs.open(EXCEPTION_FILE, 'a') as f:
                f.write('\n')
                f.write ('=======ERROR=======\n')
                f.write(str(company))
    sql_connection.commit()

def setup_companies_table():
    global OUTPUTDIR
    sql_connection = sqlite3.connect(''.join((OUTPUTDIR,"/companies.db")))
    sql_connection.cursor()
    sql_connection.execute('''DROP TABLE IF EXISTS companies''')
    sql_connection.execute('''CREATE TABLE companies (permalink text primary key, name text, cat text, descr text)''')
    sql_connection.commit()
    return sql_connection
    
def companies():
    global URL, COMPANIES_EL, API_KEY
    cb_connection=initiate_connection(URL)
    sql_connection = setup_companies_table()
    query = "%sapi_key=%s" % (COMPANIES_EL, API_KEY)
    send_companies_query (cb_connection, query, sql_connection)
    cb_connection.close()
    sql_connection.close()

if __name__ == '__main__':
	companies()

In [ ]:

'''Write to files'''

In [ ]:

#connect to companies endpoint
#getting information about every single company and filing it into companies.db
import json
import httplib
import sqlite3
import codecs
import sys

PATH = '/Users/gk/Desktop/data_sci/crunchbase'
API_KEY = open(''.join((PATH,'/keys/cb')), 'r').read().strip()
OUTPUTDIR=''.join((PATH,'/data'))
URL = 'api.crunchbase.com'
COMPANIES_EL = '/v/1/companies.js?'
COMPANY_EI = '/v/1/company/'
EXCEPTION_FILE= ''.join((OUTPUTDIR,'/exceptions.txt'))


def initiate_connection(url):
	return httplib.HTTPConnection(url)

def fetch_company_file(cb_conn, company):
    global COMPANY_EI, API_KEY, EXCEPTION_FILE, OUTPUTDIR
    query=''.join((COMPANY_EI,company['permalink'],'.js?api_key=',API_KEY))
    company_file=''.join((OUTPUTDIR,'/',company['permalink'],'.json'))
    cb_conn.request("GET",query)
    data=cb_conn.getresponse().read()
    with open(company_file, 'r+') as f:
        if len(f.read()) < 1:
            f.write(data)

    
def fetch_companies(cb_conn, companies_file):
    global EXCEPTION_FILE
    with open(''.join((PATH,'/data/count.txt'))) as count_file:
        count = int(count_file.read().strip())
    with open(companies_file) as f:
        companies = json.loads(f.read(), strict=False)
        for company in companies[count:]:
            if count > 200:
                break
                      fetch_company_file(cb_conn, company)
            except :
                extype, value = sys.exc_info()[:2]
                with open(EXCEPTION_FILE, 'a') as g:
                    g.write('\n=======ERROR: FETCH_COMPANIES=======')
                    g.write('\nTYPE: %s \t VALUE: %s' % (extype, value))
                    g.write('\n'+str(company))
            count += 1
        with open(''.join((PATH,'/data/count.txt')),'w') as count_file:
            count_file.write(str(count))

def fetch_companies_file(cb_conn):
    global COMPANIES_EL, API_KEY, OUTPUTDIR, EXCEPTION_FILE
    json_file=''.join((OUTPUTDIR,'/companies.json'))
    with open(json_file, 'r+') as f:
        if len(f.read()) < 1:
            query = ''.join((COMPANIES_EL,"api_key=",API_KEY))
            cb_conn.request("GET", query)
            f.write(cb_conn.getresponse().read().replace('][',','))        
    return json_file

def companies():
    global URL
    cb_connection=initiate_connection(URL)
    companies_file = fetch_companies_file(cb_connection)
    fetch_companies(cb_connection, companies_file)
    cb_connection.close()

if __name__ == '__main__':
	companies()

In [21]:

import IPython, json
import pandas as pd
from IPython.parallel import Client

def get_cb_apikey(path):
    retval=''
    with open(''.join((path,'/crunchbase/key')), 'r') as f:
        retval = f.read().strip()
    return retval

def get_s3_bucket(credentials):
    from boto.s3.connection import S3Connection
    s3conn = S3Connection(credentials['Access Key Id'][0], credentials['Secret Access Key'][0])
    s3bucket = s3conn.get_bucket('kandlikards')
    return s3conn, s3bucket

def get_cb_conn():
    from httplib import HTTPConnection
    return HTTPConnection('api.crunchbase.com')

    
def fetch_comp_json(s3bucket):
    import json
    
    #load the companies file as a json object
    #return the json object
    
    companies_key=s3bucket.get_key('crunchbase/companies.json')
    return json.loads(companies_key.get_contents_as_string(), strict=False)



#local paths
path = '/Users/gk/Desktop/data_sci'
#path='/home/gkandlikar'

#crunchbase stuff
apikey = get_cb_apikey(path)
url = 'api.crunchbase.com'
ep = '/v/1/company/'

#S3 stuff
#credentials = pd.read_csv(''.join((PATH,'/credentials.csv')))
cred = pd.read_csv(''.join((path, '/credentials/credentials.csv')))
s3conn, s3bucket =get_s3_bucket(cred)

#get the json object that holds references to all the companies
#companies = fetch_comp_json(s3bucket)

existing_companies=s3bucket.list('crunchbase/data')

#cluster stuff
c=Client()
dview = c.direct_view()

#make it non-blocking so it frees us up to do something else with this notebook.
dview.block=False

#spread out the companies among the nodes in the cluster
dview.scatter('existing_companies', [company for company in existing_companies], dist='r')
 
#pass in the values which are going to be used to fetch each file
#notice that I am not using %%px, so I my function is only being defined
#locally. This is why I have to pass functions like get_s3_bucket and get_cb_conn

#My other option was to pass them in as arguments for the
# fetch_company_files function, but I thought this was a better way of going about it.

dview.push(dict(apikey=apikey, url=url, ep=ep, cred=cred, get_cb_conn=get_cb_conn, get_s3_bucket=get_s3_bucket, fetch_company_files=fetch_company_files))

try:
    %px fetch_company_files()
    val = reduce(lambda x, y: x+y, result)
    print ('Retrieved %d companies' % val)

except :
    print 'fail'
    pass


#close yo connections before you wreck yo connections
s3conn.close()

fail

In [24]:

lsmagic

Out[24]:

Available line magics:
%alias  %alias_magic  %autocall  %automagic  %autopx  %autosave  %bookmark  %cd  %clear  %colors  %config  %connect_info  %debug  %dhist  %dirs  %doctest_mode  %ed  %edit  %env  %gui  %hist  %history  %install_default_config  %install_ext  %install_profiles  %killbgscripts  %less  %load  %load_ext  %loadpy  %logoff  %logon  %logstart  %logstate  %logstop  %lsmagic  %macro  %magic  %man  %matplotlib  %more  %notebook  %page  %pastebin  %pdb  %pdef  %pdoc  %pfile  %pinfo  %pinfo2  %popd  %pprint  %precision  %profile  %prun  %psearch  %psource  %pushd  %pwd  %px  %pxconfig  %pxresult  %pycat  %pylab  %qtconsole  %quickref  %recall  %rehashx  %reload_ext  %rep  %rerun  %reset  %reset_selective  %result  %run  %save  %sc  %store  %sx  %system  %tb  %time  %timeit  %unalias  %unload_ext  %who  %who_ls  %whos  %xdel  %xmode

Available cell magics:
%%!  %%HTML  %%SVG  %%bash  %%capture  %%debug  %%file  %%html  %%javascript  %%latex  %%perl  %%prun  %%px  %%pypy  %%python  %%python3  %%ruby  %%script  %%sh  %%svg  %%sx  %%system  %%time  %%timeit  %%writefile

Automagic is ON, % prefix IS NOT needed for line magics.

In [ ]: