Analyzing VC funding using Cruchbase API

by André Ríos (datablast.wordpress.com)

Starting MongoDB service over Ubuntu 12.04.4 LTS

sudo mongod --dbpath /var/lib/mongodb --journal

In [170]:
import matplotlib
import json
import requests
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import pymongo
import networkx as nx
import matplotlib.pyplot as plt
from IPython.display import display
import collections
from IPython.display import HTML

Connecting MongoDB collection called 'cities'

  • c_barcelona(companies)

  • f_barcelona(funding)

  • i_barcelona(investors)

In [171]:
conn = pymongo.Connection("localhost", 27017)
db = conn.cities
c_barcelona = db.c_barcelona
f_barcelona = db.f_barcelona
i_barcelona = db.i_barcelona

Companies in Barcelona: Create collection c_barcelona

Searching companies at Barcelona and save on BBDD

location_uuid for Barcelona: eead2c0cb178ad334e6d6c813c955e99

Other Cities:

London: aad17950576b8c448fd4f44dbeb59220

Paris: 67af74be0bfa027795c13cb5853c8bcc

Rome: c5a012b1f2f9a3b5c66aee6be778eda2

Berlin: e61662e5c5c2c4d92a31d0e6d6453375

cs=requests.get('http://api.crunchbase.com/v/2/organizations?organization_types=company&location_uuids=eead2c0cb178ad334e6d6c813c955e99&user_key=< enter your key >&page=1').json()

np=cs['data']['paging']['number_of_pages']

ht = 'http://api.crunchbase.com/v/2/organizations?organization_types=company&location_uuids=eead2c0cb178ad334e6d6c813c955e99&user_key=< enter your key >&page='

lc=[]

for i in range(np):

s=str(i+1)

ht_i = ht + s

cs = requests.get(ht_i).json()

ni=len(cs['data']['items'])

for j in range(ni):

lc.append(cs['data']['items'][j]['path'])

List of organization in the city: organization/< name of organization>

url_list=[]

url_1='http://api.crunchbase.com/v/2/'

for i in range(len(lc)):

rl_list.append(url_1 + str(lc[i]) + '?user_key=< your key >')

Save json file in MongoDB: Companies in Barcelona

for i in range(len(url_list)):

rs = requests.get(url_list[i]).json()

c_barcelona.insert(rs)

Erase some element with errors.

db.c_barcelona.remove({'data': {u'response': False, u'error': {u'message': u'Node 4035881 not found', u'code': 500}}})

In [172]:
n=c_barcelona.count()
df1 = DataFrame(columns=['uuid','name','role'], index=list(range(0,n)))
for i in range(n):
 df1.loc[i] = Series({'uuid': c_barcelona.find()[i]['data']['uuid'],'name':c_barcelona.find()[i]['data']['properties']['permalink']
 ,'role':c_barcelona.find()[i]['data']['properties']['primary_role']})
 
df1=df1.drop_duplicates()
In [119]:
df1.head()
Out[119]:
uuid name role
0 d0647cecacb39a23bc4fd3fad90ecb4e strands company
1 a931b34d31f31d171f3d359875fc02c2 gsi-commerce company
2 31af5340f371d4f618156ca86a3a28fd akamon-entertainment company
3 476cdc24e4c314c49b9297032674e2da veeva company
4 78c70050042ba916b8121ab481207ea0 3scale company
In [96]:
dfx=df1[['name','role']]
h = HTML(dfx[0:7].to_html(index=False))
my_file = open('role_company.html', 'w')
my_file.write(h.data)
my_file.close()
In [173]:
n=c_barcelona.count()
f=0
df2 = DataFrame(columns=['uuid','name','category'], index=list(range(0,10000)))
for i in range(n):
 if (c_barcelona.find()[i]['data']['relationships']).get('categories','empty') != 'empty':
 ni=len(c_barcelona.find()[i]['data']['relationships']['categories']['items'])
 for j in range(ni):
 df2.loc[f]= Series({'uuid':c_barcelona.find()[i]['data']['uuid'],'name':c_barcelona.find()[i]['data']['properties']['permalink'],
 'category':c_barcelona.find()[i]['data']['relationships']['categories']['items'][j]['name']}) 
 f=f+1
 
df2=df2.drop_duplicates()
df2=df2.dropna(thresh=1)
In [121]:
df2.head(n=10)
Out[121]:
uuid name category
0 d0647cecacb39a23bc4fd3fad90ecb4e strands Artificial Intelligence
1 d0647cecacb39a23bc4fd3fad90ecb4e strands Reviews and Recommendations
2 d0647cecacb39a23bc4fd3fad90ecb4e strands Software
3 d0647cecacb39a23bc4fd3fad90ecb4e strands Personal Finance
4 d0647cecacb39a23bc4fd3fad90ecb4e strands E-Commerce
5 d0647cecacb39a23bc4fd3fad90ecb4e strands Personalization
6 a931b34d31f31d171f3d359875fc02c2 gsi-commerce Enterprise Software
7 31af5340f371d4f618156ca86a3a28fd akamon-entertainment Games
8 476cdc24e4c314c49b9297032674e2da veeva Content
9 476cdc24e4c314c49b9297032674e2da veeva Software
In [78]:
dfx=df2[['name','category']]
h = HTML(dfx[0:7].to_html(index=False))
my_file = open('cat_company.html', 'w')
my_file.write(h.data)
my_file.close()
In [174]:
f=0
df3 = DataFrame(columns=['uuid','name','path'], index=list(range(0,10000)))
for i in range(n):
 if (c_barcelona.find()[i]['data']['relationships']).get('funding_rounds','empty') != 'empty':
 ni=len(c_barcelona.find()[i]['data']['relationships']['funding_rounds']['items'])
 for j in range(ni):
 df3.loc[f]= Series({'uuid':c_barcelona.find()[i]['data']['uuid'],
 'name':c_barcelona.find()[i]['data']['properties']['permalink'],
 'path':c_barcelona.find()[i]['data']['relationships']['funding_rounds']['items'][j]['path']}) 
 f=f+1
 
df3['uuid_fund'] = df3['path'].map(lambda x: str(x).replace("funding-round/",""))
df3=df3.drop_duplicates()
In [123]:
df3.head()
Out[123]:
uuid name path uuid_fund
0 d0647cecacb39a23bc4fd3fad90ecb4e strands funding-round/eb5cbd96fd6c6176199f993a589a4da2 eb5cbd96fd6c6176199f993a589a4da2
1 d0647cecacb39a23bc4fd3fad90ecb4e strands funding-round/9f1a9a6a5326712b265aa70e78a06aa9 9f1a9a6a5326712b265aa70e78a06aa9
2 d0647cecacb39a23bc4fd3fad90ecb4e strands funding-round/eef833bbeabcfeb4c866074eda879500 eef833bbeabcfeb4c866074eda879500
3 31af5340f371d4f618156ca86a3a28fd akamon-entertainment funding-round/639a184018b1b35cad644c3d11e086c2 639a184018b1b35cad644c3d11e086c2
4 31af5340f371d4f618156ca86a3a28fd akamon-entertainment funding-round/32705565ab649c7d969fd42f1d63e3d2 32705565ab649c7d969fd42f1d63e3d2

Funding links: Create collection f_barcelona

n=c_barcelona.count()

pl=[]

for i in range(n):

if (c_barcelona.find()[i]['data']['relationships']).get('funding_rounds','empty') != 'empty':

ni=len(c_barcelona.find()[i]['data']['relationships']['funding_rounds']['items'])

for j in range(ni):

pl.append(c_barcelona.find()[i]['data']['relationships']['funding_rounds']['items'][j]['path'])

url_list=[]

url_1='http://api.crunchbase.com/v/2/'

for i in range(len(pl)):

url_list.append(url_1 + str(pl[i]) + '?user_key=< your key>')

for i in range(len(url_list)):

try:

rs = requests.get(url_list[i]).json()

except:

print 'Error'

f_barcelona.insert(rs)

Erase some elements with errors.

db.f_barcelona.remove({'data': {u'response': False, u'error': {u'message': u'wrong argument type nil (expected String)', u'code': 500}}})

In [175]:
df4 = DataFrame(columns=['uuid_fund','type','money','currency','year','path_inv'], index=list(range(0,10000)))
nf=f_barcelona.count()
f=0
for i in range(nf):
 if (f_barcelona.find()[i]['data']['relationships']).get('investments','empty') != 'empty':
 ni=len(f_barcelona.find()[i]['data']['relationships']['investments']['items'])
 a1=f_barcelona.find()[i]['data']['uuid']
 a2=f_barcelona.find()[i]['data']['properties']['funding_type']
 if (f_barcelona.find()[i]['data']['properties']).get('money_raised','empty') != 'empty':
 a3=f_barcelona.find()[i]['data']['properties']['money_raised']
 else:
 a3=0
 a4=f_barcelona.find()[i]['data']['properties']['money_raised_currency_code']
 a5=f_barcelona.find()[i]['data']['properties']['announced_on_year']
 for j in range(ni):
 try:
 i
 df4.loc[f]= Series({'uuid_fund':a1,'type':a2,'money':a3/ni,'currency':a4,'year':a5,
 'path_inv':f_barcelona.find()[i]['data']['relationships']['investments']['items'][j]['investor']['path']})
 f=f+1
 except:
 print 'Companies with errors'

#Total funding in each round is divided by the number of the invertors. 
df4['name'] = df4['path_inv'].map(lambda x: str(x).replace("organization/",""))
df4['name'] = df4['name'].map(lambda x: str(x).replace("person/",""))
df4=df4.drop_duplicates()
 
Companies with errors
Companies with errors
Companies with errors
Companies with errors
Companies with errors
Companies with errors
Companies with errors
Companies with errors

Investor Links: Create collection i_barcelona

li=[]

nf=f_barcelona.count()

for i in range(nf):

if (f_barcelona.find()[i]['data']['relationships']).get('investments','empty') != 'empty':

ni=len(f_barcelona.find()[i]['data']['relationships']['investments']['items'])

for j in range(ni):

try:

li.append(f_barcelona.find()[i]['data']['relationships']['investments']['items'][j]['investor']['path'])

except:

print 'Error'

Remove duplicate elements

a=set(li)

Searching investor uuids

url_list=[]

url_1='http://api.crunchbase.com/v/2/'

for i in range(len(list(set(li)))):

url_list.append(url_1 + str(b[i]) + '?user_key=< your key>')

nl=len(url_list)

for i in range(nl):

try:

rs = requests.get(url_list[i]).json()

except:

print 'Error'

i_barcelona.insert(rs)

In [125]:
df4.head()
Out[125]:
uuid_fund type money currency year path_inv name
0 421e422d65b93b54d59ba19a465292d9 venture 1250000 EUR 2006 organization/active-venture-partners active-venture-partners
1 421e422d65b93b54d59ba19a465292d9 venture 1250000 EUR 2006 organization/adara-venture-partners adara-venture-partners
2 7a17150c3f14d66358956ac66fe2ea80 angel 73333 EUR 2012 organization/cabiedes-partners cabiedes-partners
3 7a17150c3f14d66358956ac66fe2ea80 angel 73333 EUR 2012 person/francois-derbaix francois-derbaix
4 7a17150c3f14d66358956ac66fe2ea80 angel 73333 EUR 2012 person/inaki-berenguer inaki-berenguer
In [176]:
ni=i_barcelona.count()
df5 = DataFrame(columns=['uuid','name','role'], index=list(range(0,ni)))
for i in range(ni):
 df5.loc[i] = Series({'uuid': i_barcelona.find()[i]['data']['uuid'],
 'name':i_barcelona.find()[i]['data']['properties']['permalink'], 'role':'investor'}) 
 
df5=df5.drop_duplicates()
 
In [127]:
df5.head()
Out[127]:
uuid name role
0 3eed4fa46c75fb697f5e17db8092b73b viewpoint-capital-partners investor
1 56e8dbb001cdbdc3db01d38b3f8ce7d4 bullnet-gestion investor
2 025c7a60c56a47b0ac783b178661ddb7 pond-ventures investor
3 e02e9ffcc387327b2339a3f721dbd058 bbva investor
4 af37951e36a3ecaa003526db8f967832 costanoa-venture-capital investor
In [81]:
dfx=df5[['name','role']]
h = HTML(dfx[0:7].to_html(index=False))
my_file = open('role_investor.html', 'w')
my_file.write(h.data)
my_file.close()

Creating relationship table between companies and investors

In [177]:
dfa=df3[['name','uuid_fund']]
dfb=df4[['name','uuid_fund','type','money','currency','year']]
dfc=dfa.merge(dfb, on='uuid_fund')
dfc.columns = ['company', 'link','investor','type','money','currency','year']
dfc=dfc[['company', 'investor','type','money','currency','year']]
In [129]:
dfc.head()
Out[129]:
company investor type money currency year
0 strands bbva venture 24000000 USD 2007
1 strands antonio-asensio venture 25000000 USD 2007
2 strands debaeque venture 3000000 USD 2006
3 strands sequel venture 3000000 USD 2006
4 akamon-entertainment bonsai-capital venture 700000 EUR 2013
Convert currency to EUR
In [178]:
from money import Money
dfc.money[dfc.currency == 'USD']=dfc.money[dfc.currency == 'USD']*0.79
dfc.currency=dfc.currency.replace('USD','EUR')
dfc['money'] = dfc['money'].map(lambda x: Money(x,'EUR'))
In [131]:
dfc.head()
Out[131]:
company investor type money currency year
0 strands bbva venture EUR 18,960,000.00 EUR 2007
1 strands antonio-asensio venture EUR 19,750,000.00 EUR 2007
2 strands debaeque venture EUR 2,370,000.00 EUR 2006
3 strands sequel venture EUR 2,370,000.00 EUR 2006
4 akamon-entertainment bonsai-capital venture EUR 700,000.00 EUR 2013
In [132]:
dfx=dfc[['company','investor','type','money','year']]
h = HTML(dfx[0:8].to_html(index=False))
my_file = open('relation.html', 'w')
my_file.write(h.data)
my_file.close()

Analysis

Top 10 companies receiving VC funding:

In [382]:
h1=dfc[['company','money']].groupby('company').sum().reset_index()
h2=h1.sort(['money'],ascending=False)
In [134]:
h2.head(n=10)
Out[134]:
company money
95 privalia EUR 395,668,000.00
30 desigual EUR 285,000,000.00
105 scytl EUR 89,427,998.42
117 strands EUR 43,450,000.00
46 groupalia EUR 35,899,996.00
112 social-point EUR 34,845,998.42
8 arcplan-information-services-ag EUR 26,907,400.00
82 ntr-global EUR 26,860,000.00
43 gigle-semiconductor EUR 24,489,998.42
116 stat-diagnostica EUR 24,100,000.00
In [83]:
a = HTML(h2[0:9].to_html(index=False))
my_file = open('top10_company.html', 'w')
my_file.write(a.data)
my_file.close()

Top 10 investors

In [180]:
g1=dfc[['investor','money']].groupby('investor').sum().reset_index()
g2=g1.sort(['money'],ascending=False)
In [136]:
g2.head(n=10)
Out[136]:
investor money
62 eurazeo EUR 285,000,000.00
31 cabiedes-partners EUR 203,989,256.00
85 index-ventures EUR 66,547,928.00
71 general-atlantic EUR 66,547,928.00
180 vulcan-capital EUR 31,600,000.00
88 insight-venture-partners EUR 29,022,928.00
161 sofina EUR 25,000,000.00
81 highland-capital-partners EUR 24,371,500.00
126 nauta-capital EUR 23,557,527.14
175 venture-investors EUR 22,100,000.00
In [84]:
a = HTML(g2[0:9].to_html(index=False))
my_file = open('top10_investor.html', 'w')
my_file.write(a.data)
my_file.close()

Year vs Inversion

In [137]:
l1=dfc[['year','money']].groupby('year').sum().reset_index()
l2=l1.sort(['money'],ascending=False)
In [138]:
l2.head()
Out[138]:
year money
9 2014 4.057950e+08
6 2011 3.520405e+08
5 2010 1.015254e+08
8 2013 9.381874e+07
2 2007 7.674140e+07
In [320]:
l2.shape
Out[320]:
(10, 2)
In [85]:
a = HTML(l2[0:9].to_html(index=False))
my_file = open('top10_year.html', 'w')
my_file.write(a.data)
my_file.close()
In [321]:
l2.to_csv('fund_year.csv', sep=',', encoding='utf-8',index=False)

Categories

In [139]:
co=collections.Counter(df2['category'])
In [140]:
co1 = DataFrame(co.items(), columns=['Category', 'Number of Companies'])
co2=co1.sort(['Number of Companies'],ascending=False)
In [141]:
co2.head(n=10)
# Obs: each company can belong to more than one category.
Out[141]:
Category Number of Companies
183 E-Commerce 93
234 Software 90
87 Mobile 86
152 Games 59
270 Internet 45
138 Enterprise Software 45
210 Curated Web 44
124 Apps 41
44 Technology 41
153 Advertising 30
In [410]:
co2[co2.Category =='Startups']
Out[410]:
Category Number of Companies
323 Startups 15
In [408]:
a = HTML(co2[0:9].to_html(index=False))
my_file = open('top10_category.html', 'w')
my_file.write(a.data)
my_file.close()

Graph Analysis

Relationships (Company - Investor) as Undirected Graph

In [144]:
x1=list(dfc['company'])
x2=list(dfc['investor'])
y=list(dfc[['company','investor']].itertuples(index=False))
G=nx.Graph()
G.add_edges_from(y)
pos=nx.spring_layout(G)
nx.draw_networkx_nodes(G,pos,nodelist=x1,node_color='b')
nx.draw_networkx_nodes(G,pos,nodelist=x2,node_color='r')
nx.draw_networkx_edges(G,pos)
nx.draw(G)
# View connected components into Graph
In [25]:
h=sorted(nx.connected_components(G),key=len,reverse=True)
In [26]:
H=G.subgraph(h[1])
In [27]:
nx.draw(H, with_labels=True)
In [28]:
H=G.subgraph(h[2])
In [29]:
nx.draw(H,with_labels=True)

Relationships (Company - Investor) as D irected Graph

In [423]:
x1=list(dfc['company'])
x2=list(dfc['investor'])
y=list(dfc[['investor','company']].itertuples(index=False))
DG=nx.DiGraph()
DG.add_edges_from(y)
pos=nx.spring_layout(DG)
nx.draw_networkx_nodes(DG,pos,nodelist=x1,node_size=50,node_color= 'g')
nx.draw_networkx_nodes(DG,pos,nodelist=x2,node_size=80, node_color= 'b')
nx.draw_networkx_edges(DG,pos)
plt.axis('off')
plt.legend(["Company", "Investor"])
plt.savefig("direct.png",dpi=1000)
#nx.draw(G)
In [158]:
h=sorted(nx.connected_components(G),key=len,reverse=True)
#connectec components only work over undirected graphs
H=DG.subgraph(h[1])
nx.draw(H,with_labels=True)
plt.axis('off')
plt.savefig("direct_case.png",dpi=1000)
In [91]:
#The out-degree centrality for a node v is the fraction of nodes its outgoing edges are connected to.
co3=nx.out_degree_centrality(DG)
co4=DataFrame(co3.items(), columns=['investor','Degree Centrality'])
co5=co4.sort(['Degree Centrality'],ascending=False)
In [92]:
co5.head(n=10)
Out[92]:
investor Degree Centrality
130 caixa-capital 0.057402
78 cabiedes-partners 0.042296
183 wayra 0.039275
311 inveready-technology-investment-group 0.033233
179 nauta-capital 0.027190
190 highgrowth 0.027190
35 active-venture-partners 0.021148
16 kibo-ventures 0.018127
235 debaeque 0.015106
158 la-caixa 0.012085
In [93]:
a = HTML(co5[0:9].to_html(index=False))
my_file = open('d_central_inv.html', 'w')
my_file.write(a.data)
my_file.close()
In [192]:
#The in-degree centrality for a node v is the fraction of nodes its incoming edges are connected to.
co3=nx.in_degree_centrality(DG)
co4=DataFrame(co3.items(), columns=['company','Degree_Centrality'])
co5=co4.sort(['Degree_Centrality'],ascending=False)
In [193]:
co5.head(n=10)
Out[193]:
company Degree_Centrality
93 geenapp-internet 0.030211
160 era-biotech 0.027190
63 groupalia 0.024169
178 scytl 0.021148
28 tiendeo 0.021148
218 nubelo 0.018127
116 privalia 0.018127
270 infantium 0.018127
51 uvinum 0.018127
43 bidaway-com 0.015106
In [220]:
a=co5[co5.Degree_Centrality != 0]
b=a.merge(h2, on='company')
x=list(b['Degree_Centrality'])
y=list(b['money'].astype(float))
plt.scatter(x,y)
plt.xlabel('Degree Centrality')
plt.ylabel('Total Fundraised (EUR)')
plt.title(r'Degree Centrality vs Total Fundraised')
plt.show()
In [95]:
a = HTML(co5[0:9].to_html(index=False))
my_file = open('d_central_comp.html', 'w')
my_file.write(a.data)
my_file.close()

Save DataFrames

In [185]:
dfc.to_csv('barcelona_link.csv', sep=',', encoding='utf-8',index=False)

dfd=df1.append(df5)
dfd=dfd[['name', 'role']]
dfd.to_csv('barcelona_role.csv', sep=',', encoding='utf-8',index=False)

dfe=df2[['name','category']]
dfe.to_csv('barcelona_cat.csv', sep=',', encoding='utf-8',index=False)
In [223]:
b.money=b.money.astype(float)
b.head()
Out[223]:
company Degree_Centrality money
0 geenapp-internet 0.030211 103000.00
1 era-biotech 0.027190 10469198.42
2 groupalia 0.024169 35899996.00
3 scytl 0.021148 89427998.42
4 tiendeo 0.021148 834995.00
In [224]:
b.to_csv('deg_central.csv', sep=',', encoding='utf-8',index=False)
In [265]:
dfc['money']=dfc['money'].astype(float)
dfc['money'] = (dfc['money'] - dfc['money'].min()) / (dfc['money'].max() - dfc['money'].min())
y=list(dfc[['investor','company','money']].itertuples(index=False))
x1=list(dfc['company'])
x2=list(dfc['investor'])
dfc.money[dfc.money == 0]=0.00000000000001
In [255]:
DG1=nx.DiGraph()
DG1.add_weighted_edges_from(y)
pos=nx.spring_layout(DG1)
nx.draw_networkx_nodes(DG1,pos,nodelist=x1,node_size=50,node_color= 'g')
nx.draw_networkx_nodes(DG1,pos,nodelist=x2,node_size=80, node_color= 'b')
nx.draw_networkx_edges(DG1,pos)
Out[255]:
<matplotlib.collections.LineCollection at 0xad2c36c>
In [309]:
pr=nx.pagerank(DG1)
In [310]:
m=list(set(x2))
for i in range(len(m)):
 if m[i] not in x1:
 del pr[m[i]]
In [315]:
co6=DataFrame(pr.items(), columns=['company','pagerank'])
b=co6.merge(h2, on='company')
x=list(b['pagerank'])
y=list(b['money'].astype(float))
plt.scatter(x,y)
plt.xlabel('PageRank')
plt.ylabel('Total Fundraised (EUR)')
plt.title(r'PageRank vs Total Fundraised')
plt.show()
In [317]:
b.money=b.money.astype(float)
Out[317]:
company pagerank money
0 akamon-entertainment 0.006464 2800000
1 ebdsoft 0.002360 3594500
2 compring 0.005577 300000
3 bettymovil 0.002279 1580000
4 tiendeo 0.009005 834995
In [318]:
b.to_csv('pagerank.csv', sep=',', encoding='utf-8',index=False)

Category Score

In [383]:
dcat=df2[['name','category']]
m=list(h2.company)
In [384]:
dcat['name']=dcat['name'].apply(lambda x: x if x in m else 'false')
dc=dcat[dcat.name != 'false']
In [385]:
h2['money']=h2['money'].astype(float)
h2['money'] = (h2['money'] - h2['money'].min()) / (h2['money'].max() - h2['money'].min())
In [387]:
rt=dict(zip(h2.company, h2.money))
In [388]:
dc['score']=dc['name'].apply(lambda x: rt[x])
In [389]:
dc.head(20)
Out[389]:
name category score
0 strands Artificial Intelligence 0.109814
1 strands Reviews and Recommendations 0.109814
2 strands Software 0.109814
3 strands Personal Finance 0.109814
4 strands E-Commerce 0.109814
5 strands Personalization 0.109814
7 akamon-entertainment Games 0.007077
8 veeva Content 0.007986
9 veeva Software 0.007986
10 veeva CRM 0.007986
11 veeva Life Sciences 0.007986
12 veeva Enterprise Software 0.007986
13 3scale Payments 0.009983
14 3scale Software 0.009983
15 3scale Infrastructure 0.009983
16 3scale Developer APIs 0.009983
17 3scale Services 0.009983
18 3scale Enterprise Software 0.009983
19 trovit Employment 0.003255
20 trovit Cars 0.003255
In [390]:
dc1=dc[['category','score']]
dl1=dc1[['category','score']].groupby('category').sum().reset_index()
dl2=dl1.sort(['score'],ascending=False)
In [393]:
dl2.head(20)
Out[393]:
category score
61 Fashion 1.721666
48 E-Commerce 1.152117
124 Retail 1.095642
41 Design 0.720301
115 Printing 0.720301
145 Software 0.581629
50 Electronics 0.226776
86 Law Enforcement 0.226018
18 Biotechnology 0.191800
55 Enterprise Software 0.175419
125 Reviews and Recommendations 0.120252
9 Artificial Intelligence 0.109981
66 Games 0.109883
110 Personalization 0.109814
109 Personal Finance 0.109814
140 Social Media 0.101359
96 Mobile 0.097313
114 Price Comparison 0.090733
59 Facebook Applications 0.088069
139 Social Games 0.088069
In [397]:
dl2['pct']=dl2['score']*100/(dl2.score.sum())
dl2.head()
Out[397]:
category score pct
61 Fashion 1.721666 18.711779
48 E-Commerce 1.152117 12.521690
124 Retail 1.095642 11.907894
41 Design 0.720301 7.828530
115 Printing 0.720301 7.828530
In [398]:
dl2.to_csv('cat_score.csv', sep=',', encoding='utf-8',index=False)