Author: Marius Millea (cosmicmar.com)

A notebook for computing various pretty plots with statistics about a BOINC project.

If you use this notebook, please consider giving reference to the original post.


In [1]:
#a number of non-standard libraries here which you may have to install
import MySQLdb as mdb
from pandas import DataFrame
import seaborn as sns
sns.set_context("notebook", font_scale=1.5)
from prettyplotlib import *
import cPickle as pickle
from mpl_toolkits.basemap import Basemap
from datetime import datetime, timedelta
import time
import os, os.path as osp
import time
from multiprocessing.dummy import Pool
import subprocess
In [2]:
#for some reason this results in prettier SVGs
def savesvg(f,**kwargs):
    savefig('%s.pdf'%f,**kwargs)
    os.system('pdf2svg %s.pdf %s.svg && rm %s.pdf'%((f,)*3))
In [3]:
con = mdb.connect('localhost', 'root', '', 'cosmohome');
In [4]:
cur=con.cursor()
curd=con.cursor(mdb.cursors.DictCursor)
In [5]:
cur.execute("SELECT MAX(received_time) FROM result"); lastres=cur.fetchall()
In [6]:
print "Number of active hosts:"
cur.execute("DROP TABLE IF EXISTS active_hosts")
cur.execute("CREATE TABLE active_hosts "
            "SELECT * FROM host h "
            "WHERE (SELECT MIN(%i-received_time) FROM result WHERE hostid=h.id)<%i"%(lastres[0][0],30*24*60*60))
Number of active hosts:
Out[6]:
5715L
In [7]:
print "Number of active users:"
cur.execute('SELECT userid FROM active_hosts'); len(set(cur.fetchall()))
Number of active users:
Out[7]:
4385
In [8]:
gb4=4*1024*1024*1024
cur.execute("SELECT COUNT(1),SUM(total_credit) FROM active_hosts WHERE m_nbytes<%i"%gb4); n32=cur.fetchone()
cur.execute("SELECT COUNT(1),SUM(total_credit) FROM active_hosts WHERE m_nbytes>%i"%gb4); n64=cur.fetchone()

dh=0.25
norm=n64[0]/n64[1]
bar(arange(2)-dh,[n64[0],n32[0]],width=dh,label='Number of Computers')
bar(arange(2),array([n64[1],n32[1]])*norm,width=dh,color='SeaGreen',label='Number of Credits (Arbitr. Units)')
xticks([0,1])
gca().set_xticklabels(['64bit','32bit'])
xlim(-1+dh,2-dh)
ylim(0,4400)
legend(loc='upper right')
ylabel('Number of Computers')

savesvg('plots/32vs64')
In [9]:
cur.execute("SELECT os_name FROM active_hosts;")
allos=[x[0] for x in set(cur.fetchall())]
nos=[cur.execute("SELECT null FROM active_hosts WHERE os_name='%s';"%o) for o in allos]
def ncred(os):
    cur=con.cursor()
    cur.execute("SELECT total_credit FROM active_hosts WHERE os_name='%s';"%os)
    return sum(cur.fetchall())

credos=[ncred(o) for o in allos]
In [10]:
fig=figure()
fig.set_size_inches(6,6)
ax=gca()

n=8
dat=DataFrame(zip(allos,nos,credos)).sort(columns=1,ascending=True)
dat[0][dat[0]=='']='Unspecified'
dat.iloc[-n-1,:]=['All Others']+list(dat.iloc[:-n-1,1:].sum())
dat=dat[-n-1:]

dh=0.35
norm=1./dat[2].max()*dat[1].max()
barh(range(n+1),dat[1],height=dh,label='Number of Computers')
barh(arange(n+1)-dh,dat[2]*norm,height=dh,color='SeaGreen',label='Number of Credits (Arbitr. Units)')

ax.set_yticks(range(n+1))
ax.set_yticklabels(list(dat[0]));

legend(loc='lower right')

xlabel('Number of Computers')

savesvg('plots/hosts',bbox_inches='tight')
In [11]:
fig=figure()
fig.set_size_inches(2*6,6/1.6)
fig.subplots_adjust(wspace=0.3)

subplot(121)
cur.execute('SELECT n_bwdown FROM active_hosts h WHERE n_bwdown>0;')
hist(array(cur.fetchall())[:,0]/1024,range=(0,1000),bins=50);
xlabel('Download [kB/s]')
ylabel('Number of Computers')
ylim(0,300)

subplot(122)
cur.execute('SELECT n_bwup FROM active_hosts h WHERE n_bwup>0;')
hist(array(cur.fetchall())[:,0]/1024,range=(0,100),bins=50);
xlabel('Upload [kB/s]')
ylabel('Number of Computers')
ylim(0,300)

savesvg('plots/transfer',bbox_inches='tight')
In [12]:
cur.execute('SELECT p_ncpus FROM active_hosts;')
ncpus=array(cur.fetchall())[:,0]
hist(ncpus,bins=array(arange(17))+0.5);
xlabel('Number of CPUs')
ylabel('Number of Computers')

savesvg('plots/cpus')
In [13]:
cur.execute("SELECT avg_turnaround FROM active_hosts WHERE avg_turnaround>0;"); 
hist(array(cur.fetchall())[:,0]/60/60/24,range=(0,2e6/60/60/24),bins=50);
xlabel('Average Turnaround [days]')
ylabel('Number of Computers')
xlim(0,14)

savesvg('plots/turnaround')
In [14]:
if osp.exists('locs'): #cache the locations in this file after the first run
    locs=pickle.load(open("locs"))
else:
    cur.execute('SELECT external_ip_addr FROM active_hosts;')
    ips=array(cur.fetchall())[:,0]
    
    def getip(ip,ntry=4):
        try:
            dat = (ip,subprocess.check_output(['curl','--connect-timeout','3','http://www.telize.com/geoip/%s'%ip]))
            print ip
            return dat
        except:
            if ntry==0: return 
            else: getip(ip,ntry=ntry-1)
            
    pool=Pool(50)
    locs=pool.map(getip,set(ips))
    pickle.dump(locs,open('locs','w'))
    
lonlats=[l for l in [(lambda x: (x['longitude'],x['latitude']))(eval(l[1].strip())) for l in locs if l] if l!=(0,0)]
In [15]:
m = Basemap(projection='moll',lon_0=0,resolution='c')
m.drawcoastlines()
land='#152A59'; water='#C7B8B1'
m.fillcontinents(color=land,lake_color=water)
m.drawparallels(np.arange(-90.,120.,30.))
m.drawmeridians(np.arange(0.,420.,60.))
m.drawmapboundary(fill_color=water)
gcf().set_size_inches(15,15)
plot(*vectorize(m)(*array(lonlats).T),c='r',marker='.',ls='')

title('Active [email protected] Computers as of July 2015',y=1.03)
savefig('plots/world.svg',bbox_inches='tight')