%pylab inline import apachelog, sys fformat = r'%V %h %l %u %t \"%r\" %>s %b \"%i\" \"%{User-Agent}i\" %T' p = apachelog.parser(fformat) sample_string = 'koldunov.net 85.26.235.202 - - [16/Mar/2013:00:19:43 +0400] "GET /?p=364 HTTP/1.0" 200 65237 "http://koldunov.net/?p=364" "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11" 0' data = p.parse(sample_string) data log = open('access_log_for_pandas').readlines() log_list = [] for line in log: try: data = p.parse(line) except: sys.stderr.write("Unable to parse %s" % line) data['%t'] = data['%t'][1:12]+' '+data['%t'][13:21]+' '+data['%t'][22:27] log_list.append(data) import pandas as pd import numpy as np from pandas import Series, DataFrame, Panel df = DataFrame(log_list) df[0:2] del df['%T']; del df['%V']; del df['%i']; del df['%l']; del df['%u']; del df['%{User-Agent}i'] df = df.rename(columns={'%>s': 'Status', '%b':'b', '%h':'IP', '%r':'Request', '%t': 'Time'}) df.head() df.index = pd.to_datetime(df.pop('Time')) df['Status'] = df['Status'].astype('int') df['b'][93] def dash2nan(x): if x == '-': x = np.nan else: x = float(x)/1048576. return x df['b'] = df['b'].apply(dash2nan) df['b'].plot() df_s = df['b'].resample('5t', how='count') df_s.plot() df_b = df['b'].resample('10t', how=['count','sum']) df_b['count'].plot( color='r') legend() df_b['sum'].plot(secondary_y=True) df_b.corr() df_b['2013-03-16 6:00':'2013-03-16 10:00']['sum'].plot() df[df['b']>20] cc = df[df['b']<20] cc.b.hist(bins=10) cc = df[df['b']<0.3] cc.b.hist(bins=100) cc = df[(df['b']>0.2)&(df['b']<0.25)] cc.b.hist(bins=100) cc = df[(df['b']>0.220)&(df['b']<0.224)] cc.head() t_span = '2H' df_404 = df['Status'][df['Status'] == 404].resample(t_span, how='count') df_403 = df['Status'][df['Status'] == 403].resample(t_span, how='count') df_301 = df['Status'][df['Status'] == 301].resample(t_span, how='count') df_304 = df['Status'][df['Status'] == 304].resample(t_span, how='count') df_200 = df['Status'][df['Status'] == 200].resample(t_span, how='count') status_df = DataFrame({'Not Found':df_404, 'Forbidden':df_403, 'Moved Permanently':df_301, 'Not Modified':df_304, 'OK':df_200,}) status_df.head() status_df.plot(figsize=(10, 3)) status_df[['Not Found','Forbidden','Moved Permanently','Not Modified']].plot(kind='barh', stacked=True, figsize=(10, 7)) grouped_status = df.groupby('Status') grouped_status.head(2) grouped_status.size().plot(kind='bar') t_span = '30t' grouped_status.get_group(301)['Status'].resample(t_span, how='count').plot(color='g', label='301') legend() grouped_status.get_group(200)['Status'].resample(t_span, how='count').plot(color='b', secondary_y=True, label='200') ips = df.groupby('IP').size() ips.sort() ips[-10:].plot(kind='barh') ips_fd = DataFrame({'Number of requests':ips[-10:]}) ips_fd = ips_fd.sort(columns='Number of requests', ascending=False) ips_fd ips_status = df.groupby(['IP', 'Status']).size() ips_status.sort() ips_status[-20:].plot(kind='barh') import pygeoip gi = pygeoip.GeoIP('./GeoLiteCity.dat', pygeoip.MEMORY_CACHE) ipcon = gi.record_by_addr('64.233.161.99') ipcon ipcon = [] for iip in ips.index: rres = gi.record_by_addr(iip) # rres['ip'] = iip rres['Number'] = ips[iip] #delete some fields we don't need del rres['area_code'] del rres['dma_code'] del rres['metro_code'] del rres['postal_code'] del rres['region_name'] del rres['time_zone'] del rres['country_code'] ipcon.append(rres) reg = DataFrame(ipcon, index = ips.index) reg.head() country = reg.groupby('country_code3') ff = country.Number.agg('sum').copy() ff.sort( ) ff[-10:].plot(kind='barh') city = reg.groupby('city') ff = city.Number.agg('sum').copy() ff.sort( ) ff[-20:].plot(kind='barh', figsize=(5,8)) from mpl_toolkits.basemap import Basemap import matplotlib.cm as cm m = Basemap(projection='robin',lon_0=0,resolution='c') x, y = m(reg['longitude'],reg['latitude']) figure(figsize=(15,15)) m.drawcoastlines(linewidth=0.25) m.drawcountries(linewidth=0.25) m.fillcontinents(color='coral',lake_color='aqua') m.drawmapboundary(fill_color='white') m.drawmeridians(np.arange(0,360,30)) m.drawparallels(np.arange(-90,90,30)) m.scatter(x,y,s=reg['Number']*3,c=reg['Number']/5,marker='o',zorder=4, cmap=cm.Paired,alpha=0.5) m = Basemap(projection='cyl',llcrnrlat=35,urcrnrlat=72,\ llcrnrlon=-10,urcrnrlon=50,resolution='l') x, y = m(reg['longitude'],reg['latitude']) figure(figsize=(15,15)) m.drawcoastlines(linewidth=0.25) m.drawcountries(linewidth=0.25) m.fillcontinents(color='white',lake_color='aqua') m.drawmapboundary(fill_color='aqua') m.drawmeridians(np.arange(0,360,30)) m.drawparallels(np.arange(-90,90,30)) m.scatter(x,y,s=reg['Number']*30,c=reg['Number'],marker='o',zorder=4, cmap=cm.gist_ncar ,alpha=0.3) def gmet(x): x = x.split() return x[0] df['Method'] = df.Request.apply(gmet) met = df.groupby(['Method', 'IP']).size() met.head post = met['POST'].copy() post.sort() post[-10:].plot(kind='barh') post[-5:] df[df.IP =='77.222.40.65'][0:5] df[df.IP =='160.75.185.7'][0:10] df[df.IP =='213.248.47.229'][0:10]