In [1]:
#use regular expression to parse log entry
import re
#http://docs.python.org/2/library/re.html
#awkward historical log format:
log_entry='proxy4.utsa.edu.au 151.217.6.9 - -|- [11/Apr/2013:23:57:14 -0400] [Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0|-=151.217.60.103|0|http://arxiv.org/|proxy4.utas.edu.au.1364191674910933] "GET /find/all/1/all:+2013arXiv13011419D/0/1/0/all/0/1 HTTP/1.0" 200 10737'
#re.match pulls out the objects in ()
mm = re.match(r"(\S+) (\S+) (\S+) (\S+?)\|(\S+) \[(.*?)\] \[(.*)\|(.*?)=(.*?)\|(\d+)\|(.*)\|(.*?)\] \"(.*)\" (\d+) (\S+)",log_entry)
keys=['host','ip','logname','tapiruid','tapirsid','datetime','ua','xfrom','xfor','delay','referer','cookie','request','status','bytes']
#mm.groups() is the list of matching objects
entry=dict(zip(keys,mm.groups()))
for k in keys: print k+':',entry[k]
host: proxy4.utsa.edu.au
ip: 151.217.6.9
logname: -
tapiruid: -
tapirsid: -
datetime: 11/Apr/2013:23:57:14 -0400
ua: Mozilla/5.0 (Windows NT 6.1; rv:19.0) Gecko/20100101 Firefox/19.0
xfrom: -
xfor: 151.217.60.103
delay: 0
referer: http://arxiv.org/
cookie: proxy4.utas.edu.au.1364191674910933
request: GET /find/all/1/all:+2013arXiv13011419D/0/1/0/all/0/1 HTTP/1.0
status: 200
bytes: 10737
In [2]:
#next need to parse the datetime
import time
#http://docs.python.org/2/library/time.html
In [3]:
def timestr_utc(time_string):
  #wants timezone names instead of time offset:
  ts=time_string.replace('-0400','EDT').replace('-0500','EST')
  try: 
     return int(time.mktime(time.strptime(ts,'%d/%b/%Y:%H:%M:%S %Z')))
  except ValueError:
     print "bad time",time_string
     return(None)
In [4]:
#current time
print 'current time is',time.time()
time_string=time.strftime('%d/%b/%Y:%H:%M:%S %Z')
print 'current time string is',time_string
utc_time = timestr_utc(time_string)
print 'and converts back to',utc_time,'seconds'
print "in years that's roughly",utc_time/(60*60*24*365.25),'years'
current time is 1366481747.77
current time string is 20/Apr/2013:14:15:47 EDT
and converts back to 1366481747 seconds
in years that's roughly 43.3011935952 years
In [5]:
#recall that on 31 Dec 1969 at 7pm eastern time the ball at Times Square descended
#with great fanfare and announced to the world "0 Unix time"
for ts in ["31/Dec/1969:18:59:59 -0500","31/Dec/1969:19:00:00 -0500","31/Dec/1969:19:00:01 -0500"]:
    print ts,'converts to',timestr_utc(ts)
31/Dec/1969:18:59:59 -0500 converts to -1
31/Dec/1969:19:00:00 -0500 converts to 0
31/Dec/1969:19:00:01 -0500 converts to 1
In [20]:
#http://imgs.xkcd.com/comics/bug.png
from urllib2 import urlopen
bug = urlopen('http://imgs.xkcd.com/comics/bug.png').read()
from IPython.display import Image 
Image(bug)
Out[20]:
In [6]:
#now check what happens when the clocks were turned back at 2a.m., and 1:30a.m. occurred twice:
time_string1 = "04/Nov/2012:01:30:13 -0400"
time_string2 = "04/Nov/2012:01:30:13 -0500"
utc1=timestr_utc(time_string1)
utc2=timestr_utc(time_string2)
print time_string1,'converts to',utc1
print time_string2,'converts to',utc2
print utc1,'-',utc2,'=',utc2-utc1,'seconds difference, and convert back to:'
#then see how they're translated back
print time.strftime('%d/%b/%Y:%H:%M:%S %Z',time.localtime(utc1))
print time.strftime('%d/%b/%Y:%H:%M:%S %Z',time.localtime(utc2))
04/Nov/2012:01:30:13 -0400 converts to 1352007013
04/Nov/2012:01:30:13 -0500 converts to 1352010613
1352007013 - 1352010613 = 3600 seconds difference, and convert back to:
04/Nov/2012:01:30:13 EDT
04/Nov/2012:01:30:13 EST
In [7]:
#some examples from http://networkx.github.io/documentation/latest/examples/
import networkx as nx
In [8]:
G=nx.Graph()
G.add_node("spam")
G.add_edge(1,2)
print 'nodes:',G.nodes()
print 'edges:',G.edges()
nodes: [1, 2, 'spam']
edges: [(1, 2)]
In [9]:
#http://networkx.github.io/documentation/latest/examples/drawing/house_with_colors.html
G=nx.house_graph()
# explicitly set positions
pos={0:(0,0),
     1:(1,0),
     2:(0,1),
     3:(1,1),
     4:(0.5,2.0)}
nx.draw_networkx_nodes(G,pos,node_size=2000,nodelist=[4])
nx.draw_networkx_nodes(G,pos,node_size=3000,nodelist=[0,1,2,3],node_color='b')
nx.draw_networkx_edges(G,pos,alpha=0.5,width=6)
axis('off')
None
In [10]:
#http://networkx.github.io/documentation/latest/examples/drawing/ego_graph.html
###just draw friends network of highest degree node in preferential attachment network
# Create a BA model graph
n=1000
m=2
from operator import itemgetter
G=nx.generators.barabasi_albert_graph(n,m)
    # find node with largest degree
node_and_degree=G.degree()
(largest_hub,degree)=sorted(node_and_degree.items(),key=itemgetter(1))[-1]
    # Create ego graph of main hub
hub_ego=nx.ego_graph(G,largest_hub)
    # Draw graph
figure(figsize=(4,4))
pos=nx.spring_layout(hub_ego)
nx.draw(hub_ego,pos,node_color='b',node_size=50,with_labels=False)
    # Draw ego as large and red
nx.draw_networkx_nodes(hub_ego,pos,nodelist=[largest_hub],node_size=300,node_color='r')
Out[10]:
<matplotlib.collections.PathCollection at 0x10824da10>
In [11]:
#http://networkx.github.io/documentation/latest/examples/drawing/random_geometric_graph.html
#random geometric graph
#
G=nx.random_geometric_graph(200,0.125)
# position is stored as node attribute data for random_geometric_graph
pos=nx.get_node_attributes(G,'pos')

# find node near center (0.5,0.5)
dmin=1
ncenter=0
for n in pos:
    x,y=pos[n]
    d=(x-0.5)**2+(y-0.5)**2
    if d<dmin:
        ncenter=n
        dmin=d

# color by path length from node near center
p=nx.single_source_shortest_path_length(G,ncenter)

figure(figsize=(8,8))
nx.draw_networkx_edges(G,pos,nodelist=[ncenter],alpha=0.4)
nx.draw_networkx_nodes(G,pos,nodelist=p.keys(),
                       node_size=80,
                       node_color=p.values(),
                       cmap=plt.cm.Reds_r)

xlim(-0.05,1.05)
ylim(-0.05,1.05)
axis('off')
None
In [22]:
#http://networkx.github.io/documentation/latest/examples/graph/degree_sequence.html
#Random graph from given degree sequence.

z=[5,3,3,3,3,2,2,2,1,1,1]
print nx.is_valid_degree_sequence(z)

print("Configuration model")
G=nx.configuration_model(z)  # configuration model
degree_sequence=list(nx.degree(G).values()) # degree sequence
print("Degree sequence %s" % degree_sequence)
print("Degree histogram")
hist={}
for d in degree_sequence:
    if d in hist:
        hist[d]+=1
    else:
        hist[d]=1
print("degree #nodes")
for d in hist:
    print('%d %d' % (d,hist[d]))
True
Configuration model
Degree sequence [5, 3, 3, 3, 3, 2, 2, 2, 1, 1, 1]
Degree histogram
degree #nodes
1 3
2 3
3 4
5 1
In [13]:
#http://networkx.github.io/documentation/latest/examples/graph/erdos_renyi.html
#Create an G{n,m} random graph with n nodes and m edges and report some properties.

n=10 # 10 nodes
m=20 # 20 edges

G=nx.gnm_random_graph(n,m)

# some properties
print("node degree clustering")
for v in nx.nodes(G):
    print('%s %d %f' % (v,nx.degree(G,v),nx.clustering(G,v)))

# print the adjacency list to terminal
nx.write_adjlist(G,sys.stdout)
node degree clustering
0 2 0.000000
1 2 0.000000
2 4 0.500000
3 5 0.500000
4 5 0.500000
5 5 0.200000
6 4 0.666667
7 5 0.300000
8 5 0.200000
9 3 0.000000
# gnm_random_graph(10,20)
0 8 5 
1 9 7 
2 8 4 6 7 
3 8 4 5 6 7 
4 8 5 6 
5 9 7 
6 7 
7 
8 9 
9 
In [14]:
#http://networkx.github.io/documentation/latest/examples/graph/karate_club.html
#Zachary's Karate Club graph
#Data file from:
#http://vlado.fmf.uni-lj.si/pub/networks/data/Ucinet/UciData.htm

G=nx.karate_club_graph()
figure(figsize=(8,8))
nx.draw_networkx(G)
axis('off')
print("Node Degree")
for v in G:  print('%s %s' % (v,G.degree(v)))
Node Degree
0 16
1 9
2 10
3 6
4 3
5 4
6 4
7 4
8 5
9 2
10 3
11 1
12 2
13 5
14 2
15 2
16 2
17 2
18 2
19 3
20 2
21 2
22 2
23 5
24 3
25 3
26 2
27 4
28 3
29 4
30 4
31 6
32 12
33 17
In [21]:
#http://networkx.github.io/documentation/latest/examples/drawing/giant_component.html
#illustrates sudden appearance of giant connected component in a binomial random graph.

layout=nx.graphviz_layout
n=150  # 150 nodes
# p value at which giant component (of size log(n) nodes) is expected
p_giant=1.0/(n-1)
# p value at which graph is expected to become completely connected
p_conn=math.log(n)/float(n)

# the following range of p values should be close to the threshold
pvals=[0.003, 0.006, 0.008, 0.015]

figure(figsize=(8,8))
region=220 # for pylab 2x2 subplot layout
subplots_adjust(left=0,right=1,bottom=0,top=0.95,wspace=0.01,hspace=0.01)
for p in pvals:
    G=nx.binomial_graph(n,p)
    pos=layout(G)
    region+=1
    subplot(region)
    title("p = %6.3f"%(p))
    nx.draw(G,pos, with_labels=False, node_size=10)
    # identify largest connected component
    Gcc=nx.connected_component_subgraphs(G)
    G0=Gcc[0]
    nx.draw_networkx_edges(G0,pos, with_labels=False,
                           edge_color='r', width=6.0)
    # show other connected components
    for Gi in Gcc[1:]:
       if len(Gi)>1:  nx.draw_networkx_edges(Gi,pos,with_labels=False,
                                 edge_color='r',alpha=0.3,width=5.0)
In [12]:
#http://networkx.github.io/documentation/latest/examples/graph/napoleon_russian_campaign.html
#Minard's data from Napoleon's 1812-1813  Russian Campaign.
#http://www.math.yorku.ca/SCS/Gallery/minard/minard.txt

import string
def minard_graph():
  data1="""\
24.0,54.9,340000,A,1
24.5,55.0,340000,A,1
25.5,54.5,340000,A,1
26.0,54.7,320000,A,1
27.0,54.8,300000,A,1
28.0,54.9,280000,A,1
28.5,55.0,240000,A,1
29.0,55.1,210000,A,1
30.0,55.2,180000,A,1
30.3,55.3,175000,A,1
32.0,54.8,145000,A,1
33.2,54.9,140000,A,1
34.4,55.5,127100,A,1
35.5,55.4,100000,A,1
36.0,55.5,100000,A,1
37.6,55.8,100000,A,1
37.7,55.7,100000,R,1
37.5,55.7,98000,R,1
37.0,55.0,97000,R,1
36.8,55.0,96000,R,1
35.4,55.3,87000,R,1
34.3,55.2,55000,R,1
33.3,54.8,37000,R,1
32.0,54.6,24000,R,1
30.4,54.4,20000,R,1
29.2,54.3,20000,R,1
28.5,54.2,20000,R,1
28.3,54.3,20000,R,1
27.5,54.5,20000,R,1
26.8,54.3,12000,R,1
26.4,54.4,14000,R,1
25.0,54.4,8000,R,1
24.4,54.4,4000,R,1
24.2,54.4,4000,R,1
24.1,54.4,4000,R,1"""
  data2="""\
24.0,55.1,60000,A,2
24.5,55.2,60000,A,2
25.5,54.7,60000,A,2
26.6,55.7,40000,A,2
27.4,55.6,33000,A,2
28.7,55.5,33000,R,2
29.2,54.2,30000,R,2
28.5,54.1,30000,R,2
28.3,54.2,28000,R,2"""
  data3="""\
24.0,55.2,22000,A,3
24.5,55.3,22000,A,3
24.6,55.8,6000,A,3
24.6,55.8,6000,R,3
24.2,54.4,6000,R,3
24.1,54.4,6000,R,3"""
  cities="""\
24.0,55.0,Kowno
25.3,54.7,Wilna
26.4,54.4,Smorgoni
26.8,54.3,Moiodexno
27.7,55.2,Gloubokoe
27.6,53.9,Minsk
28.5,54.3,Studienska
28.7,55.5,Polotzk
29.2,54.4,Bobr
30.2,55.3,Witebsk
30.4,54.5,Orscha
30.4,53.9,Mohilow
32.0,54.8,Smolensk
33.2,54.9,Dorogobouge
34.3,55.2,Wixma
34.4,55.5,Chjat
36.0,55.5,Mojaisk
37.6,55.8,Moscou
36.6,55.3,Tarantino
36.5,55.0,Malo-Jarosewii"""
  c={}
  for line in cities.split('\n'):
    x,y,name=line.split(',')
    c[name]=(float(x),float(y))

  g=[]        

  for data in [data1,data2,data3]:
    G=nx.Graph()
    i=0
    G.pos={} # location
    G.pop={} # size
    last=None
    for line in data.split('\n'):
      x,y,p,r,n=line.split(',')
      G.pos[i]=(float(x),float(y))
      G.pop[i]=int(p)
      if last is None:
        last=i
      else:
        G.add_edge(i,last,{r:int(n)})
        last=i
      i=i+1
    g.append(G)        

  return g,c            


(g,city)=minard_graph()

figure(1,figsize=(11,5))

colors=['b','g','r']
for G in g:
   c=colors.pop(0)
   node_size=[int(G.pop[n]/300.0) for n in G]
   nx.draw_networkx_edges(G,G.pos,edge_color=c,width=4,alpha=0.5)
   nx.draw_networkx_nodes(G,G.pos,node_size=node_size,node_color=c,alpha=0.5)
   nx.draw_networkx_nodes(G,G.pos,node_size=5,node_color='k')

for c in city:
   x,y=city[c]
   text(x,y+0.1,c)
In [ ]: