# numpy, scipy, scipy.stats.stats, collections, random, time
from collections import Counter, defaultdict
from itertools import chain, combinations
from time import localtime
import json
from scipy import std, mean
from scipy.stats.stats import pearsonr
from matplotlib import pyplot as plt
# This lesson requires the following modules
from __future__ import unicode_literals, print_function, division
# And %pylab features
%pylab
# Show plots into browser
%matplotlib inline
"According to latest statistics,
it appears that you eat one chicken per year:
and, if that doesn't fit your budget,
you'll fit into statistic anyway,
because someone will eat two."
C. A. Salustri (1871 – 1950)
#
# Exercise: use this frame for the exercise
#
def ping_rtt(seconds=10):
"""@return: a list of ping RTT"""
from course import sh
# get sample output
# find a solution in ipython
# test and paste the code
raise NotImplementedError
!ping -c3 www.google.it || ping -n3 www.google.it
# %load course/ping_rtt.py
__author__ = 'rpolli'
def ping_rtt():
"""
goal: slicing data
goal: using zip to transpose data
"""
from course import sh
import sys
cmd = "ping -c10 www.google.it"
if 'win' in sys.platform:
cmd = "ping -n10 www.google.it"
ping_output = sh(cmd)
filter_lines = (x.split() for x in ping_output if 'time' in x)
if 'win' in sys.platform:
ping_output = [x[6::2] for x in filter_lines]
else:
ping_output = [x[-4:-1:2] for x in filter_lines]
ttl, rtt = zip(*ping_output)
return [float(x.split("=")[1]) for x in rtt]
rtt = ping_rtt()
print(*rtt)
A distribution or $\delta$ shows the frequency of events, like how many people ate $x$ chickens;
# We can generate a distribution using
from collections import Counter
distro = Counter(rtt)
print("Un-bucketed distribution:", distro)
# If we split data in 1-second buckets....
distro = Counter(int(i) for i in rtt)
print("One second bucket distribution:",distro)
Exercise: which is the behavior of defaultdict?
# defaultdict can be used to implement more complex behaviors.
from collections import defaultdict
distro = defaultdict(int)
# This works even if rtt is a generator
for x in rtt:
distro[x] += 1
print(distro)
# or list.count if rtt is in-memory
distro2 = {x: rtt.count(x) for x in set(rtt)}
print(distro2)
# Exercise: use this cell to benchmark those three approaches while increasing the size of rtt
$\sigma^{2}(X) := \frac{ \sum(x-\bar{x})^{2} }{n} $
$\sigma$ tells if $\delta$ is fair or not, and how much the mean ($\bar{x}$) is representative
matplotlib.mlab.normpdf is a smooth function approximating the histogram
from scipy import std, mean
fair = [1, 1] # chickens
unfair = [0, 2] # chickens
# Same mean!
assert mean(fair) == mean(unfair)
# Use standard deviation!
print("is unfair?", std(fair)) # 0
print("is unfair?", std(unfair)) # 1
#Check your computed values vs the $\sigma$ returned by ping
"""
goal: remember to convert to numeric / float
goal: use scipy
goal: check stdev
"""
from scipy import std, mean # max, min are builtin
rtt = ping_rtt()
fmt_s = 'stdev: {:0.2}, mean: {}, min: {}, max: {}'
rtt_std, rtt_mean = std(rtt), mean(rtt)
rtt_max, rtt_min = max(rtt), min(rtt)
print(fmt_s.format(rtt_std, rtt_mean, rtt_max, rtt_min))
time_d = { # mail delivered (eg. removed) between
0: xxx # 00:00 - 00:59
1: xxx # 01:00 - 01:59
...
# The data/maillog file contains
ret = !cat ../data/maillog
print(*ret[:3], sep='\n')
# Use the following cell to solve the exercise.
#
# Time distribution exercise frame
#
solution = "cmV0ID0gIWdyZXAgcmVtb3ZlZCBkYXRhL21haWxsb2cKaG91cnMgPSBbeFs6Ml0gZm9yIHggaW4g\ncmV0LmZpZWxkcygyKV0K"
from course import show_solution
show_solution(solution)
ret = !grep removed ../data/maillog
#print(ret.fields(2))
hours = [int(x[:2]) for x in ret.fields(2)]
print(hours[:10], "..")
time_d = Counter(hours)
print("The distribution is",time_d.viewitems())
# Exercise: find the 3 and 4 most common values of time_d
# To plot data..
from matplotlib import pyplot as plt
# and set the interactive mode
plt.ion()
# Plotting an histogram...
frequency, slots, _ = hist(hours)
plt.title("Hourly distribution")
# .. returns a
from collections import OrderedDict
distribution = OrderedDict(zip(slots,
frequency))
# Print it nicely with
import json
print(json.dumps(distribution, indent=1))
size_d = { # mail size between
0: xxx # 0 - 10k
1: xxx # 10k - 20k
..
}
#
# Exercise frame
#
# Get the sigma_size from maillog
solution = b'CnJldCA9ICFncmVwIHNpemUgZGF0YS9tYWlsbG9nCnNpemVzID0gW2ludCh4WzU6LTFdKSBmb3Ig\neCBpbiByZXQuZmllbGRzKDcpXQoK\n'
from course import show_solution
# show_solution(solution)
def maillog_size():
ret = !grep size ../data/maillog
sizes = [int(x[5:-1]) for x in ret.fields(7)]
return sizes
sizes = maillog_size()
frequency, slots, _ = plt.hist(sizes, bins=5)
# We can use our time_d to simulate the load during the day
from time import localtime
hour = localtime().tm_hour
mail_per_minute = time_d[hour] / 60 # minutes in hour
print("Estimated frequency is", mail_per_minute)
# And sizes to create a mail load generator
mean_size, sigma_size = mean(sizes), std(sizes)
print(mean_size, sigma_size)
# A mail load generator creating attachments of a given size...
from random import gauss
def mail_size_g():
mu = min(sizes)
x = gauss(mean_size, sigma_size) # a random number
# as gaussian can give negative numbers
# we should ceil up (or skip) negative values
return x if x > mu else mu
# How is our estimation far from reality?
for emails in 5000, 10000, 100000:
X = [mail_size_g() for x in xrange(emails)]
print(emails, mean(X) / mean_size, std(X) / sigma_size)
# Homework: write a script to simulate the load of your mailserver and tweak your mail_size_g
# Hint: create a set of attachment files: 1k.out, 10k.out, 100k.out, 1000k.out, 5000k.out
# Hint: use mail_size_g() to select one of those files instead of creating a given email
To better simulate mail size, consider using a Cauchy distribution.
See www.ieee802.org/16/tgm/contrib/C80216m-07_122.pdf
# Let's plot the following datasets
# taken from a 4-hour distribution
mail_sent = [1, 5, 500, 250, 100, 7]
kB_s = [70, 300, 29000, 12500, 450, 500]
# A scatter plot can suggest relations
# between data
plt.scatter(mail_sent, kB_s)
plt.title("I am a SCATTER plot!")
The Pearson Coefficient $\rho$ is a relation indicator.
from scipy.stats.stats import pearsonr
ret = pearsonr(mail_sent, kB_s)
print(ret)
#>(0.9823, 0.0004)
correlation, probability = ret
a way of selecting members from a grouping, such that the order of selection does not matter
# Given a table with many data series
from course import table
#table = {...
# 'cpu_usr': [10, 23, 55, ..],
# 'byte_in': [2132, 3212, 3942, ..],
# }
print(*table.keys(),sep='\n')
# We can combine all their names with
from itertools import combinations
list(combinations(table,2))[:10]
When looking for interesting data, we can calculate correlation between every combination of data series and concentrate on those over a given threshold
# We can try every combination between data series
# and check if there's some correlation
from itertools import combinations
from scipy.stats.stats import pearsonr
from course import table
for k1, k2 in combinations(table, 2):
r_coeff, probability = pearsonr(table[k1], table[k2])
# I'm *still* not interested in data under this threshold
if r_coeff < 0.6:
continue
print("linear correlation between {:8} and {:8} is {:3}".format(
k1, k2, r_coeff))
Coloring scatter-plot gives us one more dimension in a 2d plot: time!
We have to mark our ordered data-series with a tick-series. eg.
tick | 0 | 0 | 0 | 1 | 1 | .. | |
---|---|---|---|---|---|---|---|
cpu_usr | 10 | 23 | 55 | 33 | 10 | .. | |
byte_in | 2132 | 3212 | 3942 | 321 | 42 | .. |
Elements with the same tick are in the same bucket.
We will now group our series in 3 buckets of 8-hours. We'll use the itertools.chain
function which
flattens a list of iterators.
from itertools import chain # magic function to flatten a list of iterators
# Dividing samples in 3 buckets.
BUCKETS = ["morning", "afternoon", "night"]
samples = len(table['byte_in'])
# scale samples on 24 hours
ticks = list(chain(*(
[x] * (samples // len(BUCKETS)) # generates a [ 0 ] * bucket_size list.
for x in range(0, 24, 24 // len(BUCKETS)) # rescale all samples on 24 hours
))
)
print(ticks)
# Any doubts? Just check the ticks distribution ;)
print(Counter(ticks))
# Exercise: use json.dumps to prettify the distribution.
# Create a formatter associating a label to every color point:
# - we just take into account the tick value, ignoring other values
# - the tick is divided by 8 (the bucket_size) to implement the following mapping
# - [0, 8) -> morning
# - [8, 16) -> afternoon
# - [16, 24) -> night
formatter = plt.FuncFormatter(lambda tick, *ignored_args: BUCKETS[tick//8])
plt.scatter(table['byte_in'], table['csw'], c=ticks)
plt.colorbar(ticks = range(0, 28, 8), format=formatter)
from scipy import arange
set(map(int, arange(0, 3, 1/samples)))
#netfishing_correlation_plot():
# Consider using pandas.tools.plotting import scatter_matrix
from itertools import combinations
from scipy.stats.stats import pearsonr
from course import table
from matplotlib import pyplot as plt
for (k1, v1), (k2,v2) in combinations(table.items(), 2):
r_coeff, probability = pearsonr(table[k1], table[k2])
if r_coeff < 0.6:
continue
plt.scatter(v1, v2, c=ticks)
plt.xlabel(k1); plt.ylabel(k2)
plt.title("r={:0.3f} p={:0.3f}".format(r_coeff, probability))
plt.colorbar(ticks = range(0, 28, 8), format=formatter)
plt.show()
# Uncomment the following lines to save the plot on disk
# plt.savefig("colorbar_{}_{}.png".format(k1, k2))
# plt.close()
from __future__ import print_function, unicode_literals
def ping_rtt(seconds=10):
"""
goal: slicing data
goal: using zip to transpose data
"""
import sys
from course import sh
cmd = "ping -c{seconds} www.google.it"
if 'win' in sys.platform:
cmd = "ping -n{seconds} www.google.it"
ping_output = sh(cmd.format(seconds=seconds))
# Filter out uninteresting lines
ping_output = [x.replace(b"=", b" ") for x in ping_output if b'from' in x]
if 'win' in sys.platform:
ping_output = [x.split()[6::2] for x in ping_output]
else:
ping_output = [x.split()[-4:-1:2] for x in ping_output]
ttl, rtt = zip(*ping_output)
return map(float, rtt)
print(*ping_rtt(3))
# Exercise II solution
# deliveder emails are like the following
#May 14 16:00:04 rpolli postfix/qmgr[122]: 4DC3DA: removed"
ret = !grep removed maillog # get the interesting lines
ts = ret.fields(2) # find the timestamp (3rd column)
hours = [ int(ts) for x in ts ]
time_d = {x: hours.count(x) for x in set(hours)}