from IPython.display import HTML
HTML('<iframe src=http://digitalcorpora.org/corpora/scenarios/nitroba-university-harassment-scenario width=600 height=300></iframe>')
cd pcap/
/home/dloss/Dropbox/genua-ipy-pandas/pcap
%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline]. For more information, type 'help(pylab)'.
import pandas as pd
!tshark -n -r nitroba.pcap -T fields -Eheader=y -e frame.number -e frame.len > frame.len
df=pd.read_table("frame.len")
df
<class 'pandas.core.frame.DataFrame'> Int64Index: 95175 entries, 0 to 95174 Data columns (total 2 columns): frame.number 95175 non-null values frame.len 95175 non-null values dtypes: int64(2)
df["frame.len"].describe()
count 95175.000000 mean 580.748789 std 625.757017 min 42.000000 25% 70.000000 50% 87.000000 75% 1466.000000 max 1466.000000 dtype: float64
df["frame.len"].plot(style=".", alpha=0.2)
title("Frame length")
ylabel("bytes")
xlabel("frame number")
<matplotlib.text.Text at 0x33c9410>
def shark(pcap_file, fields=[], readfilter="", notnull=True):
fields = ["frame.time_epoch"] + fields
fieldspec = " ".join("-e %s" % f for f in fields)
readfilters = fields if notnull else []
if readfilter:
readfilters.append(readfilter)
readspec = "-R '%s'" % " and ".join(f for f in readfilters)
!tshark -r $pcap_file -n -T fields -Eheader=y $readspec $fieldspec > tmp.txt
df = pd.read_table("tmp.txt", index_col = "frame.time_epoch", parse_dates=True, date_parser=datetime.datetime.fromtimestamp)
return df
surflen=shark("nitroba.pcap", ["frame.len"])
surflen
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 95175 entries, 2008-07-22 03:51:07.095278 to 2008-07-22 08:13:47.046029 Data columns (total 1 columns): frame.len 95175 non-null values dtypes: int64(1)
surflen.plot()
<matplotlib.axes.AxesSubplot at 0x33d8410>
bytes_per_second=surflen.resample("S", how="sum")
bytes_per_second.head()
frame.len | |
---|---|
frame.time_epoch | |
2008-07-22 03:51:07 | 20729 |
2008-07-22 03:51:08 | 8426 |
2008-07-22 03:51:09 | 13565 |
2008-07-22 03:51:10 | NaN |
2008-07-22 03:51:11 | NaN |
bytes_per_second.plot(title="bytes/s")
<matplotlib.axes.AxesSubplot at 0x34c8d90>
tf=shark("nitroba.pcap", ["tcp.ack"])
tf
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 81451 entries, 2008-07-22 03:51:07.095278 to 2008-07-22 08:13:47.046029 Data columns (total 1 columns): tcp.ack 81451 non-null values dtypes: int64(1)
!head tmp.txt
frame.time_epoch tcp.ack 1216691467.095278000 1 1216691467.103728000 2 1216691467.114897000 1 1216691467.139448000 1352 1216691467.319680000 1352 1216691467.321990000 1215 1216691467.326517000 1215 1216691467.335554000 1353 1216691467.432370000 1
tf.plot()
<matplotlib.axes.AxesSubplot at 0x34da710>
from pandas.tools.plotting import lag_plot
lag_plot(tf["tcp.ack"])
<matplotlib.axes.AxesSubplot at 0x4765810>
tf["tcp.ack"].plot()
<matplotlib.axes.AxesSubplot at 0x593fed0>
ss=shark("nitroba.pcap", ["ssl.handshake.length"])
ss
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 316 entries, 2008-07-22 03:51:07.455854 to 2008-07-22 08:11:35.539888 Data columns (total 1 columns): ssl.handshake.length 316 non-null values dtypes: object(1)
hlen=ss["ssl.handshake.length"].map(lambda x: int(x.split(",")[0]))
hlen.plot()
<matplotlib.axes.AxesSubplot at 0x476bfd0>
hlen.head()
frame.time_epoch 2008-07-22 03:51:07.455854 70 2008-07-22 03:51:07.456488 1621 2008-07-22 03:51:07.534572 130 2008-07-22 03:51:07.672054 134 2008-07-22 03:51:07.684625 70 Name: ssl.handshake.length, dtype: int64
hlen.describe()
count 316.000000 mean 310.382911 std 739.960691 min 48.000000 25% 70.000000 50% 130.000000 75% 170.000000 max 4598.000000 dtype: float64
#!tshark -n -r nitroba.pcap -R "eth.trailer" -T fields -Eheader=y -e frame.number -e frame.time_epoch -e eth.src -e eth.trailer > eth.trailer
trailer_df = shark("nitroba.pcap", ["eth.src", "eth.trailer"])
trailer_df
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 12851 entries, 2008-07-22 03:51:19.333114 to 2008-07-22 08:13:44.293692 Data columns (total 2 columns): eth.src 12851 non-null values eth.trailer 12851 non-null values dtypes: object(2)
trailer=trailer_df["eth.trailer"]
trailer
frame.time_epoch 2008-07-22 03:51:19.333114 00:00 2008-07-22 03:51:19.334990 00:00:2b:94:a2:40 2008-07-22 03:51:19.335478 00:00 2008-07-22 03:51:19.336488 00:00 2008-07-22 03:51:19.338205 00:00 2008-07-22 03:51:19.340181 00:00 2008-07-22 03:51:19.341599 00:00 2008-07-22 03:51:19.342240 00:00:f4:d3:80:99 2008-07-22 03:51:19.344359 00:00:3b:e4:c6:e3 2008-07-22 03:51:19.344561 00:00:66:2b:d5:e6 2008-07-22 03:51:19.344787 00:00:4e:de:d1:96 2008-07-22 03:51:19.346342 00:00:2a:f6:a1:37 2008-07-22 03:51:19.403621 00:00:00:00:00:00 2008-07-22 03:51:19.404754 00:00:00:00:00:00 2008-07-22 03:51:19.406769 00:00:e3:00:38:23 ... 2008-07-22 08:11:53.150588 00:00:00:00:00:00:00:00:00:00:00:00:00:00:01:f... 2008-07-22 08:11:53.199602 00:00:00:00:00:00:00:00:00:00:0f:5e:fe:0e 2008-07-22 08:11:53.417748 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:0... 2008-07-22 08:11:53.423389 00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0... 2008-07-22 08:11:53.823592 00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0... 2008-07-22 08:11:54.225001 00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0... 2008-07-22 08:11:54.267376 00:00:00:00:00:00:00:00:00:00:00:00:00:00:4a:f... 2008-07-22 08:11:54.629052 00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0... 2008-07-22 08:11:58.175698 00:00:00:00:00:00:00:00:00:00:61:d6:ab:d0 2008-07-22 08:12:13.140025 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:0... 2008-07-22 08:12:25.399427 00:00:00:00:00:00 2008-07-22 08:12:47.410748 00:00:00:00:00:00:00:00:00:00:00:00:00:00:d7:3... 2008-07-22 08:13:05.469430 00:00:00:00:00:00 2008-07-22 08:13:35.837797 00:00:00:00:00:00 2008-07-22 08:13:44.293692 00:00:00:00:00:00 Name: eth.trailer, Length: 12851, dtype: object
trailer.value_counts()
00:00:00:00:00:00 7989 3b:02:a7:19:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02 913 00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00 606 3b:02:a7:19:00:1d:6b:99:98:6a:88:64:11:00:8f:da:00:42 303 00:00 299 00:00:c0:a8:01:40:00:00:00:00:00:00:00:00:00:1d:d9:2e 259 32:01:67:06:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02 254 2d:66:6f:6f:65:05:79:61:68:6f:6f:03:63:6f:6d:00:00:01 253 04:67:6b:64:63:03:75:61:73:03:61:6f:6c:03:63:6f:6d:00 160 70:03:6d:73:67:05:79:61:68:6f:6f:03:63:6f:6d:00:00:01 151 73:6b:03:6d:61:63:03:63:6f:6d:00:00:01:00:01:00:01:00 146 2d:66:6f:6f:62:05:79:61:68:6f:6f:03:63:6f:6d:00:00:01 101 73:6b:03:6d:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02 66 72:65:76:73:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02 54 00:00:00:00:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02 52 ... 00:00:fa:c9:54:5f 1 00:00:8a:4e:29:ef 1 00:00:d0:09:08:44 1 00:00:50:e4:b9:45 1 00:00:11:48:cf:23 1 00:00:c6:c7:97:13 1 00:00:c7:24:5f:f8 1 00:00:ee:eb:a4:71 1 00:00:ab:87:e7:22 1 00:00:b0:37:d5:71 1 00:00:20:00:cb:15 1 00:00:5a:81:53:d9 1 00:00:b5:81:ee:ed 1 00:00:74:a2:f2:6f 1 00:00:00:00:00:00:00:00:00:00:00:00:00:00:68:12:4f:c7 1 Length: 635, dtype: int64
import binascii
def unhex(s, sep=":"):
return binascii.unhexlify("".join(s.split(sep)))
s=unhex("3b:02:a7:19:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02")
s
';\x02\xa7\x19\xaa\xaa\x03\x00\x80\xc2\x00\x07\x00\x00\x00\x02;\x02'
trailer_df["unhex"]=trailer_df["eth.trailer"].map(unhex)
trailer_df
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 12851 entries, 2008-07-22 03:51:19.333114 to 2008-07-22 08:13:44.293692 Data columns (total 3 columns): eth.src 12851 non-null values eth.trailer 12851 non-null values unhex 12851 non-null values dtypes: object(3)
def printable(s):
chars = []
for c in s:
if c.isalnum():
chars.append(c)
else:
chars.append(".")
return "".join(chars)
printable("\x95asd\x33")
'.asd3'
trailer_df["printable"]=trailer_df["unhex"].map(printable)
trailer_df["printable"].value_counts()
...... 8145 .................. 1927 ......k..j.d.....B 303 .. 299 2.g............... 254 .fooe.yahoo.com... 253 .gkdc.uas.aol.com. 160 p.msg.yahoo.com... 151 sk.mac.com........ 148 .foob.yahoo.com... 101 sk.m.............. 66 revs.............. 54 ge.w.............. 45 1.1............... 44 .goo.............. 42 ... ..........Wz...... 1 ..M... 1 ...i.Z 1 ..x... 1 ..N... 1 ..n.oN 1 ....fK 1 ....fk 1 ..Y8.. 1 ..n.FA 1 ...O.r 1 ....Qn 1 ..PK.e 1 ...w.. 1 ..1... 1 Length: 375, dtype: int64
trailer_df["printable"].to_csv("printable.csv", index=False)
def ratio_printable(s):
printable = sum(1.0 for c in s if c.isalnum())
return printable / len(s)
ratio_printable("a\x93sdfs")
0.8333333333333334
trailer_df["ratio_printable"] = trailer_df["unhex"].map(ratio_printable)
trailer_df[trailer_df["ratio_printable"] > 0.5]
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 727 entries, 2008-07-22 03:51:20.018817 to 2008-07-22 05:40:13.338449 Data columns (total 5 columns): eth.src 727 non-null values eth.trailer 727 non-null values unhex 727 non-null values printable 727 non-null values ratio_printable 727 non-null values dtypes: float64(1), object(4)
_.printable.value_counts()
.fooe.yahoo.com... 253 .gkdc.uas.aol.com. 160 p.msg.yahoo.com... 151 .foob.yahoo.com... 101 .weather.com...... 31 ge.weather.com.... 26 1.1..HOST.239.255. 1 ..CDWW 1 .foof.yahoo.com... 1 ..3rbo 1 ..BIKM 1 dtype: int64
trailer_df[trailer_df["ratio_printable"] > 0.5].head(100)
<class 'pandas.core.frame.DataFrame'> DatetimeIndex: 100 entries, 2008-07-22 03:51:20.018817 to 2008-07-22 04:13:25.918602 Data columns (total 5 columns): eth.src 100 non-null values eth.trailer 100 non-null values unhex 100 non-null values printable 100 non-null values ratio_printable 100 non-null values dtypes: float64(1), object(4)