Analyse von Netzwerktraffic

In [1]:
from IPython.display import HTML
HTML('<iframe src=http://digitalcorpora.org/corpora/scenarios/nitroba-university-harassment-scenario width=600 height=300></iframe>')
Out[1]:
In [2]:
cd pcap/
/home/dloss/Dropbox/genua-ipy-pandas/pcap
In [3]:
%pylab inline
Welcome to pylab, a matplotlib-based Python environment [backend: module://IPython.zmq.pylab.backend_inline].
For more information, type 'help(pylab)'.
In [4]:
import pandas as pd

PCAP zu CSV mit tshark

In [5]:
!tshark -n -r nitroba.pcap -T fields -Eheader=y -e frame.number -e frame.len > frame.len
In [6]:
df=pd.read_table("frame.len")
df
Out[6]:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 95175 entries, 0 to 95174
Data columns (total 2 columns):
frame.number    95175  non-null values
frame.len       95175  non-null values
dtypes: int64(2)
In [7]:
df["frame.len"].describe()
Out[7]:
count    95175.000000
mean       580.748789
std        625.757017
min         42.000000
25%         70.000000
50%         87.000000
75%       1466.000000
max       1466.000000
dtype: float64
In [8]:
df["frame.len"].plot(style=".", alpha=0.2)
title("Frame length")
ylabel("bytes")
xlabel("frame number")
Out[8]:
<matplotlib.text.Text at 0x33c9410>

Komfort-Funktion zum Einlesen von PCAP-files in Pandas DataFrame

In [9]:
def shark(pcap_file, fields=[], readfilter="", notnull=True):
    fields = ["frame.time_epoch"] + fields
    fieldspec = " ".join("-e %s" % f for f in fields)
    readfilters = fields if notnull else []
    if readfilter:
        readfilters.append(readfilter)
    readspec = "-R '%s'" % " and ".join(f for f in readfilters)
    !tshark -r $pcap_file -n -T fields -Eheader=y $readspec $fieldspec > tmp.txt
    df = pd.read_table("tmp.txt", index_col = "frame.time_epoch", parse_dates=True, date_parser=datetime.datetime.fromtimestamp)
    return df

Durchsatz

In [10]:
surflen=shark("nitroba.pcap", ["frame.len"])
surflen
Out[10]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 95175 entries, 2008-07-22 03:51:07.095278 to 2008-07-22 08:13:47.046029
Data columns (total 1 columns):
frame.len    95175  non-null values
dtypes: int64(1)
In [11]:
surflen.plot()
Out[11]:
<matplotlib.axes.AxesSubplot at 0x33d8410>
In [12]:
bytes_per_second=surflen.resample("S", how="sum")
bytes_per_second.head()
Out[12]:
frame.len
frame.time_epoch
2008-07-22 03:51:07 20729
2008-07-22 03:51:08 8426
2008-07-22 03:51:09 13565
2008-07-22 03:51:10 NaN
2008-07-22 03:51:11 NaN
In [13]:
bytes_per_second.plot(title="bytes/s")
Out[13]:
<matplotlib.axes.AxesSubplot at 0x34c8d90>

TCP Ack

In [14]:
tf=shark("nitroba.pcap", ["tcp.ack"])
tf
Out[14]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 81451 entries, 2008-07-22 03:51:07.095278 to 2008-07-22 08:13:47.046029
Data columns (total 1 columns):
tcp.ack    81451  non-null values
dtypes: int64(1)
In [15]:
!head tmp.txt
frame.time_epoch	tcp.ack
1216691467.095278000	1
1216691467.103728000	2
1216691467.114897000	1
1216691467.139448000	1352
1216691467.319680000	1352
1216691467.321990000	1215
1216691467.326517000	1215
1216691467.335554000	1353
1216691467.432370000	1
In [16]:
tf.plot()
Out[16]:
<matplotlib.axes.AxesSubplot at 0x34da710>
In [17]:
from pandas.tools.plotting import lag_plot
lag_plot(tf["tcp.ack"])
Out[17]:
<matplotlib.axes.AxesSubplot at 0x4765810>
In [18]:
tf["tcp.ack"].plot()
Out[18]:
<matplotlib.axes.AxesSubplot at 0x593fed0>

SSL Handshake Length

In [19]:
ss=shark("nitroba.pcap", ["ssl.handshake.length"])
ss
Out[19]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 316 entries, 2008-07-22 03:51:07.455854 to 2008-07-22 08:11:35.539888
Data columns (total 1 columns):
ssl.handshake.length    316  non-null values
dtypes: object(1)
In [20]:
hlen=ss["ssl.handshake.length"].map(lambda x: int(x.split(",")[0]))
hlen.plot()
Out[20]:
<matplotlib.axes.AxesSubplot at 0x476bfd0>
In [21]:
hlen.head()
Out[21]:
frame.time_epoch
2008-07-22 03:51:07.455854      70
2008-07-22 03:51:07.456488    1621
2008-07-22 03:51:07.534572     130
2008-07-22 03:51:07.672054     134
2008-07-22 03:51:07.684625      70
Name: ssl.handshake.length, dtype: int64
In [22]:
hlen.describe()
Out[22]:
count     316.000000
mean      310.382911
std       739.960691
min        48.000000
25%        70.000000
50%       130.000000
75%       170.000000
max      4598.000000
dtype: float64

Ethernet Padding

In [23]:
#!tshark -n -r nitroba.pcap -R "eth.trailer" -T fields -Eheader=y -e frame.number -e frame.time_epoch -e eth.src -e eth.trailer > eth.trailer
In [24]:
trailer_df = shark("nitroba.pcap", ["eth.src", "eth.trailer"])
trailer_df
Out[24]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12851 entries, 2008-07-22 03:51:19.333114 to 2008-07-22 08:13:44.293692
Data columns (total 2 columns):
eth.src        12851  non-null values
eth.trailer    12851  non-null values
dtypes: object(2)
In [25]:
trailer=trailer_df["eth.trailer"]
trailer
Out[25]:
frame.time_epoch
2008-07-22 03:51:19.333114                00:00
2008-07-22 03:51:19.334990    00:00:2b:94:a2:40
2008-07-22 03:51:19.335478                00:00
2008-07-22 03:51:19.336488                00:00
2008-07-22 03:51:19.338205                00:00
2008-07-22 03:51:19.340181                00:00
2008-07-22 03:51:19.341599                00:00
2008-07-22 03:51:19.342240    00:00:f4:d3:80:99
2008-07-22 03:51:19.344359    00:00:3b:e4:c6:e3
2008-07-22 03:51:19.344561    00:00:66:2b:d5:e6
2008-07-22 03:51:19.344787    00:00:4e:de:d1:96
2008-07-22 03:51:19.346342    00:00:2a:f6:a1:37
2008-07-22 03:51:19.403621    00:00:00:00:00:00
2008-07-22 03:51:19.404754    00:00:00:00:00:00
2008-07-22 03:51:19.406769    00:00:e3:00:38:23
...
2008-07-22 08:11:53.150588    00:00:00:00:00:00:00:00:00:00:00:00:00:00:01:f...
2008-07-22 08:11:53.199602            00:00:00:00:00:00:00:00:00:00:0f:5e:fe:0e
2008-07-22 08:11:53.417748    00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:0...
2008-07-22 08:11:53.423389    00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0...
2008-07-22 08:11:53.823592    00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0...
2008-07-22 08:11:54.225001    00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0...
2008-07-22 08:11:54.267376    00:00:00:00:00:00:00:00:00:00:00:00:00:00:4a:f...
2008-07-22 08:11:54.629052    00:00:00:00:00:00:00:00:00:00:00:00:00:00:b2:0...
2008-07-22 08:11:58.175698            00:00:00:00:00:00:00:00:00:00:61:d6:ab:d0
2008-07-22 08:12:13.140025    00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:0...
2008-07-22 08:12:25.399427                                    00:00:00:00:00:00
2008-07-22 08:12:47.410748    00:00:00:00:00:00:00:00:00:00:00:00:00:00:d7:3...
2008-07-22 08:13:05.469430                                    00:00:00:00:00:00
2008-07-22 08:13:35.837797                                    00:00:00:00:00:00
2008-07-22 08:13:44.293692                                    00:00:00:00:00:00
Name: eth.trailer, Length: 12851, dtype: object
In [26]:
trailer.value_counts()
Out[26]:
00:00:00:00:00:00                                        7989
3b:02:a7:19:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02     913
00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00:00     606
3b:02:a7:19:00:1d:6b:99:98:6a:88:64:11:00:8f:da:00:42     303
00:00                                                     299
00:00:c0:a8:01:40:00:00:00:00:00:00:00:00:00:1d:d9:2e     259
32:01:67:06:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02     254
2d:66:6f:6f:65:05:79:61:68:6f:6f:03:63:6f:6d:00:00:01     253
04:67:6b:64:63:03:75:61:73:03:61:6f:6c:03:63:6f:6d:00     160
70:03:6d:73:67:05:79:61:68:6f:6f:03:63:6f:6d:00:00:01     151
73:6b:03:6d:61:63:03:63:6f:6d:00:00:01:00:01:00:01:00     146
2d:66:6f:6f:62:05:79:61:68:6f:6f:03:63:6f:6d:00:00:01     101
73:6b:03:6d:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02      66
72:65:76:73:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02      54
00:00:00:00:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02      52
...
00:00:fa:c9:54:5f                                        1
00:00:8a:4e:29:ef                                        1
00:00:d0:09:08:44                                        1
00:00:50:e4:b9:45                                        1
00:00:11:48:cf:23                                        1
00:00:c6:c7:97:13                                        1
00:00:c7:24:5f:f8                                        1
00:00:ee:eb:a4:71                                        1
00:00:ab:87:e7:22                                        1
00:00:b0:37:d5:71                                        1
00:00:20:00:cb:15                                        1
00:00:5a:81:53:d9                                        1
00:00:b5:81:ee:ed                                        1
00:00:74:a2:f2:6f                                        1
00:00:00:00:00:00:00:00:00:00:00:00:00:00:68:12:4f:c7    1
Length: 635, dtype: int64
In [27]:
import binascii

def unhex(s, sep=":"):
    return binascii.unhexlify("".join(s.split(sep)))
In [28]:
s=unhex("3b:02:a7:19:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02")
s
Out[28]:
';\x02\xa7\x19\xaa\xaa\x03\x00\x80\xc2\x00\x07\x00\x00\x00\x02;\x02'
In [29]:
trailer_df["unhex"]=trailer_df["eth.trailer"].map(unhex)
trailer_df
Out[29]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 12851 entries, 2008-07-22 03:51:19.333114 to 2008-07-22 08:13:44.293692
Data columns (total 3 columns):
eth.src        12851  non-null values
eth.trailer    12851  non-null values
unhex          12851  non-null values
dtypes: object(3)
In [30]:
def printable(s):
    chars = []
    for c in s:
        if c.isalnum():
            chars.append(c)
        else:
            chars.append(".")
    return "".join(chars)
           
In [31]:
printable("\x95asd\x33")
Out[31]:
'.asd3'
In [32]:
trailer_df["printable"]=trailer_df["unhex"].map(printable)
In [33]:
trailer_df["printable"].value_counts()
Out[33]:
......                8145
..................    1927
......k..j.d.....B     303
..                     299
2.g...............     254
.fooe.yahoo.com...     253
.gkdc.uas.aol.com.     160
p.msg.yahoo.com...     151
sk.mac.com........     148
.foob.yahoo.com...     101
sk.m..............      66
revs..............      54
ge.w..............      45
1.1...............      44
.goo..............      42
...
..........Wz......    1
..M...                1
...i.Z                1
..x...                1
..N...                1
..n.oN                1
....fK                1
....fk                1
..Y8..                1
..n.FA                1
...O.r                1
....Qn                1
..PK.e                1
...w..                1
..1...                1
Length: 375, dtype: int64
In [34]:
trailer_df["printable"].to_csv("printable.csv", index=False)
In [35]:
def ratio_printable(s):
    printable = sum(1.0 for c in s if c.isalnum())
    return printable / len(s)
         
In [36]:
ratio_printable("a\x93sdfs")
Out[36]:
0.8333333333333334
In [37]:
trailer_df["ratio_printable"] = trailer_df["unhex"].map(ratio_printable)
In [38]:
trailer_df[trailer_df["ratio_printable"] > 0.5]
Out[38]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 727 entries, 2008-07-22 03:51:20.018817 to 2008-07-22 05:40:13.338449
Data columns (total 5 columns):
eth.src            727  non-null values
eth.trailer        727  non-null values
unhex              727  non-null values
printable          727  non-null values
ratio_printable    727  non-null values
dtypes: float64(1), object(4)
In [39]:
_.printable.value_counts()
Out[39]:
.fooe.yahoo.com...    253
.gkdc.uas.aol.com.    160
p.msg.yahoo.com...    151
.foob.yahoo.com...    101
.weather.com......     31
ge.weather.com....     26
1.1..HOST.239.255.      1
..CDWW                  1
.foof.yahoo.com...      1
..3rbo                  1
..BIKM                  1
dtype: int64
In [40]:
trailer_df[trailer_df["ratio_printable"] > 0.5].head(100)
Out[40]:
<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 100 entries, 2008-07-22 03:51:20.018817 to 2008-07-22 04:13:25.918602
Data columns (total 5 columns):
eth.src            100  non-null values
eth.trailer        100  non-null values
unhex              100  non-null values
printable          100  non-null values
ratio_printable    100  non-null values
dtypes: float64(1), object(4)
In [40]: