from IPython.display import HTML HTML('') !mkdir -p pcap cd pcap/ # Just use curl: # !curl -o nitroba.pcap http://digitalcorpora.org/corp/nps/packets/2008-nitroba/nitroba.pcap # Or use pure Python: # import urllib # urllib.urlretrieve("http://digitalcorpora.org/corp/nps/packets/2008-nitroba/nitroba.pcap", "nitroba.pcap") ls -l nitroba.pcap %pylab inline import pandas as pd !tshark -n -r nitroba.pcap -T fields -Eheader=y -e frame.number -e frame.len > frame.len df=pd.read_table("frame.len") df df["frame.len"].describe() figsize(10,6) df["frame.len"].plot(style=".", alpha=0.2) title("Frame length") ylabel("bytes") xlabel("frame number") import subprocess import datetime import pandas as pd def read_pcap(filename, fields=[], display_filter="", timeseries=False, strict=False): """ Read PCAP file into Pandas DataFrame object. Uses tshark command-line tool from Wireshark. filename: Name or full path of the PCAP file to read fields: List of fields to include as columns display_filter: Additional filter to restrict frames strict: Only include frames that contain all given fields (Default: false) timeseries: Create DatetimeIndex from frame.time_epoch (Default: false) Syntax for fields and display_filter is specified in Wireshark's Display Filter Reference: http://www.wireshark.org/docs/dfref/ """ if timeseries: fields = ["frame.time_epoch"] + fields fieldspec = " ".join("-e %s" % f for f in fields) display_filters = fields if strict else [] if display_filter: display_filters.append(display_filter) filterspec = "-R '%s'" % " and ".join(f for f in display_filters) options = "-r %s -n -T fields -Eheader=y" % filename cmd = "tshark %s %s %s" % (options, filterspec, fieldspec) proc = subprocess.Popen(cmd, shell = True, stdout=subprocess.PIPE) if timeseries: df = pd.read_table(proc.stdout, index_col = "frame.time_epoch", parse_dates=True, date_parser=datetime.datetime.fromtimestamp) else: df = pd.read_table(p.stdout) return df framelen=read_pcap("nitroba.pcap", ["frame.len"], timeseries=True) framelen bytes_per_second=framelen.resample("S", how="sum") bytes_per_second.head() bytes_per_second.plot(title="bytes/s") tf=read_pcap("nitroba.pcap", ["tcp.stream", "frame.len"], "tcp", timeseries=True, strict=True) tf tf.head() per_stream=tf.groupby("tcp.stream") per_stream bytes_per_stream = per_stream.sum() bytes_per_stream.head() bytes_per_stream.plot() bytes_per_stream.max() bytes_per_stream.idxmax() bytes_per_stream.ix[88] trailer_df = read_pcap("nitroba.pcap", ["eth.src", "eth.trailer"], timeseries=True) trailer_df trailer=trailer_df["eth.trailer"] trailer trailer.value_counts() import binascii def unhex(s, sep=":"): return binascii.unhexlify("".join(s.split(sep))) s=unhex("3b:02:a7:19:aa:aa:03:00:80:c2:00:07:00:00:00:02:3b:02") s padding = trailer_df.dropna() padding["unhex"]=padding["eth.trailer"].map(unhex) def printable(s): chars = [] for c in s: if c.isalnum(): chars.append(c) else: chars.append(".") return "".join(chars) printable("\x95asd\x33") padding["printable"]=padding["unhex"].map(printable) padding["printable"].value_counts() def ratio_printable(s): printable = sum(1.0 for c in s if c.isalnum()) return printable / len(s) ratio_printable("a\x93sdfs") padding["ratio_printable"] = padding["unhex"].map(ratio_printable) padding[padding["ratio_printable"] > 0.5] _.printable.value_counts() padding[padding["ratio_printable"] > 0.5]['eth.src'].drop_duplicates() HTML('')