In [1]:
import pandas as pd
import numpy as np
%matplotlib inline
import seaborn as sb
In [2]:
df = pd.read_csv("ip_date_sample.csv", delimiter="|")
df.head()
Out[2]:
ip timestamp
0 xyz.321 1519811397
1 xyz.321 1519811396
2 xyz.213 1519811394
3 xyz.123 1519811392
4 xyz.321 1519811352
In [3]:
df.groupby(['ip']).size()
Out[3]:
ip
xyz.123    27
xyz.213    10
xyz.312    24
xyz.321    29
dtype: int64
In [4]:
# group by ip and timestamp and get a count of the grouping
df.groupby(['ip', 'timestamp']).size()
Out[4]:
ip       timestamp 
xyz.123  1519809013    4
         1519809033    3
         1519809110    2
         1519809174    2
         1519809209    5
         1519810847    2
         1519810867    2
         1519811352    1
         1519811392    2
         1519811394    2
         1519811397    2
xyz.213  1519809033    1
         1519809112    1
         1519810847    1
         1519810979    1
         1519811352    1
         1519811392    1
         1519811394    2
         1519811396    2
xyz.312  1519809013    1
         1519809110    3
         1519809112    1
         1519809174    3
         1519809209    1
         1519810867    1
         1519810925    4
         1519810979    2
         1519811352    1
         1519811392    1
         1519811394    1
         1519811396    3
         1519811397    2
xyz.321  1519809013    1
         1519809033    2
         1519809110    1
         1519809112    4
         1519809174    1
         1519810847    3
         1519810867    3
         1519810925    2
         1519810979    3
         1519811352    3
         1519811392    2
         1519811394    1
         1519811396    1
         1519811397    2
dtype: int64
In [5]:
# same as above, but with labeling info on the count column
counts_df = df.groupby(['ip', 'timestamp']).size().reset_index(name='count')
counts_df
Out[5]:
ip timestamp count
0 xyz.123 1519809013 4
1 xyz.123 1519809033 3
2 xyz.123 1519809110 2
3 xyz.123 1519809174 2
4 xyz.123 1519809209 5
5 xyz.123 1519810847 2
6 xyz.123 1519810867 2
7 xyz.123 1519811352 1
8 xyz.123 1519811392 2
9 xyz.123 1519811394 2
10 xyz.123 1519811397 2
11 xyz.213 1519809033 1
12 xyz.213 1519809112 1
13 xyz.213 1519810847 1
14 xyz.213 1519810979 1
15 xyz.213 1519811352 1
16 xyz.213 1519811392 1
17 xyz.213 1519811394 2
18 xyz.213 1519811396 2
19 xyz.312 1519809013 1
20 xyz.312 1519809110 3
21 xyz.312 1519809112 1
22 xyz.312 1519809174 3
23 xyz.312 1519809209 1
24 xyz.312 1519810867 1
25 xyz.312 1519810925 4
26 xyz.312 1519810979 2
27 xyz.312 1519811352 1
28 xyz.312 1519811392 1
29 xyz.312 1519811394 1
30 xyz.312 1519811396 3
31 xyz.312 1519811397 2
32 xyz.321 1519809013 1
33 xyz.321 1519809033 2
34 xyz.321 1519809110 1
35 xyz.321 1519809112 4
36 xyz.321 1519809174 1
37 xyz.321 1519810847 3
38 xyz.321 1519810867 3
39 xyz.321 1519810925 2
40 xyz.321 1519810979 3
41 xyz.321 1519811352 3
42 xyz.321 1519811392 2
43 xyz.321 1519811394 1
44 xyz.321 1519811396 1
45 xyz.321 1519811397 2
In [6]:
sb.set()
sb.lmplot(x="timestamp", y="count", data=counts_df, fit_reg=False, size = 7)
Out[6]:
<seaborn.axisgrid.FacetGrid at 0x7f312c2d4a58>
In [7]:
sb.tsplot(counts_df, time="timestamp", unit="ip", condition="ip", value="count")
/usr/local/anaconda3/lib/python3.6/site-packages/seaborn/timeseries.py:183: UserWarning: The tsplot function is deprecated and will be removed or replaced (in a substantially altered version) in a future release.
  warnings.warn(msg, UserWarning)
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x7f312c2d4128>
In [27]:
# I HATE using the matplotlib based plotting, HATE IT. Here's the same plot using VegaLite

from vega3 import VegaLite

VegaLite({
  "width": 600,
  "height": 400,
  "$schema": "https://vega.github.io/schema/vega-lite/v2.json",
  "description": "IP Address Connections per Sec",
  "title": "IP Address Connections per Sec",
  "mark": "line",
  "encoding": {
    "x": {"field": "timestamp", "type": "ordinal"},
    "y": {"field": "count", "type": "quantitative"},
    "color": {"field": "ip", "type": "nominal"}
  }
}, counts_df)