#!/usr/bin/env python # coding: utf-8 #

Table of Contents

#
# # msticpy - Event Timeline # # This notebook demonstrates the use of the timeline displays built using the [Bokeh library](https://bokeh.pydata.org). # # You must have msticpy installed: # ``` # %pip install --upgrade msticpy # ``` # # There are two display types: # - Discrete event series - this plots multiple series of events as discrete glyphs # - Event value series - this plots a scalar value of the events using glyphs, bars or traditional line graph (or some combination. # In[1]: # Imports import sys import warnings from msticpy.common.utility import check_py_version MIN_REQ_PYTHON = (3,6) check_py_version(MIN_REQ_PYTHON) import ipywidgets as widgets import pandas as pd pd.set_option('display.max_rows', 100) pd.set_option('display.max_columns', 50) pd.set_option('display.max_colwidth', 100) from msticpy.vis.timeline import display_timeline WIDGET_DEFAULTS = {'layout': widgets.Layout(width='95%'), 'style': {'description_width': 'initial'}} # # Discrete Event Timelines # # ## Plotting a simple timeline # # `display_timeline` # # In[2]: processes_on_host = pd.read_csv( "data/processes_on_host.csv", parse_dates=["TimeGenerated"], infer_datetime_format=True, index_col=0 ); # At a minimum we need to pass a dataframe with timestamp column # (defaults to TimeGenerated) display_timeline(processes_on_host) # The Bokeh graph is interactive and has the following features: # - Tooltip display for each event marker as you hover over it # - Toolbar with the following tools (most are toggles enabling or disabling the tool): # - Panning # - Select zoom # - Mouse wheel zoom # - Reset to default view # - Save image to PNG # - Hover tool # # Additionally an interactive timeline navigation bar is displayed below the main graph. You can change the timespan shown on the main graph by dragging or resizing the selected area on this navigation bar. # # **Note**: # - the tooltips work on the Windows process data shown above because of a legacy fallback built into the code. # Usually you need to specify the `source_columns` parameter explicitly to have # the hover tooltips populated correctly. # ## More Advanced Timelines # `display_timeline` also takes a number of optional parameters that give you more flexibility to show multiple data series and change the way the graph appears. # # The majority of these parameters are optional so don't be too overwhelmed by them. # In[3]: help(display_timeline) # ### Grouping Series From a Single DataFrame # # In[4]: display_timeline( processes_on_host, group_by="Account", source_columns=["NewProcessName", "ParentProcessName"], legend="left", ); # We can use the group_by parameter to specify a column on which to split individually plotted series. # # Specifying a legend, we can see the value of each series group. The legend is interactive - click on a series name to # hide/show the data. The legend can be placed inside of the chart (`legend="inline"`) or to the left or right. # # Alternatively we can enable the yaxis - although this is not guaranteed to show all values of the groups. # # **Note**: # - the tooltips work on the Windows process data shown above because of a legacy fallback built into the code. Usually you need to specify the `source_columns` parameter explicitly to have the hover tooltips populated correctly. # - the trailing semicolon just stops Jupyter showing the return value from the function. It isn't mandatory # In[5]: display_timeline( processes_on_host, group_by="Account", source_columns=["NewProcessName", "ParentProcessName"], legend="none", yaxis=True, ygrid=True, ); # # Plotting directly from a DataFrame # # We've implemented the timeline plotting functions # as pandas accessors so you can plot directly from the DataFrame # using `mp_plot.timeline()`. # # All of the parameters used in the standalone function are available # in the pandas accessor functions. # In[6]: host_logons = pd.read_csv( "data/host_logons.csv", parse_dates=["TimeGenerated"], infer_datetime_format=True, index_col=0, ) host_logons.mp_plot.timeline( title="Logons by Account name", group_by="Account", source_columns=["Account", "TargetLogonId", "LogonType"], legend="left", height=200, ) host_logons.mp_plot.timeline( title="Logons by logon type", group_by="LogonType", source_columns=["Account", "TargetLogonId", "LogonType"], legend="left", height=200, range_tool=False, ygrid=True, ); # # Displaying Reference lines # # You can annotate your timeline with one or more reference markers. # These can be supplied as timestamped events in a DataFrame or a list # of datetime/label pairs. # # To use a DataFrame, pass this as the `ref_events`: # # - You can specify the column to use as a label with the `ref_col` parameter # - If the time_column is not the same name as the time column in the main # DataFrame, specify this as `ref_time_col` # # To use a list of times, use the `ref_times` parameter. This should be a list of tuples of # # - datetime # - label (string) # # E.g. `ref_times=[(date1, "item1"), (date2, "item2")...]` # # You can use either `ref_events` or `ref_times` with a single row or list entry. # In[7]: alerts = processes_on_host.sample(3) display_timeline( host_logons, title="Processes with marker", group_by="Account", source_columns=["Account", "TargetLogonId", "LogonType"], ref_events=alerts, ref_col="SubjectUserName", legend="left", ygrid=True, ); # For a single reference point you can also use `alert`, `ref_event` # or `ref_time` although these are now deprecated in # favor of `ref_events` and `ref_times`. # # Use `ref_event` (note: this is different from `ref_events`) # In[8]: fake_alert = processes_on_host.sample().iloc[0] display_timeline( host_logons, title="Processes with marker", group_by="LogonType", source_columns=["Account", "TargetLogonId", "LogonType"], alert=fake_alert, legend="left", ); # # Plotting series from different data sets # When you want to plot data sets with different schema on the same plot it is difficult to put them in a single DataFrame. # To do this we need to assemble the different data sets into a dictionary and pass that to the `display_timeline` # # The dictionary has this format: # # Key: str # Name of data set to be displayed in legend # # Value: dict, the value holds the settings for each data series: # # data: pd.DataFrame # Data to plot # time_column: str, optional # Name of the timestamp column # (defaults to `time_column` function parameter) # source_columns: list[str], optional # List of source columns to use in tooltips # (defaults to `source_columns` function parameter) # color: str, optional # Color of datapoints for this data # (defaults to autogenerating colors) # # In[9]: procs_and_logons = { "Processes": { "data": processes_on_host, "source_columns": ["NewProcessName", "Account"], }, "Logons": { "data": host_logons, "source_columns": ["Account", "TargetLogonId", "LogonType"], }, } display_timeline( data=procs_and_logons, title="Logons and Processes", legend="left", yaxis=False ); # # Plotting Series with Scalar Values # Often you may want to see a scalar value plotted with the series. # # The first example below uses the pandas `mp_plot.timeline_values()` accessor # to plot network flow data using the total flows recorded between # a pair of IP addresses. # # You can also import and use `display_timeline_values` from # `msticpy.vis.timeline_values`. This is shown in later examples # # Note that the majority of parameters are the same as `display_timeline` but # include a mandatory `value_col` parameter which indicates which value # you want to plot on the y (vertical) axis. # (this can also be specified as `y`) # In[10]: from msticpy.vis.timeline import display_timeline_values az_net_flows_df = pd.read_csv( "data/az_net_flows.csv", parse_dates=["TimeGenerated", "FlowStartTime", "FlowEndTime"], infer_datetime_format=True, index_col=0, ) az_net_flows_df.mp_plot.timeline_values( group_by="L7Protocol", source_columns=[ "FlowType", "AllExtIPs", "L7Protocol", "FlowDirection", "TotalAllowedFlows", ], time_column="FlowStartTime", value_column="TotalAllowedFlows", legend="right", height=500, ); # By default the plot uses vertical bars show the values but you can use any combination of vbar, circle and line, using the `kind` parameter. You specify the plot types as a list of strings (all lowercase). # # **Notes** # - including "circle" in the plot kinds makes it easier to see the hover value # - the line plot can be a bit misleading since it will plot lines between adjacent data points of the same series implying that there is a gradual change in the value being plotted - even though there may be no data between the times of these adjacent points. For this reason using vbar is often a more accurate view. # In[11]: flow_plot = display_timeline_values( data=az_net_flows_df, group_by="L7Protocol", source_columns=[ "FlowType", "AllExtIPs", "L7Protocol", "FlowDirection", "TotalAllowedFlows", ], time_column="FlowStartTime", value_column="TotalAllowedFlows", legend="right", height=500, kind=["vbar", "circle"], ); # In[12]: display_timeline_values( data=az_net_flows_df[az_net_flows_df["L7Protocol"] == "http"], group_by="L7Protocol", title="Line plot can be misleading", source_columns=[ "FlowType", "AllExtIPs", "L7Protocol", "FlowDirection", "TotalAllowedFlows", ], time_column="FlowStartTime", value_column="TotalAllowedFlows", legend="right", height=300, kind=["line", "circle"], range_tool=False, ) display_timeline_values( data=az_net_flows_df[az_net_flows_df["L7Protocol"] == "http"], group_by="L7Protocol", title="Vbar and circle show zero gaps in data", source_columns=[ "FlowType", "AllExtIPs", "L7Protocol", "FlowDirection", "TotalAllowedFlows", ], time_column="FlowStartTime", value_column="TotalAllowedFlows", legend="right", height=300, kind=["vbar", "circle"], range_tool=False, ); # ## Documentation for display_timeline_values # # In[13]: help(display_timeline_values) # # Timeline Durations # # Sometimes it's useful to be able to group data and see the start and ending # activity over a period. The timeline durations plot gives you # that option. It creates bands for the start and ending duration of # each group, as well as the locations of the individual events. # # Note, that unlike other timeline controls you *must* specify a # `group_by` parameter. This defines the way that the data is grouped # before calculating the start and end of the events within that group. # `group_by` can be a single column or a list of columns. # # Durations are shown using boxes with individual events # superimposed (as diamonds). # # In[14]: from msticpy.vis.timeline_duration import display_timeline_duration display_timeline_duration( host_logons, group_by="Account", ref_events=host_logons.sample(3), ref_col="TargetUserName", ); # In[15]: az_net_flows_df.mp_plot.timeline_duration( group_by=["SrcIP", "DestIP", "L7Protocol"] ) # # Exporting Plots as PNGs # To use bokeh.io image export functions you need selenium, phantomjs and pillow installed: # # `conda install -c bokeh selenium phantomjs pillow` # # or # # `pip install selenium pillow` # `npm install -g phantomjs-prebuilt` # # For phantomjs see https://phantomjs.org/download.html. # # Once the prerequisites are installed you can create a plot and save the return value to a variable. # Then export the plot using `export_png` function. # ```python # from bokeh.io import export_png # from IPython.display import Image # # # Create a plot # flow_plot = nbdisplay.display_timeline_values(data=az_net_flows_df, # group_by="L7Protocol", # source_columns=["FlowType", # "AllExtIPs", # "L7Protocol", # "FlowDirection", # "TotalAllowedFlows"], # time_column="FlowStartTime", # y="TotalAllowedFlows", # legend="right", # height=500, # kind=["vbar", "circle"] # ); # # # Export # file_name = "plot.png" # export_png(flow_plot, filename=file_name) # # # Read it and show it # display(Markdown(f"## Here is our saved plot: {file_name}")) # Image(filename=file_name) # ```