#!/usr/bin/env python # coding: utf-8 # ![border.png](attachment:070aeced-e5f5-4534-85e5-688b0031c657.png) # # MSTICpy at Blackhat Arsenal 2020 # ## Setup # # MSTICpy includes a feature called [nbinit](https://msticpy.readthedocs.io/en/latest/msticpy.nbtools.html?highlight=nbinit#module-msticpy.init.nbinit) that handles the process of installing and importing modules into a notebook environment. This was developed to allow for a clearer starting cell in notebooks and to avoid users being presented with a very large cell block at the top of a notebook.
# By passing the notebook namespace to init_notebook() this function handles the job of installing and importing core MSTICpy packages along with any others that might be needed by a notebook. # # You must have msticpy installed to run this notebook: # ``` # %pip install --upgrade msticpy[timeseries, splunk, azsentinel] # ``` # MSTICpy versions > 0.8.5 # # The notebook also uses MSTIC Notebooklets: # ``` # %pip install --upgrade msticnb # ``` # In[1]: from msticpy.nbtools import nbinit extra_imports = [ "msticpy.vis.timeseries, display_timeseries_anomolies", "msticpy.analysis.timeseries, timeseries_anomalies_stl", "datetime, datetime", "msticpy.vis.nbdisplay, draw_alert_entity_graph", "msticpy.context.ip_utils, convert_to_ip_entities", "msticpy.vis.ti_browser, browse_results", "IPython.display, Image", "msticpy.context.ip_utils, get_whois_info", "msticpy.context.ip_utils, get_ip_type" ] nbinit.init_notebook( namespace=globals(), additional_packages=["pyvis"], extra_imports=extra_imports, ); from msticpy.context import TILookup ti = TILookup() # ## Data Acquisition - Splunk # The starting point for many notebooks is ingesting data to conduct analysis or investigation of. MSTICpy has a number of [query providers](https://msticpy.readthedocs.io/en/latest/data_acquisition/DataProviders.html) to allow users to query and return data from a number of sources. Below we are using the Splunk query provider to return data from our Splunk instance.
# > Note: Using *Splunk* API via *splunk-sdk* Python package - the MSTICpy Splunk provider is in beta. # # Data is returned in a Pandas [DataFrame](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html) for easy manipulation and to provide a common interface for other features in MSTICpy.

# Here we are getting a summary of our network traffic in the time period we are interested in. # # In[ ]: splunk_host = widgets.Text(description='Splunk Host:') splunk_user = widgets.Text(description='Splunk User:') splunk_pwd = widgets.Password(description='Splunk Pwd:') display(splunk_host) display(splunk_user) display(splunk_pwd) # In[7]: # Initialize a Splunk provider and connect to our Splunk instance. splunk_prov = QueryProvider("Splunk") splunk_prov.connect(host=splunk_host.value, username=splunk_user.value, password=splunk_pwd.value) # In[8]: # Define a Splunk Query and run it. splunk_query = "search host=network_sum index=blackhat earliest=0 | table TimeGenerated, TotalBytesSent" stldemo = splunk_prov.exec_query(splunk_query) stldemo['TimeGenerated'] = pd.to_datetime(stldemo['TimeGenerated']) stldemo.set_index('TimeGenerated', inplace=True) stldemo.sort_index(inplace=True) stldemo.head() # ## Timeseries Analysis of detected Anomalies # Once we have queried the data, there are a number of analysis features within MSTICpy to help understand the data and identify potential security incidents. # # In order to effectively hunt in a dataset analysts need to focus on specific events of interest. Below we use MSTICpy's [time series analysis](https://msticpy.readthedocs.io/en/latest/msticpy.analysis.html?highlight=timeseries#module-msticpy.analysis.timeseries) machine learning capabilities to identify anomalies in our network traffic for further investigation.
# As well as computing anomalies we visualize the data so that we can more easily see where these anomalies present themselves. # # > Note: Visualization powered by *Bokeh* # # In[9]: # Conduct our timeseries analyis output = timeseries_anomalies_stl(stldemo) # Visualize the timeseries and any anomalies display_timeseries_anomolies(data=output, y= 'TotalBytesSent') # In[10]: # Identify when the anomalies occur so that we can use this timnerange to scope the next stage of our investigation. start = output[output['anomalies']==1]['TimeGenerated'].min() end = output[output['anomalies']==1]['TimeGenerated'].max() + pd.to_timedelta(1, unit='h') # md and md_warn are MSTICpy features to provide simple, and clean output in notebook cells md(f"Anomalous session start time: {start} - end time: {end}") # ## Enrich and Pivot on IP Addresses # With the time series analysis identifying several events of interest we need additional context to be able to effectively conduct a security investigation. MSTICpy has a range of features to help enrich key data types and provide that context depending on the entities being used.
# To get these entities we again query Splunk to get the IP addresses associated with the anomalous traffic. # In[11]: splunk_query = "search host=network_raw index=blackhat earliest=0 | table TimeGenerated, Action, SourceIP, DestinationIP, TotalBytesSent" net_data = splunk_prov.exec_query(splunk_query) # We need to identify what network endpoints are associated with the anomalies net_data['TotalBytesSent'] = net_data['TotalBytesSent'].astype(int) grouped_df = net_data.groupby(['SourceIP', 'DestinationIP']) noisy_hosts = grouped_df['TotalBytesSent'].agg(np.sum).sort_values(ascending=False) md("Top talkers during anomolous session: ", 'bold') display(noisy_hosts[:5]) source_ip, dest_ip = noisy_hosts.index[0][0:2] # ### Network data enrichment # ### Network data enrichment # MSTICpy can help analysts investigate an IP address, using open source information such as [passive DNS](https://msticpy.readthedocs.io/en/latest/data_acquisition/TIProviders.html?highlight=passive%20dns#use-to-do-a-passive-dns-lookup) data, [IP geolocation](https://msticpy.readthedocs.io/en/latest/data_acquisition/GeoIPLookups.html) and [threat intelligence](https://msticpy.readthedocs.io/en/latest/data_acquisition/TIProviders.html) feeds to provide valuable context. # # > Note: Whois module uses *ipwhois*, TI module uses services from *OTX*, *VirusTotal*, *XForce*, *AzureSentinel*, and *OpenPageRank* # In[12]: # Get and display WhoIs data md(f"Target IP: {dest_ip}", 'bold') md(f"{dest_ip} is a {get_ip_type(dest_ip)} IP address") whois_info = get_whois_info(dest_ip) md(f'Whois Registrar Info :', styles=["bold"]) md(f"ASN Owner: {whois_info[0]}") md(f"ASN Address: {whois_info[1]['nets'][0]['address']}") # Get Passive DNS results result = ti.lookup_ioc(observable=dest_ip, ico_type="ipv4", ioc_query_type="passivedns", providers=["XForce"]) md(f"Passive DNS records for {dest_ip}:", styles=["bold"]) for res in ti.result_to_df(result)['RawResult'][0]['Passive']['records']: print(res['value']," - ", res['last']) # Lookup ip IPAddress in threat intel feeds resp = ti.lookup_ioc(observable=dest_ip) md(f"Threat Intel results for {dest_ip}:", styles=["bold"]) ti.result_to_df(resp) # As well as returning the raw data from these enrichment sources MSTICpy has features to allow for visualization of that data to make it more accessible. Below we get the [IP geolocation](https://msticpy.readthedocs.io/en/latest/data_acquisition/GeoIPLookups.html) and use the [Folium Map](https://msticpy.readthedocs.io/en/latest/visualization/FoliumMap.html) feature to plot the IP address location on an interactive map. # > Note: uses the Python *Folium* package, which is a wrapper around *Leafletjs* # In[2]: # Plot IP geolocation on a map folium_map = FoliumMap(zoom_start=4) md('

Location of remote IP

') folium_map.add_ip_cluster(ip_entities=convert_to_ip_entities(dest_ip), color="red") folium_map.center_map() folium_map # ## Host Lookup # Once we have some context on our remote IP address we can pivot our investigation to look at the local host that has been communicating with it. # ### Data Acquisition from Azure Sentinel # As well as *Splunk*, MSTICpy has a query provider for *Azure Sentinel*. For the next phase of our investigation we are going to use this query provider to acquire data.

# With the Splunk connection we provided connection details directly to our query provider when calling `.connect()`. We can also store details in a [msticpy configuration file](https://msticpy.readthedocs.io/en/latest/getting_started/msticpyconfig.html) (`msticpyconfig.yaml`) and pass them to the query provider programmatically. Here we use the [Workspace Config](https://msticpy.readthedocs.io/en/latest/msticpy.common.html?highlight=WorkspaceConfig#msticpy.common.wsconfig.WorkspaceConfig) feature to access this configuration and retrieve the items we need to authenticate to Azure Sentinel. # # > **Note**: the authentication flow for Azure Sentinel is different from Splunk and use the Oauth2.0 device code process. # In[15]: # Initalize and connect to Azure Sentinel using details from our config file. qry_prov = QueryProvider('LogAnalytics') wkspace = WorkspaceConfig() qry_prov.connect(wkspace.code_connect_str) # Once connected we can query Azure Sentinel in a similar way to Splunk - by providing a text query string. #
We substitute the `source_ip` value we obtained in the previous section *Enrich and Pivot on IP Addresses* # In[16]: # Query Azure Sentinel to get host details. query = f"Heartbeat | where ComputerIP == '{source_ip}'" host = qry_prov.exec_query(query) host_name = host['Computer'].iloc[0] md(f"Host to investigate: {host_name}t") # ## Investigate Host # Now that we have identified our host we want to perform some standard analysis to get a summary of the host. Rather than code these steps individually each time we create a notebook that investigates hosts we have grouped together several MSTICpy features and investigation steps into a single function we call a notebooklet - by calling this notebooklet we can easily conduct analysis that would require hundreds of lines of code if coded directly in a notebook. # ### Use a notebooklet # In[18]: # Initalize our notebooklets import msticnb as nb from msticnb.common import TimeSpan nb.init() tspan = TimeSpan(start=start, end=end) # Select our notebooklet nblet = nb.nblts.azsent.host.HostSummary() # Run our notebooklet out = nblet.run(value=host_name, timespan=tspan) # ### Azure Data # As well as providing query providers to get data from SIEM solutions such as Azure Sentinel, MSTICpy also has the capability to acquire data from other sources such as the Azure APIs. Below we use [these features](https://msticpy.readthedocs.io/en/latest/data_acquisition/AzureData.html) to collect information and metrics on our host.
# We then use the MSTICpy interactive [timeline visualization](https://msticpy.readthedocs.io/en/latest/visualization/EventTimeline.html) in order to display this data. # In[19]: from msticpy.data.azure_data import AzureData # Initalize and connect to Azure az = AzureData() az.connect() # Get details on our subscription and virtal machines sub_id = az.get_subscriptions().iloc[0]['Subscription ID'] resources = az.get_resources(sub_id) display(resources[resources['name'] == "BlackHatDemoHost"]) res_id = resources[resources['name'] == "BlackHatDemoHost"].iloc[0]['resource_id'] # In[20]: # Get details on our target resource az.get_resource_details(resource_id=res_id, sub_id=sub_id) # In[23]: # Get metrics from the Azure virtual machine. mets = az.get_metrics(metrics="Percentage CPU,Disk Read Bytes,Disk Write Bytes", resource_id=res_id, sub_id=sub_id, sample_time="hour", start_time=10) disk_read_data = mets['Disk Read Bytes'] disk_read_data['Type'] = 'Disk Read' disk_write_data = mets['Disk Write Bytes'] disk_write_data['Type'] = "Disk Write" disk_data = pd.concat([disk_read_data, disk_write_data]) # Visualize those metrics nbdisplay.display_timeline_values(data=mets['Percentage CPU'], title="Host CPU Usage", time_column = 'Time', y='Data', height=400, source_columns=['Time', 'Data'], kind='line', range_tool=False) nbdisplay.display_timeline_values(data=disk_data, title="Host Disk Usage", time_column = 'Time', y='Data', height=400, source_columns=['Time', 'Data'], kind='line', group_by='Type', range_tool=False) # ### Host Alerts # One thing we want to investigate in more detail is any security alerts associated with the host. Security Alerts contain complex, detailed data that is hard to read in a regular Pandas DataFrame. To make it easier, MSTICpy provides an interactive widget to allow you to pick alerts from a list and see the details in an expanded output format. # # # > Note: Previously we were getting data from a query provider we provided the query as a string. Part of the MSTICpy dataprovider functionality is to be able to create and store parameterized queries (in YAML files). MSTICpy comes with a set of [pre-built queries](https://msticpy.readthedocs.io/en/latest/data_acquisition/DataQueries.html) for many common scenarios. Below we use one fo these to get a list of alerts related to the host we are investigating. # # # In[24]: related_alerts = qry_prov.SecurityAlert.list_related_alerts(start=start, end=end, host_name=host_name) display(related_alerts) # In[25]: related_alerts['CompromisedEntity'] = related_alerts['Computer'] def disp_full_alert(alert): global related_alert related_alert = SecurityAlert(alert) return nbdisplay.format_alert(related_alert, show_entities=True) rel_alert_select = nbwidgets.SelectAlert(alerts=related_alerts, action=disp_full_alert) rel_alert_select.display() # ### Draw graph of alerts # Graphs provide a great way to understand the relationship between items. As alerts are often associated with multiple different entities being able to view a graph of alerts and their entities helps analysts identify important connections. MSTICpy provide a [feature](https://msticpy.readthedocs.io/en/latest/msticpy.data.html?highlight=create_alert_graph#msticpy.nbtools.security_alert_graph.create_alert_graph) for graphing and plotting alert information. # In[26]: alert = SecurityAlert(rel_alert_select.selected_alert) grph = create_alert_graph(alert) full_grph = add_related_alerts(related_alerts, grph) draw_alert_entity_graph(full_grph, width=15) # ### Logon Sessions # Logon events are key to understanding any host based activity. We have previously used MSTICpy's [timeline feature](https://msticpy.readthedocs.io/en/latest/visualization/EventTimeline.html) to display value based data such as our Azure virtual machine metrics, as well as discrete data such as alerts, however we can also use it to display multiple types of discrete data on the same timeline. This is particularly useful for Windows logon events where we plot different logon types (interactive, network, etc.) in different horizontal series. # In[27]: # Acquire data using a built in query host_logons = qry_prov.WindowsSecurity.list_host_logons(start=start,end=end, host_name=host_name) # Display timeline tooltip_cols = ["TimeGenerated", "Account", "LogonType", 'TimeGenerated'] nbdisplay.display_timeline(data=host_logons, alert=rel_alert_select.selected_alert, title="Host Logons", source_columns = tooltip_cols, group_by = "LogonType", height=200) # When presented with a large number of events such as we have here its useful to cluster these into a more managable number of groups. MSTICpy contains [clustering features](https://msticpy.readthedocs.io/en/latest/msticpy.sectools.html?highlight=cluster_events#msticpy.sectools.eventcluster.dbcluster_events) that can be used against a number of data types. Once clustering is complete we use another [widget](https://msticpy.readthedocs.io/en/latest/msticpy.nbtools.html?highlight=SelectItem#msticpy.nbwidgets.SelectItem) to let the user select the cluster they want to focus on. # In[28]: from msticpy.sectools.eventcluster import dbcluster_events, add_process_features, _string_score logon_features = host_logons.copy() logon_features["AccountNum"] = host_logons.apply(lambda x: _string_score(x.Account), axis=1) logon_features["TargetUserNum"] = host_logons.apply(lambda x: _string_score(x.TargetUserName), axis=1) logon_features["LogonHour"] = host_logons.apply(lambda x: x.TimeGenerated.hour, axis=1) # run clustering (clus_logons, _, _) = dbcluster_events(data=logon_features, time_column="TimeGenerated", cluster_columns=["AccountNum", "LogonType", "TargetUserNum"], max_cluster_distance=0.0001) dist_logons = clus_logons.sort_values("TimeGenerated")[["TargetUserName", "TimeGenerated", "LastEventTime", "LogonType", "ClusterSize"]] dist_logons = dist_logons.apply(lambda x: ( f"{x.TargetUserName}: " f"(logontype {x.LogonType}) " f"timerange: {x.TimeGenerated} - {x.LastEventTime} " f"count: {x.ClusterSize}" ), axis=1, ) dist_logons = {v: k for k, v in dist_logons.to_dict().items()} def show_logon(idx): return nbdisplay.format_logon(pd.DataFrame(clus_logons.loc[idx]).T) logon_wgt = nbwidgets.SelectItem(description="Select logon cluster to examine", item_dict=dist_logons, action=show_logon,height="200px", width="100%", auto_display=True) # In[29]: # We can reset our timeframe based on the selected cluster. start = clus_logons.loc[logon_wgt.value]['FirstEventTime'] end = clus_logons.loc[logon_wgt.value]['LastEventTime'] # ### Process Tree # When investigating a host it is valuable to see the processes executed on the host, and the relationship between them. We can use the MSTICpy *ProcessTree* functionality to build and visualize [process trees](https://msticpy.readthedocs.io/en/latest/visualization/ProcessTree.html) from both Linux and Windows hosts.
# # > Due to the volume of data potentially involved when looking at process events, it's important to have a focused time frame to look at. We use the MSTICpy [widget for selecting a time range](https://msticpy.readthedocs.io/en/latest/visualization/NotebookWidgets.html?highlight=QueryTime#querytime). # In[30]: timescope = nbwidgets.QueryTime(units="hours", origin_time = start, max_before=12, max_after=24, before=0, after=3, auto_display=True) # In[31]: proc_data = qry_prov.WindowsSecurity.list_host_processes(start=timescope.start,end=timescope.end, host_name=host_name) p_tree = ptree.build_process_tree(proc_data, show_progress=True) root_proc_sel = nbwidgets.SelectItem( description="Select root process to investigate process tree", item_list=ptree.get_roots(p_tree)['NewProcessName'].to_list(), height="200px", width="100%", auto_display=True) # In[33]: # Build tree from selected root proc_tree = ptree.get_descendents(p_tree, ptree.get_roots(p_tree)[ptree.get_roots(p_tree)['NewProcessName']==root_proc_sel.value].iloc[0]) # Visualize the tree process_tree = nbdisplay.plot_process_tree(data=proc_tree, legend_col="SubjectUserName", show_table=True) # Looking a the processes above we can see some of the command line arguments appear to be Base64 encoded, this is a common technique employed by attackers to hide their activity. MSTICpy includes features to identify and [decode Base64 encoded strings](https://msticpy.readthedocs.io/en/latest/data_analysis/Base64Unpack.html) to allow for effective analysis. # In[34]: cmd_lines = p_tree.dropna(subset=['CommandLine']).copy() #Base 64 decode strings in our commandlines dec_df = base64.unpack_df(data=cmd_lines, column="CommandLine") dec_df = dec_df.dropna(subset=['decoded_string']) dec_df.head() # ## IoC Extract and Threat Intel Lookup # With the host process tree above we have found some activity that appears malicious. However, we'd like to do some more validation without having to manually examine each process. One simple way to do this is to look for key Indicators of Compromise (IoC) in our data and check them against threat intelligence. We use MSTICpy's `IoCExtract` to [extract known IoC types](https://msticpy.readthedocs.io/en/latest/data_analysis/IoCExtract.html). We can then use the same threat intelligence feature used earlier with a single IP address to look up multiple IoCs. # In[37]: # Extract IoCs from command lines ioc_ex = IoCExtract() cmd_iocs = cmd_lines.mp_ioc.extract(columns=['CommandLine'], ioc_types=['ipv4','dns']) b64_iocs = dec_df.mp_ioc.extract(columns=['decoded_string'], ioc_types=['ipv4','dns']) iocs = pd.concat([cmd_iocs,b64_iocs]) iocs = iocs.drop_duplicates(subset=['IoCType','Observable']).copy() iocs.sample(5) # Similar to the alert viewer widget used earlier MSTICpy has a viewer for threat intelligence results to make reviewing the output easier. # > Note: The full response details from the provider can be see in the collapsible `Raw Results` section # In[36]: # TI Lookups ti_resp = ti.lookup_iocs(data=iocs, obs_col="Observable") select_ti = browse_results(ti_resp, severities=['high','warning']) select_ti # ### Review domain # We have appear to have identified a malicious domain to go with the IP address identified earlier in our investigation. In order to complete our investigation we want to get some context on this domain in the same way we did with the IP address. Again MSTICpy has a number of tools to help with this, including features to [validate a domain](https://msticpy.readthedocs.io/en/latest/getting_started/PackageSummary.html?highlight=screenshot#domain-utils) and [screenshot a URL](https://msticpy.readthedocs.io/en/latest/getting_started/PackageSummary.html?highlight=screenshot#domain-utils). # # > Note: these tools use publicly-available services such as *abuse.ch* and *Browshot* # In[38]: dom = select_ti.value[0] dom_val = domain_utils.DomainValidator() md(f"Is {dom} a valid domain? {dom_val.validate_tld(dom)}") md(f"Is {dom} resolvable? {dom_val.is_resolvable(dom)}") md(f"Is the TLS cert used by {dom} in abuse.ch's abuse list? {dom_val.in_abuse_list(dom)[0]}") # In[39]: image_data = domain_utils.screenshot("secure-ssl-sec.com") with open('screenshot.png', 'wb') as f: f.write(image_data.content) display(Image(filename='screenshot.png')) # # Resources # ## MSTICpy: # - msticpy Github https://github.com/Microsoft/msticpy # - msticpy Docs https://msticpy.readthedocs.io/en/latest/ # - msticpy Release Blog https://medium.com/@msticmed # # ## MSTICpy maintainers: # - Ian Hellen [@ianhellen](https://twitter.com/ianhellen) # - Pete Bryan [@MSSPete](https://twitter.com/MSSPete) # - Ashwin Patil [@ashwinpatil](https://twitter.com/ashwinpatil) # # ## Other useful stuff: # - Azure Sentinel Github Notebooks https://github.com/Azure/Azure-Sentinel/Notebooks/tree/master # - (Samples with data in Sample-Notebooks folder) # - Azure Sentinel Tech Community Blogs https://aka.ms/AzureSentinelBlog