Notebook: https://aka.ms/Jupyterthon-ws-2-3
License: Creative Commons Attribution-ShareAlike 4.0 International
Q&A - OTR Discord #Jupyterthon #WORKSHOP DAY 2 - ADVANCED PANDAS
import pandas as pd
procs_df = pd.read_csv(
"../data/process_tree.csv",
parse_dates=["TimeCreatedUtc", "TimeGenerated"],
index_col=0
)
parents = procs_df[["ProcessId", "ParentProcessName"]].drop_duplicates()
procs = (
procs_df[["NewProcessId", "NewProcessName", "CommandLine", "ProcessId", "TimeCreatedUtc", "SubjectUserSid"]]
.drop_duplicates()
.rename(columns={"ProcessId": "ParentProcessId"})
)
users = procs_df[['SubjectUserSid', 'SubjectUserName', 'SubjectDomainName']].drop_duplicates()
print("original", len(procs_df))
print("procs", len(procs))
print("parents", len(parents))
print("users", len(users))
display(procs.head(3))
display(parents)
display(users)
original 117 procs 117 parents 3 users 2
NewProcessId | NewProcessName | CommandLine | ParentProcessId | TimeCreatedUtc | SubjectUserSid | |
---|---|---|---|---|---|---|
0 | 0x1580 | C:\Diagnostics\UserTmp\ftp.exe | .\ftp -s:C:\RECYCLER\xxppyy.exe | 0xbc8 | 2019-01-15 05:15:15.677 | S-1-5-21-996632719-2361334927-4038480536-500 |
1 | 0x16fc | C:\Diagnostics\UserTmp\reg.exe | .\reg not /domain:everything that /sid:shines... | 0xbc8 | 2019-01-15 05:15:16.167 | S-1-5-21-996632719-2361334927-4038480536-500 |
2 | 0x1700 | C:\Diagnostics\UserTmp\cmd.exe | cmd /c "systeminfo && systeminfo" | 0xbc8 | 2019-01-15 05:15:16.277 | S-1-5-21-996632719-2361334927-4038480536-500 |
ProcessId | ParentProcessName | |
---|---|---|
0 | 0xbc8 | C:\Windows\System32\cmd.exe |
115 | 0x440 | C:\Windows\System32\svchost.exe |
116 | 0x1580 | C:\Diagnostics\UserTmp\powershell.exe |
SubjectUserSid | SubjectUserName | SubjectDomainName | |
---|---|---|---|
0 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
115 | S-1-5-18 | MSTICAlertsWin1$ | WORKGROUP |
pd.concat([df1, df2...])
We saw using pd.concat to append rows in part 1
# Do some processing on the original DF
dec_logon_id = (
pd.DataFrame(procs_df.SubjectLogonId.apply(lambda x: int(x, base=16)))
.rename(columns={"SubjectLogonId": "SubjectLogonId_dec"})
)
dec_logon_id.head(5)
SubjectLogonId_dec | |
---|---|
0 | 16428071 |
1 | 16428071 |
2 | 16428071 |
3 | 16428071 |
4 | 16428071 |
axis="columns"
or axis=1
joins column-wise (horizontally)¶(
pd.concat([procs_df, dec_logon_id], axis="columns")
.head()
.filter(regex=".*Process.*|Sub.*")
)
SubjectUserSid | SubjectUserName | SubjectDomainName | SubjectLogonId | NewProcessId | NewProcessName | ProcessId | ParentProcessName | SubjectLogonId_dec | |
---|---|---|---|---|---|---|---|---|---|
0 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 | 0xfaac27 | 0x1580 | C:\Diagnostics\UserTmp\ftp.exe | 0xbc8 | C:\Windows\System32\cmd.exe | 16428071 |
1 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 | 0xfaac27 | 0x16fc | C:\Diagnostics\UserTmp\reg.exe | 0xbc8 | C:\Windows\System32\cmd.exe | 16428071 |
2 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 | 0xfaac27 | 0x1700 | C:\Diagnostics\UserTmp\cmd.exe | 0xbc8 | C:\Windows\System32\cmd.exe | 16428071 |
3 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 | 0xfaac27 | 0x1728 | C:\Diagnostics\UserTmp\rundll32.exe | 0xbc8 | C:\Windows\System32\cmd.exe | 16428071 |
4 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 | 0xfaac27 | 0x175c | C:\Diagnostics\UserTmp\rundll32.exe | 0xbc8 | C:\Windows\System32\cmd.exe | 16428071 |
display(procs.head())
display(users)
NewProcessId | NewProcessName | CommandLine | ParentProcessId | TimeCreatedUtc | SubjectUserSid | |
---|---|---|---|---|---|---|
0 | 0x1580 | C:\Diagnostics\UserTmp\ftp.exe | .\ftp -s:C:\RECYCLER\xxppyy.exe | 0xbc8 | 2019-01-15 05:15:15.677 | S-1-5-21-996632719-2361334927-4038480536-500 |
1 | 0x16fc | C:\Diagnostics\UserTmp\reg.exe | .\reg not /domain:everything that /sid:shines... | 0xbc8 | 2019-01-15 05:15:16.167 | S-1-5-21-996632719-2361334927-4038480536-500 |
2 | 0x1700 | C:\Diagnostics\UserTmp\cmd.exe | cmd /c "systeminfo && systeminfo" | 0xbc8 | 2019-01-15 05:15:16.277 | S-1-5-21-996632719-2361334927-4038480536-500 |
3 | 0x1728 | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32 /C 12345.exe | 0xbc8 | 2019-01-15 05:15:16.340 | S-1-5-21-996632719-2361334927-4038480536-500 |
4 | 0x175c | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32 /C c:\users\MSTICAdmin\12345.exe | 0xbc8 | 2019-01-15 05:15:16.400 | S-1-5-21-996632719-2361334927-4038480536-500 |
SubjectUserSid | SubjectUserName | SubjectDomainName | |
---|---|---|---|
0 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
115 | S-1-5-18 | MSTICAlertsWin1$ | WORKGROUP |
procs.merge(users, on="SubjectUserSid")
NewProcessId | NewProcessName | CommandLine | ParentProcessId | TimeCreatedUtc | SubjectUserSid | SubjectUserName | SubjectDomainName | |
---|---|---|---|---|---|---|---|---|
0 | 0x1580 | C:\Diagnostics\UserTmp\ftp.exe | .\ftp -s:C:\RECYCLER\xxppyy.exe | 0xbc8 | 2019-01-15 05:15:15.677 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
1 | 0x16fc | C:\Diagnostics\UserTmp\reg.exe | .\reg not /domain:everything that /sid:shines... | 0xbc8 | 2019-01-15 05:15:16.167 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
2 | 0x1700 | C:\Diagnostics\UserTmp\cmd.exe | cmd /c "systeminfo && systeminfo" | 0xbc8 | 2019-01-15 05:15:16.277 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
3 | 0x1728 | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32 /C 12345.exe | 0xbc8 | 2019-01-15 05:15:16.340 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
4 | 0x175c | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32 /C c:\users\MSTICAdmin\12345.exe | 0xbc8 | 2019-01-15 05:15:16.400 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
112 | 0x1434 | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32.exe /C c:\windows\fonts\conhost.exe | 0xbc8 | 2019-01-15 05:15:14.613 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
113 | 0x123c | C:\Diagnostics\UserTmp\regsvr32.exe | .\regsvr32 /u /s c:\windows\fonts\csrss.exe | 0xbc8 | 2019-01-15 05:15:14.693 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
114 | 0x240 | C:\Windows\System32\tasklist.exe | tasklist | 0xbc8 | 2019-01-15 05:15:14.770 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
115 | 0x15a0 | C:\Windows\System32\win32calc.exe | "C:\Windows\System32\win32calc.exe" | 0x1580 | 2019-01-15 05:15:13.053 | S-1-5-21-996632719-2361334927-4038480536-500 | MSTICAdmin | MSTICAlertsWin1 |
116 | 0xbc8 | C:\Windows\System32\cmd.exe | cmd.exe /c c:\Diagnostics\WindowsSimulateDetec... | 0x440 | 2019-01-15 05:15:03.047 | S-1-5-18 | MSTICAlertsWin1$ | WORKGROUP |
117 rows × 8 columns
procs.merge(users[1:], on="SubjectUserSid")
NewProcessId | NewProcessName | CommandLine | ParentProcessId | TimeCreatedUtc | SubjectUserSid | SubjectUserName | SubjectDomainName | |
---|---|---|---|---|---|---|---|---|
0 | 0xbc8 | C:\Windows\System32\cmd.exe | cmd.exe /c c:\Diagnostics\WindowsSimulateDetec... | 0x440 | 2019-01-15 05:15:03.047 | S-1-5-18 | MSTICAlertsWin1$ | WORKGROUP |
procs.merge(users[1:], on="SubjectUserSid", how="left")
NewProcessId | NewProcessName | CommandLine | ParentProcessId | TimeCreatedUtc | SubjectUserSid | SubjectUserName | SubjectDomainName | |
---|---|---|---|---|---|---|---|---|
0 | 0x1580 | C:\Diagnostics\UserTmp\ftp.exe | .\ftp -s:C:\RECYCLER\xxppyy.exe | 0xbc8 | 2019-01-15 05:15:15.677 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
1 | 0x16fc | C:\Diagnostics\UserTmp\reg.exe | .\reg not /domain:everything that /sid:shines... | 0xbc8 | 2019-01-15 05:15:16.167 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
2 | 0x1700 | C:\Diagnostics\UserTmp\cmd.exe | cmd /c "systeminfo && systeminfo" | 0xbc8 | 2019-01-15 05:15:16.277 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
3 | 0x1728 | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32 /C 12345.exe | 0xbc8 | 2019-01-15 05:15:16.340 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
4 | 0x175c | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32 /C c:\users\MSTICAdmin\12345.exe | 0xbc8 | 2019-01-15 05:15:16.400 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
... | ... | ... | ... | ... | ... | ... | ... | ... |
112 | 0x1434 | C:\Diagnostics\UserTmp\rundll32.exe | .\rundll32.exe /C c:\windows\fonts\conhost.exe | 0xbc8 | 2019-01-15 05:15:14.613 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
113 | 0x123c | C:\Diagnostics\UserTmp\regsvr32.exe | .\regsvr32 /u /s c:\windows\fonts\csrss.exe | 0xbc8 | 2019-01-15 05:15:14.693 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
114 | 0x240 | C:\Windows\System32\tasklist.exe | tasklist | 0xbc8 | 2019-01-15 05:15:14.770 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
115 | 0xbc8 | C:\Windows\System32\cmd.exe | cmd.exe /c c:\Diagnostics\WindowsSimulateDetec... | 0x440 | 2019-01-15 05:15:03.047 | S-1-5-18 | MSTICAlertsWin1$ | WORKGROUP |
116 | 0x15a0 | C:\Windows\System32\win32calc.exe | "C:\Windows\System32\win32calc.exe" | 0x1580 | 2019-01-15 05:15:13.053 | S-1-5-21-996632719-2361334927-4038480536-500 | NaN | NaN |
117 rows × 8 columns
(
procs.merge(parents, left_on="ParentProcessId", right_on="ProcessId")
.head()
.filter(regex=".*Process.*")
)
NewProcessId | NewProcessName | ParentProcessId | ProcessId | ParentProcessName | |
---|---|---|---|---|---|
0 | 0x1580 | C:\Diagnostics\UserTmp\ftp.exe | 0xbc8 | 0xbc8 | C:\Windows\System32\cmd.exe |
1 | 0x16fc | C:\Diagnostics\UserTmp\reg.exe | 0xbc8 | 0xbc8 | C:\Windows\System32\cmd.exe |
2 | 0x1700 | C:\Diagnostics\UserTmp\cmd.exe | 0xbc8 | 0xbc8 | C:\Windows\System32\cmd.exe |
3 | 0x1728 | C:\Diagnostics\UserTmp\rundll32.exe | 0xbc8 | 0xbc8 | C:\Windows\System32\cmd.exe |
4 | 0x175c | C:\Diagnostics\UserTmp\rundll32.exe | 0xbc8 | 0xbc8 | C:\Windows\System32\cmd.exe |
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
# Generate a summary
summary_df = (
net_df[["RemoteRegion", "TotalAllowedFlows", "L7Protocol"]]
.groupby("RemoteRegion")
.agg(
FlowsSum = pd.NamedAgg("TotalAllowedFlows", "sum"),
FlowsVar = pd.NamedAgg("TotalAllowedFlows", "var"),
FlowsStdDev = pd.NamedAgg("TotalAllowedFlows", "std"),
L7Prots = pd.NamedAgg("L7Protocol", "nunique"),
)
)
summary_df
FlowsSum | FlowsVar | FlowsStdDev | L7Prots | |
---|---|---|---|---|
RemoteRegion | ||||
814.0 | 57.267027 | 7.567498 | 6 | |
canadacentral | 5103.0 | 29.811223 | 5.459965 | 1 |
centralus | 236.0 | 4.675897 | 2.162382 | 1 |
eastus | 602.0 | 1.646154 | 1.283025 | 3 |
eastus2 | 1502.0 | 4.830914 | 2.197934 | 1 |
northeurope | 82.0 | 0.492438 | 0.701739 | 1 |
southcentralus | 817.0 | 8.882186 | 2.980300 | 1 |
westcentralus | 59.0 | 0.017241 | 0.131306 | 1 |
westus | 38.0 | 0.782609 | 0.884652 | 1 |
westus2 | 7.0 | 0.300000 | 0.547723 | 1 |
df.style.highlight_max(...)
df_style = summary_df.style.highlight_max(color="blue").highlight_min(color="green")
df_style
FlowsSum | FlowsVar | FlowsStdDev | L7Prots | |
---|---|---|---|---|
RemoteRegion | ||||
814.000000 | 57.267027 | 7.567498 | 6 | |
canadacentral | 5103.000000 | 29.811223 | 5.459965 | 1 |
centralus | 236.000000 | 4.675897 | 2.162382 | 1 |
eastus | 602.000000 | 1.646154 | 1.283025 | 3 |
eastus2 | 1502.000000 | 4.830914 | 2.197934 | 1 |
northeurope | 82.000000 | 0.492438 | 0.701739 | 1 |
southcentralus | 817.000000 | 8.882186 | 2.980300 | 1 |
westcentralus | 59.000000 | 0.017241 | 0.131306 | 1 |
westus | 38.000000 | 0.782609 | 0.884652 | 1 |
westus2 | 7.000000 | 0.300000 | 0.547723 | 1 |
df.style.background_gradient(...)
import seaborn as sns
cm = sns.light_palette("blue", as_cmap=True)
summary_df.style.background_gradient(cmap=cm).format("{:.1f}")
FlowsSum | FlowsVar | FlowsStdDev | L7Prots | |
---|---|---|---|---|
RemoteRegion | ||||
814.0 | 57.3 | 7.6 | 6.0 | |
canadacentral | 5103.0 | 29.8 | 5.5 | 1.0 |
centralus | 236.0 | 4.7 | 2.2 | 1.0 |
eastus | 602.0 | 1.6 | 1.3 | 3.0 |
eastus2 | 1502.0 | 4.8 | 2.2 | 1.0 |
northeurope | 82.0 | 0.5 | 0.7 | 1.0 |
southcentralus | 817.0 | 8.9 | 3.0 | 1.0 |
westcentralus | 59.0 | 0.0 | 0.1 | 1.0 |
westus | 38.0 | 0.8 | 0.9 | 1.0 |
westus2 | 7.0 | 0.3 | 0.5 | 1.0 |
df.style.bar(...)
summary_df.style.bar(color="blue").format("{:.2f}")
FlowsSum | FlowsVar | FlowsStdDev | L7Prots | |
---|---|---|---|---|
RemoteRegion | ||||
814.00 | 57.27 | 7.57 | 6.00 | |
canadacentral | 5103.00 | 29.81 | 5.46 | 1.00 |
centralus | 236.00 | 4.68 | 2.16 | 1.00 |
eastus | 602.00 | 1.65 | 1.28 | 3.00 |
eastus2 | 1502.00 | 4.83 | 2.20 | 1.00 |
northeurope | 82.00 | 0.49 | 0.70 | 1.00 |
southcentralus | 817.00 | 8.88 | 2.98 | 1.00 |
westcentralus | 59.00 | 0.02 | 0.13 | 1.00 |
westus | 38.00 | 0.78 | 0.88 | 1.00 |
westus2 | 7.00 | 0.30 | 0.55 | 1.00 |
summary_df.style.set_properties(**{
'background-color': 'black',
'color': 'lawngreen',
'font-family': 'consolas',
}).format("{:.2f}")
FlowsSum | FlowsVar | FlowsStdDev | L7Prots | |
---|---|---|---|---|
RemoteRegion | ||||
814.00 | 57.27 | 7.57 | 6.00 | |
canadacentral | 5103.00 | 29.81 | 5.46 | 1.00 |
centralus | 236.00 | 4.68 | 2.16 | 1.00 |
eastus | 602.00 | 1.65 | 1.28 | 3.00 |
eastus2 | 1502.00 | 4.83 | 2.20 | 1.00 |
northeurope | 82.00 | 0.49 | 0.70 | 1.00 |
southcentralus | 817.00 | 8.88 | 2.98 | 1.00 |
westcentralus | 59.00 | 0.02 | 0.13 | 1.00 |
westus | 38.00 | 0.78 | 0.88 | 1.00 |
westus2 | 7.00 | 0.30 | 0.55 | 1.00 |
pandas primarily uses NaN
to represent missing data which is of floattype
.
IEEE 754 floating point representation of Not a Number (NaN).
Tip: Often you will see TypeError exceptions about not being able to perform an expected operation on a float (when you were expecting the type to be a string or other object type). This is very likely due NaNs in your data.
Also NaT - is the equivalent of NaN for DateTime data.
Sometimes python also raises None
for missing data. NoneType
object.
import pandas as pd
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
len(net_df)
1360
print(f"Null elements in DataFrame: {net_df.isnull().values.sum()} \n\
Rows with null elements: {net_df.shape[0] - net_df.dropna().shape[0]}")
Null elements in DataFrame: 24 Rows with null elements: 8
df.isna()
series.isna()
net_df.isna().any()
TimeGenerated False FlowStartTime False FlowEndTime False FlowIntervalEndTime False FlowType False ResourceGroup True VMName True VMIPAddress False PublicIPs True SrcIP False DestIP False L4Protocol False L7Protocol False DestPort False FlowDirection False AllowedOutFlows False AllowedInFlows False DeniedInFlows False DeniedOutFlows False RemoteRegion False VMRegion False AllExtIPs False TotalAllowedFlows False dtype: bool
net_df.ResourceGroup.value_counts()
asihuntomsworkspacerg 1352 Name: ResourceGroup, dtype: int64
You can use .isna()
on the whole DataFrame or a single column.
net_df[net_df["PublicIPs"].isna()]
TimeGenerated | FlowStartTime | FlowEndTime | FlowIntervalEndTime | FlowType | ResourceGroup | VMName | VMIPAddress | PublicIPs | SrcIP | ... | DestPort | FlowDirection | AllowedOutFlows | AllowedInFlows | DeniedInFlows | DeniedOutFlows | RemoteRegion | VMRegion | AllExtIPs | TotalAllowedFlows | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
326 | 2019-02-13 01:23:47.634 | 2019-02-13 00:55:10 | 2019-02-13 00:57:33 | 2019-02-13 01:00:00 | IntraVNet | None | None | None | 10.0.3.4 | ... | 445.0 | O | 3.0 | 0.0 | 0.0 | 0.0 | eastus | 10.0.3.5 | 3.0 | ||
327 | 2019-02-13 01:23:47.634 | 2019-02-13 00:55:10 | 2019-02-13 00:57:33 | 2019-02-13 01:00:00 | IntraVNet | None | None | None | 10.0.3.4 | ... | 445.0 | I | 0.0 | 3.0 | 0.0 | 0.0 | eastus | 10.0.3.4 | 3.0 | ||
336 | 2019-02-12 22:23:05.399 | 2019-02-12 21:55:43 | 2019-02-12 21:55:43 | 2019-02-12 22:00:00 | IntraVNet | None | None | None | 10.0.3.5 | ... | 22.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | eastus | 10.0.3.4 | 1.0 | ||
345 | 2019-02-12 22:23:05.384 | 2019-02-12 21:54:04 | 2019-02-12 21:55:36 | 2019-02-12 22:00:00 | IntraVNet | None | None | None | 10.0.3.5 | ... | 22.0 | O | 6.0 | 0.0 | 0.0 | 0.0 | eastus | 104.211.30.1 | 6.0 | ||
357 | 2019-02-12 23:23:59.515 | 2019-02-12 22:22:35 | 2019-02-12 22:55:37 | 2019-02-12 23:00:00 | IntraVNet | None | None | None | 10.0.3.5 | ... | 22.0 | O | 12.0 | 0.0 | 0.0 | 0.0 | eastus | 104.211.30.1 | 12.0 | ||
413 | 2019-02-12 18:23:51.853 | 2019-02-12 17:26:19 | 2019-02-12 17:44:09 | 2019-02-12 18:00:00 | IntraVNet | None | None | None | 10.0.3.4 | ... | 445.0 | O | 6.0 | 0.0 | 0.0 | 0.0 | eastus | 10.0.3.5 | 6.0 | ||
414 | 2019-02-12 18:23:51.853 | 2019-02-12 17:26:19 | 2019-02-12 17:44:09 | 2019-02-12 18:00:00 | IntraVNet | None | None | None | 10.0.3.4 | ... | 445.0 | I | 0.0 | 6.0 | 0.0 | 0.0 | eastus | 10.0.3.4 | 6.0 | ||
466 | 2019-02-12 22:23:17.236 | 2019-02-12 21:55:43 | 2019-02-12 21:55:43 | 2019-02-12 22:00:00 | IntraVNet | None | None | None | 10.0.3.5 | ... | 22.0 | I | 0.0 | 1.0 | 0.0 | 0.0 | eastus | 10.0.3.5 | 1.0 |
8 rows × 23 columns
.dropna
¶
df.dropna() # removes all rows with ANY NaNs
df.dropna(axis=1) # removes all columns with ANY NaNs
df.dropna(how="all") # removes all rows that are ALL NaNs
df.dropna(axis=1, how="all") # removes all cols that are ALL NaNs
dropna()
also supports inplace=True
. Don't do it!!!
len(net_df.dropna())
1352
df.fillna(replacement) # replace NaNs with 'replacement'
df[column] = df[column].fillna(replacement) # replace NaNs in a single column
net_df2 = net_df.fillna(value="N/A")
net_df2.ResourceGroup.value_counts()
asihuntomsworkspacerg 1352 N/A 8 Name: ResourceGroup, dtype: int64
net_df.dtypes
TimeGenerated datetime64[ns] FlowStartTime datetime64[ns] FlowEndTime datetime64[ns] FlowIntervalEndTime datetime64[ns] FlowType object ResourceGroup object VMName object VMIPAddress object PublicIPs object SrcIP object DestIP object L4Protocol object L7Protocol object DestPort float64 FlowDirection object AllowedOutFlows float64 AllowedInFlows float64 DeniedInFlows float64 DeniedOutFlows float64 RemoteRegion object VMRegion object AllExtIPs object TotalAllowedFlows float64 dtype: object
series.column.astype(target) # convert type
Target can be a numpy type, a pandas dtype or a friendly string:
df.column.astype(target|{col1: type1, col2, type2...}) # convert multiple cols
net_df.TotalAllowedFlows = net_df.TotalAllowedFlows.astype('str')
net_df.TotalAllowedFlows.dtypes
dtype('O')
Gives you more control over specific conversions (esp for DateTime)
net_df.TotalAllowedFlows = pd.to_numeric(net_df.TotalAllowedFlows)
#pd.to_datetime
#pd.to_timedelta
net_df.TotalAllowedFlows.dtypes
dtype('float64')
net_df.columns
Index(['TimeGenerated', 'FlowStartTime', 'FlowEndTime', 'FlowIntervalEndTime', 'FlowType', 'ResourceGroup', 'VMName', 'VMIPAddress', 'PublicIPs', 'SrcIP', 'DestIP', 'L4Protocol', 'L7Protocol', 'DestPort', 'FlowDirection', 'AllowedOutFlows', 'AllowedInFlows', 'DeniedInFlows', 'DeniedOutFlows', 'RemoteRegion', 'VMRegion', 'AllExtIPs', 'TotalAllowedFlows'], dtype='object')
df.rename(columns={col1: col1_new, col2: ....}) # rename
net_df.rename(columns={"FlowStartTime": "FlowStartDateTime", "FlowEndTime": "FlowEndDateTime"}).columns
Index(['TimeGenerated', 'FlowStartDateTime', 'FlowEndDateTime', 'FlowIntervalEndTime', 'FlowType', 'ResourceGroup', 'VMName', 'VMIPAddress', 'PublicIPs', 'SrcIP', 'DestIP', 'L4Protocol', 'L7Protocol', 'DestPort', 'FlowDirection', 'AllowedOutFlows', 'AllowedInFlows', 'DeniedInFlows', 'DeniedOutFlows', 'RemoteRegion', 'VMRegion', 'AllExtIPs', 'TotalAllowedFlows'], dtype='object')
df.rename(func, axis='columns')
net_df.rename(str.lower, axis='columns').columns
Index(['timegenerated', 'flowstarttime', 'flowendtime', 'flowintervalendtime', 'flowtype', 'resourcegroup', 'vmname', 'vmipaddress', 'publicips', 'srcip', 'destip', 'l4protocol', 'l7protocol', 'destport', 'flowdirection', 'allowedoutflows', 'allowedinflows', 'deniedinflows', 'deniedoutflows', 'remoteregion', 'vmregion', 'allextips', 'totalallowedflows'], dtype='object')
net_df.columns
Index(['TimeGenerated', 'FlowStartTime', 'FlowEndTime', 'FlowIntervalEndTime', 'FlowType', 'ResourceGroup', 'VMName', 'VMIPAddress', 'PublicIPs', 'SrcIP', 'DestIP', 'L4Protocol', 'L7Protocol', 'DestPort', 'FlowDirection', 'AllowedOutFlows', 'AllowedInFlows', 'DeniedInFlows', 'DeniedOutFlows', 'RemoteRegion', 'VMRegion', 'AllExtIPs', 'TotalAllowedFlows'], dtype='object')
net_df.columns = [
"timegenerated",
"flowstarttime",
"flowendtime",
"flowintervalendtime",
"flowtype",
"resourcegroup",
"vmname",
"vmipaddress",
"publicips",
"srcip",
"destip",
"l4protocol",
"l7protocol",
"destport",
"flowdirection",
"allowedoutflows",
"allowedinflows",
"deniedinflows",
"deniedoutflows",
"remoteregion",
"vmregion",
"allextips",
"totalallowedflows",
]
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
net_df.PublicIPs.head(10)
0 [13.67.143.117] 1 [40.77.232.95] 2 [13.65.107.32, 40.124.45.19] 3 [13.65.107.32, 40.124.45.19] 4 [20.38.98.100] 5 [13.67.143.117] 6 [13.71.172.128, 13.71.172.130] 7 [13.71.172.128, 13.71.172.130] 8 [65.55.44.109, 40.77.228.69, 65.55.44.108] 9 [65.55.44.109, 40.77.228.69, 65.55.44.108] Name: PublicIPs, dtype: object
net_df.PublicIPs.count()
1352
net_df_ext = net_df.explode("PublicIPs")
net_df_ext.PublicIPs.head(10)
0 13.67.143.117 1 40.77.232.95 2 13.65.107.32 2 40.124.45.19 3 13.65.107.32 3 40.124.45.19 4 20.38.98.100 5 13.67.143.117 6 13.71.172.128 6 13.71.172.130 Name: PublicIPs, dtype: object
len(net_df_ext.PublicIPs.unique())
123
Pandas.melt() unpivots a DataFrame from wide format to long format.
melt() function is useful to message a DataFrame into a format where one or more columns are identifier variables, while all other columns, considered measured variables, are unpivoted to the row axis, leaving just two non-identifier columns, variable and value.
net_df_min = net_df[["FlowType", "AllExtIPs", "TotalAllowedFlows"]]
pd.melt(net_df_min,
id_vars=['AllExtIPs'])
AllExtIPs | variable | value | |
---|---|---|---|
0 | 13.67.143.117 | FlowType | AzurePublic |
1 | 40.77.232.95 | FlowType | AzurePublic |
2 | 13.65.107.32 | FlowType | AzurePublic |
3 | 40.124.45.19 | FlowType | AzurePublic |
4 | 20.38.98.100 | FlowType | AzurePublic |
... | ... | ... | ... |
2715 | 13.71.172.130 | TotalAllowedFlows | 23.0 |
2716 | 40.77.232.95 | TotalAllowedFlows | 1.0 |
2717 | 52.168.138.145 | TotalAllowedFlows | 4.0 |
2718 | 23.215.98.90 | TotalAllowedFlows | 2.0 |
2719 | 72.21.81.240 | TotalAllowedFlows | 2.0 |
2720 rows × 3 columns
net_df_min.head().T
0 | 1 | 2 | 3 | 4 | |
---|---|---|---|---|---|
FlowType | AzurePublic | AzurePublic | AzurePublic | AzurePublic | AzurePublic |
AllExtIPs | 13.67.143.117 | 40.77.232.95 | 13.65.107.32 | 40.124.45.19 | 20.38.98.100 |
TotalAllowedFlows | 1.0 | 1.0 | 4.0 | 4.0 | 1.0 |
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
net_df_agg = net_df.groupby("AllExtIPs").agg({"TotalAllowedFlows":['mean', 'min', 'max'],
"AllowedOutFlows":['mean', 'min', 'max'],
"AllowedInFlows":['mean', 'min', 'max']})
net_df_agg.head()
TotalAllowedFlows | AllowedOutFlows | AllowedInFlows | |||||||
---|---|---|---|---|---|---|---|---|---|
mean | min | max | mean | min | max | mean | min | max | |
AllExtIPs | |||||||||
10.0.3.4 | 3.333333 | 1.0 | 6.0 | 0.333333 | 0.0 | 1.0 | 3.000000 | 0.0 | 6.0 |
10.0.3.5 | 3.333333 | 1.0 | 6.0 | 3.000000 | 0.0 | 6.0 | 0.333333 | 0.0 | 1.0 |
104.211.30.1 | 9.000000 | 6.0 | 12.0 | 9.000000 | 6.0 | 12.0 | 0.000000 | 0.0 | 0.0 |
104.40.17.153 | 1.750000 | 1.0 | 2.0 | 1.750000 | 1.0 | 2.0 | 0.000000 | 0.0 | 0.0 |
104.43.212.12 | 2.166667 | 1.0 | 4.0 | 2.166667 | 1.0 | 4.0 | 0.000000 | 0.0 | 0.0 |
net_df_agg["TotalAllowedFlows"]["mean"]
AllExtIPs 10.0.3.4 3.333333 10.0.3.5 3.333333 104.211.30.1 9.000000 104.40.17.153 1.750000 104.43.212.12 2.166667 ... 90.130.70.73 1.000000 99.84.104.63 7.000000 99.84.106.178 10.000000 99.84.106.27 10.000000 99.84.106.92 10.000000 Name: mean, Length: 125, dtype: float64
idx = pd.IndexSlice
net_df_agg.loc[:,idx[:,'mean']]
TotalAllowedFlows | AllowedOutFlows | AllowedInFlows | |
---|---|---|---|
mean | mean | mean | |
AllExtIPs | |||
10.0.3.4 | 3.333333 | 0.333333 | 3.000000 |
10.0.3.5 | 3.333333 | 3.000000 | 0.333333 |
104.211.30.1 | 9.000000 | 9.000000 | 0.000000 |
104.40.17.153 | 1.750000 | 1.750000 | 0.000000 |
104.43.212.12 | 2.166667 | 2.166667 | 0.000000 |
... | ... | ... | ... |
90.130.70.73 | 1.000000 | 1.000000 | 0.000000 |
99.84.104.63 | 7.000000 | 7.000000 | 0.000000 |
99.84.106.178 | 10.000000 | 10.000000 | 0.000000 |
99.84.106.27 | 10.000000 | 10.000000 | 0.000000 |
99.84.106.92 | 10.000000 | 10.000000 | 0.000000 |
125 rows × 3 columns
net_df_agg_stacked = net_df_agg.stack()
net_df_agg_stacked.head()
TotalAllowedFlows | AllowedOutFlows | AllowedInFlows | ||
---|---|---|---|---|
AllExtIPs | ||||
10.0.3.4 | mean | 3.333333 | 0.333333 | 3.000000 |
min | 1.000000 | 0.000000 | 0.000000 | |
max | 6.000000 | 1.000000 | 6.000000 | |
10.0.3.5 | mean | 3.333333 | 3.000000 | 0.333333 |
min | 1.000000 | 0.000000 | 0.000000 |
net_df_agg_stacked.loc[("10.0.3.4","mean"),"TotalAllowedFlows"]
3.3333333333333335
net_df_agg_stacked.unstack().head()
TotalAllowedFlows | AllowedOutFlows | AllowedInFlows | |||||||
---|---|---|---|---|---|---|---|---|---|
mean | min | max | mean | min | max | mean | min | max | |
AllExtIPs | |||||||||
10.0.3.4 | 3.333333 | 1.0 | 6.0 | 0.333333 | 0.0 | 1.0 | 3.000000 | 0.0 | 6.0 |
10.0.3.5 | 3.333333 | 1.0 | 6.0 | 3.000000 | 0.0 | 6.0 | 0.333333 | 0.0 | 1.0 |
104.211.30.1 | 9.000000 | 6.0 | 12.0 | 9.000000 | 6.0 | 12.0 | 0.000000 | 0.0 | 0.0 |
104.40.17.153 | 1.750000 | 1.0 | 2.0 | 1.750000 | 1.0 | 2.0 | 0.000000 | 0.0 | 0.0 |
104.43.212.12 | 2.166667 | 1.0 | 4.0 | 2.166667 | 1.0 | 4.0 | 0.000000 | 0.0 | 0.0 |
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
net_df.head()
TimeGenerated | FlowStartTime | FlowEndTime | FlowIntervalEndTime | FlowType | ResourceGroup | VMName | VMIPAddress | PublicIPs | SrcIP | ... | DestPort | FlowDirection | AllowedOutFlows | AllowedInFlows | DeniedInFlows | DeniedOutFlows | RemoteRegion | VMRegion | AllExtIPs | TotalAllowedFlows | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-02-14 13:23:59.512 | 2019-02-14 12:21:58 | 2019-02-14 12:21:58 | 2019-02-14 13:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.67.143.117] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | centralus | eastus | 13.67.143.117 | 1.0 | |
1 | 2019-02-14 13:23:59.512 | 2019-02-14 12:29:02 | 2019-02-14 12:29:02 | 2019-02-14 13:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [40.77.232.95] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | westcentralus | eastus | 40.77.232.95 | 1.0 | |
2 | 2019-02-14 03:26:06.765 | 2019-02-14 02:08:46 | 2019-02-14 02:48:45 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.65.107.32, 40.124.45.19] | ... | 443.0 | O | 4.0 | 0.0 | 0.0 | 0.0 | southcentralus | eastus | 13.65.107.32 | 4.0 | |
3 | 2019-02-14 03:26:06.765 | 2019-02-14 02:08:46 | 2019-02-14 02:48:45 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.65.107.32, 40.124.45.19] | ... | 443.0 | O | 4.0 | 0.0 | 0.0 | 0.0 | southcentralus | eastus | 40.124.45.19 | 4.0 | |
4 | 2019-02-14 03:26:06.828 | 2019-02-14 02:30:56 | 2019-02-14 02:30:56 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [20.38.98.100] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | eastus | eastus | 20.38.98.100 | 1.0 |
5 rows × 23 columns
# Prepare groupby dataset to perform pivot. Does expect column with unique values
net_df_grouped = net_df.groupby(['FlowIntervalEndTime','FlowType'])['AllExtIPs'].count().reset_index()
net_df_grouped.head()
FlowIntervalEndTime | FlowType | AllExtIPs | |
---|---|---|---|
0 | 2019-02-07 13:00:00 | AzurePublic | 3 |
1 | 2019-02-07 14:00:00 | AzurePublic | 8 |
2 | 2019-02-07 14:00:00 | ExternalPublic | 1 |
3 | 2019-02-07 15:00:00 | AzurePublic | 5 |
4 | 2019-02-07 15:00:00 | ExternalPublic | 3 |
net_df_grouped.pivot(index="FlowIntervalEndTime", columns="FlowType", values="AllExtIPs")
FlowType | AzurePublic | ExternalPublic | IntraVNet |
---|---|---|---|
FlowIntervalEndTime | |||
2019-02-07 13:00:00 | 3.0 | NaN | NaN |
2019-02-07 14:00:00 | 8.0 | 1.0 | NaN |
2019-02-07 15:00:00 | 5.0 | 3.0 | NaN |
2019-02-07 16:00:00 | 7.0 | 1.0 | NaN |
2019-02-07 17:00:00 | 8.0 | NaN | NaN |
... | ... | ... | ... |
2019-02-14 09:00:00 | 8.0 | 1.0 | NaN |
2019-02-14 10:00:00 | 8.0 | NaN | NaN |
2019-02-14 11:00:00 | 7.0 | 2.0 | NaN |
2019-02-14 12:00:00 | 9.0 | NaN | NaN |
2019-02-14 13:00:00 | 2.0 | NaN | NaN |
141 rows × 3 columns
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
net_df.head()
TimeGenerated | FlowStartTime | FlowEndTime | FlowIntervalEndTime | FlowType | ResourceGroup | VMName | VMIPAddress | PublicIPs | SrcIP | ... | DestPort | FlowDirection | AllowedOutFlows | AllowedInFlows | DeniedInFlows | DeniedOutFlows | RemoteRegion | VMRegion | AllExtIPs | TotalAllowedFlows | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-02-14 13:23:59.512 | 2019-02-14 12:21:58 | 2019-02-14 12:21:58 | 2019-02-14 13:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.67.143.117] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | centralus | eastus | 13.67.143.117 | 1.0 | |
1 | 2019-02-14 13:23:59.512 | 2019-02-14 12:29:02 | 2019-02-14 12:29:02 | 2019-02-14 13:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [40.77.232.95] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | westcentralus | eastus | 40.77.232.95 | 1.0 | |
2 | 2019-02-14 03:26:06.765 | 2019-02-14 02:08:46 | 2019-02-14 02:48:45 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.65.107.32, 40.124.45.19] | ... | 443.0 | O | 4.0 | 0.0 | 0.0 | 0.0 | southcentralus | eastus | 13.65.107.32 | 4.0 | |
3 | 2019-02-14 03:26:06.765 | 2019-02-14 02:08:46 | 2019-02-14 02:48:45 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.65.107.32, 40.124.45.19] | ... | 443.0 | O | 4.0 | 0.0 | 0.0 | 0.0 | southcentralus | eastus | 40.124.45.19 | 4.0 | |
4 | 2019-02-14 03:26:06.828 | 2019-02-14 02:30:56 | 2019-02-14 02:30:56 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [20.38.98.100] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | eastus | eastus | 20.38.98.100 | 1.0 |
5 rows × 23 columns
dti = pd.to_datetime(net_df['TimeGenerated'])
dti_utc = dti.dt.tz_localize("UTC")
print(dti_utc)
0 2019-02-14 13:23:59.512000+00:00 1 2019-02-14 13:23:59.512000+00:00 2 2019-02-14 03:26:06.765000+00:00 3 2019-02-14 03:26:06.765000+00:00 4 2019-02-14 03:26:06.828000+00:00 ... 1355 2019-02-09 03:32:41.967000+00:00 1356 2019-02-09 03:32:51.124000+00:00 1357 2019-02-09 03:32:51.264000+00:00 1358 2019-02-09 03:32:45.608000+00:00 1359 2019-02-09 03:32:45.608000+00:00 Name: TimeGenerated, Length: 1360, dtype: datetime64[ns, UTC]
dti_pst = dti.dt.tz_localize("US/Pacific")
print(dti_pst)
0 2019-02-14 13:23:59.512000-08:00 1 2019-02-14 13:23:59.512000-08:00 2 2019-02-14 03:26:06.765000-08:00 3 2019-02-14 03:26:06.765000-08:00 4 2019-02-14 03:26:06.828000-08:00 ... 1355 2019-02-09 03:32:41.967000-08:00 1356 2019-02-09 03:32:51.124000-08:00 1357 2019-02-09 03:32:51.264000-08:00 1358 2019-02-09 03:32:45.608000-08:00 1359 2019-02-09 03:32:45.608000-08:00 Name: TimeGenerated, Length: 1360, dtype: datetime64[ns, US/Pacific]
Resampling - resample()
time-based groupby
net_df.set_index('TimeGenerated').resample('H')['FlowType'].count()
TimeGenerated 2019-02-07 13:00:00 3 2019-02-07 14:00:00 9 2019-02-07 15:00:00 8 2019-02-07 16:00:00 8 2019-02-07 17:00:00 8 .. 2019-02-14 09:00:00 9 2019-02-14 10:00:00 8 2019-02-14 11:00:00 9 2019-02-14 12:00:00 9 2019-02-14 13:00:00 2 Freq: H, Name: FlowType, Length: 169, dtype: int64
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
net_df.head()
TimeGenerated | FlowStartTime | FlowEndTime | FlowIntervalEndTime | FlowType | ResourceGroup | VMName | VMIPAddress | PublicIPs | SrcIP | ... | DestPort | FlowDirection | AllowedOutFlows | AllowedInFlows | DeniedInFlows | DeniedOutFlows | RemoteRegion | VMRegion | AllExtIPs | TotalAllowedFlows | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 2019-02-14 13:23:59.512 | 2019-02-14 12:21:58 | 2019-02-14 12:21:58 | 2019-02-14 13:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.67.143.117] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | centralus | eastus | 13.67.143.117 | 1.0 | |
1 | 2019-02-14 13:23:59.512 | 2019-02-14 12:29:02 | 2019-02-14 12:29:02 | 2019-02-14 13:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [40.77.232.95] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | westcentralus | eastus | 40.77.232.95 | 1.0 | |
2 | 2019-02-14 03:26:06.765 | 2019-02-14 02:08:46 | 2019-02-14 02:48:45 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.65.107.32, 40.124.45.19] | ... | 443.0 | O | 4.0 | 0.0 | 0.0 | 0.0 | southcentralus | eastus | 13.65.107.32 | 4.0 | |
3 | 2019-02-14 03:26:06.765 | 2019-02-14 02:08:46 | 2019-02-14 02:48:45 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [13.65.107.32, 40.124.45.19] | ... | 443.0 | O | 4.0 | 0.0 | 0.0 | 0.0 | southcentralus | eastus | 40.124.45.19 | 4.0 | |
4 | 2019-02-14 03:26:06.828 | 2019-02-14 02:30:56 | 2019-02-14 02:30:56 | 2019-02-14 03:00:00 | AzurePublic | asihuntomsworkspacerg | msticalertswin1 | 10.0.3.5 | [20.38.98.100] | ... | 443.0 | O | 1.0 | 0.0 | 0.0 | 0.0 | eastus | eastus | 20.38.98.100 | 1.0 |
5 rows × 23 columns
net_df[["TimeGenerated","AllExtIPs"]].groupby("AllExtIPs").agg("count")
TimeGenerated | |
---|---|
AllExtIPs | |
10.0.3.4 | 3 |
10.0.3.5 | 3 |
104.211.30.1 | 2 |
104.40.17.153 | 4 |
104.43.212.12 | 12 |
... | ... |
90.130.70.73 | 2 |
99.84.104.63 | 1 |
99.84.106.178 | 1 |
99.84.106.27 | 1 |
99.84.106.92 | 1 |
125 rows × 1 columns
net_df[["RemoteRegion","AllExtIPs"]].groupby("AllExtIPs").agg("count").sort_values(by="RemoteRegion", ascending=False)
RemoteRegion | |
---|---|
AllExtIPs | |
65.55.44.109 | 139 |
13.71.172.130 | 136 |
52.168.138.145 | 117 |
40.124.45.19 | 115 |
13.71.172.128 | 114 |
... | ... |
23.45.181.178 | 1 |
23.45.181.176 | 1 |
23.45.181.160 | 1 |
23.45.180.34 | 1 |
99.84.106.92 | 1 |
125 rows × 1 columns
(
net_df[["RemoteRegion","AllExtIPs"]]
.groupby("AllExtIPs")
.agg("count")
.sort_values(by="RemoteRegion", ascending=False)
)
RemoteRegion | |
---|---|
AllExtIPs | |
65.55.44.109 | 139 |
13.71.172.130 | 136 |
52.168.138.145 | 117 |
40.124.45.19 | 115 |
13.71.172.128 | 114 |
... | ... |
23.45.181.178 | 1 |
23.45.181.176 | 1 |
23.45.181.160 | 1 |
23.45.180.34 | 1 |
99.84.106.92 | 1 |
125 rows × 1 columns
(
net_df[["RemoteRegion", "AllExtIPs"]]
.groupby("AllExtIPs")
.agg("count")
.sort_values(
by="RemoteRegion", ascending=False
)
.head(5)
)
RemoteRegion | |
---|---|
AllExtIPs | |
65.55.44.109 | 139 |
13.71.172.130 | 136 |
52.168.138.145 | 117 |
40.124.45.19 | 115 |
13.71.172.128 | 114 |
(
net_df[["RemoteRegion","AllExtIPs"]]
.groupby("AllExtIPs")
.agg("count")
.sort_values(by="RemoteRegion", ascending=False)
.head(5)
.index
.to_list()
)
['65.55.44.109', '13.71.172.130', '52.168.138.145', '40.124.45.19', '13.71.172.128']
.pipe
¶
df.pipe(function)
You can call functions to do processing on your data. The function must take a DataFrame as the first parameter and return a DataFrame
# Define a couple of (not very useful) functions
def drop_duplicates(df, column_name):
return df.drop_duplicates(subset=column_name)
def fill_missing_values(df):
df_result = df.copy()
for col in df_result.columns:
df_result[col].fillna("N/A", inplace=True)
return df_result
display(net_df[["TimeGenerated", "ResourceGroup"]][net_df["ResourceGroup"].isna()])
print("rows with NaNs:", net_df.isnull().values.sum())
TimeGenerated | ResourceGroup | |
---|---|---|
326 | 2019-02-13 01:23:47.634 | None |
327 | 2019-02-13 01:23:47.634 | None |
336 | 2019-02-12 22:23:05.399 | None |
345 | 2019-02-12 22:23:05.384 | None |
357 | 2019-02-12 23:23:59.515 | None |
413 | 2019-02-12 18:23:51.853 | None |
414 | 2019-02-12 18:23:51.853 | None |
466 | 2019-02-12 22:23:17.236 | None |
rows with NaNs: 24
net_df.pipe(fill_missing_values).isnull().values.sum()
0
Using the drop_duplicates function
len(net_df)
1360
net_df.pipe(drop_duplicates, "AllExtIPs").shape
(125, 23)
Using both functions
net_df = pd.read_pickle("../data/az_net_comms_df.pkl")
len(net_df)
1360
net_df_cleaned = (
net_df
.pipe(drop_duplicates, "AllExtIPs")
.pipe(fill_missing_values)
)
display(net_df[["TimeGenerated", "ResourceGroup"]][net_df["ResourceGroup"].isna()])
display(net_df_cleaned[["TimeGenerated", "ResourceGroup"]][net_df["ResourceGroup"].isna()])
TimeGenerated | ResourceGroup | |
---|---|---|
326 | 2019-02-13 01:23:47.634 | None |
327 | 2019-02-13 01:23:47.634 | None |
336 | 2019-02-12 22:23:05.399 | None |
345 | 2019-02-12 22:23:05.384 | None |
357 | 2019-02-12 23:23:59.515 | None |
413 | 2019-02-12 18:23:51.853 | None |
414 | 2019-02-12 18:23:51.853 | None |
466 | 2019-02-12 22:23:17.236 | None |
C:\Users\Ian\Anaconda3\envs\condadev\lib\site-packages\ipykernel_launcher.py:2: UserWarning: Boolean Series key will be reindexed to match DataFrame index.
TimeGenerated | ResourceGroup | |
---|---|---|
326 | 2019-02-13 01:23:47.634 | N/A |
327 | 2019-02-13 01:23:47.634 | N/A |
345 | 2019-02-12 22:23:05.384 | N/A |
display(
net_df.VMIPAddress.apply(str.split, ".").head()
)
display(
net_df.VMRegion.apply(str.capitalize).head()
)
0 [10.0.3.5] 1 [10.0.3.5] 2 [10.0.3.5] 3 [10.0.3.5] 4 [10.0.3.5] Name: VMIPAddress, dtype: object
0 Eastus 1 Eastus 2 Eastus 3 Eastus 4 Eastus Name: VMRegion, dtype: object
# Using a lambda (inline) function
display(
net_df.VMIPAddress.apply(
lambda col: "_".join(col.split("."))
)
.head()
)
0 10_0_3_5 1 10_0_3_5 2 10_0_3_5 3 10_0_3_5 4 10_0_3_5 Name: VMIPAddress, dtype: object
import ipaddress
def to_ip(ip_str):
if ip_str:
ip = ipaddress.ip_address(ip_str)
if ip.is_global:
return "global", ip
else:
return "other", ip
return "Unknown"
display(
net_df.AllExtIPs.apply(to_ip)
.head(10)
)
0 (global, 13.67.143.117) 1 (global, 40.77.232.95) 2 (global, 13.65.107.32) 3 (global, 40.124.45.19) 4 (global, 20.38.98.100) 5 (global, 13.67.143.117) 6 (global, 13.71.172.128) 7 (global, 13.71.172.130) 8 (global, 65.55.44.109) 9 (global, 40.77.228.69) Name: AllExtIPs, dtype: object
.apply
and DataFrames¶
df.apply(function, axis=1) # apply by row
df.apply(function [, axis=0]) # apply by column
display(
net_df
.apply(lambda row: row.RemoteRegion.upper(), axis=1)
.head(5)
)
display(
net_df
.apply(lambda row: row.RemoteRegion.capitalize() + ": " + str(hash(row.RemoteRegion)), axis=1)
.head()
)
0 CENTRALUS 1 WESTCENTRALUS 2 SOUTHCENTRALUS 3 SOUTHCENTRALUS 4 EASTUS dtype: object
0 Centralus: 3997470178254466550 1 Westcentralus: 2950529182713360191 2 Southcentralus: 2388453837175337402 3 Southcentralus: 2388453837175337402 4 Eastus: -5448835124403651518 dtype: object
def df_to_ip(row):
for name in row.index:
value = row[name]
try:
ip = ipaddress.ip_address(value)
if ip.is_global:
row[name] = f"IP global: {ip}"
elif ip.is_private:
row[name] = f"IP private: {ip}"
else:
row[name] = f"IP other: {ip}"
except:
pass
return row
net_df.apply(df_to_ip, axis=1).filter(regex=".*IP.*")
VMIPAddress | PublicIPs | SrcIP | DestIP | AllExtIPs | |
---|---|---|---|---|---|
0 | IP private: 10.0.3.5 | [13.67.143.117] | IP global: 13.67.143.117 | ||
1 | IP private: 10.0.3.5 | [40.77.232.95] | IP global: 40.77.232.95 | ||
2 | IP private: 10.0.3.5 | [13.65.107.32, 40.124.45.19] | IP global: 13.65.107.32 | ||
3 | IP private: 10.0.3.5 | [13.65.107.32, 40.124.45.19] | IP global: 40.124.45.19 | ||
4 | IP private: 10.0.3.5 | [20.38.98.100] | IP global: 20.38.98.100 | ||
... | ... | ... | ... | ... | ... |
1355 | IP private: 10.0.3.5 | [13.71.172.128, 13.71.172.130] | IP global: 13.71.172.130 | ||
1356 | IP private: 10.0.3.5 | [40.77.232.95] | IP global: 40.77.232.95 | ||
1357 | IP private: 10.0.3.5 | [52.168.138.145] | IP global: 52.168.138.145 | ||
1358 | IP private: 10.0.3.5 | [23.215.98.90, 72.21.81.240] | IP global: 23.215.98.90 | ||
1359 | IP private: 10.0.3.5 | [23.215.98.90, 72.21.81.240] | IP global: 72.21.81.240 |
1360 rows × 5 columns