#!/usr/bin/env python # coding: utf-8 # In[1]: import numpy as np import pandas as pd import plotly.figure_factory as ff import plotly.express as px import requests # In[2]: data = [] def parse_file(path): if path.startswith("http"): content = requests.get(path).content.decode() else: content = open(path).read() for line in content.splitlines(): if line.strip() == "CLEAR": # ran the loop twice with "CLEAR" in between # so delete the first set of cold measurements data[:] = [] if line.count(",") != 6: continue resource, task, branch, basket, start, stop, nevents = line.split(",") data.append(dict( Resource=resource, Task=task.lower(), Branch=branch.split()[1], Basket = int(basket.split()[1]), Start = float(start.split()[1]), Finish = float(stop.split()[1]), Nevents = int(nevents.split()[1]), )) df = pd.DataFrame(data) df["Start"] = pd.to_datetime(df["Start"], unit="s") df["Finish"] = pd.to_datetime(df["Finish"], unit="s") return df # outreach file uses gzip and has only Muon branches (~6) # nanoaod file uses LZMA and has all branches (~1.5k) df_outreach_4t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/outreach_timing_4threads.txt") df_nanoaod_4t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/nanoaod_timing_4threads.txt") df_outreach_1t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/outreach_timing_1threads.txt") df_nanoaod_1t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/nanoaod_timing_1threads.txt") # In[3]: def plot(df): duration = (df["Finish"].max()-df["Start"].min()).total_seconds() return ff.create_gantt(df, index_col='Resource', show_colorbar=True, group_tasks=True,title=f"total walltime = {duration:.3f}s").show() # In[4]: plot(df_outreach_4t) # In[5]: plot(df_nanoaod_4t) # In[6]: # plot(df_outreach_1t) # In[7]: # plot(df_nanoaod_1t) # In[8]: # OUTREACH file # many short baskets at the beginning of the file (which get assigned to thread 1) # however, the number of events processed by each thread is roughly the same gb = df_outreach_4t.query("Resource == 'interpretation'").drop_duplicates("Basket").groupby("Task") print("Baskets per thread") print(gb.size()) print("Nevents per thread") gb["Nevents"].sum() # In[9]: # NANOAOD file gb = df_nanoaod_4t.query("Resource == 'interpretation'").drop_duplicates("Basket").groupby("Task") print("Baskets per thread") print(gb.size()) print("Nevents per thread") gb["Nevents"].sum() # In[ ]: