In [1]:

import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import plotly.express as px
import requests

In [2]:

data = []
def parse_file(path):
    if path.startswith("http"):
        content = requests.get(path).content.decode()
    else:
        content = open(path).read()
    for line in content.splitlines():
        if line.strip() == "CLEAR": 
            # ran the loop twice with "CLEAR" in between
            # so delete the first set of cold measurements
            data[:] = []
        if line.count(",") != 6:
            continue
        resource, task, branch, basket, start, stop, nevents = line.split(",")
        data.append(dict(
            Resource=resource,
            Task=task.lower(),
            Branch=branch.split()[1],
            Basket = int(basket.split()[1]),
            Start = float(start.split()[1]),
            Finish = float(stop.split()[1]),
            Nevents = int(nevents.split()[1]),
        ))
    df = pd.DataFrame(data)
    df["Start"] = pd.to_datetime(df["Start"], unit="s")
    df["Finish"] = pd.to_datetime(df["Finish"], unit="s")
    return df

# outreach file uses gzip and has only Muon branches (~6)
# nanoaod file uses LZMA and has all branches (~1.5k)
df_outreach_4t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/outreach_timing_4threads.txt")
df_nanoaod_4t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/nanoaod_timing_4threads.txt")
df_outreach_1t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/outreach_timing_1threads.txt")
df_nanoaod_1t = parse_file("http://uaf-7.t2.ucsd.edu/~namin/dump/threading/nanoaod_timing_1threads.txt")

In [3]:

def plot(df):
    duration = (df["Finish"].max()-df["Start"].min()).total_seconds()
    return ff.create_gantt(df, index_col='Resource', show_colorbar=True, group_tasks=True,title=f"total walltime = {duration:.3f}s").show()

In [4]:

plot(df_outreach_4t)

In [5]:

plot(df_nanoaod_4t)

In [6]:

# plot(df_outreach_1t)

In [7]:

# plot(df_nanoaod_1t)

In [8]:

# OUTREACH file
# many short baskets at the beginning of the file (which get assigned to thread 1)
# however, the number of events processed by each thread is roughly the same
gb = df_outreach_4t.query("Resource == 'interpretation'").drop_duplicates("Basket").groupby("Task")
print("Baskets per thread")
print(gb.size())
print("Nevents per thread")
gb["Nevents"].sum()

Baskets per thread
Task
 thread 1    466
 thread 2     19
 thread 3     19
 thread 4     19
dtype: int64
Nevents per thread

Out[8]:

Task
 thread 1    14790510
 thread 2    15612205
 thread 3    15612205
 thread 4    15525493
Name: Nevents, dtype: int64

In [9]:

# NANOAOD file
gb = df_nanoaod_4t.query("Resource == 'interpretation'").drop_duplicates("Basket").groupby("Task")
print("Baskets per thread")
print(gb.size())
print("Nevents per thread")
gb["Nevents"].sum()

Baskets per thread
Task
 thread 1    48
 thread 2    50
 thread 3    49
 thread 4    49
dtype: int64
Nevents per thread

Out[9]:

Task
 thread 1    193704
 thread 2    201775
 thread 3    201775
 thread 4    199412
Name: Nevents, dtype: int64

In [ ]: