Convert compressed json to pandas dataframe and save it

  • Go through the compressed file once
  • Pull out all feilds of interest (including, runtime, node_count, walltime_req, etc.)
  • Save the resulting dataframe to jobs_raw.csv file for further processing and analysis

Source of the job data: https://cscdata.nrel.gov/#/datasets/d332818f-ef57-4189-ba1d-beea291886eb

In [12]:
import json
import bz2
import pandas as pd
from datetime import datetime
import sys

show_progress = True
jsonfilename = '/data/peregrine/10k.anon.json.bz2'

def get_runtime(job):
    start = datetime.strptime(job['job']['start_time'].replace('T',' ').replace('Z',''), '%Y-%m-%d %H:%M:%S.%f') 
    end = datetime.strptime(job['job']['end_time'].replace('T',' ').replace('Z',''), '%Y-%m-%d %H:%M:%S.%f') 
    return (end-start).total_seconds()

jobs = []
with bz2.BZ2File(jsonfilename, 'r') as f:
    for idx, line in enumerate(f):
        obj = json.loads(line)
        
        jobs.append({"Order": idx, "NodeCount": obj['job']['nodes_used'], "Runtime": get_runtime(obj),
                    "wallclock_req": obj['job']['wallclock_req'], "wallclock_used": obj['job']['wallclock_used'],
                    "queue": obj['job']['queue'], "app_name": obj['job']['app_name'],
                    "exit_code": obj['job']['exit_code'], "submit_time": obj['job']['submit_time']})
        
        if show_progress:
            if idx % 50 == 0:
                sys.stdout.write("%d..." % idx)
                sys.stdout.flush()
            if idx % (50*10) == 0 and idx > 0:
                print " "

jobs = pd.DataFrame(jobs).set_index("Order")
jobs.to_csv('jobs_raw.csv')
print "Saved %d lines to jobs_raw.csv" % (len(jobs))
0...50...100...150...200...250...300...350...400...450...500... 
550...600...650...700...750...800...850...900...950...1000... 
1050...1100...1150...1200...1250...1300...1350...1400...1450...1500... 
1550...1600...1650...1700...1750...1800...1850...1900...1950...2000... 
2050...2100...2150...2200...2250...2300...2350...2400...2450...2500... 
2550...2600...2650...2700...2750...2800...2850...2900...2950...3000... 
3050...3100...3150...3200...3250...3300...3350...3400...3450...3500... 
3550...3600...3650...3700...3750...3800...3850...3900...3950...4000... 
4050...4100...4150...4200...4250...4300...4350...4400...4450...4500... 
4550...4600...4650...4700...4750...4800...4850...4900...4950...5000... 
5050...5100...5150...5200...5250...5300...5350...5400...5450...5500... 
5550...5600...5650...5700...5750...5800...5850...5900...5950...6000... 
6050...6100...6150...6200...6250...6300...6350...6400...6450...6500... 
6550...6600...6650...6700...6750...6800...6850...6900...6950...7000... 
7050...7100...7150...7200...7250...7300...7350...7400...7450...7500... 
7550...7600...7650...7700...7750...7800...7850...7900...7950...8000... 
8050...8100...8150...8200...8250...8300...8350...8400...8450...8500... 
8550...8600...8650...8700...8750...8800...8850...8900...8950...9000... 
9050...9100...9150...9200...9250...9300...9350...9400...9450...9500... 
9550...9600...9650...9700...9750...9800...9850...9900...9950...Saved 9996 lines to summary.csv

Make plots and print stats information

  • Load data from jobs_raw.csv, no data manipulation
In [90]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

jobs = pd.read_csv('jobs_raw.csv')
print "Total number of jobs: %d" % len(jobs)

fig, ax1 = plt.subplots(1,1, figsize=(8,3))
jobs['NodeCount'].plot.hist(ax=ax1, logy=True)
ax1.set_title("Histogram of Node Counts for Peregrine Jobs")
ax1.legend().set_visible(False)
fig.savefig('jobs_node_count_hist.png', dpi=200)
print jobs.info()

fig, ax1 = plt.subplots(1,1, figsize=(8,3))
# sns.xkcd_rgb["denim blue"] - one of the special colors defined in seaborn; 
# more at: http://seaborn.pydata.org/tutorial/color_palettes.html
ax1.scatter(range(len(jobs)), jobs["Runtime"], c=sns.xkcd_rgb["denim blue"], lw=0)
ax1.set_title("Job Runtimes")
ax1.set_ylabel("Runtime, s")
ax1.set_xlabel("Job Order")
fig.tight_layout()
fig.savefig('jobs_runtime_scatter.png', dpi=200)

fig, ax2 = plt.subplots(1,1, figsize=(8,3))
ax2.scatter(range(len(jobs)), jobs["NodeCount"], c=sns.xkcd_rgb["denim blue"], lw=0)
ax2.set_title("Job Node Counts")
ax2.set_ylabel("# of Nodes")
ax2.set_xlabel("Job Order")
fig.tight_layout()
fig.savefig('jobs_node_count_scatter.png', dpi=200)

nc = [5, 10, 20, 30, 40, 50, 100]
for c in nc:
    print "Number of jobs that require <=%d compute nodes: %d" % (c, len(jobs[jobs['NodeCount'] <= c ]))
nh = range(1,25)
for c in nh:
    print "Number of jobs that are shorter then %d hours: %d" % (c, len(jobs[jobs['Runtime'] <= d*3600 ]))

subset = jobs[ (jobs['Runtime'] <= 24*3600) & (jobs['NodeCount'] <= 20) & (jobs['Runtime'] > 0.001) & (jobs['exit_code'] == 0) ]
print "Number of jobs that are shorter then 24 hours AND use " + \
"<=20 compute nodes (with non-zero runtime and zero exit code): %d" % (len(subset))
Total number of jobs: 9996
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9996 entries, 0 to 9995
Data columns (total 9 columns):
Order             9996 non-null int64
NodeCount         9996 non-null int64
Runtime           9996 non-null float64
app_name          9978 non-null object
exit_code         9996 non-null int64
queue             9996 non-null object
submit_time       9996 non-null object
wallclock_req     9978 non-null object
wallclock_used    9996 non-null object
dtypes: float64(1), int64(3), object(5)
memory usage: 702.9+ KB
None
Number of jobs that require <=5 compute nodes: 9649
Number of jobs that require <=10 compute nodes: 9879
Number of jobs that require <=20 compute nodes: 9943
Number of jobs that require <=30 compute nodes: 9959
Number of jobs that require <=40 compute nodes: 9982
Number of jobs that require <=50 compute nodes: 9986
Number of jobs that require <=100 compute nodes: 9990
Number of jobs that are shorter then 1 hours: 9304
Number of jobs that are shorter then 2 hours: 9304
Number of jobs that are shorter then 3 hours: 9304
Number of jobs that are shorter then 4 hours: 9304
Number of jobs that are shorter then 5 hours: 9304
Number of jobs that are shorter then 6 hours: 9304
Number of jobs that are shorter then 7 hours: 9304
Number of jobs that are shorter then 8 hours: 9304
Number of jobs that are shorter then 9 hours: 9304
Number of jobs that are shorter then 10 hours: 9304
Number of jobs that are shorter then 11 hours: 9304
Number of jobs that are shorter then 12 hours: 9304
Number of jobs that are shorter then 13 hours: 9304
Number of jobs that are shorter then 14 hours: 9304
Number of jobs that are shorter then 15 hours: 9304
Number of jobs that are shorter then 16 hours: 9304
Number of jobs that are shorter then 17 hours: 9304
Number of jobs that are shorter then 18 hours: 9304
Number of jobs that are shorter then 19 hours: 9304
Number of jobs that are shorter then 20 hours: 9304
Number of jobs that are shorter then 21 hours: 9304
Number of jobs that are shorter then 22 hours: 9304
Number of jobs that are shorter then 23 hours: 9304
Number of jobs that are shorter then 24 hours: 9304
Number of jobs that are shorter then 24 hours AND use <=20 compute nodes (with non-zero runtime and zero exit code): 7286

Add derived columns and plot walltime/runtime ratio

  • Add wallclock_req_seconds based on wallclock_req (dictinary)
  • Add wallclock_used_seconds based on wallclock_used (dictinary)
  • Add wallclock_to_runtime_ratio = wallclock_req_seconds / runtime
  • If conversions to seconds can be done (missing data), all these columns get the value of -1 (will be excluded)
  • Jobs with Runtime == 0 are excluded
In [78]:
import ast

jobs = pd.read_csv('jobs_raw.csv')

jobs = jobs[ (jobs['Runtime'] <= 24*3600) & (jobs['NodeCount'] <= 20) & (jobs['Runtime'] > 0.001) & (jobs['exit_code'] == 0) ]

print "Total number of jobs: %d" % len(jobs)

# Exlude jobs with zero runtimes 
jobs = jobs[jobs["Runtime"] > 0.0]

def duration_dict2seconds(dd):
    """
    Convert dictionary where keys are units (seconds, minutes, etc.) 
    and values are number of those units to total seconds
    """
    total = 0.0
    for unit, val in dd:
        if unit == "seconds":
            total += val
        elif unit == "minutes":
            total += 60*val
        elif unit == "hours":
            total += 3600*val
        elif unit == "days":
            total += 24*3600*val
        else:
            print "ERROR: Unsupported unit %s" % unit
            return -1
    # print "Converted: %s -> %f" % (dd, total)
    return total

jobs['wallclock_used_seconds'] = 0.0
jobs['wallclock_req_seconds'] = 0.0
jobs['wallclock_to_runtime_ratio'] = 0.0
for idx, j in jobs.iterrows():
    try:
        wallclock = duration_dict2seconds(ast.literal_eval(j["wallclock_req"]).items())
        jobs.set_value(idx, 'wallclock_req_seconds', wallclock)
        jobs.set_value(idx, 'wallclock_to_runtime_ratio', wallclock/j["Runtime"])
        
        runtime = duration_dict2seconds(ast.literal_eval(j["wallclock_used"]).items())
        jobs.set_value(idx, 'wallclock_used_seconds', runtime)
    except ValueError:
        #print "Skipping job with order: %d" % idx
        jobs.set_value(idx, 'wallclock_req_seconds', -1)
        jobs.set_value(idx, 'wallclock_to_runtime_ratio', -1)
        jobs.set_value(idx, 'wallclock_used_seconds', -1)
        continue

# Exclude jobs for which the conversions above are successful
converted_jobs = jobs[jobs['wallclock_req_seconds'] != -1]

print "Total number of converted jobs: %d" % len(converted_jobs)

fig, ax = plt.subplots(1,1, figsize=(6,80))
converted_jobs['wallclock_to_runtime_ratio'].hist(by=selected_jobs['app_name'], layout=(26,1), ax=ax);
#fig.tight_layout()
fig.savefig('jobs_ratio.png', dpi=200)
Total number of jobs: 7286
Total number of converted jobs: 7276

Exclude jobs for which wallclock_used_seconds differs from runtime (strange anomaly)

In [79]:
print "Number of jobs before: ", len(converted_jobs)

excluded = converted_jobs[converted_jobs["wallclock_used_seconds"] != converted_jobs["Runtime"]]
print "Jobs that will be excluded:"
print excluded

selected_jobs = converted_jobs[converted_jobs["wallclock_used_seconds"] == converted_jobs["Runtime"]]
print "Number of jobs before: ", len(selected_jobs)
Number of jobs before:  7276
Jobs that will be excluded:
      Order  NodeCount  Runtime app_name  exit_code  queue  \
4107   4107          3   4891.0     vasp          0  batch   

                   submit_time wallclock_req  \
4107  2014-10-31T18:47:46.000Z  {u'days': 3}   

                                     wallclock_used  wallclock_used_seconds  \
4107  {u'hours': 2, u'seconds': 31, u'minutes': 21}                  8491.0   

      wallclock_req_seconds  wallclock_to_runtime_ratio  
4107               259200.0                   52.995297  
Number of jobs before:  7275

Look closer at jobs for the vasp application

In [81]:
vasp = selected_jobs[selected_jobs['app_name'] == 'vasp']
print vasp.describe()

fig, ax = plt.subplots()
vasp['wallclock_to_runtime_ratio'].hist(bins=10);
ax.set_yscale('log')
ax.set_title("Histogram of Requested Walltime/Runtime Ratio for vasp Jobs")
ax.set_ylabel("Count")
ax.set_xlabel("Requested Walltime/Runtime Ratio")
fig.tight_layout()
fig.savefig('jobs_vasp_ratio.png', dpi=200)
             Order    NodeCount       Runtime  exit_code  \
count  5330.000000  5330.000000   5330.000000     5330.0   
mean   5054.945591     1.445779   6649.665103        0.0   
std    2869.383678     1.156608  11986.190262        0.0   
min       0.000000     1.000000     33.000000        0.0   
25%    2606.500000     1.000000    421.250000        0.0   
50%    5081.500000     1.000000   1611.000000        0.0   
75%    7510.250000     1.000000   6548.750000        0.0   
max    9994.000000    16.000000  83930.000000        0.0   

       wallclock_used_seconds  wallclock_req_seconds  \
count             5330.000000            5330.000000   
mean              6649.665103           59032.767355   
std              11986.190262           58038.167478   
min                 33.000000              30.000000   
25%                421.250000           14400.000000   
50%               1611.000000           57600.000000   
75%               6548.750000           57600.000000   
max              83930.000000          648000.000000   

       wallclock_to_runtime_ratio  
count                 5330.000000  
mean                    78.559905  
std                    229.369375  
min                      0.015291  
25%                      6.540623  
50%                     22.635579  
75%                     69.230769  
max                   5082.352941  

Get the number of underestimated and overestimated jobs

  • Underestimated: requested walltime is less than the actual runtime
  • Overestimated: requested walltime is greater than the actual runtime
In [91]:
print "Total number of selected jobs: %d" % len(selected_jobs)

underestimated = selected_jobs[(selected_jobs["wallclock_to_runtime_ratio"] <= 1.0) ]
print "Stats for underestimated jobs:"
print underestimated.describe()

fig, ax = plt.subplots()
underestimated['wallclock_to_runtime_ratio'].hist(bins=10);
# ax.set_yscale('log')
ax.set_title("Histogram of Requested Walltime/Runtime Ratio for Underestimated Jobs")
ax.set_ylabel("Count")
ax.set_xlabel("Requested Walltime/Runtime Ratio")
fig.tight_layout()
fig.savefig('jobs_underestimated_ratio.png', dpi=200)

overestimated = selected_jobs[(selected_jobs["wallclock_to_runtime_ratio"] >= 1.0) ]
print "Stats for overestimated jobs:"
print overestimated.describe()

fig, ax = plt.subplots()
overestimated['wallclock_to_runtime_ratio'].hist(bins=10);
# ax.set_yscale('log')
ax.set_title("Histogram of Requested Walltime/Runtime Ratio for Overestimated Jobs")
ax.set_ylabel("Count")
ax.set_xlabel("Requested Walltime/Runtime Ratio")
fig.tight_layout()
fig.savefig('jobs_overestimated_ratio.png', dpi=200)
Total number of selected jobs: 7275
Stats for underestimated jobs:
             Order  NodeCount       Runtime  exit_code  \
count    93.000000  93.000000     93.000000       93.0   
mean   5054.322581   1.247312  10780.397849        0.0   
std    2702.836128   0.928485  18871.728553        0.0   
min     188.000000   1.000000    101.000000        0.0   
25%    2874.000000   1.000000   1229.000000        0.0   
50%    4981.000000   1.000000   1690.000000        0.0   
75%    7214.000000   1.000000   5432.000000        0.0   
max    9660.000000   8.000000  65557.000000        0.0   

       wallclock_used_seconds  wallclock_req_seconds  \
count               93.000000              93.000000   
mean             10780.397849            9707.817204   
std              18871.728553           18882.110594   
min                101.000000              30.000000   
25%               1229.000000             300.000000   
50%               1690.000000             300.000000   
75%               5432.000000            3600.000000   
max              65557.000000           64800.000000   

       wallclock_to_runtime_ratio  
count                   93.000000  
mean                     0.470041  
std                      0.381771  
min                      0.015291  
25%                      0.133749  
50%                      0.358423  
75%                      0.950370  
max                      0.998198  
Stats for overestimated jobs:
             Order    NodeCount       Runtime  exit_code  \
count  7182.000000  7182.000000   7182.000000     7182.0   
mean   5014.694375     1.421888   7071.657895        0.0   
std    2885.787682     1.196807  12694.589286        0.0   
min       0.000000     1.000000     33.000000        0.0   
25%    2521.500000     1.000000    433.250000        0.0   
50%    5024.000000     1.000000   1930.000000        0.0   
75%    7513.750000     1.000000   6841.250000        0.0   
max    9994.000000    16.000000  85945.000000        0.0   

       wallclock_used_seconds  wallclock_req_seconds  \
count             7182.000000            7182.000000   
mean              7071.657895           69807.486912   
std              12694.589286           90778.193333   
min                 33.000000             240.000000   
25%                433.250000           14400.000000   
50%               1930.000000           57600.000000   
75%               6841.250000           57600.000000   
max              85945.000000          864000.000000   

       wallclock_to_runtime_ratio  
count                 7182.000000  
mean                   166.689234  
std                    967.687692  
min                      1.005868  
25%                      5.428679  
50%                     19.673475  
75%                     67.759013  
max                  21600.000000  
In [84]:
app_breakdown = selected_jobs.app_name.value_counts()
print "Breakdown of jobs by application:"
print app_breakdown
fig, ax = plt.subplots()
app_breakdown.plot(kind='pie', ax=ax);
ax.set_title("Breakdown of jobs by application")
fig.tight_layout()
fig.savefig('jobs_app_breakdown.png', dpi=200)

queue_breakdown = selected_jobs.queue.value_counts()
print "Breakdown of jobs by queue:"
print queue_breakdown
fig, ax = plt.subplots()
queue_breakdown.plot(kind='pie', ax=ax);
ax.set_title("Breakdown of jobs by queue")
fig.tight_layout()
fig.savefig('jobs_queue_breakdown.png', dpi=200)
Breakdown of jobs by application:
vasp                5330
python               548
mono                 448
gaussian             341
fast                 113
gromacs               71
amber                 55
wrf                   51
matlab                49
mfix                  47
Unknown               43
ls-dyna               40
fluent                37
charmm                32
gamess                20
OpenFOAM              12
transport              9
WindPlantSolver        9
Quantum_Espresso       8
eplus                  4
nek                    3
s3d                    3
namd                   1
comsol                 1
Name: app_name, dtype: int64
Breakdown of jobs by queue:
batch     6047
short      922
debug      143
bigmem      76
long        44
phi         42
large        1
Name: queue, dtype: int64

Save selected jobs

In [85]:
selected_jobs.to_csv('jobs_selected.csv')
print selected_jobs.describe()
             Order    NodeCount       Runtime  exit_code  \
count  7275.000000  7275.000000   7275.000000     7275.0   
mean   5015.200962     1.419656   7119.068591        0.0   
std    2883.351141     1.193868  12797.274658        0.0   
min       0.000000     1.000000     33.000000        0.0   
25%    2525.000000     1.000000    439.500000        0.0   
50%    5022.000000     1.000000   1921.000000        0.0   
75%    7509.500000     1.000000   6840.500000        0.0   
max    9994.000000    16.000000  85945.000000        0.0   

       wallclock_used_seconds  wallclock_req_seconds  \
count             7275.000000            7275.000000   
mean              7119.068591           69039.202474   
std              12797.274658           90473.311734   
min                 33.000000              30.000000   
25%                439.500000           14400.000000   
50%               1921.000000           57600.000000   
75%               6840.500000           57600.000000   
max              85945.000000          864000.000000   

       wallclock_to_runtime_ratio  
count                 7275.000000  
mean                   164.564370  
std                    961.663054  
min                      0.015291  
25%                      5.174273  
50%                     19.057702  
75%                     66.666667  
max                  21600.000000  

Estimate total compute time

In [92]:
total_node_hours = 0.0
for idx, j in selected_jobs.iterrows():
    total_node_hours += j["Runtime"] * j["NodeCount"]
total_node_hours = total_node_hours/3600.0
print "Total node hours: %f" % total_node_hours

print "Estimated (lower bound) time (in hours) to completion on %d nodes: %f" % (20, total_node_hours/20.0)
Total node hours: 26838.580000
Estimated (lower bound) time (in hours) to completion on 20 nodes: 1341.929000
In [ ]: