#!/usr/bin/env python # coding: utf-8 # In[1]: import matplotlib.pyplot as plt import seaborn import pandas as pd import numpy as np import math import sh IOSTAT_COLUMNS = ['r/s', 'w/s', 'kr/s', 'kw/s', 'wait', 'actv', 'wsvc_t', 'asvc_t', '%w', '%b', 'device'] TEST_CONFIG = 'max-rate-submit' DISK_CONFIG = 'hdd' NJOBS = [1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024] NDISKS = [1, 2, 4, 8] DISKS = ['c1t1d0', 'c1t2d0', 'c1t3d0', 'c2t0d0', 'c2t1d0', 'c2t2d0', 'c4t0d0', 'c4t1d0', 'c4t2d0', 'c3t0d0', 'c3t1d0'] seaborn.set() seaborn.set_context('talk') jq = sh.jq.bake('-M', '-r') def fio_iops_series(directory): iops = [] for njobs in NJOBS: data = jq('.jobs[0].write.iops', '{:s}/fio-{:d}-jobs/fio.json'.format(directory, njobs)) iops.append(float(data.strip())) return pd.Series(iops, NJOBS) def fio_latency_series(directory): latency = [] for njobs in NJOBS: data = jq('.jobs[0].write.lat.mean', '{:s}/fio-{:d}-jobs/fio.json'.format(directory, njobs)) latency.append(float(data.strip())) return pd.Series(latency, NJOBS) def iostat_column_series(column, directory, ndisks): jobavgs = [] for njobs in NJOBS: diskavgs = pd.Series() for disk in DISKS[0:ndisks]: data = pd.read_csv('{:s}/fio-{:d}-jobs/iostat-{:s}.txt'.format(directory, njobs, disk), delim_whitespace=True, header=None, names=IOSTAT_COLUMNS, skiprows=5) diskavgs[disk] = data[column].mean() jobavgs.append(data[column].mean()) return pd.Series(jobavgs, NJOBS) def get_pctchange_dataframe(project, master): diff = pd.DataFrame() for plabel, mlabel in zip(project, master): new = project[plabel] old = master[mlabel] diff[plabel.replace('project - ', '')] = 100 * ((new - old) / old) return diff def plot_iops_dataframe(df): df.plot(figsize=(16, 9), style='-o') plt.title('fio -- write iops vs. fio threads') plt.xlabel('number of fio threads issuing writes') plt.ylabel('write iops reported by fio') plt.loglog(basey=2) plt.xticks(df.index, df.index) plt.show() def plot_latency_dataframe(df): df.plot(figsize=(16, 9), style='-o') plt.title('fio -- average write latency vs. fio threads') plt.xlabel('number of fio threads issuing writes') plt.ylabel('average write latency reported by fio (microseconds)') plt.loglog(basey=2) plt.xticks(df.index, df.index) plt.show() def plot_iostat_column_dataframe(df, column): df.plot(figsize=(16, 9), style='-o') plt.title('iostat -- {:s} vs. fio threads'.format(column)) plt.xlabel('number of fio threads issuing writes') plt.xscale('log') plt.xticks(df.index, df.index) plt.show() # In[2]: master_latency = pd.DataFrame() master_iops = pd.DataFrame() master_busy = pd.DataFrame() for i in NDISKS: directory = 'openzfs-447-perf/{:s}/master/{:d}-{:s}'.format(TEST_CONFIG, i, DISK_CONFIG) label = 'master - {:d} {:s}'.format(i, DISK_CONFIG) master_latency[label] = fio_latency_series(directory) master_iops[label] = fio_iops_series(directory) master_busy[label] = iostat_column_series('%b', directory, i) project_latency = pd.DataFrame() project_iops = pd.DataFrame() project_busy = pd.DataFrame() for i in NDISKS: directory = 'openzfs-447-perf/{:s}/project/{:d}-{:s}'.format(TEST_CONFIG, i, DISK_CONFIG) label = 'project - {:d} {:s}'.format(i, DISK_CONFIG) project_latency[label] = fio_latency_series(directory) project_iops[label] = fio_iops_series(directory) project_busy[label] = iostat_column_series('%b', directory, i) pctchange_latency = get_pctchange_dataframe(project_latency, master_latency) pctchange_iops = get_pctchange_dataframe(project_iops, master_iops) # # OpenZFS #447 Performance Results - Max Rate Submit on HDDs # # ### Workload Details # # This workload consisted of using `fio` to drive synchronous writes, while varying the number of threads used by `fio`. Each `fio` thread would issue writes to a unique file, using sequential file offsets, `pwrite`, `O_SYNC`, a blocksize of `8k`, and a queue depth of 1 (i.e. each thread performing a single write at a time). Additionally, each thread would issue the writes as quickly as possible; i.e. immediately after a thread's write would complete, it would issue the next write. Here's the `fio` configuration used to acheive this: # ``` # [global] # group_reporting # clocksource=cpu # ioengine=psync # fallocate=none # blocksize=8k # runtime=60 # time_based # iodepth=1 # rw=write # thread=0 # direct=0 # sync=1 # # [zfs-workload] # ``` # # The command line flag `--numjobs` was used to vary the number of threads used for each invocation, ranging from a single thread to 1024 threads. # # ### ZFS Pool and Dataset Configuration # # The above `fio` workload was run on zpools with varying numbers of direct attached disks; configurations of 1 disk, 2 disks, 4 disks, and 8 disks were used. All configuration options were kept default at the zpool level (i.e. no `-o` options were passed to `zpool create`). # # For all tests, a single ZFS dataset was used to store all the `fio` files for all thread counts. The configuration options used for this dataset were the following: `recsize=8k`, `compress=lz4`, `checksum=edonr`, `redundant_metadata=most`. These were all chosen to match the options used by our Delphix Engine, except `recsize`, which was used to avoid the read-modify-write penalty since `fio` was issuing `8k` writes. # # ### System Hardware Configuration # # - VM running on VMWare ESXi 6.0.0 # - 8 vCPUs # - 128 GB of RAM # - Traditional Magnetic Disks # ## IOPs as reported by `fio` vs. number of `fio` threads # # Below are graphs of the write IOPs reported by `fio` (using the `write.iops` metric), which accounts for all `fio` threads in the given run; i.e. it's the aggregate value for all `fio` threads vs. the value of each individual `fio` thread. Additionally, each line corresponds to a different zpool configuration; each configuration having a different number of disks in the pool. # ### IOPs as reported by `fio` vs. number of `fio` threads - master branch # In[3]: plot_iops_dataframe(master_iops) # In[4]: master_iops # ### IOPs as reported by `fio` vs. number of `fio` threads - project branch # In[5]: plot_iops_dataframe(project_iops) # In[6]: project_iops # ## % change in write IOPs vs. number of `fio` threads - master vs. project # # The following graph shows the percentage change for the IOPs reported by `fio`, between the "master" and "project" test runs. A positive value here reflects an increase in the IOPs reported by fio when comparing the results of the "project" branch to the "master" branch; i.e. positive is better. Additionally, a 100% increase would reflect a doubling of the IOPs. Similarly, a 50% decrease would equate to halving the IOPs. # In[7]: pctchange_iops.plot(figsize=(16, 9), style='-o') plt.title('fio -- % change in write iops vs. number of fio threads') plt.xlabel('number of fio threads issuing writes') plt.ylabel('% change in write iops reported by fio') plt.ylim(-50, 150) plt.xscale('log') plt.xticks(pctchange_iops.index, pctchange_iops.index) plt.axhline(0, ls='-.') plt.show() # In[8]: pctchange_iops # ## average write latency as reported by `fio` vs. number of `fio` threads # # Below are graphs of the average write latency (in microseconds) reported by `fio` (using the `write.lat.mean` metric), for all `fio` threads in the test run. Just like the graph of IOPs above, each line represents a different zpool configuration, and there's data for the "master" branch as well as the "project" branch. # ## average write latency as reported by `fio` vs. number of `fio` threads - maser branch # In[9]: plot_latency_dataframe(master_latency) # In[10]: master_latency # ### average write latency as reported by `fio` vs. number of `fio` threads - project branch # In[11]: plot_latency_dataframe(project_latency) # In[12]: project_latency # ## % change in average write latency vs. number of `fio` threads - master vs. project # # The following graph shows the percentage change for the average write latency reported by `fio`, between the "master" branch and "project" branch test runs. A positive value here reflects an increase in the average write latency reported by `fio` when comparing the "project" to the "baseline". Thus, unlike the IOPs numbers above, a negative value here is better. # In[13]: pctchange_latency.plot(figsize=(16, 9), style='-o') plt.title('fio -- % change in average write latency vs. number of fio threads') plt.xlabel('number of fio threads issuing writes') plt.ylabel('% change in average write latency reported by fio') plt.ylim(-150, 50) plt.xscale('log') plt.xticks(pctchange_latency.index, pctchange_latency.index) plt.axhline(0, ls='-.') plt.show() # In[14]: pctchange_latency # ## `%b` averaged across all disks in zpool vs. `fio` threads # # Below are graphs of the `%b` column from `iostat` for all disks in the zpool. # # The values that're shown were generating by using 1 second samples (i.e. `iostat -xn 1`) for each disk in the zpool, for the entire runtime of the test. These samples were then averaged to acheive a single `%b` average for each disk in the zpool. Then, the single value per disk was averaged across all disks in the zpool, to achieve a single `%b` value, representing all disks in the zpool. # # This provides an approximation for how utilized the disks in the zpool were, during the runtime of the `fio` workload. # ### `%b` averaged across all disks in zpool vs. `fio` threads - master branch # In[15]: plot_iostat_column_dataframe(master_busy, '%b') # In[16]: master_busy # ### `%b` averaged across all disks in zpool vs. `fio` threads - project branch # In[17]: plot_iostat_column_dataframe(project_busy, '%b') # In[18]: project_busy # ## on-cpu, system wide, kernel flame graphs # # The visualizations below are on-cpu flame-graphs of the entire system, using kernel level stacks. Unlike the line graphs above, there isn't a straightforward way to condense all of the test runs into a single flame-graph visualization. Thus, instead of showing the unique graph for each configuration, 2 configurations were specifically chosen with hopes that these two show a representative sample of the whole population. The two chosen configurations are: # # - 1 disk zpool, with 1024 `fio` threads # - 8 disk zpool, with 1024 `fio` threads # # Both configurations have the largest number of `fio` threads available; and then one configuration has the largest number of disks, and the other configuration has the least number of disks. # ### on-cpu, system wide, kernel flame graph - 1 disk - 1024 `fio` threads - master branch # ![image](max-rate-submit-master-1-hdd-fio-1024-jobs-dtrace-profile.svg) # ### on-cpu, system wide, kernel flame graph - 1 disk - 1024 `fio` threads - project branch # ![image](max-rate-submit-project-1-hdd-fio-1024-jobs-dtrace-profile.svg) # ### on-cpu, system wide, kernel flame graph - 8 disks - 1024 `fio` threads - master branch # ![image](max-rate-submit-master-8-hdd-fio-1024-jobs-dtrace-profile.svg) # ### on-cpu, system wide, kernel flame graph - 8 disks - 1024 `fio` threads - project branch # ![image](max-rate-submit-project-8-hdd-fio-1024-jobs-dtrace-profile.svg)