from dask_jobqueue import PBSCluster
import dask
dask.config.set(
{
'distributed.dashboard.link': 'https://jupyterhub.hpc.ucar.edu/stable/user/{USER}/dav-compute/proxy/{port}/status'
}
)
<dask.config.set at 0x2b7e89a7a520>
cluster = PBSCluster(
cores=1,
processes=1,
memory="10GB",
queue="casper",
project="NTDD0005",
walltime="00:30:00",
resource_spec="select=1:ncpus=1:mem=10GB:ngpus=1",
extra=[
'--resources GPU=1'
], # tag dask-workers: # specify special hardware availability that the scheduler is not aware of
job_extra=['-l gpu_type=v100'],
env_extra=['module load cuda/11.0.3'], # ensure cuda is loaded
)
cluster
VBox(children=(HTML(value='<h2>PBSCluster</h2>'), HBox(children=(HTML(value='\n<div>\n <style scoped>\n .d…
print(cluster.job_script())
#!/usr/bin/env bash #PBS -N dask-worker #PBS -q casper #PBS -A NTDD0005 #PBS -l select=1:ncpus=1:mem=10GB:ngpus=1 #PBS -l walltime=00:30:00 #PBS -e /glade/scratch/abanihi/ #PBS -o /glade/scratch/abanihi/ #PBS -l gpu_type=v100 module load cuda/11.0.3 /glade/work/abanihi/opt/miniconda/envs/dask-gpu/bin/python -m distributed.cli.dask_worker tcp://10.12.205.17:41700 --nthreads 1 --memory-limit 9.31GiB --name dummy-name --nanny --death-timeout 60 --local-directory /glade/scratch/abanihi --resources GPU=1 --interface ib0 --protocol tcp://
!qstat -u abanihi
Req'd Req'd Elap Job ID Username Queue Jobname SessID NDS TSK Memory Time S Time --------------- -------- -------- ---------- ------ --- --- ------ ----- - ----- 240546.casper-* abanihi jhublog* STDIN 217212 1 1 4gb 720:0 R 226:0 251037.casper-* abanihi jhublog* STDIN 238092 1 1 4gb 720:0 R 149:3 282298.casper-* abanihi tdd STDIN 10906 1 1 10gb 00:30 R 00:01 282301.casper-* abanihi tdd dask-work* 31679 1 1 10gb 00:30 R 00:00 282302.casper-* abanihi tdd dask-work* -- 1 1 10gb 00:30 R --
cluster.scale(2)
cluster.scale(jobs=2)
cluster.adapt(minimum=2, maximum=4)
<distributed.deploy.adaptive.Adaptive at 0x2b7e8985eac0>
cluster.adapt(minimum_jobs=2, maximum_jobs=4)
<distributed.deploy.adaptive.Adaptive at 0x2b7eacd3f400>
from distributed import Client
client = Client(cluster)
def get_nvidia_smi_info():
import subprocess
p = subprocess.check_output('nvidia-smi').strip().decode('utf-8')
return p
def nvidia_smi(on='workers'):
if on == 'workers':
x = client.run(get_nvidia_smi_info)
print(" ***** NVIDIA-SMI info on Workers *****")
for key, value in x.items():
print("*" * 80)
print(key)
print(value, end="\n\n")
elif on == 'scheduler':
print("***** NVIDIA-SMI info on Scheduler *****")
print(client.run_on_scheduler(get_nvidia_smi_info))
nvidia_smi()
***** NVIDIA-SMI info on Workers ***** ******************************************************************************** tcp://10.12.205.38:40174 Wed May 19 22:18:08 2021 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:B2:00.0 Off | 0 | | N/A 31C P0 40W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+ ******************************************************************************** tcp://10.12.205.38:41472 Wed May 19 22:18:08 2021 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:1B:00.0 Off | 0 | | N/A 31C P0 40W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
nvidia_smi(on='scheduler')
***** NVIDIA-SMI info on Scheduler ***** Wed May 19 22:18:09 2021 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:B3:00.0 Off | 0 | | N/A 28C P0 40W / 300W | 0MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | No running processes found | +-----------------------------------------------------------------------------+
import cupy
import dask.array as da
# generate chunked dask arrays of many cupy random arrays
rs = da.random.RandomState(RandomState=cupy.random.RandomState) # <-- we specify cupy here
x = rs.normal(10, 1, size=(200000, 200000), chunks=(10000, 4000), dtype=cupy.float32)
x
|
y = (x + 1)[::2, ::2].std(axis=0)
y = y.persist()
y
|
%%time
result = y.compute()
CPU times: user 30 ms, sys: 2 ms, total: 32 ms Wall time: 36.7 ms
result
array([0.9968655 , 0.99782085, 1.0016662 , ..., 1.0015868 , 0.9996461 , 0.9991475 ], dtype=float32)
type(result)
cupy._core.core.ndarray
nvidia_smi()
***** NVIDIA-SMI info on Workers ***** ******************************************************************************** tcp://10.12.205.38:40174 Wed May 19 22:23:17 2021 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:B2:00.0 Off | 0 | | N/A 32C P0 52W / 300W | 926MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | 0 N/A N/A 31956 C .../envs/dask-gpu/bin/python 923MiB | +-----------------------------------------------------------------------------+ ******************************************************************************** tcp://10.12.205.38:41472 Wed May 19 22:23:17 2021 +-----------------------------------------------------------------------------+ | NVIDIA-SMI 450.51.06 Driver Version: 450.51.06 CUDA Version: 11.0 | |-------------------------------+----------------------+----------------------+ | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | | | | MIG M. | |===============================+======================+======================| | 0 Tesla V100-SXM2... On | 00000000:1B:00.0 Off | 0 | | N/A 33C P0 54W / 300W | 926MiB / 32510MiB | 0% Default | | | | N/A | +-------------------------------+----------------------+----------------------+ +-----------------------------------------------------------------------------+ | Processes: | | GPU GI CI PID Type Process name GPU Memory | | ID ID Usage | |=============================================================================| | 0 N/A N/A 31847 C .../envs/dask-gpu/bin/python 923MiB | +-----------------------------------------------------------------------------+