#!/usr/bin/env python # coding: utf-8 # # Accessing File with GDS # ## Prerequisite # # [NVIDIA® GPUDirect® Storage (GDS)](https://developer.nvidia.com/gpudirect-storage) needs to be installed to use GDS feature (Since CUDA Toolkit 11.4, GDS client package has been available.) # # File access APIs would still work without the GDS host (kernel) packages but you won't see the speed up. # Please follow the [release note](https://docs.nvidia.com/gpudirect-storage/release-notes/index.html) or the [installation guide](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html#abstract) to install GDS in your host system. # # - Note:: During the GDS prerequisite installation (step 3 of [the installation guide](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html#install-prereqs>)), you would need MOFED (Mellanox OpenFabrics Enterprise Distribution) installed. MOFED is available at https://www.mellanox.com/products/infiniband-drivers/linux/mlnx_ofed. # # # # The following examples assumes that files loaded are mounted on the NVMe storage device and assumes that CuPy and PyTorch packages are installed. # # Please execute the following commands to install the dependent libraries. # # ``` # !conda install -c pytorch -c conda-forge pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0 # (If executing `import torch; torch.cuda.is_available()` doesn't show 'True', use `pip` PyTorch installation method.) # ``` # # # or # ``` # !pip install cupy-cuda110 # !pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html # ``` # In[1]: #!conda install -c pytorch -c conda-forge pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0 # or #!pip install cupy-cuda110 #!pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html # ## Open File # # You can use either `CuFileDriver` class or `open` method in `cucim.clara.filesystem` package. # ### Opening/Closing file with CuFileDriver # # A file descriptor would be needed to create a CuFileDriver instance. # # To use GDS, the file needs to be opened with `os.O_DIRECT`. See [NVIDIA GPUDirect Storage O_DIRECT Requirements Guide](https://docs.nvidia.com/gpudirect-storage/o-direct-guide/index.html). # # Please also see [os.open()](https://docs.python.org/3/library/os.html#os.open) for the detailed options available. # # In[2]: import os from cucim.clara.filesystem import CuFileDriver fno = os.open( "input/image.tif", os.O_RDONLY | os.O_DIRECT) fno2 = os.dup(fno) fd = CuFileDriver(fno, False) fd.close() os.close(fno) # Do not use GDS even when GDS can be supported for the file. fd2 = CuFileDriver(fno2, True) fd2.close() os.close(fno2) help(CuFileDriver.__init__) # ### Opening file with `open()` method in cucim.clara.filesystem package # # `cucim.clara.filesystem.open()` method accepts the three parameters (`file_path`, `flags`, `mode`). # # # #### file_path # # A string for the file path. # # #### flags # # `flags` can be one of the following flag string: # # - **"r"** : `os.O_RDONLY` # - **"r+"** : `os.O_RDWR` # - **"w"** : `os.O_RDWR` | `os.O_CREAT` | `os.O_TRUNC` # - **"a"** : `os.O_RDWR` | `os.O_CREAT` # # In addition to above flags, the method append `os.O_CLOEXEC` and `os.O_DIRECT` by default. # # The following is optional flags that can be added to above string: # - **'p'**: Use POSIX APIs only (first try to open with O_DIRECT). It does not use GDS. # - **'n'**: Do not add O_DIRECT flag. # - **'m'**: Use memory-mapped file. This flag is supported only for the read-only file descriptor. # # When **'m'** is used, `PROT_READ` and `MAP_SHARED` are used for the parameter of [mmap()](https://man7.org/linux/man-pages/man2/mmap.2.html) function. # # #### mode # # A file mode. Default value is `0o644`. # In[3]: import cucim.clara.filesystem as fs fd = fs.open("input/image.tif", "r") fs.close(fd) # same with fd.close() # Open file without using GDS fd2 = fs.open("input/image.tif", "rp") fs.close(fd2) # same with fd2.close() # ## Read/Write File # # You can use `pread()`/`pwrite()` method in either `CuFileDriver` class or `cucim.clara.filesystem` package. # # Those methods are similar to POSIX [pread()](https://man7.org/linux/man-pages/man2/pread.2.html)&[pwrite()](https://man7.org/linux/man-pages/man2/pwrite.2.html) methods which requires `buf`, `count`, and `offset`(`file_offset`) parameters. # # However, for user's convenient, an optional `buf_offset` parameter (default value: `0`) is also added to specify an offset of the input/output buffer and it would have `0` if not specified. # ### Using CPU memory # # Any Python object supporting [\_\_array_interface__](https://numpy.org/doc/stable/reference/arrays.interface.html) (such as numpy.array or numpy.ndarray) can be used for `buf` parameter. # Or, any pointer address (`int` type) can be used for `buf` parameter. # In[4]: from cucim.clara.filesystem import CuFileDriver import cucim.clara.filesystem as fs import os, numpy as np, torch # Write a file with size 10 (in bytes) with open("input.raw", "wb") as input_file: input_file.write(bytearray([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])) # Create an array with size 10 (in bytes) np_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=np.uint8) torch_arr = torch.from_numpy(np_arr) # Note: np_arr shares internal data with torch_arr # Using CuFileDriver fno = os.open( "input.raw", os.O_RDONLY) fd = CuFileDriver(fno) read_count = fd.pread(np_arr, 8, 0, 2) # read 8 bytes starting from file offset 0 into buffer offset 2 print("{:10} cnt: {} content: {}".format("np_arr", read_count, np_arr)) read_count = fd.pread(np_arr, 10, 0) # read 10 bytes starting from file offset 0 print("{:10} cnt: {} content: {}".format("np_arr", read_count, np_arr)) read_count = fd.pread(torch_arr.data_ptr(), 10, 3) # read 10 bytes starting from file offset 3 print("{:10} cnt: {} content: {}".format("torch_arr", read_count, torch_arr)) fd.close() os.close(fno) fno = os.open("output.raw", os.O_RDWR | os.O_CREAT | os.O_TRUNC) fd = CuFileDriver(fno) write_count = fd.pwrite(np_arr, 10, 5) # write 10 bytes from np_array to file starting from offset 5 fd.close() os.close(fno) print("{:10} cnt: {} content: {}".format("output.raw", write_count, list(open("output.raw", "rb").read()))) print() # Using filesystem package fd = fs.open("output.raw", "r") read_count = fs.pread(fd, np_arr, 10, 0) # read 10 bytes starting from offset 0 print("{:10} cnt: {} content: {}".format("np_arr", read_count, np_arr)) fs.close(fd) # same with fd.close() # Using 'with' statement with fs.open("output.raw", "r") as fd: read_count = fd.pread(np_arr, 10, 0) # read 10 bytes starting from offset 0 print("{:10} cnt: {} content: {}".format("np_arr", read_count, np_arr)) # ### Using GPU memory # # Any Python object supporting [\_\_cuda_array_interface__](http://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html) (such as cupy.array, cupy.ndarray, or Pytorch Cuda Tensor) can be used for `buf` parameter. # Or, any pointer address (`int` type) can be used for `buf` parameter. # In[5]: from cucim.clara.filesystem import CuFileDriver import cucim.clara.filesystem as fs import os import cupy as cp import torch # Write a file with size 10 (in bytes) with open("input.raw", "wb") as input_file: input_file.write(bytearray([101, 102, 103, 104, 105, 106, 107, 108, 109, 110])) # Create an array with size 10 (in bytes) cp_arr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.uint8) cuda0 = torch.device('cuda:0') torch_arr = torch.zeros(10, dtype=torch.uint8, device=cuda0) # Using CuFileDriver fno = os.open( "input.raw", os.O_RDONLY | os.O_DIRECT) fd = CuFileDriver(fno) read_count = fd.pread(cp_arr, 8, 0, 2) # read 8 bytes starting from file offset 0 into buffer offset 2 print("{:20} cnt: {} content: {}".format("np_arr", read_count, cp_arr)) read_count = fd.pread(cp_arr, 10, 0) # read 10 bytes starting from offset 0 print("{:20} cnt: {} content: {}".format("cp_arr", read_count, np_arr)) read_count = fd.pread(torch_arr, 10, 3) # read 10 bytes starting from offset 3 print("{:20} cnt: {} content: {}".format("torch_arr", read_count, torch_arr)) fd.close() os.close(fno) fno = os.open("output.raw", os.O_RDWR | os.O_CREAT | os.O_TRUNC) fd = CuFileDriver(fno) write_count = fd.pwrite(cp_arr, 10, 5) # write 10 bytes from np_array to file starting from offset 5 fd.close() os.close(fno) print("{:20} cnt: {} content: {}".format("output.raw", write_count, list(open("output.raw", "rb").read()))) print() # Using filesystem package fd = fs.open("output.raw", "r") read_count = fs.pread(fd, cp_arr, 10, 0) # read 10 bytes starting from offset 0 print("{:20} cnt: {} content: {}".format("cp_arr", read_count, cp_arr)) fs.close(fd) # same with fd.close() # Using 'with' statement with fs.open("output.raw", "r") as fd: read_count = fd.pread(cp_arr, 10, 0) # read 10 bytes starting from offset 0 print("{:10} cnt: {} content: {}".format("np_arr", read_count, cp_arr)) # In[6]: cp_arr.__cuda_array_interface__ # In[7]: torch_arr.__cuda_array_interface__ # ## Discarding system (page) cache for a file # # You can use `discard_page_cache()` method for discarding system (page) cache for the given file, before any performance measurement on a file. # # ```python # import cucim.clara.filesystem as fs # # fs.discard_page_cache("input/image.tif") # # ... file APIs on `input/image.tif` # ``` # # Its implementation looks like below # ```C++ # bool discard_page_cache(const char* file_path) # { # int fd = ::open(file_path, O_RDONLY); # if (fd < 0) # { # return false; # } # if (::fdatasync(fd) < 0) # { # return false; # } # if (::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED) < 0) # { # return false; # } # if (::close(fd) < 0) # { # return false; # } # return true; # } # ``` # # It helps measure accurate file access performance without the effect of the page cache. # ## Experiments (for a big file such as .mhd) # # Conducted experiments on Intel(R) Core(TM) i7-7800X CPU @ 3.50GHz with Samsung SSD 970 PRO (NVMe SSD, 1TB). # # This is for reading 10GB of data # ``` # second method(posix + cudamemcpy) : 5.031040154863149 # second method(posix+odirect + cudamemcpy) : 4.7419630330987275 # second method(gds) : 4.235773948952556 # ``` # Performance gain: 1.19x # # # This is for reading 2GB of data # # ```bash # second method(posix) : 1.0681836600415409 # second method(posix+odirect) : 0.9496012150775641 # second method(gds) : 0.8406150250229985 # ``` # Performance gain: 1.27x # #