#!/usr/bin/env python
# coding: utf-8

# # Accessing File with GDS

# ## Prerequisite
# 
# [NVIDIA® GPUDirect® Storage (GDS)](https://developer.nvidia.com/gpudirect-storage) needs to be installed to use GDS feature (Since CUDA Toolkit 11.4, GDS client package has been available.)
# 
# File access APIs would still work without the GDS host (kernel) packages but you won't see the speed up.
# Please follow the [release note](https://docs.nvidia.com/gpudirect-storage/release-notes/index.html) or the [installation guide](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html#abstract) to install GDS in your host system.
# 
# - Note:: During the GDS prerequisite installation (step 3 of [the installation guide](https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html#install-prereqs>)), you would need MOFED (Mellanox OpenFabrics Enterprise Distribution) installed. MOFED is available at https://www.mellanox.com/products/infiniband-drivers/linux/mlnx_ofed.
# 
# 
# 
# The following examples assumes that files loaded are mounted on the NVMe storage device and assumes that CuPy and PyTorch packages are installed.
# 
# Please execute the following commands to install the dependent libraries.
# 
# ```
# !conda install -c pytorch -c conda-forge pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0
# (If executing `import torch; torch.cuda.is_available()` doesn't show 'True', use `pip` PyTorch installation method.)
# ```
# 
# 
# or
# ```
# !pip install cupy-cuda110
# !pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
# ```

# In[1]:


#!conda install -c pytorch -c conda-forge pytorch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2 cudatoolkit=11.0

# or

#!pip install cupy-cuda110
#!pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html


# ## Open File
# 
# You can use either `CuFileDriver` class or `open` method in `cucim.clara.filesystem` package.

# ### Opening/Closing file with CuFileDriver
# 
# A file descriptor would be needed to create a CuFileDriver instance.
# 
# To use GDS, the file needs to be opened with `os.O_DIRECT`. See [NVIDIA GPUDirect Storage O_DIRECT Requirements Guide](https://docs.nvidia.com/gpudirect-storage/o-direct-guide/index.html).
# 
# Please also see [os.open()](https://docs.python.org/3/library/os.html#os.open) for the detailed options available.
# 

# In[2]:


import os
from cucim.clara.filesystem import CuFileDriver

fno = os.open( "input/image.tif", os.O_RDONLY | os.O_DIRECT)
fno2 = os.dup(fno)

fd = CuFileDriver(fno, False)
fd.close()
os.close(fno)

# Do not use GDS even when GDS can be supported for the file.
fd2 = CuFileDriver(fno2, True)
fd2.close()
os.close(fno2)

help(CuFileDriver.__init__)


# ### Opening file with `open()` method in cucim.clara.filesystem package
# 
# `cucim.clara.filesystem.open()` method accepts the three parameters (`file_path`, `flags`, `mode`).
# 
# 
# #### file_path
# 
# A string for the file path.
# 
# #### flags
# 
# `flags` can be one of the following flag string:
# 
# - **"r"**  : `os.O_RDONLY`
# - **"r+"** : `os.O_RDWR`
# - **"w"**  : `os.O_RDWR`   | `os.O_CREAT` | `os.O_TRUNC`
# - **"a"**  : `os.O_RDWR`   | `os.O_CREAT`
# 
# In addition to above flags, the method append `os.O_CLOEXEC` and `os.O_DIRECT` by default.
# 
# The following is optional flags that can be added to above string:
# - **'p'**: Use POSIX APIs only (first try to open with O_DIRECT). It does not use GDS.
# - **'n'**: Do not add O_DIRECT flag.
# - **'m'**: Use memory-mapped file. This flag is supported only for the read-only file descriptor.
# 
# When **'m'** is used, `PROT_READ` and `MAP_SHARED` are used for the parameter of [mmap()](https://man7.org/linux/man-pages/man2/mmap.2.html) function.
# 
# #### mode
# 
# A file mode. Default value is `0o644`.

# In[3]:


import cucim.clara.filesystem as fs

fd = fs.open("input/image.tif", "r")
fs.close(fd)  # same with fd.close()

# Open file without using GDS
fd2 = fs.open("input/image.tif", "rp")
fs.close(fd2)  # same with fd2.close()


# ## Read/Write File
# 
# You can use `pread()`/`pwrite()` method in either `CuFileDriver` class or `cucim.clara.filesystem` package.
# 
# Those methods are similar to POSIX [pread()](https://man7.org/linux/man-pages/man2/pread.2.html)&[pwrite()](https://man7.org/linux/man-pages/man2/pwrite.2.html) methods which requires `buf`, `count`, and `offset`(`file_offset`) parameters.
# 
# However, for user's convenient, an optional `buf_offset` parameter (default value: `0`) is also added to specify an offset of the input/output buffer and it would have `0` if not specified.

# ### Using CPU memory
# 
# Any Python object supporting [\_\_array_interface__](https://numpy.org/doc/stable/reference/arrays.interface.html) (such as numpy.array or numpy.ndarray) can be used for `buf` parameter.
# Or, any pointer address (`int` type) can be used for `buf` parameter.

# In[4]:


from cucim.clara.filesystem import CuFileDriver
import cucim.clara.filesystem as fs

import os, numpy as np, torch

# Write a file with size 10 (in bytes)
with open("input.raw", "wb") as input_file:
    input_file.write(bytearray([101, 102, 103, 104, 105, 106, 107, 108, 109, 110]))

# Create an array with size 10 (in bytes)
np_arr = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=np.uint8)
torch_arr = torch.from_numpy(np_arr) # Note: np_arr shares internal data with torch_arr

# Using CuFileDriver
fno = os.open( "input.raw", os.O_RDONLY)
fd = CuFileDriver(fno)
read_count = fd.pread(np_arr, 8, 0, 2)      # read 8 bytes starting from file offset 0 into buffer offset 2
print("{:10} cnt: {}  content: {}".format("np_arr", read_count, np_arr))
read_count = fd.pread(np_arr, 10, 0)      # read 10 bytes starting from file offset 0
print("{:10} cnt: {}  content: {}".format("np_arr", read_count, np_arr))
read_count = fd.pread(torch_arr.data_ptr(), 10, 3)      # read 10 bytes starting from file offset 3
print("{:10} cnt: {}  content: {}".format("torch_arr", read_count, torch_arr))
fd.close()
os.close(fno)

fno = os.open("output.raw", os.O_RDWR | os.O_CREAT | os.O_TRUNC)
fd = CuFileDriver(fno)
write_count = fd.pwrite(np_arr, 10, 5)      # write 10 bytes from np_array to file starting from offset 5
fd.close()
os.close(fno)
print("{:10} cnt: {}  content: {}".format("output.raw", write_count, list(open("output.raw", "rb").read())))


print()
# Using filesystem package
fd = fs.open("output.raw", "r")
read_count = fs.pread(fd, np_arr, 10, 0)  # read 10 bytes starting from offset 0
print("{:10} cnt: {}  content: {}".format("np_arr", read_count, np_arr))
fs.close(fd)                              # same with fd.close()

# Using 'with' statement
with fs.open("output.raw", "r") as fd:
    read_count = fd.pread(np_arr, 10, 0)  # read 10 bytes starting from offset 0
    print("{:10} cnt: {}  content: {}".format("np_arr", read_count, np_arr))


# ### Using GPU memory
# 
# Any Python object supporting [\_\_cuda_array_interface__](http://numba.pydata.org/numba-doc/latest/cuda/cuda_array_interface.html) (such as cupy.array, cupy.ndarray, or Pytorch Cuda Tensor) can be used for `buf` parameter.
# Or, any pointer address (`int` type) can be used for `buf` parameter.

# In[5]:


from cucim.clara.filesystem import CuFileDriver
import cucim.clara.filesystem as fs

import os
import cupy as cp
import torch

# Write a file with size 10 (in bytes)
with open("input.raw", "wb") as input_file:
    input_file.write(bytearray([101, 102, 103, 104, 105, 106, 107, 108, 109, 110]))

# Create an array with size 10 (in bytes)
cp_arr = cp.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=cp.uint8)

cuda0 = torch.device('cuda:0')
torch_arr = torch.zeros(10, dtype=torch.uint8, device=cuda0)

# Using CuFileDriver
fno = os.open( "input.raw", os.O_RDONLY | os.O_DIRECT)
fd = CuFileDriver(fno)

read_count = fd.pread(cp_arr, 8, 0, 2)      # read 8 bytes starting from file offset 0 into buffer offset 2
print("{:20} cnt: {}  content: {}".format("np_arr", read_count, cp_arr))
read_count = fd.pread(cp_arr, 10, 0)      # read 10 bytes starting from offset 0
print("{:20} cnt: {}  content: {}".format("cp_arr", read_count, np_arr))
read_count = fd.pread(torch_arr, 10, 3)      # read 10 bytes starting from offset 3
print("{:20} cnt: {}  content: {}".format("torch_arr", read_count, torch_arr))
fd.close()
os.close(fno)

fno = os.open("output.raw", os.O_RDWR | os.O_CREAT | os.O_TRUNC)
fd = CuFileDriver(fno)
write_count = fd.pwrite(cp_arr, 10, 5)      # write 10 bytes from np_array to file starting from offset 5
fd.close()
os.close(fno)
print("{:20} cnt: {}  content: {}".format("output.raw", write_count, list(open("output.raw", "rb").read())))

print()
# Using filesystem package
fd = fs.open("output.raw", "r")
read_count = fs.pread(fd, cp_arr, 10, 0)  # read 10 bytes starting from offset 0
print("{:20} cnt: {}  content: {}".format("cp_arr", read_count, cp_arr))
fs.close(fd)                              # same with fd.close()

# Using 'with' statement
with fs.open("output.raw", "r") as fd:
    read_count = fd.pread(cp_arr, 10, 0)  # read 10 bytes starting from offset 0
    print("{:10} cnt: {}  content: {}".format("np_arr", read_count, cp_arr))


# In[6]:


cp_arr.__cuda_array_interface__


# In[7]:


torch_arr.__cuda_array_interface__


# ## Discarding system (page) cache for a file
# 
# You can use `discard_page_cache()` method for discarding system (page) cache for the given file, before any performance measurement on a file.
# 
# ```python
# import cucim.clara.filesystem as fs
# 
# fs.discard_page_cache("input/image.tif")
# # ... file APIs on `input/image.tif`
# ```
# 
# Its implementation looks like below
# ```C++
# bool discard_page_cache(const char* file_path)
# {
#     int fd = ::open(file_path, O_RDONLY);
#     if (fd < 0)
#     {
#         return false;
#     }
#     if (::fdatasync(fd) < 0)
#     {
#         return false;
#     }
#     if (::posix_fadvise(fd, 0, 0, POSIX_FADV_DONTNEED) < 0)
#     {
#         return false;
#     }
#     if (::close(fd) < 0)
#     {
#         return false;
#     }
#     return true;
# }
# ```
# 
# It helps measure accurate file access performance without the effect of the page cache.

# ## Experiments (for a big file such as .mhd)
# 
# Conducted experiments on Intel(R) Core(TM) i7-7800X CPU @ 3.50GHz with Samsung SSD 970 PRO (NVMe SSD, 1TB).
# 
# This is for reading 10GB of data
# ```
#   second method(posix + cudamemcpy)         : 5.031040154863149
#   second method(posix+odirect + cudamemcpy) : 4.7419630330987275
#   second method(gds)                        : 4.235773948952556
# ```
# Performance gain: 1.19x
# 
# 
# This is for reading 2GB of data
# 
# ```bash
#   second method(posix)         : 1.0681836600415409
#   second method(posix+odirect) : 0.9496012150775641
#   second method(gds)           : 0.8406150250229985
# ```
# Performance gain: 1.27x
# 
#