#!/usr/bin/env python # coding: utf-8 # # What this notebook is about # # Currently, [`importlib`](https://docs.python.org/3/library/importlib.html#module-importlib) provides no easy way to get data from within a package that is not stored as a Python literal in source code. This is a problem for projects who provide a package that contains data in a format that is better kept in a file. Hence there is a desire to create an `importlib.resources` to solve this problem by providing some way to get data through an API that is agnostic to how the package containing the data is stored (e.g, as files on disk or in a zip file). # # # # The current solution # The [`pkg_resources`](https://pythonhosted.org/setuptools/pkg_resources.html) module in [`setuptools`](https://pypi.python.org/pypi/setuptools) provides a way in which people can use to extract resources from a package. That's basically fancy talk to say there is an API there to read data that come as part of your package in a separate file. The key reason people use `pkg_resources` is because it works both with packages on the file system as well as when the code is in a zip file. This makes it a portable way to access e.g. data files no matter which way someone choose to bundle the package with their code. And thanks to having a defined API, people can add support for alternative ways of storing your package, e.g. a sqlite database. # # The [`ResourceManager`](https://pythonhosted.org/setuptools/pkg_resources.html#resourcemanager-api) API defines what support `pkg_resources` provides. The methods are: # # - `resource_exists(package_or_requirement, resource_name)` # - `resource_stream(package_or_requirement, resource_name)` # - `resource_string(package_or_requirement, resource_name)` # - `resource_isdir(package_or_requirement, resource_name)` # - `resource_listdir(package_or_requirement, resource_name)` # - `resource_filename(package_or_requirement, resource_name)` # - `set_extraction_path(path)` # - `cleanup_resources(force=False)` # - `get_cache_path(archive_name, names=())` # - `extraction_error()` # - `postprocess(tempname, filename)` # # As you can see, the API is extensive. There are two direct ways to access data -- `resource_stream()` and `resource_string()` -- as well as one indirect way -- `resource_filename()`. There are also various methods to provide a file system-like interface -- `resource_exists()`, `resource_isdir()`, `resource_listdir()`. The rest of the API is for management of temporary files. # # Designing a new API # # Because the need to read data from files included in a package is a common enough occurrence, the desire to provide an implementation in `importlib` in Python's standard library. That will help make sure it evolves along with import itself, keep it maintained, and negate an external dependency for something that is somewhat fundamental but tricky to get right. # # ## Overall API shape # # The first design question is what the overall shape of the API should be. Because this is all meant to address accessing data files for a package, that means all functions will require knowing what package is being used for the access and the relative path to the data file. There are essentially two possible approaches: # # 1. `importlib.resources(module).some_func('relative_path') # 2. `importlib.resources.some_func(module, 'relative_path') # # In the end the approach is essentially cosmetic. There is no measurable performance difference between the two and it simply chooses whether an object or function-based approach is preferred aesthetically (one could argue caching the result of apparoch 1 is a perk, but that is extremely minor and probably not going to be a common occurrence). Because of this, a [poll on Google+](https://plus.google.com/u/0/+BrettCannon/posts/gimHMzJntmQ) was held to see what people preferred and approach 2 won out. # # With this choice in mind, any APIs proposed in this notebook will be implemented by *resource readers*. Each module is expected to have a unique instance of a resource reader saved in its `__spec__.loader_state` dictionary under the `resource reader` key. This means it will be the job of the finder to instantiate and store the resource reader in the module's spec. The ABC that will be defined for resource readers will be what people use to access data files. If approach 2 for the API shape was chosen then `importlib.resources` would be a module instead of a function and have wrappers around the resource reader object. # In[1]: import abc import importlib.util import types import typing as t # importlib.abc class ResourceReader(metaclass=abc.ABCMeta): # Providing primarily for typing at this point. # Details will be provided throughout this notebook as issues are discussed. def __init__(self, spec): self._base = pathlib.Path(spec.origin).parent # importlib def resources(module: t.Union[str, types.ModuleType]) -> ResourceReader: spec = module.__spec__ if isinstance(module, types.ModuleType) else importlib.util.find_spec(module) try: return spec.loader_state['resource reader'] except (KeyError, TypeError) as exc: raise ValueError('no resource reader found for ' + repr(module)) from exc # ### File paths # # A special mention of what will be considered valid file paths is necessary. Since all paths are expected to be relative to the location of a module, all paths must be relative. But being relative doesn't mean that allowing for arbitrary directory traversal should be allowed either. This means that a path cannot start with `../`, contain `/../`, or end with `/..`. Since the relative paths are anchored to a specific module there isn't a need to be able to traverse upwards as you simply anchor from a module higher up in the package namespace. # In[2]: import pathlib import typing def check_relative_path(path: t.Union[str, pathlib.Path]) -> None: """Raise an exception if the path is not valid for use with a resource reader.""" relative_path = pathlib.Path(path) if relative_path.is_absolute(): raise ValueError('absolute paths such as {!r} are not allowed for resource readers'.format(path)) posix_path = relative_path.as_posix() if posix_path.startswith('../') or '/../' in posix_path or posix_path.endswith('/..'): raise ValueError('parent directory traversal such as in {!r} is not allowed for resource readers'.format(path)) # In[3]: # Let's create a test scenario. import pathlib package_directory = pathlib.Path('resource_test') package_directory.mkdir(exist_ok=True) init_path = package_directory / '__init__.py' init_path.touch(exist_ok=True) resource_file = package_directory / 'resource_file.txt' with resource_file.open('w') as file: file.write('Hello, World!') print('Resource file exists at', resource_file.resolve()) def _set_reader(module, reader_class): module.__spec__.loader_state= {} module.__spec__.loader_state['resource reader'] = reader_class(module.__spec__) # Verify the package is on sys.path. import resource_test resource_test.__spec__ # ## Directly reading bytes # # # At minimum, a way to get data is needed. This can either be as a stream or as a bytes depending on needs. A stream makes the most amount of sense if the data is large and should not be read entirely into memory at once. Unfortunately there is no guarantee that the storage back-end for the package supports the concept of a stream easily, which would mean certain back-ends would need to create an `io.BytesIO` instance to provide a streaming object. Reading all bytes has the exact opposite pros and cons for providing a stream; it does require reading all data into memory, but there is no unnecessary streaming object creation in order to hide implementation details. Because the chances of a data file being included in a package that is so large that arbitrary file seeking is necessary to keep memory pressure down, the initial version of resource readers will only worry about returning bytes directly. Providing an API for returning a stream for a data file can be re-considered based on user demand. # In[4]: import pathlib import typing class ReadingResourceReader(ResourceReader): """Provide a way for resource readers to read bytes.""" # Would be an abstractmethod in the base class. # Implementing the file-based approach for demonstration purposes. def read_bytes(self, path: t.Union[str, pathlib.Path]) -> bytes: # XXX Make a decorator (class or function)? # Have subclasses implement another method that this one can delegate to? check_relative_path(path) full_path = self._base / pathlib.Path(path) with full_path.open('rb') as file: return file.read() # Example usage. import resource_test _set_reader(resource_test, ReadingResourceReader) resources(resource_test).read_bytes('resource_file.txt') # The other way to get data out is with a file path to the data. This comes up in APIs such as OpenSSL which takes a file path to load certificate files and provide no other way to load the data (arguments over whether this is poor API design is being ignored for this discussion since the APIs taking only file paths already exist whether we agree with the design or not). This means that providing an API to get the file path to a data file may be useful. # # The problem with this, much like with streams, is that if the storage back-end for the package doesn't have a proper concept of files then there is no way to return a file path that some code will most likely end up calling `open()` on, e.g. zip files. To determine how prevalent the use of this API is, you can quickly do a [GitHub search for `pkg_resource.resource_filename`](https://github.com/search?utf8=%E2%9C%93&q=pkg_resources.resource_filename&type=Code&ref=searchresults) and find out that it is used [more than `pkg_resources.resource_stream()`](https://github.com/search?q=pkg_resources.resource_stream&type=Code&utf8=%E2%9C%93) and [`pkg_resources.resource_string()`](https://github.com/search?utf8=%E2%9C%93&q=pkg_resources.resource_string&type=Code&ref=searchresults). Looking at the results for code using `pkg_resources.resource_filename()`, though, show that the vast majority of them do not actually require the functionality. Typically the API is used with [`lxml.etree.parse()`](http://lxml.de/api/lxml.etree-module.html#parse), but there is also [`lxml.etree.fromstring()`](http://lxml.de/api/lxml.etree-module.html#fromstring) as an alternative for parsing from a string directly instead of a file. There was one case of someone using the path as a default case for a flag in a CLI, but they also could have easily created their own temporary file while the application was running and used that as the default argument. # # But it should also be mentioned that getting a temporary file path and cleaning up properly when the temporary file is only a *possibility* becomes tricky. The [`tempfile.NamedTemporaryFile` context manager](https://docs.python.org/3/library/tempfile.html#tempfile.NamedTemporaryFile) can delete the file upon closing it or exiting the context manager, but that means you must be able to write to the file and then read from it from somewhere else calling `open()` *without* closing the file first and that's not feasible at least on Windows (trying to read from a file that's already open with another call to `open()` raises `PermissionError`). You can turn off the automatic deletion, but that then means you need to know when a temporary file was returned instead of an actual file path to do the proper cleanup. One could use the [atexit module](https://docs.python.org/3/library/atexit.html#module-atexit), but that won't guarantee file cleanup in the face of e.g., a signal triggering an exit while also leaving the temporary file lying around longer than necessary. You could return both the file path and a boolean indicating whether the file path is to a temporary file, but then that adds boilerplate to all uses of the API and is still error-prone if one forgets to do proper cleanup. In the end probably the only reasonable way to provide this API would be through a context manager that returned the path to the file and if it is temporary then clean up upon exit. # In[5]: import contextlib import pathlib import tempfile import typing as t class RealFilePathResourceReader(ReadingResourceReader): @contextlib.contextmanager def file_path(self, path: t.Union[str, pathlib.Path], *, tempfile_ok=False) -> pathlib.Path: """Return a context manager for the path to the file containing the specified data.""" # a.k.a. the Donald Stufft method. check_relative_path(path) yield self._base / pathlib.Path(path) # Example usage. import resource_test _set_reader(resource_test, RealFilePathResourceReader) with resources(resource_test).file_path('resource_file.txt') as path: print(path) # Providing a file system-like API for discoverability of data would be nice, but is not a requirement for the API to be useful. Since packages will know what data is being included with the package then the need to discover what is in the package diminishes quickly. While people may choose to use this mechanism to try and provide a solution for deploying Python applications, that is not the initial focus of the API. # # Proposed ABC # # Below is the proposed abstract base class for resource readers. It includes a default implementation for `file_path()` which relies on creating a temporary file. It is expected that a file-based resource reader will override it with a concrete implementation as shown previous in this notebook. # In[6]: import abc import contextlib import functools from importlib import machinery import pathlib import tempfile import typing as t # Have not decided if I want to use this or not. class ResourceReaderMeta(abc.ABCMeta): """Experimental metaclass to enforce checking for valid relative paths.""" def __new__(cls, name, bases, namespace, **kwargs): result = super().__new__(cls, name, bases, namespace, **kwargs) for method_name in ('read_bytes', 'file_path'): method = namespace[method_name] namespace[method_name] = cls._check_relative_path(method) return result @staticmethod def _check_relative_path(func): """Decorator to check that the `path` argument is not illegally structured.""" @functools.wraps(func) def decorator(self, path, *args, **kwargs): """Raise an exception if the path is not valid for use with a resource reader.""" relative_path = pathlib.Path(path) if relative_path.is_absolute(): raise ValueError('absolute paths such as {!r} are not allowed for resource readers'.format(path)) posix_path = relative_path.as_posix() if posix_path.startswith('../') or '/../' in posix_path or posix_path.endswith('/..'): raise ValueError('parent directory traversal such as in {!r} is not allowed for ' 'resource readers'.format(path)) return func(self, path, *args, **kwargs) return decorator class ResourceReader(metaclass=abc.ABCMeta): """Abstract base class defining the resource reader interface.""" @abc.abstractmethod def __init__(self, spec: machinery.ModuleSpec): """Create a resource reader for the provided spec. The resource reader should NOT worry about storing itself on the spec. """ @abc.abstractmethod def read_bytes(path: t.Union[str, pathlib.Path]) -> bytes: """Retrun the bytes found at the relative path to this reader.""" # http://bugs.python.org/issue25609 is tracking adding a context manager type to the typing module. @contextlib.contextmanager def file_path(self, path: t.Union[str, pathlib.Path], *, tempfile_ok=False): """Return a context manager to a temporary file containing the specified data. When the context manager exits, the temporary file will be deleted (if it still exists). """ if not tempfile_ok: raise FileNotFoundError('resource reader does not support file paths without creating a temporary file') check_relative_path(path) data = self.read_bytes(path) temp_path = pathlib.Path(tempfile.mkstemp()) try: with temp_path.open('wb') as file: file.write(data) yield temp_path finally: try: temp_path.unlink() except FileNotFoundError: pass