Spaces:
Paused
Paused
| """A file interface for handling local and remote data files. | |
| The goal of datasource is to abstract some of the file system operations | |
| when dealing with data files so the researcher doesn't have to know all the | |
| low-level details. Through datasource, a researcher can obtain and use a | |
| file with one function call, regardless of location of the file. | |
| DataSource is meant to augment standard python libraries, not replace them. | |
| It should work seamlessly with standard file IO operations and the os | |
| module. | |
| DataSource files can originate locally or remotely: | |
| - local files : '/home/guido/src/local/data.txt' | |
| - URLs (http, ftp, ...) : 'http://www.scipy.org/not/real/data.txt' | |
| DataSource files can also be compressed or uncompressed. Currently only | |
| gzip, bz2 and xz are supported. | |
| Example:: | |
| >>> # Create a DataSource, use os.curdir (default) for local storage. | |
| >>> from numpy import DataSource | |
| >>> ds = DataSource() | |
| >>> | |
| >>> # Open a remote file. | |
| >>> # DataSource downloads the file, stores it locally in: | |
| >>> # './www.google.com/index.html' | |
| >>> # opens the file and returns a file object. | |
| >>> fp = ds.open('http://www.google.com/') # doctest: +SKIP | |
| >>> | |
| >>> # Use the file as you normally would | |
| >>> fp.read() # doctest: +SKIP | |
| >>> fp.close() # doctest: +SKIP | |
| """ | |
| import os | |
| import io | |
| from numpy.core.overrides import set_module | |
| _open = open | |
| def _check_mode(mode, encoding, newline): | |
| """Check mode and that encoding and newline are compatible. | |
| Parameters | |
| ---------- | |
| mode : str | |
| File open mode. | |
| encoding : str | |
| File encoding. | |
| newline : str | |
| Newline for text files. | |
| """ | |
| if "t" in mode: | |
| if "b" in mode: | |
| raise ValueError("Invalid mode: %r" % (mode,)) | |
| else: | |
| if encoding is not None: | |
| raise ValueError("Argument 'encoding' not supported in binary mode") | |
| if newline is not None: | |
| raise ValueError("Argument 'newline' not supported in binary mode") | |
| # Using a class instead of a module-level dictionary | |
| # to reduce the initial 'import numpy' overhead by | |
| # deferring the import of lzma, bz2 and gzip until needed | |
| # TODO: .zip support, .tar support? | |
| class _FileOpeners: | |
| """ | |
| Container for different methods to open (un-)compressed files. | |
| `_FileOpeners` contains a dictionary that holds one method for each | |
| supported file format. Attribute lookup is implemented in such a way | |
| that an instance of `_FileOpeners` itself can be indexed with the keys | |
| of that dictionary. Currently uncompressed files as well as files | |
| compressed with ``gzip``, ``bz2`` or ``xz`` compression are supported. | |
| Notes | |
| ----- | |
| `_file_openers`, an instance of `_FileOpeners`, is made available for | |
| use in the `_datasource` module. | |
| Examples | |
| -------- | |
| >>> import gzip | |
| >>> np.lib._datasource._file_openers.keys() | |
| [None, '.bz2', '.gz', '.xz', '.lzma'] | |
| >>> np.lib._datasource._file_openers['.gz'] is gzip.open | |
| True | |
| """ | |
| def __init__(self): | |
| self._loaded = False | |
| self._file_openers = {None: io.open} | |
| def _load(self): | |
| if self._loaded: | |
| return | |
| try: | |
| import bz2 | |
| self._file_openers[".bz2"] = bz2.open | |
| except ImportError: | |
| pass | |
| try: | |
| import gzip | |
| self._file_openers[".gz"] = gzip.open | |
| except ImportError: | |
| pass | |
| try: | |
| import lzma | |
| self._file_openers[".xz"] = lzma.open | |
| self._file_openers[".lzma"] = lzma.open | |
| except (ImportError, AttributeError): | |
| # There are incompatible backports of lzma that do not have the | |
| # lzma.open attribute, so catch that as well as ImportError. | |
| pass | |
| self._loaded = True | |
| def keys(self): | |
| """ | |
| Return the keys of currently supported file openers. | |
| Parameters | |
| ---------- | |
| None | |
| Returns | |
| ------- | |
| keys : list | |
| The keys are None for uncompressed files and the file extension | |
| strings (i.e. ``'.gz'``, ``'.xz'``) for supported compression | |
| methods. | |
| """ | |
| self._load() | |
| return list(self._file_openers.keys()) | |
| def __getitem__(self, key): | |
| self._load() | |
| return self._file_openers[key] | |
| _file_openers = _FileOpeners() | |
| def open(path, mode='r', destpath=os.curdir, encoding=None, newline=None): | |
| """ | |
| Open `path` with `mode` and return the file object. | |
| If ``path`` is an URL, it will be downloaded, stored in the | |
| `DataSource` `destpath` directory and opened from there. | |
| Parameters | |
| ---------- | |
| path : str | |
| Local file path or URL to open. | |
| mode : str, optional | |
| Mode to open `path`. Mode 'r' for reading, 'w' for writing, 'a' to | |
| append. Available modes depend on the type of object specified by | |
| path. Default is 'r'. | |
| destpath : str, optional | |
| Path to the directory where the source file gets downloaded to for | |
| use. If `destpath` is None, a temporary directory will be created. | |
| The default path is the current directory. | |
| encoding : {None, str}, optional | |
| Open text file with given encoding. The default encoding will be | |
| what `io.open` uses. | |
| newline : {None, str}, optional | |
| Newline to use when reading text file. | |
| Returns | |
| ------- | |
| out : file object | |
| The opened file. | |
| Notes | |
| ----- | |
| This is a convenience function that instantiates a `DataSource` and | |
| returns the file object from ``DataSource.open(path)``. | |
| """ | |
| ds = DataSource(destpath) | |
| return ds.open(path, mode, encoding=encoding, newline=newline) | |
| class DataSource: | |
| """ | |
| DataSource(destpath='.') | |
| A generic data source file (file, http, ftp, ...). | |
| DataSources can be local files or remote files/URLs. The files may | |
| also be compressed or uncompressed. DataSource hides some of the | |
| low-level details of downloading the file, allowing you to simply pass | |
| in a valid file path (or URL) and obtain a file object. | |
| Parameters | |
| ---------- | |
| destpath : str or None, optional | |
| Path to the directory where the source file gets downloaded to for | |
| use. If `destpath` is None, a temporary directory will be created. | |
| The default path is the current directory. | |
| Notes | |
| ----- | |
| URLs require a scheme string (``http://``) to be used, without it they | |
| will fail:: | |
| >>> repos = np.DataSource() | |
| >>> repos.exists('www.google.com/index.html') | |
| False | |
| >>> repos.exists('http://www.google.com/index.html') | |
| True | |
| Temporary directories are deleted when the DataSource is deleted. | |
| Examples | |
| -------- | |
| :: | |
| >>> ds = np.DataSource('/home/guido') | |
| >>> urlname = 'http://www.google.com/' | |
| >>> gfile = ds.open('http://www.google.com/') | |
| >>> ds.abspath(urlname) | |
| '/home/guido/www.google.com/index.html' | |
| >>> ds = np.DataSource(None) # use with temporary file | |
| >>> ds.open('/home/guido/foobar.txt') | |
| <open file '/home/guido.foobar.txt', mode 'r' at 0x91d4430> | |
| >>> ds.abspath('/home/guido/foobar.txt') | |
| '/tmp/.../home/guido/foobar.txt' | |
| """ | |
| def __init__(self, destpath=os.curdir): | |
| """Create a DataSource with a local path at destpath.""" | |
| if destpath: | |
| self._destpath = os.path.abspath(destpath) | |
| self._istmpdest = False | |
| else: | |
| import tempfile # deferring import to improve startup time | |
| self._destpath = tempfile.mkdtemp() | |
| self._istmpdest = True | |
| def __del__(self): | |
| # Remove temp directories | |
| if hasattr(self, '_istmpdest') and self._istmpdest: | |
| import shutil | |
| shutil.rmtree(self._destpath) | |
| def _iszip(self, filename): | |
| """Test if the filename is a zip file by looking at the file extension. | |
| """ | |
| fname, ext = os.path.splitext(filename) | |
| return ext in _file_openers.keys() | |
| def _iswritemode(self, mode): | |
| """Test if the given mode will open a file for writing.""" | |
| # Currently only used to test the bz2 files. | |
| _writemodes = ("w", "+") | |
| for c in mode: | |
| if c in _writemodes: | |
| return True | |
| return False | |
| def _splitzipext(self, filename): | |
| """Split zip extension from filename and return filename. | |
| *Returns*: | |
| base, zip_ext : {tuple} | |
| """ | |
| if self._iszip(filename): | |
| return os.path.splitext(filename) | |
| else: | |
| return filename, None | |
| def _possible_names(self, filename): | |
| """Return a tuple containing compressed filename variations.""" | |
| names = [filename] | |
| if not self._iszip(filename): | |
| for zipext in _file_openers.keys(): | |
| if zipext: | |
| names.append(filename+zipext) | |
| return names | |
| def _isurl(self, path): | |
| """Test if path is a net location. Tests the scheme and netloc.""" | |
| # We do this here to reduce the 'import numpy' initial import time. | |
| from urllib.parse import urlparse | |
| # BUG : URLs require a scheme string ('http://') to be used. | |
| # www.google.com will fail. | |
| # Should we prepend the scheme for those that don't have it and | |
| # test that also? Similar to the way we append .gz and test for | |
| # for compressed versions of files. | |
| scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) | |
| return bool(scheme and netloc) | |
| def _cache(self, path): | |
| """Cache the file specified by path. | |
| Creates a copy of the file in the datasource cache. | |
| """ | |
| # We import these here because importing them is slow and | |
| # a significant fraction of numpy's total import time. | |
| import shutil | |
| from urllib.request import urlopen | |
| from urllib.error import URLError | |
| upath = self.abspath(path) | |
| # ensure directory exists | |
| if not os.path.exists(os.path.dirname(upath)): | |
| os.makedirs(os.path.dirname(upath)) | |
| # TODO: Doesn't handle compressed files! | |
| if self._isurl(path): | |
| with urlopen(path) as openedurl: | |
| with _open(upath, 'wb') as f: | |
| shutil.copyfileobj(openedurl, f) | |
| else: | |
| shutil.copyfile(path, upath) | |
| return upath | |
| def _findfile(self, path): | |
| """Searches for ``path`` and returns full path if found. | |
| If path is an URL, _findfile will cache a local copy and return the | |
| path to the cached file. If path is a local file, _findfile will | |
| return a path to that local file. | |
| The search will include possible compressed versions of the file | |
| and return the first occurrence found. | |
| """ | |
| # Build list of possible local file paths | |
| if not self._isurl(path): | |
| # Valid local paths | |
| filelist = self._possible_names(path) | |
| # Paths in self._destpath | |
| filelist += self._possible_names(self.abspath(path)) | |
| else: | |
| # Cached URLs in self._destpath | |
| filelist = self._possible_names(self.abspath(path)) | |
| # Remote URLs | |
| filelist = filelist + self._possible_names(path) | |
| for name in filelist: | |
| if self.exists(name): | |
| if self._isurl(name): | |
| name = self._cache(name) | |
| return name | |
| return None | |
| def abspath(self, path): | |
| """ | |
| Return absolute path of file in the DataSource directory. | |
| If `path` is an URL, then `abspath` will return either the location | |
| the file exists locally or the location it would exist when opened | |
| using the `open` method. | |
| Parameters | |
| ---------- | |
| path : str | |
| Can be a local file or a remote URL. | |
| Returns | |
| ------- | |
| out : str | |
| Complete path, including the `DataSource` destination directory. | |
| Notes | |
| ----- | |
| The functionality is based on `os.path.abspath`. | |
| """ | |
| # We do this here to reduce the 'import numpy' initial import time. | |
| from urllib.parse import urlparse | |
| # TODO: This should be more robust. Handles case where path includes | |
| # the destpath, but not other sub-paths. Failing case: | |
| # path = /home/guido/datafile.txt | |
| # destpath = /home/alex/ | |
| # upath = self.abspath(path) | |
| # upath == '/home/alex/home/guido/datafile.txt' | |
| # handle case where path includes self._destpath | |
| splitpath = path.split(self._destpath, 2) | |
| if len(splitpath) > 1: | |
| path = splitpath[1] | |
| scheme, netloc, upath, uparams, uquery, ufrag = urlparse(path) | |
| netloc = self._sanitize_relative_path(netloc) | |
| upath = self._sanitize_relative_path(upath) | |
| return os.path.join(self._destpath, netloc, upath) | |
| def _sanitize_relative_path(self, path): | |
| """Return a sanitised relative path for which | |
| os.path.abspath(os.path.join(base, path)).startswith(base) | |
| """ | |
| last = None | |
| path = os.path.normpath(path) | |
| while path != last: | |
| last = path | |
| # Note: os.path.join treats '/' as os.sep on Windows | |
| path = path.lstrip(os.sep).lstrip('/') | |
| path = path.lstrip(os.pardir).lstrip('..') | |
| drive, path = os.path.splitdrive(path) # for Windows | |
| return path | |
| def exists(self, path): | |
| """ | |
| Test if path exists. | |
| Test if `path` exists as (and in this order): | |
| - a local file. | |
| - a remote URL that has been downloaded and stored locally in the | |
| `DataSource` directory. | |
| - a remote URL that has not been downloaded, but is valid and | |
| accessible. | |
| Parameters | |
| ---------- | |
| path : str | |
| Can be a local file or a remote URL. | |
| Returns | |
| ------- | |
| out : bool | |
| True if `path` exists. | |
| Notes | |
| ----- | |
| When `path` is an URL, `exists` will return True if it's either | |
| stored locally in the `DataSource` directory, or is a valid remote | |
| URL. `DataSource` does not discriminate between the two, the file | |
| is accessible if it exists in either location. | |
| """ | |
| # First test for local path | |
| if os.path.exists(path): | |
| return True | |
| # We import this here because importing urllib is slow and | |
| # a significant fraction of numpy's total import time. | |
| from urllib.request import urlopen | |
| from urllib.error import URLError | |
| # Test cached url | |
| upath = self.abspath(path) | |
| if os.path.exists(upath): | |
| return True | |
| # Test remote url | |
| if self._isurl(path): | |
| try: | |
| netfile = urlopen(path) | |
| netfile.close() | |
| del(netfile) | |
| return True | |
| except URLError: | |
| return False | |
| return False | |
| def open(self, path, mode='r', encoding=None, newline=None): | |
| """ | |
| Open and return file-like object. | |
| If `path` is an URL, it will be downloaded, stored in the | |
| `DataSource` directory and opened from there. | |
| Parameters | |
| ---------- | |
| path : str | |
| Local file path or URL to open. | |
| mode : {'r', 'w', 'a'}, optional | |
| Mode to open `path`. Mode 'r' for reading, 'w' for writing, | |
| 'a' to append. Available modes depend on the type of object | |
| specified by `path`. Default is 'r'. | |
| encoding : {None, str}, optional | |
| Open text file with given encoding. The default encoding will be | |
| what `io.open` uses. | |
| newline : {None, str}, optional | |
| Newline to use when reading text file. | |
| Returns | |
| ------- | |
| out : file object | |
| File object. | |
| """ | |
| # TODO: There is no support for opening a file for writing which | |
| # doesn't exist yet (creating a file). Should there be? | |
| # TODO: Add a ``subdir`` parameter for specifying the subdirectory | |
| # used to store URLs in self._destpath. | |
| if self._isurl(path) and self._iswritemode(mode): | |
| raise ValueError("URLs are not writeable") | |
| # NOTE: _findfile will fail on a new file opened for writing. | |
| found = self._findfile(path) | |
| if found: | |
| _fname, ext = self._splitzipext(found) | |
| if ext == 'bz2': | |
| mode.replace("+", "") | |
| return _file_openers[ext](found, mode=mode, | |
| encoding=encoding, newline=newline) | |
| else: | |
| raise IOError("%s not found." % path) | |
| class Repository (DataSource): | |
| """ | |
| Repository(baseurl, destpath='.') | |
| A data repository where multiple DataSource's share a base | |
| URL/directory. | |
| `Repository` extends `DataSource` by prepending a base URL (or | |
| directory) to all the files it handles. Use `Repository` when you will | |
| be working with multiple files from one base URL. Initialize | |
| `Repository` with the base URL, then refer to each file by its filename | |
| only. | |
| Parameters | |
| ---------- | |
| baseurl : str | |
| Path to the local directory or remote location that contains the | |
| data files. | |
| destpath : str or None, optional | |
| Path to the directory where the source file gets downloaded to for | |
| use. If `destpath` is None, a temporary directory will be created. | |
| The default path is the current directory. | |
| Examples | |
| -------- | |
| To analyze all files in the repository, do something like this | |
| (note: this is not self-contained code):: | |
| >>> repos = np.lib._datasource.Repository('/home/user/data/dir/') | |
| >>> for filename in filelist: | |
| ... fp = repos.open(filename) | |
| ... fp.analyze() | |
| ... fp.close() | |
| Similarly you could use a URL for a repository:: | |
| >>> repos = np.lib._datasource.Repository('http://www.xyz.edu/data') | |
| """ | |
| def __init__(self, baseurl, destpath=os.curdir): | |
| """Create a Repository with a shared url or directory of baseurl.""" | |
| DataSource.__init__(self, destpath=destpath) | |
| self._baseurl = baseurl | |
| def __del__(self): | |
| DataSource.__del__(self) | |
| def _fullpath(self, path): | |
| """Return complete path for path. Prepends baseurl if necessary.""" | |
| splitpath = path.split(self._baseurl, 2) | |
| if len(splitpath) == 1: | |
| result = os.path.join(self._baseurl, path) | |
| else: | |
| result = path # path contains baseurl already | |
| return result | |
| def _findfile(self, path): | |
| """Extend DataSource method to prepend baseurl to ``path``.""" | |
| return DataSource._findfile(self, self._fullpath(path)) | |
| def abspath(self, path): | |
| """ | |
| Return absolute path of file in the Repository directory. | |
| If `path` is an URL, then `abspath` will return either the location | |
| the file exists locally or the location it would exist when opened | |
| using the `open` method. | |
| Parameters | |
| ---------- | |
| path : str | |
| Can be a local file or a remote URL. This may, but does not | |
| have to, include the `baseurl` with which the `Repository` was | |
| initialized. | |
| Returns | |
| ------- | |
| out : str | |
| Complete path, including the `DataSource` destination directory. | |
| """ | |
| return DataSource.abspath(self, self._fullpath(path)) | |
| def exists(self, path): | |
| """ | |
| Test if path exists prepending Repository base URL to path. | |
| Test if `path` exists as (and in this order): | |
| - a local file. | |
| - a remote URL that has been downloaded and stored locally in the | |
| `DataSource` directory. | |
| - a remote URL that has not been downloaded, but is valid and | |
| accessible. | |
| Parameters | |
| ---------- | |
| path : str | |
| Can be a local file or a remote URL. This may, but does not | |
| have to, include the `baseurl` with which the `Repository` was | |
| initialized. | |
| Returns | |
| ------- | |
| out : bool | |
| True if `path` exists. | |
| Notes | |
| ----- | |
| When `path` is an URL, `exists` will return True if it's either | |
| stored locally in the `DataSource` directory, or is a valid remote | |
| URL. `DataSource` does not discriminate between the two, the file | |
| is accessible if it exists in either location. | |
| """ | |
| return DataSource.exists(self, self._fullpath(path)) | |
| def open(self, path, mode='r', encoding=None, newline=None): | |
| """ | |
| Open and return file-like object prepending Repository base URL. | |
| If `path` is an URL, it will be downloaded, stored in the | |
| DataSource directory and opened from there. | |
| Parameters | |
| ---------- | |
| path : str | |
| Local file path or URL to open. This may, but does not have to, | |
| include the `baseurl` with which the `Repository` was | |
| initialized. | |
| mode : {'r', 'w', 'a'}, optional | |
| Mode to open `path`. Mode 'r' for reading, 'w' for writing, | |
| 'a' to append. Available modes depend on the type of object | |
| specified by `path`. Default is 'r'. | |
| encoding : {None, str}, optional | |
| Open text file with given encoding. The default encoding will be | |
| what `io.open` uses. | |
| newline : {None, str}, optional | |
| Newline to use when reading text file. | |
| Returns | |
| ------- | |
| out : file object | |
| File object. | |
| """ | |
| return DataSource.open(self, self._fullpath(path), mode, | |
| encoding=encoding, newline=newline) | |
| def listdir(self): | |
| """ | |
| List files in the source Repository. | |
| Returns | |
| ------- | |
| files : list of str | |
| List of file names (not containing a directory part). | |
| Notes | |
| ----- | |
| Does not currently work for remote repositories. | |
| """ | |
| if self._isurl(self._baseurl): | |
| raise NotImplementedError( | |
| "Directory listing of URLs, not supported yet.") | |
| else: | |
| return os.listdir(self._baseurl) | |