diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml new file mode 100644 index 0000000..42a6601 --- /dev/null +++ b/.github/workflows/ci.yml @@ -0,0 +1,30 @@ +name: CI + +on: [push, pull_request] + +jobs: + + test: + runs-on: ubuntu-latest + env: + COV_MIN: 50 + + strategy: + matrix: + py_version: ["3.11", "3.12"] + + steps: + - uses: actions/checkout@v5 + - name: Set up Python ${{ matrix.py_version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.py_version }} + + - name: Install dsprofile + run: | + python -m pip install --upgrade pip + python -m pip install -e .[test] + + - name: Test with pytest + working-directory: tests + run: pytest --cov=dsprofile --cov-fail-under=$COV_MIN diff --git a/README.md b/README.md index 4db554f..aaff801 100644 --- a/README.md +++ b/README.md @@ -1,11 +1,11 @@ -# NetCDF Metadata +![ci-workflow](https://github.com/eScienceLab/dsprofile/workflows/CI/badge.svg) -## Overview +# Dataset Profile -A utility to describe the structure of NetCDF4 datasets. +## Overview -Reads a NetCDF4 file and reports the group structure and information -about any dimensions, variables, and attributes that are defined. +A utility to describe the structure of datasets in netCDF, GeoTiff, +and ESRI Shapefile format. ## Installation @@ -19,27 +19,45 @@ The optional test suite may be installed and run with: ```bash $ python -m pip install .[test] -$ pytest --cov=ncmetadata tests +$ pytest --cov=dsprofile tests ``` ## Usage ```bash -usage: ncmetadata [-h] [-o {category,group}] [-e ,,...] [-m] [-d] filename +usage: dsprofile [-h] {netcdf,geotiff,shape} ... + +Describes datasets in a variety of formats + +options: + -h, --help show this help message and exit + +Dataset formats: + {netcdf,geotiff,shape} + netcdf Extracts metadata from netCDF4 files + geotiff Extracts metadata from GeoTIFF files + shape Extracts metadata from ESRI Shape files +``` -Extracts metadata from netCDF4 files +## NetCDF Options + +Reads a netCDF4 file and reports the group structure and information +about any dimensions, variables, and attributes that are defined. + +```bash +usage: dsprofile netcdf [-h] [-o {category,group}] [-e ,,...] [-m] [-d] filename positional arguments: filename options: - -h, --help show this help message and exit - -o {category,group} --order-by {category,group} - (default group) + -h, --help show this help message and exit + -o {category,group}, --order-by {category,group} + (default group) -e ,,..., --exclude-groups ,,... - Exclude each of the named arguments - -m, --omit-metadata Output only netCDF file contents, not file metadata - -d, --omit-digest Do not include a hash digest in file metadata + Exclude each of the named arguments + -m, --omit-metadata Output only netCDF file contents, not file metadata + -d, --omit-digest Do not include a hash digest in file metadata ``` The `--order-by` option allows the resulting output to be arranged in one of two ways: @@ -53,11 +71,52 @@ The `--omit-digest` option prevents calculation of a SHA256 hash for the process This may be desirable for very large files or test workflows to avoid the potentially time-consuming hashing operation. -## Example +### NetCDF Example For example, to report on the contents of the netCDF4 file `test.nc` using the default output options... ```bash -$ ncmetadata test.nc +$ dsprofile netcdf test.nc +``` + +## GeoTiff Options + +```bash +usage: dsprofile geotiff [-h] [-m] [-d] filename + +positional arguments: + filename + +options: + -h, --help show this help message and exit + -m, --omit-metadata Output only GeoTIFF file contents, not file metadata + -d, --omit-digest Do not include a hash digest in file metadata +``` + +## ESRI Shapefile Options + +```bash +usage: dsprofile shape [-h] [-m] [-d] filename + +positional arguments: + filename + +options: + -h, --help show this help message and exit + -m, --omit-metadata Output only Shape file contents, not file metadata + -d, --omit-digest Do not include a hash digest in file metadata +``` + +A Shapefile may be read by opening any of its components, for example... + +```bash +$ dsprofile shape shapefile.shp +``` +...is equivalent to... + +```bash +$ dsprofile shape shapefile.dbf ``` +Note however that where a hex digest of a hash is included in the output, +this will refer only to file provided as a command-line argument. diff --git a/ncmetadata/__init__.py b/dsprofile/__init__.py similarity index 100% rename from ncmetadata/__init__.py rename to dsprofile/__init__.py diff --git a/dsprofile/lib/__init__.py b/dsprofile/lib/__init__.py new file mode 100644 index 0000000..1b3d8bf --- /dev/null +++ b/dsprofile/lib/__init__.py @@ -0,0 +1,9 @@ +from .reader import ( # noqa: F401 + Reader, + reader_type_map, + make_reader +) + +from .netcdf import NetCDFReader # noqa: F401 +from .tiff import GeoTIFFReader # noqa: F401 +from .shape import ShapefileReader # noqa: F401 diff --git a/dsprofile/lib/netcdf.py b/dsprofile/lib/netcdf.py new file mode 100644 index 0000000..610a137 --- /dev/null +++ b/dsprofile/lib/netcdf.py @@ -0,0 +1,207 @@ +import pathlib +import sys +import weakref + +from collections.abc import Sequence + +from dsprofile.lib.reader import Reader + +import netCDF4 as nc + + +class NetCDFReader(Reader): + """ + Defines a reader type for NetCDF4-based datasets. + Note that the underlying library may include support + for other (and non-HDF based) NetCDF formats, but this assumes + the availability of NetCDF4 primitives. + """ + + format = "netcdf" + + def __init__(self, filename, order_by="group", exclude=None): + self.ds = self.__class__.read_dataset(filename) + self._finalizer = weakref.finalize(self, self.finalize_close, self.ds) + self.order_by = order_by + # Note that the order is significant here + # as str is a Sequence type + if not exclude: + self.exclude_groups = [] + elif isinstance(exclude, str): + self.exclude_groups = [exclude] + elif issubclass(type(exclude), Sequence): + self.exclude_groups = exclude + + @staticmethod + def read_dataset(filename): + """ + Handle OSError, PermissionError, FileNotFoundError neatly + Inform neatly for non-netCDF4 files + Allow all other exceptions to raise unhandled + """ + + try: + ds = nc.Dataset(filename, 'r') + except (OSError, PermissionError, FileNotFoundError) as e: + print(f"{e.strerror} for file '{filename}'", file=sys.stderr) + sys.exit(1) + + if ds.data_model != "NETCDF4": + print(f"File '{filename}' has format '{ds.data_model}', " + f"not 'NETCDF4' as required", file=sys.stderr) + ds.close() + sys.exit(1) + + return ds + + @staticmethod + def finalize_close(ncdf): + """ + An out-of-scope handler (usually) invoked by weakref.finalize + which should ensure that any resources acquired by this + instance are correctly returned before GC. + """ + if isinstance(ncdf, nc.Dataset) and ncdf.isopen(): + ncdf.close() + + def close(self): + if self._finalizer.alive: + self._finalizer() + + def walk_groups_breadth_first(self, ds=None): + if not ds: + ds = self.ds + yield (g for g in ds.groups.values() if g.path not in self.exclude_groups) + for group in ds.groups.values(): + if group.path in self.exclude_groups: + continue + yield from self.walk_groups_breadth_first(group) + + def walk_groups_depth_first(self, ds=None): + if not ds: + ds = self.ds + for group in ds.groups.values(): + if group.path in self.exclude_groups: + continue + yield from self.walk_groups_depth_first(group) + yield (g for g in ds.groups.values() if g.path not in self.exclude_groups) + + walk_func_map = { + "breadth": walk_groups_breadth_first, + "depth": walk_groups_depth_first + } + + def walk_groups(self, ds=None, order="breadth"): + return self.walk_func_map[order](self, ds) + + def gather_by_group(self): + """ + A categorisation of dimensions, variables, and + attributes defined in the Dataset argument, + ordered by the group to which they belong. + """ + dims = self.describe_dimensions() + ncvars = self.describe_variables() + attrs = self.describe_attributes() + by_group = {"/": { + "dimensions": dims["/"], + "variables": ncvars["/"], + "attributes": attrs["/"] + } + } + for groups in self.walk_groups(): + for group in groups: + by_group[group.path] = { + "dimensions": dims[group.path], + "variables": ncvars[group.path], + "attributes": attrs[group.path] + } + + return by_group + + def gather_by_type(self): + """ + A categorisation of dimensions, variables, and + attributes defined in the Dataset argument, + ordered by type. + """ + return { + "dimensions": self.describe_dimensions(), + "variables": self.describe_variables(), + "attributes": self.describe_attributes() + } + + process_func_map = { + "category": gather_by_type, + "group": gather_by_group + } + + def describe_dimensions(self): + dimensions = {} + + dimensions['/'] = {d.name: {"size": d.size} for d in self.ds.dimensions.values()} + for groups in self.walk_groups(): + for group in groups: + dimensions[group.path] = {d.name: {"size": d.size} for d in group.dimensions.values()} + + return dimensions + + def describe_variables(self): + variables = {} + variables['/'] = {v.name: {"dtype": v.dtype.name, + "dimensions": v.dimensions, + "fill_value": str(v.get_fill_value())} + for v in self.ds.variables.values()} + for groups in self.walk_groups(): + for group in groups: + variables[group.path] = {v.name: {"dtype": v.dtype.name, + "dimensions": v.dimensions, + "fill_value": str(v.get_fill_value())} + for v in group.variables.values()} + return variables + + def describe_attributes(self): + attrs = {} + attrs['/'] = {"group": [a for a in self.ds.ncattrs()], + "vars": {v.name: [a for a in v.ncattrs()] for v in self.ds.variables.values()} + } + for groups in self.walk_groups(): + for group in groups: + attrs[group.path] = {"group": [a for a in group.ncattrs()], + "vars": {v.name: [a for a in v.ncattrs()] for v in group.variables.values()} + } + return attrs + + def process(self): + return self.process_func_map[self.order_by](self) + + @classmethod + def build_subparser(cls, sp): + parser = sp.add_parser(cls.format, + help="Extracts metadata from netCDF4 files") + parser.add_argument("filename", type=pathlib.Path) + parser.add_argument("-o", "--order-by", choices=["category", "group"], + default="group", help="(default group)") + parser.add_argument("-e", "--exclude-groups", metavar=",,...", + help="Exclude each of the named arguments") + parser.add_argument("-m", "--omit-metadata", action="store_true", + help="Output only netCDF file contents, not file metadata") + parser.add_argument("-d", "--omit-digest", action="store_true", + help="Do not include a hash digest in file metadata") + return parser + + @classmethod + def handle_args(cls, args): + if args.filename.is_dir(): + print(f"A valid file is required not directory '{args.filename}'", + file=sys.stderr) + sys.exit(1) + + exclude = [] + if hasattr(args, "exclude_groups"): + exclude = args.exclude_groups.split(',') if args.exclude_groups else [] + + ctor_args = [args.filename, args.order_by, exclude] + ctor_kwargs = {} + + return ctor_args, ctor_kwargs diff --git a/dsprofile/lib/reader.py b/dsprofile/lib/reader.py new file mode 100644 index 0000000..f381395 --- /dev/null +++ b/dsprofile/lib/reader.py @@ -0,0 +1,81 @@ +from abc import ( + ABC, + abstractmethod +) + + +reader_type_map = {} + + +class Reader(ABC): + """ + An abstract base for all Reader types, each subclass of this implements + reader functionality for a particular dataset type. + Each subclass must define the attribute referred to by + as a class attribute whose value indicates the + type of dataset handled by that concrete type. + """ + + subclass_type_key = "format" + + def __init_subclass__(cls, /, **kwargs): + """ + Derived types are validated to ensure they provide the attr + identified by and then added to the + registry. + """ + super().__init_subclass__(**kwargs) + keyattr = __class__.subclass_type_key + reader_type = getattr(cls, keyattr, None) + if not reader_type or not isinstance(reader_type, str): + raise NotImplementedError(f"Reader subclass {cls.__qualname__} " + f"does not define a {keyattr} key") + reader_type_map[reader_type] = cls + + @classmethod + @abstractmethod + def build_subparser(cls, sp): + """ + Receives an argparse subparser argument and is responsible + for adding all type-specific command line arguments. + """ + pass + + @classmethod + @abstractmethod + def handle_args(cls, args) -> tuple[list, dict]: + """ + Translates its argparse argument into the positional + and keyword arguments required to create an instance of this + type. + The returned tuple must consist of two elements: + 1. A list (or other sequence) of positional arguments + 2. A dict with str keys containing keyword arguments + These are subsequently passed to the type's constructor + to create an instance. + """ + pass + + @abstractmethod + def process(self) -> dict: + """ + Processes the dataset and returns a type-specific dict containing + the resulting metadata profile. + """ + pass + + +def make_reader(args): + """ + 1. Receives an argparse argument containing command-line + arguments + 2. Identifies the subtype required using the attr which + must match an entry in the reader_type_map + 3. Translates the argparse arguments into the specific form required + by the constructor for the type identified in (2) above + 4. Returns an instance of that type constructed using these + arguments + """ + cls = reader_type_map[args.command] + ctor_args, ctor_kwargs = cls.handle_args(args) + return cls(*ctor_args, **ctor_kwargs) diff --git a/dsprofile/lib/shape.py b/dsprofile/lib/shape.py new file mode 100644 index 0000000..d54bcf4 --- /dev/null +++ b/dsprofile/lib/shape.py @@ -0,0 +1,94 @@ +import pathlib +import sys +import weakref + +import fiona +from fiona import Collection + +from dsprofile.lib.reader import Reader + + +class ShapefileReader(Reader): + """ + A Reader type for ESRI Shapefile datasets + """ + + format = "shape" + + def __init__(self, filename): + super().__init__() + self.shp = self.__class__.read_dataset(filename) + self._finalizer = weakref.finalize(self, self.finalize_close, self.shp) + + @staticmethod + def finalize_close(shpinst): + if not isinstance(shpinst, Collection): + return + + if not shpinst.closed: + shpinst.close() + + def close(self): + if self._finalizer.alive: + self._finalizer() + + @classmethod + def build_subparser(cls, sp): + parser = sp.add_parser(cls.format, + help="Extracts metadata from ESRI Shape files") + parser.add_argument("filename", type=pathlib.Path) + parser.add_argument("-m", "--omit-metadata", action="store_true", + help="Output only Shape file contents, not file metadata") + parser.add_argument("-d", "--omit-digest", action="store_true", + help="Do not include a hash digest in file metadata") + return parser + + @classmethod + def handle_args(cls, args): + if args.filename.is_dir(): + print(f"A valid file is required not directory '{args.filename}'", + file=sys.stderr) + sys.exit(1) + + ctor_args = [args.filename] + ctor_kwargs = {} + + return ctor_args, ctor_kwargs + + def process(self): + output = { + "bounds": self.shp.bounds, + "features": [] + } + + units, factor = self.shp.crs.units_factor + output["units"] = units + output["factor"] = factor + + auth = self.shp.crs.to_authority() + if auth is not None: + if len(auth) == 2: + registry, code = auth + output["crs"] = f"{registry}:{code}" + elif len(auth) == 1: + output["crs"] = str(auth) + + for feat in self.shp: + fdata = { + "type": feat.geometry.type, + "coordinates": len(feat.geometry.coordinates[0]), + "properties": {k: v for k, v in feat.properties.items()} + } + output["features"].append(fdata) + + return output + + @staticmethod + def read_dataset(filename): + try: + shp = fiona.open(filename, 'r') + except fiona.errors.DriverError as e: + print(f"Unable to read '{filename}': {e}", file=sys.stderr) + sys.exit(1) + + return shp diff --git a/dsprofile/lib/tiff.py b/dsprofile/lib/tiff.py new file mode 100644 index 0000000..d955390 --- /dev/null +++ b/dsprofile/lib/tiff.py @@ -0,0 +1,87 @@ +import pathlib +import sys +import weakref + +from dsprofile.lib.reader import Reader + +import rasterio as rio +from rasterio.errors import RasterioIOError + + +class GeoTIFFReader(Reader): + """ + A Reader instance to process GeoTiff format data. + """ + + format = "geotiff" + + def __init__(self, filename): + super().__init__() + try: + self.tif = rio.open(filename, 'r') + except RasterioIOError as e: + print(f"Unable to read dataset '{filename}': {e}", file=sys.stderr) + sys.exit(1) + + self._finalizer = weakref.finalize(self, self.finalize_close, self.tif) + + @staticmethod + def finalize_close(rioinst): + if not isinstance(rioinst, rio.io.DatasetReader): + return + + if not rioinst.closed: + rioinst.close() + + def close(self): + if self._finalizer.alive: + self._finalizer() + + @classmethod + def build_subparser(cls, sp): + parser = sp.add_parser(cls.format, + help="Extracts metadata from GeoTIFF files") + parser.add_argument("filename", type=pathlib.Path) + parser.add_argument("-m", "--omit-metadata", action="store_true", + help="Output only GeoTIFF file contents, not file metadata") + parser.add_argument("-d", "--omit-digest", action="store_true", + help="Do not include a hash digest in file metadata") + return parser + + @classmethod + def handle_args(cls, args): + if args.filename.is_dir(): + print(f"A valid file is required not directory '{args.filename}'", + file=sys.stderr) + sys.exit(1) + + ctor_args = [args.filename] + ctor_kwargs = {} + + return ctor_args, ctor_kwargs + + def process(self): + output = { + "shape": { + "width": self.tif.width, + "height": self.tif.height + }, + "bands": {idx: dtype for idx, dtype in zip(self.tif.indexes, self.tif.dtypes)}, + "bounds": { + "left": self.tif.bounds.left, + "bottom": self.tif.bounds.bottom, + "right": self.tif.bounds.right, + "top": self.tif.bounds.top + }, + "units": self.tif.crs.linear_units, + "lin_step": self.tif.res + } + auth = self.tif.crs.to_authority() + if auth is not None: + if len(auth) == 2: + registry, code = auth + output["crs"] = f"{registry}:{code}" + elif len(auth) == 1: + output["crs"] = str(auth) + + return output diff --git a/dsprofile/main.py b/dsprofile/main.py new file mode 100644 index 0000000..d2d0a7a --- /dev/null +++ b/dsprofile/main.py @@ -0,0 +1,58 @@ +import argparse +import json +import sys + + +from dsprofile.lib import ( + reader_type_map, + make_reader +) + +from dsprofile.util import make_file_profile + + +def parse_args(argv): + """ + Build an argparse environment for the package and any defined + Reader subclasses. + Note that each Reader subtype must implement cls.build_subparser + to create any type-specific cli arguments it requires. + """ + parser = argparse.ArgumentParser( + prog="dsprofile", + description="Describes datasets in a variety of formats", + epilog="For more information, see github.com/eScienceLab/dsprofile" + ) + + sp = parser.add_subparsers(title="Dataset formats", + dest="command") + # Delegate per-type subparser to each defined sub-type... + for cls in reader_type_map.values(): + cls.build_subparser(sp) + + if len(argv) == 1: + parser.print_help() + parser.exit(0) + + args = parser.parse_args() + return args + + +def handle_args(args): + output = {} + if hasattr(args, "omit_metadata") and not args.omit_metadata: + output["metadata"] = make_file_profile(args) + + inst = make_reader(args) + output["content"] = inst.process() + print(json.dumps(output, indent=2)) + + +def main(): + args = parse_args(sys.argv) + handle_args(args) + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/ncmetadata/util.py b/dsprofile/util.py similarity index 59% rename from ncmetadata/util.py rename to dsprofile/util.py index 84bbd25..84c6e11 100644 --- a/ncmetadata/util.py +++ b/dsprofile/util.py @@ -1,35 +1,25 @@ +""" + General (ie. not type-specific) utility functions used + by the dsprofile framework and readers. +""" import datetime import os import sys from importlib.metadata import version -import netCDF4 as nc - -def read_dataset(filename): +def make_file_profile(ctx) -> dict: """ - Handle OSError, PermissionError, FileNotFoundError neatly - Inform neatly for non-netCDF4 files - Allow all other exceptions to raise unhandled + Returns a summary of the file used as a command-line argument + and useful metadata about the execution environment. """ - - try: - ds = nc.Dataset(filename, 'r') - except (OSError, PermissionError, FileNotFoundError) as e: - print(f"{e.strerror} for file '{filename}'", file=sys.stderr) - sys.exit(1) - - if ds.data_model != "NETCDF4": - print(f"File '{filename}' has format '{ds.data_model}', " - f"not 'NETCDF4' as required", file=sys.stderr) - sys.exit(1) - - return ds - - -def make_file_profile(ctx): try: + """ + Note that where "--omit-metadata" is not provided, this operation + detects ENOENT and EPERM files *before* any type-specific + constructor in Reader types. + """ stat = os.stat(ctx.filename) except (OSError, PermissionError, FileNotFoundError) as e: print(f"{e.strerror} for file '{ctx.filename}'", file=sys.stderr) diff --git a/ncmetadata/main.py b/ncmetadata/main.py deleted file mode 100644 index 7bc8fef..0000000 --- a/ncmetadata/main.py +++ /dev/null @@ -1,53 +0,0 @@ -import argparse -import json -import pathlib -import sys - -from ncmetadata.reader import process_file -from ncmetadata.util import make_file_profile - - -def parse_args(argv): - parser = argparse.ArgumentParser( - prog="ncmetadata", - description="Extracts metadata from netCDF4 files", - epilog="TODO: attribution/repo/docs" - ) - - parser.add_argument("filename", type=pathlib.Path) - parser.add_argument("-o", "--order-by", choices=["category", "group"], - default="group", help="(default group)") - parser.add_argument("-e", "--exclude-groups", metavar=",,...", - help="Exclude each of the named arguments") - parser.add_argument("-m", "--omit-metadata", action="store_true", - help="Output only netCDF file contents, not file metadata") - parser.add_argument("-d", "--omit-digest", action="store_true", - help="Do not include a hash digest in file metadata") - - return parser.parse_args() - - -def handle_args(args): - if args.filename.is_dir(): - print(f"A valid file is required not directory '{args.filename}'", - file=sys.stderr) - sys.exit(1) - - output = {} - if not args.omit_metadata: - output["metadata"] = make_file_profile(args) - - exclude = args.exclude_groups.split(',') if args.exclude_groups else [] - output["content"] = process_file(args.filename, args.order_by, exclude) - - print(json.dumps(output, indent=2)) - - -def main(): - args = parse_args(sys.argv) - handle_args(args) - sys.exit(0) - - -if __name__ == "__main__": - main() diff --git a/ncmetadata/reader.py b/ncmetadata/reader.py deleted file mode 100644 index 1908f91..0000000 --- a/ncmetadata/reader.py +++ /dev/null @@ -1,110 +0,0 @@ -from ncmetadata.util import ( - read_dataset -) - -exclude_groups = [] - - - -def walk_groups_breadth_first(ds): - yield ds.groups.values() - for group in ds.groups.values(): - yield from walk_groups_breadth_first(group) - - -def walk_groups_depth_first(ds): - for group in ds.groups.values(): - yield from walk_groups_depth_first(group) - yield ds.groups.values() - - -def walk_groups_ordered(ds): - for group in ds.groups.values(): - if group.path in exclude_groups: - continue - yield from walk_groups_ordered(group) - yield ds - - -walk_func_map = { - "breadth": walk_groups_breadth_first, - "depth": walk_groups_depth_first, - "ordered": walk_groups_ordered -} - - -def walk_groups(ds, order="ordered"): - return walk_func_map[order](ds) - - -def describe_dimensions(ds): - dimensions = {} - - for group in walk_groups(ds): - dimensions[group.path] = {d.name: {"size": d.size} for d in group.dimensions.values()} - - return dimensions - - -def describe_variables(ds): - variables = {} - for group in walk_groups(ds): - variables[group.path] = {v.name: {"dtype": v.dtype.name, - "dimensions": v.dimensions, - "fill_value": str(v.get_fill_value())} - for v in group.variables.values()} - return variables - - -def describe_attributes(ds): - attrs = {} - for group in walk_groups(ds): - attrs[group.path] = {"group": [a for a in group.ncattrs()], - "vars": {v.name: [a for a in v.ncattrs()] for v in group.variables.values()} - } - - return attrs - - -def gather_by_group(ds): - """ - A categorisation of dimensions, variables, and - attributes defined in the Dataset argument, - ordered by the group to which they belong. - """ - dims = describe_dimensions(ds) - ncvars = describe_variables(ds) - attrs = describe_attributes(ds) - by_group = {} - for group in walk_groups(ds): - by_group[group.path] = { - "dimensions": dims[group.path], - "variables": ncvars[group.path], - "attributes": attrs[group.path] - } - - return by_group - - -def gather_by_type(ds): - """ - A categorisation of dimensions, variables, and - attributes defined in the Dataset argument, - ordered by type. - """ - return { - "dimensions": describe_dimensions(ds), - "variables": describe_variables(ds), - "attributes": describe_attributes(ds) - } - -process_func_map = { - "category": gather_by_type, - "group": gather_by_group -} - -def process_file(filename, order_by, exclude): - global exclude_groups - exclude_groups = exclude - ds = read_dataset(filename) - return process_func_map[order_by](ds) diff --git a/setup.py b/setup.py index a47aa0f..670ee49 100644 --- a/setup.py +++ b/setup.py @@ -17,17 +17,21 @@ setup( - name = "ncmetadata", - version = "0.1.0", - packages = find_packages(include=["ncmetadata", "ncmetadata.*"]), + name = "dsprofile", + version = "0.2.0", + packages = find_packages(include=["dsprofile", "dsprofile.*"]), install_requires = [ - "netCDF4" + "setuptools==68.1.2", # earthpy uses pkg_resources + "netCDF4", + "earthpy", # Includes GeoPandas, rasterio + "fiona", # ESRI Shapefile support + "pyproj" # CRS parsing ], extras_require = { "dev": dev_requires, "test": test_requires }, entry_points = { - "console_scripts": ["ncmetadata=ncmetadata.main:main"] + "console_scripts": ["dsprofile=dsprofile.main:main"] } ) diff --git a/tests/data/GeogToWGS84GeoKey5.tif b/tests/data/GeogToWGS84GeoKey5.tif new file mode 100755 index 0000000..e878c60 Binary files /dev/null and b/tests/data/GeogToWGS84GeoKey5.tif differ diff --git a/tests/data/SJER_crop2.dbf b/tests/data/SJER_crop2.dbf new file mode 100755 index 0000000..4e7c869 Binary files /dev/null and b/tests/data/SJER_crop2.dbf differ diff --git a/tests/data/SJER_crop2.prj b/tests/data/SJER_crop2.prj new file mode 100755 index 0000000..02487dd --- /dev/null +++ b/tests/data/SJER_crop2.prj @@ -0,0 +1 @@ +PROJCS["WGS_1984_UTM_Zone_11N",GEOGCS["GCS_WGS_1984",DATUM["D_WGS_1984",SPHEROID["WGS_1984",6378137,298.257223563]],PRIMEM["Greenwich",0],UNIT["Degree",0.017453292519943295]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",-117],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["Meter",1]] \ No newline at end of file diff --git a/tests/data/SJER_crop2.qpj b/tests/data/SJER_crop2.qpj new file mode 100755 index 0000000..e1e8714 --- /dev/null +++ b/tests/data/SJER_crop2.qpj @@ -0,0 +1 @@ +PROJCS["WGS 84 / UTM zone 11N",GEOGCS["WGS 84",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0,AUTHORITY["EPSG","8901"]],UNIT["degree",0.0174532925199433,AUTHORITY["EPSG","9122"]],AUTHORITY["EPSG","4326"]],PROJECTION["Transverse_Mercator"],PARAMETER["latitude_of_origin",0],PARAMETER["central_meridian",-117],PARAMETER["scale_factor",0.9996],PARAMETER["false_easting",500000],PARAMETER["false_northing",0],UNIT["metre",1,AUTHORITY["EPSG","9001"]],AXIS["Easting",EAST],AXIS["Northing",NORTH],AUTHORITY["EPSG","32611"]] diff --git a/tests/data/SJER_crop2.shp b/tests/data/SJER_crop2.shp new file mode 100755 index 0000000..7b2952d Binary files /dev/null and b/tests/data/SJER_crop2.shp differ diff --git a/tests/data/SJER_crop2.shx b/tests/data/SJER_crop2.shx new file mode 100755 index 0000000..8ae2d0a Binary files /dev/null and b/tests/data/SJER_crop2.shx differ diff --git a/tests/test_geotiff.py b/tests/test_geotiff.py new file mode 100644 index 0000000..18f0595 --- /dev/null +++ b/tests/test_geotiff.py @@ -0,0 +1,68 @@ +import os + +import pytest +import rasterio as rio + +from dsprofile.lib.tiff import GeoTIFFReader + + +TEST_DATA_PATH = os.getenv("DSPROFILE_TEST_DATA_PATH") + + +@pytest.fixture +def geotiff_test_file(request): + if TEST_DATA_PATH: + test_dir = TEST_DATA_PATH + else: + base_dir = os.path.dirname(request.module.__file__) + test_dir = os.path.join(base_dir, "data") + return { + "path": os.path.join(test_dir, "GeogToWGS84GeoKey5.tif"), + "meta": { + "shape": { + "width": 101, + "height": 101 + }, + "bands": {1: "uint8"}, + "bounds": { + "left": 8.999654601821101, + "bottom": 51.9999732301211, + "right": 9.0024601573789, + "top": 52.0027787856789 + } + } + } + + +class TestGeoTIFF: + def test_reader_instance(self, geotiff_test_file): + """ + Can an instance of the GeoTIFFReader be created + with the expected default attributes? + """ + r = GeoTIFFReader(geotiff_test_file["path"]) + assert r.format == GeoTIFFReader.format + + def test_dataset_fileops(self, geotiff_test_file): + """ + Can a GeoTIFF file be opened correctly, and + does the finalizer correctly close the file + when manually invoked? + """ + r = GeoTIFFReader(geotiff_test_file["path"]) + assert isinstance(r.tif, rio.io.DatasetReader) + assert r._finalizer.alive + tref = r.tif + assert not tref.closed # Our tiff file is open... + r.close() # ...the finalizer is invoked... + assert tref.closed # ...so the file must be closed + + def test_read_dataset(self, geotiff_test_file): + """ + Are the expected metadata values correctly + retrieved from the test GeoTIFF file? + """ + r = GeoTIFFReader(geotiff_test_file["path"]) + data = r.process() + for k, v in geotiff_test_file["meta"].items(): + assert data[k] == v diff --git a/tests/test_groups.py b/tests/test_groups.py deleted file mode 100644 index 1b6b249..0000000 --- a/tests/test_groups.py +++ /dev/null @@ -1,59 +0,0 @@ -import os - -import pytest - -from ncmetadata.reader import ( - exclude_groups, - read_dataset, - walk_groups -) - - -TEST_DATA_PATH = os.getenv("NCDF_TEST_DATA_PATH", "tests/data") - -@pytest.fixture -def synthetic_test_file(): - return { - "path": os.path.join(TEST_DATA_PATH, "test.nc"), - "groups": ['/top01/nest_a/nest_a_01', - '/top01/nest_a/nest_a_02', - '/top01/nest_a', - '/top01/nest_b/nest_b_01', - '/top01/nest_b/nest_b_02', - '/top01/nest_b/nest_b_03', - '/top01/nest_b', - '/top01', - '/top02', - '/'] - } - - -class TestGroups: - def test_read_dataset(self, synthetic_test_file): - """ - Can a netCDF4 file be opened correctly? - """ - ds = read_dataset(synthetic_test_file["path"]) - - def test_walk_groups(self, synthetic_test_file): - """ - Are groups correctly identified and appear - in the expected order? - """ - ds = read_dataset(synthetic_test_file["path"]) - groupnames = [group.path for group in walk_groups(ds)] - for idx in range(len(groupnames)): - assert groupnames[idx] == synthetic_test_file["groups"][idx] - - def test_exclude_groups(self, synthetic_test_file): - """ - Are group paths excluded from the search - correctly omitted? - """ - ds = read_dataset(synthetic_test_file["path"]) - exclusion = "/top01/nest_b" - exclude_groups.append(exclusion) - groupnames = [group.path for group in walk_groups(ds)] - filtered_groups = [group for group in synthetic_test_file["groups"] if not group.startswith(exclusion)] - for idx in range(len(groupnames)): - assert groupnames[idx] == filtered_groups[idx] diff --git a/tests/test_netcdf.py b/tests/test_netcdf.py new file mode 100644 index 0000000..5107f04 --- /dev/null +++ b/tests/test_netcdf.py @@ -0,0 +1,71 @@ +import os + +import pytest + +from dsprofile.lib.netcdf import NetCDFReader + + +TEST_DATA_PATH = os.getenv("DSPROFILE_TEST_DATA_PATH") + + +@pytest.fixture +def synthetic_test_file(request): + if TEST_DATA_PATH: + test_dir = TEST_DATA_PATH + else: + base_dir = os.path.dirname(request.module.__file__) + test_dir = os.path.join(base_dir, "data") + return { + "path": os.path.join(test_dir, "test.nc"), + "groups": [ '/top01', + '/top02', + '/top01/nest_a', + '/top01/nest_b', + '/top01/nest_a/nest_a_01', + '/top01/nest_a/nest_a_02', + '/top01/nest_b/nest_b_01', + '/top01/nest_b/nest_b_02', + '/top01/nest_b/nest_b_03'] + } + + +class TestNetCDF: + def test_reader_instance(self, synthetic_test_file): + """ + Can an instance of the NetCDFReader be created + and does it have the correct defaults? + """ + r = NetCDFReader(synthetic_test_file["path"]) + assert r.format == NetCDFReader.format + assert r.order_by == "group" + assert r.exclude_groups == [] + + def test_read_dataset(self, synthetic_test_file): + """ + Can a netCDF4 file be opened correctly? + """ + ds = NetCDFReader.read_dataset(synthetic_test_file["path"]) + assert ds.data_model == "NETCDF4" + + def test_walk_groups(self, synthetic_test_file): + """ + Are groups correctly identified and appear + in the expected order? + """ + r = NetCDFReader(synthetic_test_file["path"]) + r.process() + groupnames = [group.path for groups in r.walk_groups(r.ds) for group in groups] + for idx in range(len(groupnames)): + assert groupnames[idx] == synthetic_test_file["groups"][idx] + + def test_exclude_groups(self, synthetic_test_file): + """ + Are group paths excluded from the search + correctly omitted? + """ + exclusion = "/top01/nest_b" + r = NetCDFReader(synthetic_test_file["path"], exclude=exclusion) + groupnames = [group.path for groups in r.walk_groups(r.ds) for group in groups] + filtered_groups = [group for group in synthetic_test_file["groups"] if not group.startswith(exclusion)] + for idx in range(len(groupnames)): + assert groupnames[idx] == filtered_groups[idx] diff --git a/tests/test_setup.py b/tests/test_setup.py index 57298f6..a25cf27 100644 --- a/tests/test_setup.py +++ b/tests/test_setup.py @@ -1,11 +1,10 @@ -import os from packaging import version -import pytest - min_version_map = { - "netCDF4": version.parse("1.7.0") + "netCDF4": version.parse("1.7.0"), + "rasterio": version.parse("1.4.0"), + "fiona": version.parse("0.9.0") } @@ -18,4 +17,3 @@ def test_lib_versions(self): for libname, semver in min_version_map.items(): lib = importlib.import_module(libname) assert version.parse(lib.__version__) >= semver - diff --git a/tests/test_shapefile.py b/tests/test_shapefile.py new file mode 100644 index 0000000..4f9eed6 --- /dev/null +++ b/tests/test_shapefile.py @@ -0,0 +1,71 @@ +import os + +import pytest +from fiona import Collection +import numpy as np + +from dsprofile.lib import ShapefileReader + + +TEST_DATA_PATH = os.getenv("DSPROFILE_TEST_DATA_PATH") + + +@pytest.fixture +def shape_test_file(request): + if TEST_DATA_PATH: + test_dir = TEST_DATA_PATH + else: + base_dir = os.path.dirname(request.module.__file__) + test_dir = os.path.join(base_dir, "data") + return { + "path": os.path.join(test_dir, "SJER_crop2.shp"), + "meta": { + "bounds": [ + 255209.5107915717, + 4108471.237186788, + 257532.73265945335, + 4110975.960763098 + ], + "features": [ + { + "type": "Polygon", + "coordinates": 7, + "properties": { + "id": 1 + } + } + ], + "units": "metre", + "factor": 1.0, + "crs": "EPSG:32611" + } + } + + +class TestShapefile: + def test_instance(self, shape_test_file): + """ + Can an instance of the ShapefileReader be created + with the expected default attributes? + """ + s = ShapefileReader(shape_test_file["path"]) + assert s.format == ShapefileReader.format + assert s.shp.driver.lower() == "esri shapefile" + + def test_read_dataset(self, shape_test_file): + """ + Are the expected metadata values correctly + retrieved from the test Shape file? + """ + r = ShapefileReader(shape_test_file["path"]) + assert isinstance(r.shp, Collection) + data = r.process() + + for key in shape_test_file["meta"]: + assert key in data + + assert np.allclose(data["bounds"], shape_test_file["meta"]["bounds"]) + assert len(data["features"]) == len(shape_test_file["meta"]["features"]) + direct_attrs = ("units", "factor", "crs") + for attr in direct_attrs: + assert data[attr] == shape_test_file["meta"][attr]