diff --git a/Changelog.rst b/Changelog.rst index f0cf3a3579..14723cf7a4 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -14,6 +14,12 @@ version NEXTVERSION * New keyword parameter to `cf.Field.derivative`: ``ignore_coordinate_units`` (https://github.com/NCAS-CMS/cf-python/issues/807) +* Allow access to netCDF-4 files in S3 object stores + (https://github.com/NCAS-CMS/cf-python/issues/712) +* New class `cf.H5netcdfArray` +* New class `cf.NetCDF4Array` +* New class `cf.CFAH5netcdfArray` +* New class `cf.CFANetCDF4Array` * Fix bug that sometimes puts an incorrect ``radian-1`` or ``radian-2`` in the returned units of the differential operator methods and functions @@ -27,6 +33,12 @@ version NEXTVERSION * Fix bug where `cf.normalize_slice` doesn't correctly handle certain cyclic slices (https://github.com/NCAS-CMS/cf-python/issues/774) +* New dependency: ``h5netcdf>=1.3.0`` +* New dependency: ``h5py>=3.10.0`` +* New dependency: ``s3fs>=2024.2.0`` +* Changed dependency: ``1.11.2.0<=cfdm<1.11.3.0`` +* Changed dependency: ``cfunits>=3.3.7`` + ---- @@ -141,6 +153,8 @@ version 3.16.0 * Changed dependency: ``1.11.0.0<=cfdm<1.11.1.0`` * New dependency: ``scipy>=1.10.0`` +---- + version 3.15.4 -------------- @@ -279,7 +293,7 @@ version 3.14.1 ---- -version 3.14.0 (*first Dask release*) +version 3.14.0 (*first Dask version*) ------------------------------------- **2023-01-31** @@ -314,7 +328,7 @@ version 3.14.0 (*first Dask release*) ---- -version 3.13.1 (*last LAMA release*) +version 3.13.1 (*last LAMA version*) ------------------------------------ **2022-10-17** diff --git a/cf/__init__.py b/cf/__init__.py index fe0ed8c07a..9e630d86ea 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -13,10 +13,17 @@ * read field constructs from netCDF, CDL, PP and UM datasets, +* read field constructs and domain constructs from netCDF, CDL, PP and + UM datasets with a choice of netCDF backends, + +* read files from OPeNDAP servers and S3 object stores, + * create new field constructs in memory, * write and append field constructs to netCDF datasets on disk, +* read, write, and manipulate UGRID mesh topologies, + * read, write, and create coordinates defined by geometry cells, * read netCDF and CDL datasets containing hierarchical groups, @@ -74,8 +81,8 @@ """ __Conventions__ = "CF-1.11" -__date__ = "2024-04-26" -__version__ = "3.16.2" +__date__ = "2024-??-??" +__version__ = "3.17.0" _requires = ( "numpy", @@ -199,8 +206,8 @@ ) # Check the version of cfdm -_minimum_vn = "1.11.1.0" -_maximum_vn = "1.11.2.0" +_minimum_vn = "1.11.2.0" +_maximum_vn = "1.11.3.0" _cfdm_version = Version(cfdm.__version__) if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn): raise RuntimeError( @@ -209,12 +216,6 @@ ) # Check the version of dask -_minimum_vn = "2022.12.1" -if Version(dask.__version__) < Version(_minimum_vn): - raise RuntimeError( - f"Bad dask version: cf requires dask>={_minimum_vn}. " - f"Got {dask.__version__} at {dask.__file__}" - ) # Check the version of Python _minimum_vn = "3.8.0" @@ -274,15 +275,19 @@ from .data.array import ( BoundsFromNodesArray, CellConnectivityArray, - CFANetCDFArray, + CFAH5netcdfArray, + CFANetCDF4Array, FullArray, GatheredArray, + H5netcdfArray, NetCDFArray, + NetCDF4Array, PointTopologyArray, RaggedContiguousArray, RaggedIndexedArray, RaggedIndexedContiguousArray, SubsampledArray, + UMArray, ) from .data.fragment import ( diff --git a/cf/cellmethod.py b/cf/cellmethod.py index 3bdfbf3659..dda1e343b0 100644 --- a/cf/cellmethod.py +++ b/cf/cellmethod.py @@ -56,7 +56,7 @@ class CellMethod(cfdm.CellMethod): def __new__(cls, *args, **kwargs): """This must be overridden in subclasses. - .. versionadded:: (cfdm) 3.7.0 + .. versionadded:: 3.7.0 """ instance = super().__new__(cls) diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index b40a7dcd73..db8601627c 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -26,12 +26,15 @@ TiePointIndex, ) from .data import Data + from .data.array import ( BoundsFromNodesArray, CellConnectivityArray, - CFANetCDFArray, + CFAH5netcdfArray, + CFANetCDF4Array, GatheredArray, - NetCDFArray, + H5netcdfArray, + NetCDF4Array, PointTopologyArray, RaggedContiguousArray, RaggedIndexedArray, @@ -112,65 +115,39 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): parent, construct, axes=axes, copy=copy, **kwargs ) - def initialise_CFANetCDFArray( - self, - filename=None, - address=None, - dtype=None, - mask=True, - units=False, - calendar=False, - instructions=None, - substitutions=None, - term=None, - x=None, - **kwargs, - ): - """Return a `CFANetCDFArray` instance. + def initialise_CFANetCDF4Array(self, **kwargs): + """Return a `CFANetCDF4Array` instance. :Parameters: - filename: `str` - - address: (sequence of) `str` or `int` - - dytpe: `numpy.dtype` - - mask: `bool`, optional + kwargs: optional + Initialisation parameters to pass to the new instance. - units: `str` or `None`, optional + :Returns: - calendar: `str` or `None`, optional + `CFANetCDF4Array` - instructions: `str`, optional + """ + cls = self.get_class("CFANetCDF4Array") + return cls(**kwargs) - substitutions: `dict`, optional + def initialise_CFAH5netcdfArray(self, **kwargs): + """Return a `CFAH5netcdfArray` instance. - term: `str`, optional + .. versionadded:: NEXTVERSION - x: `dict`, optional + :Parameters: kwargs: optional - Ignored. + Initialisation parameters to pass to the new instance. :Returns: - `CFANetCDFArray` + `CFAH5netcdfArray` """ - cls = self.get_class("CFANetCDFArray") - return cls( - filename=filename, - address=address, - dtype=dtype, - mask=mask, - units=units, - calendar=calendar, - instructions=instructions, - substitutions=substitutions, - term=term, - x=x, - ) + cls = self.get_class("CFAH5netcdfArray") + return cls(**kwargs) _implementation = CFImplementation( @@ -179,7 +156,8 @@ def initialise_CFANetCDFArray( CellConnectivity=CellConnectivity, CellMeasure=CellMeasure, CellMethod=CellMethod, - CFANetCDFArray=CFANetCDFArray, + CFAH5netcdfArray=CFAH5netcdfArray, + CFANetCDF4Array=CFANetCDF4Array, CoordinateReference=CoordinateReference, DimensionCoordinate=DimensionCoordinate, Domain=Domain, @@ -202,7 +180,8 @@ def initialise_CFANetCDFArray( BoundsFromNodesArray=BoundsFromNodesArray, CellConnectivityArray=CellConnectivityArray, GatheredArray=GatheredArray, - NetCDFArray=NetCDFArray, + H5netcdfArray=H5netcdfArray, + NetCDF4Array=NetCDF4Array, PointTopologyArray=PointTopologyArray, RaggedContiguousArray=RaggedContiguousArray, RaggedIndexedArray=RaggedIndexedArray, @@ -236,7 +215,8 @@ def implementation(): 'CellConnectivityArray': cf.data.array.cellconnectivityarray.CellConnectivityArray, 'CellMeasure': cf.cellmeasure.CellMeasure, 'CellMethod': cf.cellmethod.CellMethod, - 'CFANetCDFArray': cf.data.array.cfanetcdfarray.CFANetCDFArray, + 'CFAH5netcdfArray': cf.data.array.cfah5netcdfarray.CFAH5netcdfArray, + 'CFANetCDF4Array': cf.data.array.cfanetcdf4array.CFANetCDF4Array, 'CoordinateReference': cf.coordinatereference.CoordinateReference, 'DimensionCoordinate': cf.dimensioncoordinate.DimensionCoordinate, 'Domain': cf.domain.Domain, @@ -257,7 +237,8 @@ def implementation(): 'PartNodeCountProperties': cf.partnodecountproperties.PartNodeCountProperties, 'Data': cf.data.data.Data, 'GatheredArray': cf.data.array.gatheredarray.GatheredArray, - 'NetCDFArray': cf.data.array.netcdfarray.NetCDFArray, + 'H5netcdfArray': cf.data.array.h5netcdfarray.H5netcdfArray, + 'NetCDF4Array': cf.data.array.netcdf4array.NetCDF4Array, 'PointTopologyArray': , 'RaggedContiguousArray': cf.data.array.raggedcontiguousarray.RaggedContiguousArray, 'RaggedIndexedArray': cf.data.array.raggedindexedarray.RaggedIndexedArray, diff --git a/cf/constants.py b/cf/constants.py index ea335f63ee..1472bd83d2 100644 --- a/cf/constants.py +++ b/cf/constants.py @@ -63,6 +63,9 @@ "LOG_LEVEL": logging.getLevelName(logging.getLogger().level), "BOUNDS_COMBINATION_MODE": "AND", "CHUNKSIZE": parse_bytes(_CHUNKSIZE), + "active_storage": False, + "active_storage_url": None, + "active_storage_max_requests": 100, } masked = np.ma.masked diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index c21f8916c5..cd2c53766b 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -1,9 +1,12 @@ from .boundsfromnodesarray import BoundsFromNodesArray from .cellconnectivityarray import CellConnectivityArray -from .cfanetcdfarray import CFANetCDFArray +from .cfah5netcdfarray import CFAH5netcdfArray +from .cfanetcdf4array import CFANetCDF4Array from .fullarray import FullArray from .gatheredarray import GatheredArray +from .h5netcdfarray import H5netcdfArray from .netcdfarray import NetCDFArray +from .netcdf4array import NetCDF4Array from .pointtopologyarray import PointTopologyArray from .raggedcontiguousarray import RaggedContiguousArray from .raggedindexedarray import RaggedIndexedArray diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py new file mode 100644 index 0000000000..47c58bff06 --- /dev/null +++ b/cf/data/array/cfah5netcdfarray.py @@ -0,0 +1,10 @@ +from .h5netcdfarray import H5netcdfArray +from .mixin import CFAMixin + + +class CFAH5netcdfArray(CFAMixin, H5netcdfArray): + """A CFA-netCDF array accessed with `h5netcdf` + + .. versionadded:: NEXTVERSION + + """ diff --git a/cf/data/array/cfanetcdf4array.py b/cf/data/array/cfanetcdf4array.py new file mode 100644 index 0000000000..b3b6b69d7a --- /dev/null +++ b/cf/data/array/cfanetcdf4array.py @@ -0,0 +1,10 @@ +from .mixin import CFAMixin +from .netcdf4array import NetCDF4Array + + +class CFANetCDF4Array(CFAMixin, NetCDF4Array): + """A CFA-netCDF array accessed with `netCDF4`. + + .. versionadded:: NEXTVERSION + + """ diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py index f6c29858fb..81278c3407 100644 --- a/cf/data/array/fullarray.py +++ b/cf/data/array/fullarray.py @@ -1,11 +1,13 @@ import numpy as np +from ...functions import indices_shape, parse_indices from .abstract import Array +from .mixin import IndexMixin _FULLARRAY_HANDLED_FUNCTIONS = {} -class FullArray(Array): +class FullArray(IndexMixin, Array): """A array filled with a given value. The array may be empty or all missing values. @@ -19,8 +21,7 @@ def __init__( fill_value=None, dtype=None, shape=None, - units=False, - calendar=False, + attributes=None, source=None, copy=True, ): @@ -38,23 +39,22 @@ def __init__( shape: `tuple` The array dimension sizes. - units: `str` or `None`, optional - The units of the netCDF variable. Set to `None` to - indicate that there are no units. If unset then the - units will be set to `None` during the first - `__getitem__` call. + {{init attributes: `dict` or `None`, optional}} - calendar: `str` or `None`, optional - The calendar of the netCDF variable. By default, or if - set to `None`, then the CF default calendar is - assumed, if applicable. If unset then the calendar - will be set to `None` during the first `__getitem__` - call. + .. versionadded:: NEXTVERSION {{init source: optional}} {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + """ super().__init__(source=source, copy=copy) @@ -75,42 +75,14 @@ def __init__( shape = None try: - units = source._get_component("units", False) - except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", None) + attributes = source._get_component("attributes", False) except AttributeError: - calendar = None + attributes = None self._set_component("full_value", fill_value, copy=False) self._set_component("dtype", dtype, copy=False) self._set_component("shape", shape, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) - - def __array__(self, *dtype): - """The numpy array interface. - - .. versionadded:: 3.15.0 - - :Parameters: - - dtype: optional - Typecode or data-type to which the array is cast. - - :Returns: - - `numpy.ndarray` - An independent numpy array of the data. - - """ - array = self[...] - if not dtype: - return array - else: - return array.astype(dtype[0], copy=False) + self._set_component("attributes", attributes, copy=False) def __array_function__(self, func, types, args, kwargs): """The `numpy` `__array_function__` protocol. @@ -128,54 +100,6 @@ def __array_function__(self, func, types, args, kwargs): return _FULLARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs) - def __getitem__(self, indices): - """x.__getitem__(indices) <==> x[indices] - - Returns a numpy array. - - """ - # If 'apply_indices' is False then we can make a filled numpy - # array that already has the correct shape. Otherwise we need - # to make one with shape 'self.shape' and then apply the - # indices to that. - apply_indices = False - - if indices is Ellipsis: - array_shape = self.shape - else: - array_shape = [] - for i, size in zip(indices, self.shape): - if not isinstance(i, slice): - continue - - start, stop, step = i.indices(size) - a, b = divmod(stop - start, step) - if b: - a += 1 - - array_shape.append(a) - - if len(array_shape) != self.ndim: - apply_indices = True - array_shape = self.shape - - fill_value = self.get_full_value() - if fill_value is np.ma.masked: - array = np.ma.masked_all(array_shape, dtype=self.dtype) - elif fill_value is not None: - array = np.full( - array_shape, fill_value=fill_value, dtype=self.dtype - ) - else: - array = np.empty(array_shape, dtype=self.dtype) - - if apply_indices: - array = self.get_subspace(array, indices) - - self._set_units() - - return array - def __repr__(self): """Called by the `repr` built-in function. @@ -196,6 +120,53 @@ def __str__(self): return f"Filled with {fill_value!r}" + def _get_array(self, index=None): + """Returns the full array. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + if index is None: + shape = self.shape + else: + original_shape = self.original_shape + index = parse_indices(original_shape, index, keepdims=False) + shape = indices_shape(index, original_shape, keepdims=False) + + fill_value = self.get_full_value() + if fill_value is np.ma.masked: + array = np.ma.masked_all(shape, dtype=self.dtype) + elif fill_value is not None: + array = np.full(shape, fill_value=fill_value, dtype=self.dtype) + else: + array = np.empty(shape, dtype=self.dtype) + + return array + + @property + def array(self): + """Return an independent numpy array containing the data. + + .. versionadded:: NEXTVERSION + + :Returns: + + `numpy.ndarray` + An independent numpy array of the data. + """ + return np.asanyarray(self) + @property def dtype(self): """Data-type of the data elements.""" diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py new file mode 100644 index 0000000000..02cd0f1cc5 --- /dev/null +++ b/cf/data/array/h5netcdfarray.py @@ -0,0 +1,81 @@ +import cfdm + +from ...mixin_container import Container +from .locks import netcdf_lock +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin + + +class H5netcdfArray( + ActiveStorageMixin, + IndexMixin, + FileArrayMixin, + ArrayMixin, + Container, + cfdm.H5netcdfArray, +): + """A netCDF array accessed with `h5netcdf`. + + **Active storage reductions** + + An active storage reduction may be enabled with the `actify` + method. See `cf.data.collapse.Collapse` for details. + + .. versionadded:: NEXTVERSION + + """ + + def __dask_tokenize__(self): + """Return a value fully representative of the object. + + .. versionadded:: NEXTVERSION + + """ + return super().__dask_tokenize__() + (self.get_mask(),) + + @property + def _lock(self): + """Set the lock for use in `dask.array.from_array`. + + Returns a lock object because concurrent reads are not + currently supported by the HDF5 library. The lock object will + be the same for all `NetCDF4Array` and `H5netcdfArray` + instances, regardless of the dataset they access, which means + that access to all netCDF and HDF files coordinates around the + same lock. + + .. versionadded:: NEXTVERSION + + """ + return netcdf_lock + + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + if index is None: + index = self.index() + + # We need to lock because the netCDF file is about to be accessed. + self._lock.acquire() + + # It's cfdm.H5netcdfArray.__getitem__ that we want to + # call here, but we use 'Container' in super because + # that comes immediately before cfdm.H5netcdfArray in + # the method resolution order. + array = super(Container, self).__getitem__(index) + + self._lock.release() + return array diff --git a/cf/data/array/locks.py b/cf/data/array/locks.py new file mode 100644 index 0000000000..5a7b2bd333 --- /dev/null +++ b/cf/data/array/locks.py @@ -0,0 +1,4 @@ +from dask.utils import SerializableLock + +# Global lock for netCDF file access +netcdf_lock = SerializableLock() diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py index a9f7f75cb3..8e5dd7690d 100644 --- a/cf/data/array/mixin/__init__.py +++ b/cf/data/array/mixin/__init__.py @@ -1,3 +1,6 @@ +from .activestoragemixin import ActiveStorageMixin from .arraymixin import ArrayMixin +from .cfamixin import CFAMixin from .compressedarraymixin import CompressedArrayMixin from .filearraymixin import FileArrayMixin +from .indexmixin import IndexMixin diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py new file mode 100644 index 0000000000..024bb05d04 --- /dev/null +++ b/cf/data/array/mixin/activestoragemixin.py @@ -0,0 +1,28 @@ +class ActiveStorageMixin: + """Mixin class for enabling active storage operations. + + .. versionadded:: NEXTVERSION + + """ + + @property + def active_storage(self): + """Whether active storage operations are allowed. + + Currently, active storage operations are allowed unless the + data are numerically packed. + + .. versionadded:: NEXTVERSION + + :Returns: + + `bool` + `True` if active storage operations are allowed, + otherwise `False`. + + """ + attributes = self.get_attributes({}) + if "add_offset" in attributes or "scale_factor" in attributes: + return False + + return True diff --git a/cf/data/array/mixin/arraymixin.py b/cf/data/array/mixin/arraymixin.py index a167abec50..3468253b36 100644 --- a/cf/data/array/mixin/arraymixin.py +++ b/cf/data/array/mixin/arraymixin.py @@ -1,3 +1,5 @@ +import numpy as np + from ....units import Units @@ -16,6 +18,22 @@ def __array_function__(self, func, types, args, kwargs): """ return NotImplemented + @property + def _meta(self): + """Normalise the array to an appropriate Dask meta object. + + The Dask meta can be thought of as a suggestion to Dask. Dask + uses this meta to generate the task graph until it can infer + the actual metadata from the values. It does not force the + output to have the structure or dtype of the specified meta. + + .. versionadded:: NEXTVERSION + + .. seealso:: `dask.utils.meta_from_array` + + """ + return np.array((), dtype=self.dtype) + @property def Units(self): """The `cf.Units` object containing the units of the array. @@ -23,4 +41,4 @@ def Units(self): .. versionadded:: 3.14.0 """ - return Units(self.get_units(), self.get_calendar(None)) + return Units(self.get_units(None), self.get_calendar(None)) diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/mixin/cfamixin.py similarity index 62% rename from cf/data/array/cfanetcdfarray.py rename to cf/data/array/mixin/cfamixin.py index 86465309e9..6bcb01a468 100644 --- a/cf/data/array/cfanetcdfarray.py +++ b/cf/data/array/mixin/cfamixin.py @@ -4,36 +4,50 @@ import numpy as np -from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray -from ..utils import chunk_locations, chunk_positions -from .netcdfarray import NetCDFArray +from ...utils import chunk_locations, chunk_positions -# Store fragment array classes. -_FragmentArray = { - "nc": NetCDFFragmentArray, - "um": UMFragmentArray, - "full": FullFragmentArray, -} +class CFAMixin: + """Mixin class for a CFA array. -class CFANetCDFArray(NetCDFArray): - """A CFA aggregated array stored in a netCDF file. - - .. versionadded:: 3.14.0 + .. versionadded:: NEXTVERSION """ + def __new__(cls, *args, **kwargs): + """Store fragment array classes. + + .. versionadded:: NEXTVERSION + + """ + # Import fragment array classes. Do this here (as opposed to + # outside the class) to avoid a circular import. + from ...fragment import ( + FullFragmentArray, + NetCDFFragmentArray, + UMFragmentArray, + ) + + instance = super().__new__(cls) + instance._FragmentArray = { + "nc": NetCDFFragmentArray, + "um": UMFragmentArray, + "full": FullFragmentArray, + } + return instance + def __init__( self, filename=None, address=None, dtype=None, mask=True, - units=False, - calendar=False, + unpack=True, instructions=None, substitutions=None, term=None, + attributes=None, + storage_options=None, source=None, copy=True, x=None, @@ -43,39 +57,34 @@ def __init__( :Parameters: filename: (sequence of) `str`, optional - The name of the CFA-netCDF file containing the - array. If a sequence then it must contain one element. + The name of the CFA file containing the array. If a + sequence then it must contain one element. address: (sequence of) `str`, optional - The name of the CFA-netCDF aggregation variable for the + The name of the CFA aggregation variable for the array. If a sequence then it must contain one element. dtype: `numpy.dtype` The data type of the aggregated data array. May be `None` if the numpy data-type is not known (which can - be the case for netCDF string types, for example). + be the case for some string types, for example). mask: `bool` If True (the default) then mask by convention when reading data from disk. - A netCDF array is masked depending on the values of any of - the netCDF variable attributes ``valid_min``, - ``valid_max``, ``valid_range``, ``_FillValue`` and - ``missing_value``. + A array is masked depending on the values of any of + the variable attributes ``valid_min``, ``valid_max``, + ``valid_range``, ``_FillValue`` and ``missing_value``. - units: `str` or `None`, optional - The units of the aggregated data. Set to `None` to - indicate that there are no units. + {{init unpack: `bool`, optional}} - calendar: `str` or `None`, optional - The calendar of the aggregated data. Set to `None` to - indicate the CF default calendar, if applicable. + .. versionadded:: NEXTVERSION instructions: `str`, optional The ``aggregated_data`` attribute value as found on - the CFA netCDF variable. If set then this will be used - to improve the performance of `__dask_tokenize__`. + the CFA variable. If set then this will be used to + improve the performance of `__dask_tokenize__`. substitutions: `dict`, optional A dictionary whose key/value pairs define text @@ -88,23 +97,66 @@ def __init__( term: `str`, optional The name of a non-standard aggregation instruction term from which the array is to be created, instead of - creating the aggregated data in the standard - terms. If set then *address* must be the name of the - term's CFA-netCDF aggregation instruction variable, - which must be defined on the fragment dimensions and - no others. Each value of the aggregation instruction - variable will be broadcast across the shape of the - corresponding fragment. + creating the aggregated data in the standard terms. If + set then *address* must be the name of the term's + aggregation instruction variable, which must be + defined on the fragment dimensions and no others. Each + value of the aggregation instruction variable will be + broadcast across the shape of the corresponding + fragment. *Parameter example:* ``address='cfa_tracking_id', term='tracking_id'`` .. versionadded:: 3.15.0 + storage_options: `dict` or `None`, optional + Key/value pairs to be passed on to the creation of + `s3fs.S3FileSystem` file systems to control the + opening of fragment files in S3 object stores. Ignored + for files not in an S3 object store, i.e. those whose + names do not start with ``s3:``. + + By default, or if `None`, then *storage_options* is + taken as ``{}``. + + If the ``'endpoint_url'`` key is not in + *storage_options* or is not in a dictionary defined by + the ``'client_kwargs`` key (which is always the case + when *storage_options* is `None`), then one will be + automatically inserted for accessing a fragment S3 + file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` + key with value ``'https://store'`` would be created. + + *Parameter example:* + ``{'key: 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + .. versionadded:: NEXTVERSION + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + .. versionaddedd:: NEXTVERSION + {{init source: optional}} {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + """ if source is not None: super().__init__(source=source, copy=copy) @@ -135,90 +187,16 @@ def __init__( term = None elif filename is not None: - aggregated_data = {} - - location = x["location"] - ndim = location.shape[0] - - chunks = [i.compressed().tolist() for i in location] - shape = [sum(c) for c in chunks] - positions = chunk_positions(chunks) - locations = chunk_locations(chunks) - - if term is not None: - # -------------------------------------------------------- - # This fragment contains a constant value, not file - # locations. - # -------------------------------------------------------- - term = x[term] - fragment_shape = term.shape - aggregated_data = { - frag_loc: { - "location": loc, - "fill_value": term[frag_loc].item(), - "format": "full", - } - for frag_loc, loc in zip(positions, locations) - } - else: - a = x["address"] - f = x["file"] - fmt = x["format"] - - extra_dimension = f.ndim > ndim - if extra_dimension: - # There is an extra non-fragment dimension - fragment_shape = f.shape[:-1] - else: - fragment_shape = f.shape - - if not a.ndim: - a = np.full(f.shape, a, dtype=a.dtype) - - if not fmt.ndim: - fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype) - - if extra_dimension: - aggregated_data = { - frag_loc: { - "location": loc, - "filename": f[frag_loc].tolist(), - "address": a[frag_loc].tolist(), - "format": fmt[frag_loc].item(), - } - for frag_loc, loc in zip(positions, locations) - } - else: - aggregated_data = { - frag_loc: { - "location": loc, - "filename": (f[frag_loc].item(),), - "address": (a[frag_loc].item(),), - "format": fmt[frag_loc].item(), - } - for frag_loc, loc in zip(positions, locations) - } - - # Apply string substitutions to the fragment filenames - if substitutions: - for value in aggregated_data.values(): - filenames2 = [] - for filename in value["filename"]: - for base, sub in substitutions.items(): - filename = filename.replace(base, sub) - - filenames2.append(filename) - - value["filename"] = filenames2 - + shape, fragment_shape, aggregated_data = self._parse_cfa( + x, term, substitutions + ) super().__init__( filename=filename, address=address, shape=shape, dtype=dtype, mask=mask, - units=units, - calendar=calendar, + attributes=attributes, copy=copy, ) else: @@ -227,8 +205,7 @@ def __init__( address=address, dtype=dtype, mask=mask, - units=units, - calendar=calendar, + attributes=attributes, copy=copy, ) @@ -247,6 +224,134 @@ def __init__( "substitutions", substitutions.copy(), copy=False ) + def _parse_cfa(self, x, term, substitutions): + """Parse the CFA aggregation instructions. + + .. versionadded:: NEXTVERSION + + :Parameters: + + x: `dict` + + term: `str` or `None` + The name of a non-standard aggregation instruction + term from which the array is to be created, instead of + creating the aggregated data in the standard + terms. Each value of the aggregation instruction + variable will be broadcast across the shape of the + corresponding fragment. + + substitutions: `dict` or `None` + A dictionary whose key/value pairs define text + substitutions to be applied to the fragment file + names. Each key must be specified with the ``${...}`` + syntax, for instance ``{'${base}': 'sub'}``. + + :Returns: + + 3-`tuple` + 1. The shape of the aggregated data. + 2. The shape of the array of fragments. + 3. The parsed aggregation instructions. + + """ + aggregated_data = {} + + location = x["location"] + ndim = location.shape[0] + compressed = np.ma.compressed + chunks = [compressed(i).tolist() for i in location] + shape = [sum(c) for c in chunks] + positions = chunk_positions(chunks) + locations = chunk_locations(chunks) + + if term is not None: + # -------------------------------------------------------- + # Each fragment contains a constant value, not file + # locations. + # -------------------------------------------------------- + term = x[term] + fragment_shape = term.shape + aggregated_data = { + frag_loc: { + "location": loc, + "fill_value": term[frag_loc].item(), + "format": "full", + } + for frag_loc, loc in zip(positions, locations) + } + else: + # -------------------------------------------------------- + # Each fragment contains file locations + # -------------------------------------------------------- + a = x["address"] + f = x["file"] + file_fmt = x["format"] + + extra_dimension = f.ndim > ndim + if extra_dimension: + # There is an extra non-fragment dimension + fragment_shape = f.shape[:-1] + else: + fragment_shape = f.shape + + if not a.ndim: + a = (a.item(),) + scalar_address = True + else: + scalar_address = False + + if not file_fmt.ndim: + file_fmt = file_fmt.item() + scalar_fmt = True + else: + scalar_fmt = False + + for frag_loc, location in zip(positions, locations): + if extra_dimension: + filename = compressed(f[frag_loc]).tolist() + if scalar_address: + address = a * len(filename) + else: + address = compressed(a[frag_loc].tolist()) + + if scalar_fmt: + fmt = file_fmt + else: + fmt = compressed(file_fmt[frag_loc]).tolist() + else: + filename = (f[frag_loc].item(),) + if scalar_address: + address = a + else: + address = (a[frag_loc].item(),) + + if scalar_fmt: + fmt = file_fmt + else: + fmt = file_fmt[frag_loc].item() + + aggregated_data[frag_loc] = { + "location": location, + "filename": filename, + "address": address, + "format": fmt, + } + + # Apply string substitutions to the fragment filenames + if substitutions: + for value in aggregated_data.values(): + filenames2 = [] + for filename in value["filename"]: + for base, sub in substitutions.items(): + filename = filename.replace(base, sub) + + filenames2.append(filename) + + value["filename"] = filenames2 + + return shape, fragment_shape, aggregated_data + def __dask_tokenize__(self): """Used by `dask.base.tokenize`. @@ -298,12 +403,14 @@ def get_aggregated_data(self, copy=True): >>> a.get_fragment_shape() (2, 1, 1, 1) >>> a.get_aggregated_data() - {(0, 0, 0, 0): {'file': 'January-June.nc', - 'address': 'temp', + {(0, 0, 0, 0): { + 'file': ('January-June.nc',), + 'address': ('temp',), 'format': 'nc', 'location': [(0, 6), (0, 1), (0, 73), (0, 144)]}, - (1, 0, 0, 0): {'file': 'July-December.nc', - 'address': 'temp', + (1, 0, 0, 0): { + 'file': ('July-December.nc',), + 'address': ('temp',), 'format': 'nc', 'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}} @@ -357,6 +464,33 @@ def get_fragment_shape(self): """ return self._get_component("fragment_shape") + def get_storage_options(self): + """Return `s3fs.S3FileSystem` options for accessing S3 fragment files. + + .. versionadded:: NEXTVERSION + + :Returns: + + `dict` or `None` + The `s3fs.S3FileSystem` options. + + **Examples** + + >>> f.get_storage_options() + {} + + >>> f.get_storage_options() + {'anon': True} + + >>> f.get_storage_options() + {'key: 'scaleway-api-key...', + 'secret': 'scaleway-secretkey...', + 'endpoint_url': 'https://s3.fr-par.scw.cloud', + 'client_kwargs': {'region_name': 'fr-par'}} + + """ + return super().get_storage_options(create_endpoint_url=False) + def get_term(self, default=ValueError()): """The CFA aggregation instruction term for the data, if set. @@ -380,13 +514,18 @@ def get_term(self, default=ValueError()): def subarray_shapes(self, shapes): """Create the subarray shapes. + A fragmented dimension (i.e. one spanned by two or more + fragments) will always have a subarray size equal to the + size of each of its fragments, overriding any other size + implied by the *shapes* parameter. + .. versionadded:: 3.14.0 .. seealso:: `subarrays` :Parameters: - shapes: `int`, sequence, `dict` or `str`, optional + shapes: `int`, sequence, `dict` or `str`, optional Define the subarray shapes. Any value accepted by the *chunks* parameter of the @@ -427,7 +566,8 @@ def subarray_shapes(self, shapes): from dask.array.core import normalize_chunks - # Indices of fragmented dimensions + # Positions of fragmented dimensions (i.e. those spanned by + # two or more fragments) f_dims = self.get_fragmented_dimensions() shape = self.shape @@ -440,8 +580,9 @@ def subarray_shapes(self, shapes): zip(self.get_fragment_shape(), self.shape) ): if dim in f_dims: - # This aggregated dimension is spanned by more than - # one fragment. + # This aggregated dimension is spanned by two or more + # fragments => set the chunks to be the same size as + # each fragment. c = [] index = [0] * ndim for j in range(n_fragments): @@ -453,8 +594,8 @@ def subarray_shapes(self, shapes): chunks.append(tuple(c)) else: # This aggregated dimension is spanned by exactly one - # fragment. Store None, for now, in the expectation - # that it will get overwrittten. + # fragment => store `None` for now. This will get + # overwritten from 'shapes'. chunks.append(None) if isinstance(shapes, (str, Number)) or shapes is None: @@ -657,18 +798,19 @@ def to_dask_array(self, chunks="auto"): name = (f"{self.__class__.__name__}-{tokenize(self)}",) dtype = self.dtype - units = self.get_units() + units = self.get_units(None) calendar = self.get_calendar(None) aggregated_data = self.get_aggregated_data(copy=False) # Set the chunk sizes for the dask array chunks = self.subarray_shapes(chunks) - if self.get_mask(): - fragment_arrays = _FragmentArray - else: - fragment_arrays = _FragmentArray.copy() - fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False) + fragment_arrays = self._FragmentArray + if not self.get_mask(): + fragment_arrays = fragment_arrays.copy() + fragment_arrays["nc"] = partial(fragment_arrays["nc"], mask=False) + + storage_options = self.get_storage_options() dsk = {} for ( @@ -691,6 +833,10 @@ def to_dask_array(self, chunks="auto"): f"fragment dataset format: {fragment_format!r}" ) + if storage_options and kwargs["address"] == "nc": + # Pass on any file system options + kwargs["storage_options"] = storage_options + fragment = FragmentArray( dtype=dtype, shape=fragment_shape, @@ -706,7 +852,7 @@ def to_dask_array(self, chunks="auto"): key, f_indices, False, - getattr(fragment, "_lock", False), + False, ) # Return the dask array diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py index 378567a23a..b5b314b9e2 100644 --- a/cf/data/array/mixin/filearraymixin.py +++ b/cf/data/array/mixin/filearraymixin.py @@ -1,8 +1,6 @@ from os import sep from os.path import basename, dirname, join -import numpy as np - from ....functions import _DEPRECATION_ERROR_ATTRIBUTE, abspath @@ -26,20 +24,6 @@ def __dask_tokenize__(self): self.get_addresses(), ) - @property - def _dask_meta(self): - """The metadata for the containing dask array. - - This is the kind of array that will result from slicing the - file array. - - .. versionadded:: 3.14.0 - - .. seealso:: `dask.array.from_array` - - """ - return np.array((), dtype=self.dtype) - @property def filename(self): """The name of the file containing the array. diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py new file mode 100644 index 0000000000..d105ba943a --- /dev/null +++ b/cf/data/array/mixin/indexmixin.py @@ -0,0 +1,364 @@ +from numbers import Integral + +import numpy as np +from dask.array.slicing import normalize_index +from dask.base import is_dask_collection + +from ....functions import indices_shape, parse_indices + + +class IndexMixin: + """Mixin class for lazy indexing of a data array. + + A data for a subspace is retrieved by casting the object as a + `numpy` array. See `__getitem__` for more details. + + **Examples** + + >>> a = cf.{{class}}(...) + >>> a.shape + (6, 5) + >>> print(np.asanyarray(a)) + [[ 0 1 2 3 4]) + [ 5 6 7 8 9] + [10 11 12 13 14] + [15 16 17 18 19] + [20 21 22 23 24] + [25 26 27 28 29]] + >>> a = a[::2, [1, 2, 4]] + >>> a = a[[True, False, True], :] + >>> a.shape + (2, 3) + >>> print(np.asanyarray(a)) + [[ 1, 2, 4], + [21, 22, 24]] + + .. versionadded:: NEXTVERSION + + """ + + def __array__(self, *dtype): + """Convert the `{{class}}` into a `numpy` array. + + .. versionadded:: NEXTVERSION + + :Parameters: + + dtype: optional + Typecode or data-type to which the array is cast. + + :Returns: + + `numpy.ndarray` + An independent `numpy` array of the subspace of the + data defined by the `indices` attribute. + + """ + array = self._get_array() + if dtype: + return array.astype(dtype[0], copy=False) + + return array + + def __getitem__(self, index): + """Returns a subspace of the data as a new `{{class}}`. + + x.__getitem__(indices) <==> x[indices] + + Subspaces created by indexing are lazy and are not applied + until the `{{class}}` object is converted to a `numpy` array, + by which time all lazily-defined subspaces will have been + converted to a single combined index which defines only the + actual elements that need to be retrieved from the original + data. + + The combined index is orthogonal, meaning that the index for + each dimension is to be applied independently, regardless of + how that index was defined. For instance, the indices ``[[0, + 1], [1, 3], 0]`` and ``[:2, 1::2, 0]`` will give identical + results. + + For example, if the original data has shape ``(12, 145, 192)`` + and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and + ``[[0, 5], [True, False, True], 0]`` are applied, then only + the elements defined by the combined index``[[0, 10], [1, 4], + 96]`` will be retrieved from the data when `__array__` is + called. + + .. versionadded:: NEXTVERSION + + .. seealso:: `index`, `original_shape`, `__array__`, + `__getitem__` + + :Returns: + + `{{class}}` + The subspaced data. + + """ + shape0 = self.shape + index0 = self.index(conform=False) + original_shape = self.original_shape + + index1 = parse_indices(shape0, index, keepdims=False) + + new = self.copy() + new_indices = [] + new_shape = [] + + i = 0 + for ind0, original_size in zip(index0, original_shape): + if isinstance(ind0, Integral): + # The previous call to __getitem__ resulted in a + # dimension being removed (i.e. 'ind0' is + # integer-valued). Therefore 'index1' must have fewer + # elements than 'index0', so we need to "carry + # forward" the integer-valued index so that it is + # available at evaluation time. + new_indices.append(ind0) + continue + + ind1 = index1[i] + size0 = shape0[i] + i += 1 + + # If this dimension is not subspaced by the new index then + # we don't need to update the old index. + if isinstance(ind1, slice) and ind1 == slice(None): + new_indices.append(ind0) + continue + + # Still here? Then we have to work out the index of the + # full array that is equivalent to applying + # 'ind0' followed by 'ind1'. + if is_dask_collection(ind1): + # Note: This will never occur when this __getitem__ is + # being called from within a Dask graph, because + # any lazy indices will have already been + # computed as part of the whole graph execution; + # i.e. we don't have to worry about a + # compute-within-a-compute situation. (If this + # were not the case then we could add + # `scheduler="synchronous"` to the compute + # call.) + ind1 = ind1.compute() + + if isinstance(ind0, slice): + if isinstance(ind1, slice): + # ind0: slice + # ind1: slice + start, stop, step = ind0.indices(original_size) + start1, stop1, step1 = ind1.indices(size0) + size1, mod1 = divmod(stop1 - start1, step1) + + if mod1 != 0: + size1 += 1 + + start += start1 * step + step *= step1 + stop = start + (size1 - 1) * step + + if step > 0: + stop += 1 + else: + stop -= 1 + + if stop < 0: + stop = None + + new_index = slice(start, stop, step) + else: + # ind0: slice + # ind1: int, or array of int/bool + new_index = np.arange(*ind0.indices(original_size))[ind1] + else: + # ind0: array of int. If we made it to here then it + # can't be anything else. This is + # because we've dealt with ind0 + # being a slice or an int, the + # very first ind0 is always + # slice(None), and a previous ind1 + # that was an array of bool will + # have resulted in this ind0 being + # an array of int. + # + # ind1: anything + new_index = np.asanyarray(ind0)[ind1] + + new_indices.append(new_index) + + new._custom["index"] = tuple(new_indices) + + # Find the shape defined by the new index + new_shape = indices_shape(new_indices, original_shape, keepdims=False) + new._set_component("shape", tuple(new_shape), copy=False) + + return new + + def __repr__(self): + """Called by the `repr` built-in function. + + x.__repr__() <==> repr(x) + + """ + return ( + f"" + ) + + @property + def __asanyarray__(self): + """Whether the array is accessed by conversion to a `numpy` array. + + .. versionadded:: NEXTVERSION + + :Returns: + + `True` + + """ + return True + + def _get_array(self, index=None): + """Returns a subspace of the data as a `numpy` array. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + index: `tuple` or `None`, optional + Provide the indices that define the subspace. If + `None` then the `index` attribute is used. + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + return NotImplementedError( + f"Must implement {self.__class__.__name__}._get_array" + ) + + def index(self, conform=True): + """The index to be applied when converting to a `numpy` array. + + The `shape` is defined by the `index` applied to the + `original_shape`. + + .. versionadded:: NEXTVERSION + + .. seealso:: `shape`, `original_shape` + + :Parameters: + + conform: `bool`, optional + If True, the default, then + + * Convert a decreasing size 1 slice to an increasing + one. + + * Convert, where possible, a sequence of integers to a + slice. + + These transformations are to allow subspacing on data + objects that have restricted indexing functionality, + such as `h5py.Variable` objects. + + If False then these transformations are not done. + + :Returns: + + `tuple` + + **Examples** + + >>> x.shape + (12, 145, 192) + >>> x.index() + (slice(None), slice(None), slice(None)) + >>> x = x[8:7:-1, 10:19:3, [15, 1, 4, 12]] + >>> x = x[[0], [True, False, True], ::-2] + >>> x.shape + (1, 2, 2) + >>> x.index() + (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)) + >>> x.index(conform=False) + (array([8]), array([10, 16]), array([12, 1])) + + """ + ind = self._custom.get("index") + if ind is None: + # No indices have been applied yet, so define indices that + # are equivalent to Ellipsis, and set the original shape. + ind = (slice(None),) * self.ndim + self._custom["index"] = ind + self._custom["original_shape"] = self.shape + return ind + + if not conform: + return ind + + # Still here? Then conform the indices by: + # + # 1) Converting decreasing size 1 slices to increasing + # ones. This helps when the parent class can't cope with + # decreasing slices. + # + # 2) Converting, where possible, sequences of integers to + # slices. This helps when the parent class can't cope with + # indices that are sequences of integers. + ind = list(ind) + for n, (i, size) in enumerate(zip(ind[:], self.original_shape)): + if isinstance(i, slice): + if size == 1: + start, _, step = i.indices(size) + if step and step < 0: + # Decreasing slices are not universally + # accepted (e.g. `h5py` doesn't like them), + # but we can convert them to increasing ones. + ind[n] = slice(start, start + 1) + elif np.iterable(i): + i = normalize_index((i,), (size,))[0] + if i.size == 1: + # Convert a sequence of one integer into a slice + start = i.item() + ind[n] = slice(start, start + 1) + else: + # Convert a sequence of two or more evenly spaced + # integers into a slice. + step = np.unique(np.diff(i)) + if step.size == 1: + start, stop = i[[0, -1]] + if stop >= start: + stop += 1 + elif stop: + stop = -1 + else: + stop = None + + ind[n] = slice(start, stop, step.item()) + + return tuple(ind) + + @property + def original_shape(self): + """The original shape of the data, before any subspacing. + + The `shape` is defined by the result of subspacing the data in + its original shape with the indices given by `index`. + + .. versionadded:: NEXTVERSION + + .. seealso:: `index`, `shape` + + """ + out = self._custom.get("original_shape") + if out is None: + # No subspace has been defined yet + out = self.shape + self._custom["original_shape"] = out + + return out diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py new file mode 100644 index 0000000000..095bf2d3ad --- /dev/null +++ b/cf/data/array/netcdf4array.py @@ -0,0 +1,80 @@ +import cfdm + +from ...mixin_container import Container +from .locks import netcdf_lock +from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin + + +class NetCDF4Array( + ActiveStorageMixin, + IndexMixin, + FileArrayMixin, + ArrayMixin, + Container, + cfdm.NetCDF4Array, +): + """A netCDF array accessed with `netCDF4`. + + **Active storage reductions** + + An active storage reduction may be enabled with the `actify` + method. See `cf.data.collapse.Collapse` for details. + + """ + + def __dask_tokenize__(self): + """Return a value fully representative of the object. + + .. versionadded:: 3.15.0 + + """ + return super().__dask_tokenize__() + (self.get_mask(),) + + @property + def _lock(self): + """Set the lock for use in `dask.array.from_array`. + + Returns a lock object because concurrent reads are not + currently supported by the netCDF and HDF libraries. The lock + object will be the same for all `NetCDF4Array` and + `H5netcdfArray` instances, regardless of the dataset they + access, which means that access to all netCDF and HDF files + coordinates around the same lock. + + .. versionadded:: 3.14.0 + + """ + return netcdf_lock + + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + if index is None: + index = self.index() + + # Note: We need to lock because the netCDF file is about to be + # accessed. + self._lock.acquire() + + # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call + # here, but we use 'Container' in super because that + # comes immediately before cfdm.NetCDFArray in the + # method resolution order. + array = super(Container, self).__getitem__(index) + + self._lock.release() + return array diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py index 6dab56af1a..4642dff910 100644 --- a/cf/data/array/netcdfarray.py +++ b/cf/data/array/netcdfarray.py @@ -1,35 +1,16 @@ -import cfdm -from dask.utils import SerializableLock +class NetCDFArray: + """A netCDF array accessed with `netCDF4`. -from ...mixin_container import Container -from .mixin import ArrayMixin, FileArrayMixin + Deprecated at version NEXTVERSION and is no longer available. Use + `cf.NetCDF4Array` instead. -# Global lock for netCDF file access -_lock = SerializableLock() + """ + def __init__(self, *args, **kwargs): + """**Initialisation**""" + from ...functions import DeprecationError -class NetCDFArray(FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray): - """An array stored in a netCDF file.""" - - def __dask_tokenize__(self): - """Return a value fully representative of the object. - - .. versionadded:: 3.15.0 - - """ - return super().__dask_tokenize__() + (self.get_mask(),) - - @property - def _lock(self): - """Set the lock for use in `dask.array.from_array`. - - Returns a lock object because concurrent reads are not - currently supported by the netCDF-C library. The lock object - will be the same for all `NetCDFArray` instances, regardless - of the dataset they access, which means that access to all - netCDF files coordinates around the same lock. - - .. versionadded:: 3.14.0 - - """ - return _lock + raise DeprecationError( + f"{self.__class__.__name__} was deprecated at version NEXTVERSION " + "and is no longer available. Use cf.NetCDF4Array instead." + ) diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py index a5e24ddb74..510b9c97ee 100644 --- a/cf/data/array/umarray.py +++ b/cf/data/array/umarray.py @@ -1,19 +1,15 @@ import cfdm -import numpy as np from ...constants import _stash2standard_name -from ...functions import ( - _DEPRECATION_ERROR_ATTRIBUTE, - get_subspace, - load_stash2standard_name, - parse_indices, -) +from ...functions import _DEPRECATION_ERROR_ATTRIBUTE, load_stash2standard_name from ...umread_lib.umfile import File, Rec from .abstract import Array -from .mixin import FileArrayMixin +from .mixin import FileArrayMixin, IndexMixin -class UMArray(FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array): +class UMArray( + IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array +): """A sub-array stored in a PP or UM fields file.""" def __init__( @@ -25,8 +21,7 @@ def __init__( fmt=None, word_size=None, byte_ordering=None, - units=False, - calendar=False, + attributes=None, source=None, copy=True, ): @@ -61,16 +56,15 @@ def __init__( byte_ordering: `str`, optional ``'little_endian'`` or ``'big_endian'`` - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. + {{init attributes: `dict` or `None`, optional}} - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. + During the first `__getitem__` call, any of the + ``_FillValue``, ``add_offset``, ``scale_factor``, + ``units``, and ``calendar`` attributes which haven't + already been set will be inferred from the lookup + header and cached for future use. + + .. versionadded:: NEXTVERSION {{init source: optional}} @@ -83,7 +77,7 @@ def __init__( Deprecated at version 3.14.0. header_offset: `int` - Deprecated at version 3.15.0. use the *address* + Deprecated at version 3.15.0. Use the *address* parameter instead. data_offset: `int`, optional @@ -92,6 +86,14 @@ def __init__( disk_length: `int`, optional Deprecated at version 3.15.0. + units: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + """ super().__init__(source=source, copy=copy) @@ -132,14 +134,9 @@ def __init__( byte_ordering = None try: - units = source._get_component("units", False) + attributes = source._get_component("attributes", None) except AttributeError: - units = False - - try: - calendar = source._get_component("calendar", False) - except AttributeError: - calendar = False + attributes = None if filename is not None: if isinstance(filename, str): @@ -159,8 +156,7 @@ def __init__( self._set_component("shape", shape, copy=False) self._set_component("dtype", dtype, copy=False) - self._set_component("units", units, copy=False) - self._set_component("calendar", calendar, copy=False) + self._set_component("attributes", attributes, copy=False) if fmt is not None: self._set_component("fmt", fmt, copy=False) @@ -174,75 +170,60 @@ def __init__( # By default, close the UM file after data array access self._set_component("close", True, copy=False) - def __getitem__(self, indices): - """Return a subspace of the array. + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} - x.__getitem__(indices) <==> x[indices] + :Returns: - Returns a subspace of the array as an independent numpy array. + `numpy.ndarray` + The subspace. """ + # Note: No need to lock the UM file - concurrent reads are OK. + + if index is None: + index = self.index() + f, header_offset = self.open() rec = self._get_rec(f, header_offset) int_hdr = rec.int_hdr real_hdr = rec.real_hdr - array = rec.get_data().reshape(self.shape) + array = rec.get_data().reshape(self.original_shape) self.close(f) del f, rec - if indices is not Ellipsis: - indices = parse_indices(array.shape, indices) - array = get_subspace(array, indices) - - # Set the units, if they haven't been set already. - self._set_units(int_hdr) - - LBUSER2 = int_hdr.item(38) - if LBUSER2 == 3: - # Return the numpy array now if it is a boolean array - self._set_component("dtype", np.dtype(bool), copy=False) - return array.astype(bool) - - integer_array = LBUSER2 == 2 - - # ------------------------------------------------------------ - # Convert to a masked array - # ------------------------------------------------------------ - # Set the fill_value from BMDI - fill_value = real_hdr.item(17) - if fill_value != -1.0e30: - # -1.0e30 is the flag for no missing data - if integer_array: - # The fill_value must be of the same type as the data - # values - fill_value = int(fill_value) - - # Mask any missing values - mask = array == fill_value - if mask.any(): - array = np.ma.masked_where(mask, array, copy=False) - - # ------------------------------------------------------------ - # Unpack the array using the scale_factor and add_offset, if - # either is available - # ------------------------------------------------------------ - # Treat BMKS as a scale_factor if it is neither 0 nor 1 - scale_factor = real_hdr.item(18) - if scale_factor != 1.0 and scale_factor != 0.0: - if integer_array: - scale_factor = int(scale_factor) - - array *= scale_factor - - # Treat BDATUM as an add_offset if it is not 0 - add_offset = real_hdr.item(4) - if add_offset != 0.0: - if integer_array: - add_offset = int(add_offset) + # Set the netCDF attributes for the data + attributes = self.get_attributes({}) + self._set_units(int_hdr, attributes) + self._set_FillValue(int_hdr, real_hdr, attributes) + self._set_unpack(int_hdr, real_hdr, attributes) + self._set_component("attributes", attributes, copy=False) + + # Get the data subspace, applying any masking and unpacking + array = cfdm.netcdf_indexer( + array, + mask=True, + unpack=True, + always_masked_array=False, + orthogonal_indexing=True, + attributes=attributes, + copy=False, + ) + array = array[index] - array += add_offset + if int_hdr.item(38) == 3: + # Convert the data to a boolean array + array = array.astype(bool) # Set the data type self._set_component("dtype", array.dtype, copy=False) @@ -289,68 +270,144 @@ def _get_rec(self, f, header_offset): # if r.hdr_offset == header_offset: # return r - def _set_units(self, int_hdr): - """The units and calendar properties. + def _set_FillValue(self, int_hdr, real_hdr, attributes): + """Set the ``_FillValue`` attribute. - These are set from inpection of the integer header, but only - if they have already not been defined, either during {{class}} - instantiation or by a previous call to `_set_units`. + .. versionadded:: NEXTVERSION + + :Parameters: + + int_hdr: `numpy.ndarray` + The integer header of the data. + + real_header: `numpy.ndarray` + The real header of the data. + + attributes: `dict` + The dictionary in which to store the new + attributes. If a new attribute exists then + *attributes* is updated in-place. + + :Returns: + + `None + + """ + if "FillValue" in attributes: + return + + # Set the fill_value from BMDI + _FillValue = real_hdr.item(17) + if _FillValue != -1.0e30: + # -1.0e30 is the flag for no missing data + if int_hdr.item(38) == 2: + # Must have an integer _FillValue for integer data + _FillValue = int(_FillValue) + + attributes["_FillValue"] = _FillValue + + def _set_units(self, int_hdr, attributes): + """Set the ``units`` attribute. .. versionadded:: 3.14.0 + .. versionadded:: NEXTVERSION + :Parameters: int_hdr: `numpy.ndarray` The integer header of the data. + real_header: `numpy.ndarray` + The real header of the data. + + attributes: `dict` + The dictionary in which to store the new + attributes. If a new attribute exists then + *attributes* is updated in-place. + :Returns: - `tuple` - The units and calendar values, either of which may be - `None`. + `None` + + """ + if "units" in attributes: + return + + units = None + if not _stash2standard_name: + load_stash2standard_name() + + submodel = int_hdr.item(44) + stash = int_hdr.item(41) + records = _stash2standard_name.get((submodel, stash)) + if records: + LBSRCE = int_hdr.item(37) + version, source = divmod(LBSRCE, 10000) + if version <= 0: + version = 405.0 + + for ( + long_name, + units0, + valid_from, + valid_to, + standard_name, + cf_info, + condition, + ) in records: + if not self._test_version( + valid_from, valid_to, version + ) or not self._test_condition(condition, int_hdr): + continue + + units = units0 + break + + attributes["units"] = units + + def _set_unpack(self, int_hdr, real_hdr, attributes): + """Set the ``add_offset`` and ``scale_factor`` attributes. + + .. versionadded:: NEXTVERSION + + :Parameters: + + int_hdr: `numpy.ndarray` + The integer header of the data. + + real_header: `numpy.ndarray` + The real header of the data. + + attributes: `dict` + The dictionary in which to store the new + attributes. If any new attributes exist then + *attributes* is updated in-place. + + :Returns: + + `None """ - units = self._get_component("units", False) - if units is False: - units = None - - if not _stash2standard_name: - load_stash2standard_name() - - submodel = int_hdr[44] - stash = int_hdr[41] - records = _stash2standard_name.get((submodel, stash)) - if records: - LBSRCE = int_hdr[37] - version, source = divmod(LBSRCE, 10000) - if version <= 0: - version = 405.0 - - for ( - long_name, - units0, - valid_from, - valid_to, - standard_name, - cf_info, - condition, - ) in records: - if not self._test_version( - valid_from, valid_to, version - ) or not self._test_condition(condition, int_hdr): - continue - - units = units0 - break - - self._set_component("units", units, copy=False) - - calendar = self._get_component("calendar", False) - if calendar is False: - calendar = None - self._set_component("calendar", calendar, copy=False) - - return units, calendar + if "scale_factor" not in attributes: + # Treat BMKS as a scale_factor if it is neither 0 nor 1 + scale_factor = real_hdr.item(18) + if scale_factor != 1.0 and scale_factor != 0.0: + if int_hdr.item(38) == 2: + # Must have an integer scale_factor for integer data + scale_factor = int(scale_factor) + + attributes["scale_factor"] = scale_factor + + if "add_offset" not in attributes: + # Treat BDATUM as an add_offset if it is not 0 + add_offset = real_hdr.item(4) + if add_offset != 0.0: + if int_hdr.item(38) == 2: + # Must have an integer add_offset for integer data + add_offset = int(add_offset) + + attributes["add_offset"] = add_offset def _test_condition(self, condition, int_hdr): """Return `True` if a field satisfies a condition for a STASH @@ -381,14 +438,14 @@ def _test_condition(self, condition, int_hdr): return True if condition == "true_latitude_longitude": - LBCODE = int_hdr[15] + LBCODE = int_hdr.item(15) # LBCODE 1: Unrotated regular lat/long grid # LBCODE 2 = Regular lat/lon grid boxes (grid points are # box centres) if LBCODE in (1, 2): return True elif condition == "rotated_latitude_longitude": - LBCODE = int_hdr[15] + LBCODE = int_hdr.item(15) # LBCODE 101: Rotated regular lat/long grid # LBCODE 102: Rotated regular lat/lon grid boxes (grid # points are box centres) @@ -686,7 +743,7 @@ def open(self): **Examples** >>> f.open() - (, 4) + (, 4) """ return super().open( diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py index 0de12360ea..ec720438f0 100644 --- a/cf/data/collapse/__init__.py +++ b/cf/data/collapse/__init__.py @@ -1 +1,2 @@ from .collapse import Collapse +from .collapse_active import active_reduction_methods diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py index 0220f3e17a..550c14959c 100644 --- a/cf/data/collapse/collapse.py +++ b/cf/data/collapse/collapse.py @@ -11,6 +11,47 @@ class Collapse(metaclass=DocstringRewriteMeta): """Container for functions that collapse dask arrays. + **Active storage reductions** + + A collapse method (such as `max`, `var`, etc.) will attempt to + make use of active storage reduction on an individual `dask` chunk + when all of the following conditions are met: + + * `cf.active_storage()` is True; + + * ``cf.active_storage_url()`` returns the URL of a valid active + storage server; + + * the `dask` chunk's data are defined by a netCDF-4 file on disk + (rather than in any other file format, or in memory); + + * it is possible to import the `activestorage.Active` class; + + * the method is one of those specified by + `cf.data.collapse.active_reduction_methods`; + + * the collapse is over all axes; + + * the collapse is unweighted; + + * the data are not numerically packed. + + If any of these conditions are not met then the `dask` chunk will + be collapsed "as usual", i.e. by retrieving the data to memory (if + it is not already there) and using the local client to perform the + collapse calculations. + + .. note:: The performance improvements from using active storage + operations will increase the closer, in a network sense, + the active storage server is to the data storage. If the + active storage server is sufficiently far away from the + data then it could even be faster and require less + energy to do non-active operation of the local client. + + See `cf.data.collapse.collapse_active.actify` and + `cf.data.collapse.collapse_active.active_chunk_function` for + further details. + .. versionadded:: 3.14.0 """ @@ -52,7 +93,7 @@ def max( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -80,7 +121,7 @@ def max( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -91,6 +132,7 @@ def max( from .dask_collapse import cf_max_agg, cf_max_chunk, cf_max_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_max_chunk check_input_dtype(a) @@ -113,7 +155,7 @@ def max_abs( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -141,7 +183,8 @@ def max_abs( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} + :Returns: @@ -163,7 +206,7 @@ def mean( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -193,7 +236,7 @@ def mean( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -204,6 +247,7 @@ def mean( from .dask_collapse import cf_mean_agg, cf_mean_chunk, cf_mean_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_mean_chunk check_input_dtype(a) @@ -228,7 +272,7 @@ def mean_abs( weights=None, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -258,7 +302,7 @@ def mean_abs( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -281,7 +325,7 @@ def mid_range( axis=None, dtype=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -309,7 +353,7 @@ def mid_range( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -324,6 +368,7 @@ def mid_range( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_range_chunk check_input_dtype(a, allowed="fi") @@ -346,7 +391,7 @@ def min( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -374,7 +419,7 @@ def min( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -385,6 +430,7 @@ def min( from .dask_collapse import cf_min_agg, cf_min_chunk, cf_min_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_min_chunk check_input_dtype(a) @@ -407,7 +453,7 @@ def min_abs( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -435,7 +481,7 @@ def min_abs( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -456,7 +502,7 @@ def range( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -484,7 +530,7 @@ def range( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -499,6 +545,7 @@ def range( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_range_chunk check_input_dtype(a, allowed="fi") @@ -522,7 +569,7 @@ def rms( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -552,7 +599,8 @@ def rms( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} + :Returns: @@ -563,6 +611,7 @@ def rms( from .dask_collapse import cf_mean_combine, cf_rms_agg, cf_rms_chunk if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_rms_chunk check_input_dtype(a) @@ -586,7 +635,7 @@ def sample_size( a, axis=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -614,7 +663,7 @@ def sample_size( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -629,6 +678,7 @@ def sample_size( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_sample_size_chunk check_input_dtype(a) @@ -652,7 +702,7 @@ def sum( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -682,7 +732,7 @@ def sum( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -693,6 +743,7 @@ def sum( from .dask_collapse import cf_sum_agg, cf_sum_chunk, cf_sum_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_sum_chunk check_input_dtype(a) @@ -720,7 +771,7 @@ def sum_of_weights( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -750,7 +801,7 @@ def sum_of_weights( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -765,6 +816,7 @@ def sum_of_weights( ) if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_sum_of_weights_chunk check_input_dtype(a) @@ -789,7 +841,7 @@ def sum_of_weights2( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, split_every=None, chunk_function=None, ): @@ -819,7 +871,7 @@ def sum_of_weights2( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -830,17 +882,18 @@ def sum_of_weights2( from .dask_collapse import ( cf_sum_agg, cf_sum_combine, - cf_sum_of_weights_chunk, + cf_sum_of_weights2_chunk, ) if chunk_function is None: - chunk_function = cf_sum_of_weights_chunk + # Default function for chunk calculations + chunk_function = cf_sum_of_weights2_chunk check_input_dtype(a) dtype = double_precision_dtype(weights, default="i8") return reduction( a, - partial(chunk_function, square=True), + chunk_function, partial(cf_sum_agg, mtol=mtol, original_shape=a.shape), axis=axis, keepdims=keepdims, @@ -864,7 +917,7 @@ def unique(self, a, split_every=None, chunk_function=None): {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} :Returns: @@ -875,6 +928,7 @@ def unique(self, a, split_every=None, chunk_function=None): from .dask_collapse import cf_unique_agg, cf_unique_chunk if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_unique_chunk check_input_dtype(a, "fibUS") @@ -905,7 +959,7 @@ def var( axis=None, weights=None, keepdims=False, - mtol=None, + mtol=1, ddof=None, split_every=None, chunk_function=None, @@ -938,7 +992,12 @@ def var( {{split_every: `int` or `dict`, optional}} - {{chunk_function: callable, optional}} + {{chunk_function: callable or `None`, optional}} + + A callable function must accept a *ddof* keyword + parameter that sets the delta degrees of freedom. See + `cf.data.collapse.dask_collapse.cf_var_chunk` for + details. :Returns: @@ -949,6 +1008,7 @@ def var( from .dask_collapse import cf_var_agg, cf_var_chunk, cf_var_combine if chunk_function is None: + # Default function for chunk calculations chunk_function = cf_var_chunk check_input_dtype(a) diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py new file mode 100644 index 0000000000..db8fc277f6 --- /dev/null +++ b/cf/data/collapse/collapse_active.py @@ -0,0 +1,286 @@ +import datetime +import logging +import time +from functools import wraps +from numbers import Integral + +try: + from activestorage import Active +except ModuleNotFoundError: + pass + +from ...functions import ( + active_storage, + active_storage_max_requests, + active_storage_url, + is_log_level_info, +) + +logger = logging.getLogger(__name__) + +# -------------------------------------------------------------------- +# Specify which reduction methods are possible with active storage +# -------------------------------------------------------------------- +active_reduction_methods = ("max", "mean", "min", "sum") + + +class ActiveStorageError(Exception): + pass + + +def actify(method): + """Decorator for active storage reductions on chunks. + + Intended to decorate the ``cf_*_chunk`` methods in + `cf.data.collapse.dask_collapse`. + + When a ``cf_*_chunk`` method is decorated, its computations will + be attempted in active storage. If that is not possible (due to + configuration settings, limitations on the type of reduction that + can be done in active storage, or the active storage reduction + failed) then the computations will be done locally "as usual". + + .. versionadded:: NEXTVERSION + + .. seealso:: `active_chunk_function` + + :Parameters: + + method: `str` + The name of the reduction method. + + """ + + def decorator(chunk_function): + @wraps(chunk_function) + def wrapper(*args, **kwargs): + try: + # Try doing an active storage reduction + out = active_chunk_function(method, *args, **kwargs) + except ActiveStorageError as error: + # The active storage reduction failed + logger.warning( + f"{error} => reverting to local computation" + ) # pragma: no cover + else: + if out is not None: + # The active storage reduction succeeded + return out + + # Still here? Then using active storage was not + # appropriate, or else doing the active storage operation + # failed => do a local computation. + return chunk_function(*args, **kwargs) + + return wrapper + + return decorator + + +def active_chunk_function(method, *args, **kwargs): + """Collapse data in a chunk with active storage. + + Called by the `actify` decorator function. + + If an active storage reduction is not appropriate then `None` is + returned. + + If the active storage operation fails then an ActiveStorageError + is raised. + + If the active storage operation is successful then a dictionary of + reduction components, similar to that returned by a ``cf_*_chunk`` + method, is returned. + + .. versionadded:: NEXTVERSION + + .. seealso:: `actify` + + :Parameters: + + method: `str` + The name of the reduction method (e.g. ``'mean'``). + + x: array_like + The data to be collapsed. + + kwargs: optional + Extra keyword arguments that define the reduction. + + :Returns: + + `dict` or `None` + The reduced data in component form, or `None` if an active + storage reduction is not appropriate. + + **Examples** + + >>> d = active_chunk_function('sum', x) + >>> print(d) + {'N': 7008, 'sum': 7006221.66903949} + + Active storage reduction is not (yet) possible for variances: + + >>> d = active_chunk_function('variance', x, weights) + >>> print(d) + None + + """ + x = args[0] + + # Dask reduction machinery + if kwargs.get("computing_meta"): + return x + + # ---------------------------------------------------------------- + # Return None if active storage reduction is not appropriate. + # Inside `actify`, this will trigger a local reduction to be + # carried out instead. + # ---------------------------------------------------------------- + if not active_storage(): + # Active storage is turned off + return + + url = kwargs.get("active_storage_url") + if url is None: + url = active_storage_url().value + if url is None: + # Active storage is not possible when no active storage + # server URL has been provided + return + + if method not in active_reduction_methods: + # Active storage is not (yet) available for this method + return + + if not getattr(x, "active_storage", False): + # The data object 'x' is incompatible with active storage + # operations. E.g. it is a UMArray object, a numpy array, etc. + return + + if len(args) == 2: + # Weights, if present, are always passed in as a positional + # parameter, never as a keyword parameter (see + # `dask.array.reductions.reduction` for details). + weights = args[1] + if weights is not None: + # Active storage is not (yet) allowed for weighted + # reductions + return + + axis = kwargs.get("axis") + if axis is not None: + if isinstance(axis, Integral): + axis = (axis,) + + if len(axis) < x.ndim: + # Active storage is not (yet) allowed for reductions over + # a subset of the axes + return + + # ---------------------------------------------------------------- + # Still here? Set up an Active instance that will carry out the + # active storage operation. If the operation fails, for any + # reason, then this will trigger (inside `actify`) a local + # reduction being carried out instead. + # ---------------------------------------------------------------- + filename = x.get_filename() + address = x.get_address() + max_requests = active_storage_max_requests() + active_kwargs = { + "uri": "/".join(filename.split("/")[3:]), + "ncvar": address, + "storage_options": x.get_storage_options(), + "active_storage_url": url, + "storage_type": "s3", + "max_threads": max_requests, + } + # WARNING: The "uri", "storage_options", and "storage_type" keys + # of the `active_kwargs` dictionary are currently + # formatted according to the whims of the `Active` class + # (i.e. the pyfive branch of PyActiveStorage). Future + # versions of `Active` will have a better API, that will + # require improvements to `active_kwargs`. + + index = x.index() + + details = ( + f"{method!r} (file={filename}, address={address}, url={url}, " + f"Dask chunk={index})" + ) + + info = is_log_level_info(logger) + if info: + # Do some detailed logging + start = time.time() + logger.info( + f"STARTED active storage {details}: {datetime.datetime.now()}" + ) # pragma: no cover + + active = Active(**active_kwargs) + active.method = method + active.components = True + + # Instruct the `Active` class to attempt an active storage + # reduction on the remote server + # + # WARNING: The `_version` API of `Active` is likely to change from + # the current version (i.e. the pyfive branch of + # PyActiveStorage) + active._version = 2 + + # ---------------------------------------------------------------- + # Execute the active storage operation by indexing the Active + # instance + # ---------------------------------------------------------------- + try: + d = active[index] + except Exception as error: + # Something went wrong with the active storage operations => + # Raise an ActiveStorageError that will in turn trigger + # (inside `actify`) a local reduction to be carried out + # instead. + raise ActiveStorageError( + f"FAILED in active storage {details} ({error}))" + ) + else: + # Active storage reduction was successful + if info: + # Do some detailed logging + try: + md = active.metric_data + except AttributeError: + logger.info( + f"FINISHED active storage {details}: " + f"{time.time() - start:6.2f}s" + ) # pragma: no cover + else: + logger.info( + f"FINISHED active storage {details}: " + f"dataset chunks: {md['dataset chunks']}, " + f"load nc (s): {md['load nc time']:6.2f}, " + f"indexing (s): {md['indexing time (s)']:6.2f}, " + f"reduction (s): {md['reduction time (s)']:6.2f}, " + f"selection 2 (s): {md['selection 2 time (s)']:6.2f}, " + f"Total: {(time.time() - start):6.2f}s" + ) # pragma: no cover + + # ---------------------------------------------------------------- + # Active storage reduction was a success. Reformat the resulting + # components dictionary 'd' to match the output of the + # corresponding local chunk function (e.g. `cf_mean_chunk`). + # ---------------------------------------------------------------- + if method == "max": + # Local chunk function `cf_max_chunk` + d = {"N": d["n"], "max": d["max"]} + elif method == "mean": + # Local chunk function `cf_mean_chunk` + d = {"N": d["n"], "sum": d["sum"], "V1": d["n"], "weighted": False} + elif method == "min": + # Local chunk function `cf_min_chunk` + d = {"N": d["n"], "min": d["min"]} + elif method == "sum": + # Local chunk function `cf_sum_chunk` + d = {"N": d["n"], "sum": d["sum"]} + + return d diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py index 05f83ce077..9476ffcb0b 100644 --- a/cf/data/collapse/dask_collapse.py +++ b/cf/data/collapse/dask_collapse.py @@ -1,7 +1,8 @@ """Reduction functions intended to be passed to be dask. -Most of these functions are expected to be set as *chunk*, *combine* and -*aggregate* parameters of `dask.array.reduction` +Most of these functions are expected to be passed to +`dask.array.reduction` as its *chunk*, *combine* and *aggregate* +parameters. """ @@ -15,6 +16,8 @@ from dask.core import flatten from dask.utils import deepmap +from ..dask_utils import cf_asanyarray +from .collapse_active import actify from .collapse_utils import double_precision_dtype @@ -37,16 +40,15 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape): mtol: number The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between - 0 and 1 inclusive) of the contributing input data values. + 0 and 1 inclusive) of the contributing input data + values. A missing value in the output array occurs + whenever more than ``100*mtol%`` of its contributing input + array elements are missing data. - The default of *mtol* is 1, meaning that a missing datum + The default of *mtol* is 1, meaning that a missing value in the output array occurs whenever all of its contributing input array elements are missing data. - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its - contributing input array elements are missing data. - Note that for non-zero values of *mtol*, different collapsed elements may have different sample sizes, depending on the distribution of missing data in the input @@ -126,7 +128,8 @@ def sum_weights_chunk( N = cf_sample_size_chunk(x, **kwargs)["N"] return N - elif check_weights: + + if check_weights: w_min = weights.min() if w_min <= 0: raise ValueError( @@ -227,6 +230,7 @@ def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # mean # -------------------------------------------------------------------- +@actify("mean") def cf_mean_chunk( x, weights=None, @@ -237,8 +241,13 @@ def cf_mean_chunk( ): """Chunk calculations for the mean. - This function is passed to `dask.array.reduction` as its *chunk* - parameter. + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + Weights are interpreted as reliability weights, as opposed to + frequency weights. See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details. .. versionadded:: 3.14.0 @@ -267,11 +276,15 @@ def cf_mean_chunk( if computing_meta: return x + x = cf_asanyarray(x) + if weights is not None: + weights = cf_asanyarray(weights) + # N, sum - d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs) + d = cf_sum_chunk(x, weights=weights, dtype=dtype, **kwargs) d["V1"] = sum_weights_chunk( - x, weights, N=d["N"], check_weights=False, **kwargs + x, weights=weights, N=d["N"], check_weights=False, **kwargs ) d["weighted"] = weights is not None @@ -363,6 +376,7 @@ def cf_mean_agg( # -------------------------------------------------------------------- # maximum # -------------------------------------------------------------------- +@actify("max") def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the maximum. @@ -381,12 +395,14 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs): Dictionary with the keys: * N: The sample size. - * max: The maximum of `x``. + * max: The maximum of ``x``. """ if computing_meta: return x + x = cf_asanyarray(x) + return { "max": chunk.max(x, **kwargs), "N": cf_sample_size_chunk(x, **kwargs)["N"], @@ -514,6 +530,7 @@ def cf_mid_range_agg( # -------------------------------------------------------------------- # minimum # -------------------------------------------------------------------- +@actify("min") def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the minimum. @@ -538,6 +555,8 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x + x = cf_asanyarray(x) + return { "min": chunk.min(x, **kwargs), "N": cf_sample_size_chunk(x, **kwargs)["N"], @@ -617,6 +636,7 @@ def cf_min_agg( # -------------------------------------------------------------------- # range # -------------------------------------------------------------------- +@actify("range") def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the range. @@ -636,12 +656,14 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs): * N: The sample size. * min: The minimum of ``x``. - * max: The maximum of ``x`. + * max: The maximum of ``x``. """ if computing_meta: return x + x = cf_asanyarray(x) + # N, max d = cf_max_chunk(x, **kwargs) @@ -727,12 +749,18 @@ def cf_range_agg( # -------------------------------------------------------------------- # root mean square # -------------------------------------------------------------------- +@actify("rms") def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): """Chunk calculations for the root mean square (RMS). This function is passed to `dask.array.reduction` as its *chunk* parameter. + Weights are interpreted as reliability weights, as opposed to + frequency weights. See + https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights + for details. + .. versionadded:: 3.14.0 :Parameters: @@ -751,6 +779,8 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs): if computing_meta: return x + x = cf_asanyarray(x) + return cf_mean_chunk( np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs ) @@ -803,6 +833,7 @@ def cf_rms_agg( # -------------------------------------------------------------------- # sample size # -------------------------------------------------------------------- +@actify("sample_size") def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): """Chunk calculations for the sample size. @@ -826,6 +857,8 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs): if computing_meta: return x + x = cf_asanyarray(x) + if np.ma.isMA(x): N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs) else: @@ -913,6 +946,7 @@ def cf_sample_size_agg( # -------------------------------------------------------------------- # sum # -------------------------------------------------------------------- +@actify("sum") def cf_sum_chunk( x, weights=None, @@ -951,7 +985,10 @@ def cf_sum_chunk( if computing_meta: return x + x = cf_asanyarray(x) + if weights is not None: + weights = cf_asanyarray(weights) if check_weights: w_min = weights.min() if w_min <= 0: @@ -1044,8 +1081,9 @@ def cf_sum_agg( # -------------------------------------------------------------------- # sum of weights # -------------------------------------------------------------------- +@actify("sum_of_weights") def cf_sum_of_weights_chunk( - x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs + x, weights=None, dtype="f8", computing_meta=False, **kwargs ): """Chunk calculations for the sum of the weights. @@ -1054,9 +1092,50 @@ def cf_sum_of_weights_chunk( :Parameters: - square: `bool`, optional - If True then calculate the sum of the squares of the - weights. + See `dask.array.reductions` for details of the other + parameters. + + :Returns: + + `dict` + Dictionary with the keys: + + * N: The sample size. + * sum: The sum of ``weights``. + + """ + if computing_meta: + return x + + x = cf_asanyarray(x) + if weights is not None: + weights = cf_asanyarray(weights) + + # N + d = cf_sample_size_chunk(x, **kwargs) + + d["sum"] = sum_weights_chunk( + x, weights=weights, square=False, N=d["N"], **kwargs + ) + + return d + + +# -------------------------------------------------------------------- +# sum of squares of weights +# -------------------------------------------------------------------- +@actify("sum_of_weights2") +def cf_sum_of_weights2_chunk( + x, weights=None, dtype="f8", computing_meta=False, **kwargs +): + """Chunk calculations for the sum of the squares of the weights. + + This function is passed to `dask.array.reduction` as its *chunk* + parameter. + + .. versionadded:: NEXTVERSION + + :Parameters: See `dask.array.reductions` for details of the other parameters. @@ -1067,18 +1146,21 @@ def cf_sum_of_weights_chunk( Dictionary with the keys: * N: The sample size. - * sum: The sum of ``weights``, or the sum of - ``weights**2`` if *square* is True. + * sum: The sum of the squares of ``weights``. """ if computing_meta: return x + x = cf_asanyarray(x) + if weights is not None: + weights = cf_asanyarray(weights) + # N d = cf_sample_size_chunk(x, **kwargs) d["sum"] = sum_weights_chunk( - x, weights=weights, square=square, N=d["N"], **kwargs + x, weights=weights, square=True, N=d["N"], **kwargs ) return d @@ -1087,6 +1169,7 @@ def cf_sum_of_weights_chunk( # -------------------------------------------------------------------- # unique # -------------------------------------------------------------------- +@actify("unique") def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): """Chunk calculations for the unique values. @@ -1110,6 +1193,8 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs): if computing_meta: return x + x = cf_asanyarray(x) + return {"unique": np.unique(x)} @@ -1148,18 +1233,37 @@ def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs): # -------------------------------------------------------------------- # variance # -------------------------------------------------------------------- +@actify("var") def cf_var_chunk( x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs ): - """Chunk calculations for the variance. + r"""Chunk calculations for the variance. This function is passed to `dask.array.reduction` as its *chunk* parameter. + For non-overlapping data sets, X_{i}, making up the aggregate data + set X=\bigcup _{i}X_{i}, the unweighted variance \sigma ^{2} is + + \mu &={\frac {1}{\sum _{i}{N_{X_{i}}}}}\left(\sum + _{i}{N_{X_{i}}\mu _{X_{i}}}\right) + + \sigma ^{2}&={\sqrt {{\frac {1}{\sum + _{i}{N_{X_{i}}-ddof}}}\left(\sum _{i}{\left[(N_{X_{i}}-1)\sigma + _{X_{i}}^{2}+N_{X_{i}}\mu _{X_{i}}^{2}\right]}-\left[\sum + _{i}{N_{X_{i}}}\right]\mu _{X}^{2}\right)}} + + where X_{i}\cap X_{j}=\emptyset , \forall i>> a = np.array([[1, 2, 3]]) + >>> print(cf.data.dask_utils.cf_filled(a, -999)) + [[1 2 3]] + >>> a = np.ma.array([[1, 2, 3]], mask=[[True, False, False]]) + >>> print(cf.data.dask_utils.cf_filled(a, -999)) + [[-999 2 3]] + + """ + a = cf_asanyarray(a) + return np.ma.filled(a, fill_value=fill_value) + + +def cf_asanyarray(a): + """Convert to a `numpy` array. + + Only do this if the input *a* has an `__asanyarray__` attribute + with value True. + + .. versionadded:: NEXTVERSION + + :Parameters: + + a: array_like + The array. + + :Returns: + + The array converted to a `numpy` array, or the input array + unchanged if ``a.__asanyarray__`` False. + + """ + if getattr(a, "__asanyarray__", False): + return np.asanyarray(a) + + return a diff --git a/cf/data/data.py b/cf/data/data.py index fd84d14bb4..61abf86bc7 100644 --- a/cf/data/data.py +++ b/cf/data/data.py @@ -13,7 +13,6 @@ import numpy as np from cfdm import is_log_level_info from dask import compute, delayed # noqa: F401 -from dask.array import Array from dask.array.core import normalize_chunks from dask.base import collections_to_dsk, is_dask_collection, tokenize from dask.highlevelgraph import HighLevelGraph @@ -44,11 +43,15 @@ from ..units import Units from .collapse import Collapse from .creation import generate_axis_identifiers, to_dask + from .dask_utils import ( _da_ma_allclose, + cf_asanyarray, cf_contains, cf_dt2rt, + cf_filled, cf_harden_mask, + cf_is_masked, cf_percentile, cf_rt2dt, cf_soften_mask, @@ -370,16 +373,22 @@ def __init__( super().__init__( source=source, _use_array=_use_array and array is not None ) - if _use_array: try: - array = source.to_dask_array() + array = source.to_dask_array(_asanyarray=False) except (AttributeError, TypeError): - pass + try: + array = source.to_dask_array() + except (AttributeError, TypeError): + pass + else: + self._set_dask(array, copy=copy, clear=_NONE) else: - self._set_dask(array, copy=copy, clear=_NONE) + self._set_dask( + array, copy=copy, clear=_NONE, asanyarray=None + ) else: - self._del_dask(None) + self._del_dask(None, clear=_NONE) # Set the mask hardness self.hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK) @@ -427,6 +436,7 @@ def __init__( return # Still here? Then create a dask array and store it. + custom = self._custom # Find out if the input data is compressed by convention try: @@ -440,17 +450,18 @@ def __init__( "for compressed input arrays" ) + # Bring the compressed data into memory without + # decompressing it if to_memory: try: array = array.to_memory() except AttributeError: pass - try: - array.get_filenames() - except AttributeError: - pass - else: + if self._is_abstract_Array_subclass(array): + # Save the input array in case it's useful later. For + # compressed input arrays this will contain extra + # information, such as a count or index variable. self._set_Array(array) # Cast the input data as a dask array @@ -463,7 +474,20 @@ def __init__( # Set whether or not we're sure that the Data instance has a # deterministic name - self._custom["deterministic"] = not is_dask_collection(array) + is_dask = is_dask_collection(array) + custom["deterministic"] = not is_dask + + # Set whether or not to call `np.asanyarray` on chunks to + # convert them to numpy arrays. + if is_dask: + # We don't know what's in the dask array, so we should + # assume that it might need converting to a numpy array. + custom["__asanyarray__"] = True + else: + # Use the array's __asanyarray__ value, if it has one. + custom["__asanyarray__"] = bool( + getattr(array, "__asanyarray__", False) + ) dx = to_dask(array, chunks, **kwargs) @@ -487,7 +511,7 @@ def __init__( self._Units = units # Store the dask array - self._set_dask(dx, clear=_NONE) + self._set_dask(dx, clear=_NONE, asanyarray=None) # Override the data type if dtype is not None: @@ -621,9 +645,13 @@ def __contains__(self, value): # are incompatible return False - value = value.to_dask_array() + # 'cf_contains' has its own calls to 'cf_asanyarray', so + # we can set '_asanyarray=False'. + value = value.to_dask_array(_asanyarray=False) - dx = self.to_dask_array() + # 'cf_contains' has its own calls to 'cf_asanyarray', so we + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind @@ -756,7 +784,9 @@ def __len__(self): TypeError: len() of unsized object """ - dx = self.to_dask_array() + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data len: Performance may be degraded") dx.compute_chunk_sizes() @@ -874,10 +904,10 @@ def __getitem__(self, indices): new = self.roll( axis=tuple(roll.keys()), shift=tuple(roll.values()) ) - dx = new.to_dask_array() + dx = new.to_dask_array(_asanyarray=False) else: - new = self.copy(array=False) - dx = self.to_dask_array() + new = self.copy() + dx = self.to_dask_array(_asanyarray=False) # ------------------------------------------------------------ # Subspace the dask array @@ -923,8 +953,12 @@ def __getitem__(self, indices): # ------------------------------------------------------------ # Set the subspaced dask array + # + # * A subspaced chunk might not result in an array in memory, + # so we set asanyarray=True to ensure that, if required, + # they are converted at compute time. # ------------------------------------------------------------ - new._set_dask(dx) + new._set_dask(dx, asanyarray=True) # ------------------------------------------------------------ # Get the axis identifiers for the subspace @@ -1117,6 +1151,7 @@ def __setitem__(self, indices, value): # Do the assignment self._set_subspace(dx, indices, value) + self._set_dask(dx) # Unroll any axes that were rolled to enable a cyclic # assignment @@ -1136,15 +1171,23 @@ def __setitem__(self, indices, value): self[indices] = reset - # Remove elements made invalid by updating the `dask` array - # in-place - self._clear_after_dask_update(_ALL) - return - # ---------------------------------------------------------------- - # Indexing behaviour attributes - # ---------------------------------------------------------------- + @property + def __asanyarray__(self): + """Whether or not chunks need conversion to `numpy` arrays. + + .. versionadded:: NEXTVERSION + + ..seealso:: `to_dask_array`, `todict`, `_set_dask` + + :Returns: + + `bool` + + """ + return self._custom.get("__asanyarray__", True) + @property def __orthogonal_indexing__(self): """Flag to indicate that orthogonal indexing is supported. @@ -1323,9 +1366,6 @@ def _clear_after_dask_update(self, clear=_ALL): * If ``clear & _CFA`` is non-zero then the CFA write status is set to `False`. - * If ``clear`` is non-zero then the CFA term status is - set to `False`. - By default *clear* is the ``_ALL`` integer-valued constant, which results in all components being removed. @@ -1361,7 +1401,7 @@ def _clear_after_dask_update(self, clear=_ALL): # Set the CFA write status to False self._cfa_del_write() - def _set_dask(self, array, copy=False, clear=_ALL): + def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False): """Set the dask array. .. versionadded:: 3.14.0 @@ -1371,7 +1411,7 @@ def _set_dask(self, array, copy=False, clear=_ALL): :Parameters: - array: `dask.array.Array` + dx: `dask.array.Array` The array to be inserted. copy: `bool`, optional @@ -1384,12 +1424,18 @@ def _set_dask(self, array, copy=False, clear=_ALL): results in all components being removed. See `_clear_after_dask_update` for details. + asanyarray: `None` or `bool`, optional + If `None` then do nothing. Otherwise set the + `__asanyarray__` attribute to *asanyarray*. + + .. versionadded:: NEXTVERSION + :Returns: `None` """ - if array is NotImplemented: + if dx is NotImplemented: logger.warning( "WARNING: NotImplemented has been set in the place of a " "dask array." @@ -1402,16 +1448,20 @@ def _set_dask(self, array, copy=False, clear=_ALL): "suitability (such as data type casting, " "broadcasting, etc.). Note that the exception may be " "difficult to diagnose, as dask will have silently " - "trapped it and returned NotImplemented (for " - "instance, see dask.array.core.elemwise). Print " + "trapped it and returned NotImplemented (see, for " + "instance, dask.array.core.elemwise). Print " "statements in a local copy of dask are possibly the " "way to go if the cause of the error is not obvious." ) if copy: - array = array.copy() + dx = dx.copy() + + custom = self._custom + custom["dask"] = dx + if asanyarray is not None: + custom["__asanyarray__"] = bool(asanyarray) - self._custom["dask"] = array self._clear_after_dask_update(clear) def _del_dask(self, default=ValueError(), clear=_ALL): @@ -1513,6 +1563,20 @@ def _get_cached_elements(self): return cache.copy() + def _is_abstract_Array_subclass(self, array): + """Whether or not an array is a type of Array. + + :Parameters: + + array: + + :Returns: + + `bool` + + """ + return isinstance(array, cfdm.Array) + def _set_cached_elements(self, elements): """Cache selected element values. @@ -1520,7 +1584,7 @@ def _set_cached_elements(self, elements): within its ``custom`` dictionary. .. warning:: Never change ``_custom['cached_elements']`` - in-place. + in-place. .. versionadded:: 3.14.0 @@ -2470,7 +2534,9 @@ def percentile( else: axes = tuple(sorted(d._parse_axes(axes))) - dx = d.to_dask_array() + # 'cf_percentile' has its own call to 'cf_asanyarray', so we + # can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dtype = dx.dtype shape = dx.shape @@ -2535,7 +2601,7 @@ def percentile( name = name[0] graph = HighLevelGraph.from_collections(name, dsk, dependencies=[dx]) - dx = Array(graph, name, chunks=out_chunks, dtype=float) + dx = da.Array(graph, name, chunks=out_chunks, dtype=float) d._set_dask(dx) @@ -2586,11 +2652,9 @@ def persist(self, inplace=False): """ d = _inplace_enabled_define_and_cleanup(self) - dx = self.to_dask_array() dx = dx.persist() d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) - return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -2628,7 +2692,8 @@ def ceil(self, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array() - d._set_dask(da.ceil(dx)) + dx = da.ceil(dx) + d._set_dask(dx) return d def cfa_get_term(self): @@ -2753,7 +2818,8 @@ def compute(self): # noqa: F811 .. versionadded:: 3.14.0 - .. seealso:: `persist`, `array`, `datetime_array` + .. seealso:: `persist`, `array`, `datetime_array`, + `sparse_array` :Returns: @@ -2778,7 +2844,8 @@ def compute(self): # noqa: F811 [0., 0., 0.]]) """ - a = self.to_dask_array().compute() + dx = self.to_dask_array() + a = dx.compute() if np.ma.isMA(a): if self.hardmask: @@ -2966,8 +3033,8 @@ def convolution_filter( dx = d.to_dask_array() - # Cast to float to ensure that NaNs can be stored (as required - # by cf_convolve1d) + # Cast to float to ensure that NaNs can be stored (so + # map_overlap can correctly assign the halos) if dx.dtype != float: dx = dx.astype(float, copy=False) @@ -3153,9 +3220,13 @@ def rechunk( """ d = _inplace_enabled_define_and_cleanup(self) - dx = d.to_dask_array() + dx = d.to_dask_array(_asanyarray=False) dx = dx.rechunk(chunks, threshold, block_size_limit, balance) - d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE) + # Dask rechunking is essentially a wrapper for __getitem__ + # calls on the chunks, which means that we can use the same + # 'asanyarray' and 'clear' keyword values to `_set_dask` as + # are used in `__gettem__`. + d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True) return d @@ -3206,7 +3277,9 @@ def _asdatetime(self, inplace=False): ) if not d._isdatetime(): - dx = d.to_dask_array() + # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we + # can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object) d._set_dask(dx) @@ -3261,7 +3334,9 @@ def _asreftime(self, inplace=False): ) if d._isdatetime(): - dx = d.to_dask_array() + # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we + # can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float) d._set_dask(dx) @@ -3871,7 +3946,9 @@ def _regrid( f"the shape of the regrid operator: {operator.src_shape}" ) - dx = self.to_dask_array() + # 'regrid' has its own calls to 'cf_asanyarray', so we can set + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) # Rechunk so that each chunk contains data in the form # expected by the regrid operator, i.e. the regrid axes all @@ -3939,7 +4016,7 @@ def _regrid( ) # Create a regridding function to apply to each chunk - regrid_func = partial( + cf_regrid_func = partial( regrid, method=method, src_shape=src_shape, @@ -3959,7 +4036,7 @@ def _regrid( # github.com/pangeo-data/pangeo/issues/334#issuecomment-403787663 dx = dx.map_blocks( - regrid_func, + cf_regrid_func, weights_dst_mask=weights_dst_mask, ref_src_mask=src_mask, chunks=regridded_chunks, @@ -4113,8 +4190,11 @@ def concatenate( processed_data.append(data1) copied = not copy # to avoid making two copies in a given case - # Get data as dask arrays and apply concatenation operation - dxs = [d.to_dask_array() for d in processed_data] + # Get data as dask arrays and apply concatenation + # operation. We can set '_asanyarray=False' because at compute + # time the concatenation operation does not need to access the + # actual data. + dxs = [d.to_dask_array(_asanyarray=False) for d in processed_data] dx = da.concatenate(dxs, axis=axis) # Set the CFA write status @@ -4142,8 +4222,18 @@ def concatenate( cfa = _NONE break + # Define the __asanyarray__ status + asanyarray = processed_data[0].__asanyarray__ + for d in processed_data[1:]: + if d.__asanyarray__ != asanyarray: + # If and only if any two input Data objects have + # different __asanyarray__ values, then set + # asanyarray=True on the concatenation. + asanyarray = True + break + # Set the new dask array - data0._set_dask(dx, clear=_ALL ^ cfa) + data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray) # Set appropriate cached elements cached_elements = {} @@ -4770,7 +4860,7 @@ def _axes(self, value): # ---------------------------------------------------------------- @property def chunks(self): - """The chunk sizes for each dimension. + """The `dask` chunk sizes for each dimension. .. versionadded:: 3.14.0 @@ -4787,7 +4877,9 @@ def chunks(self): 6 """ - return self.to_dask_array().chunks + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + return self.to_dask_array(_asanyarray=False).chunks # ---------------------------------------------------------------- # Attributes @@ -4840,10 +4932,12 @@ def Units(self, value): else: dtype = _dtype_float - func = partial(cf_units, from_units=old_units, to_units=value) + cf_func = partial(cf_units, from_units=old_units, to_units=value) - dx = self.to_dask_array() - dx = dx.map_blocks(func, dtype=dtype) + # 'cf_units' has its own call to 'cf_asanyarray', so we can + # set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) + dx = dx.map_blocks(cf_func, dtype=dtype) # Setting equivalent units doesn't affect the CFA write # status. Nor does it invalidate any cached values, but only @@ -4854,7 +4948,7 @@ def Units(self, value): cache = self._get_cached_elements() if cache: self._set_cached_elements( - {index: func(value) for index, value in cache.items()} + {index: cf_func(value) for index, value in cache.items()} ) self._Units = value @@ -4909,16 +5003,17 @@ def dtype(self): [1 2 3] """ - dx = self.to_dask_array() + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) return dx.dtype @dtype.setter def dtype(self, value): - dx = self.to_dask_array() - # Only change the datatype if it's different to that of the # dask array - if dx.dtype != value: + if self.dtype != value: + dx = self.to_dask_array() dx = dx.astype(value) self._set_dask(dx) @@ -5023,18 +5118,15 @@ def is_masked(self): True """ - - def is_masked(a): - out = np.ma.is_masked(a) - return np.array(out).reshape((1,) * a.ndim) - - dx = self.to_dask_array() + # 'cf_is_masked' has its own call to 'cf_asanyarray', so we + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) out_ind = tuple(range(dx.ndim)) dx_ind = out_ind dx = da.blockwise( - is_masked, + cf_is_masked, out_ind, dx, dx_ind, @@ -5071,7 +5163,9 @@ def nbytes(self): 24 """ - dx = self.to_dask_array() + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data nbytes: Performance may be degraded") dx.compute_chunk_sizes() @@ -5105,7 +5199,9 @@ def ndim(self): 0 """ - dx = self.to_dask_array() + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) return dx.ndim @property @@ -5127,7 +5223,9 @@ def npartitions(self): 6 """ - return self.to_dask_array().npartitions + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + return self.to_dask_array(_asanyarray=False).npartitions @property def numblocks(self): @@ -5148,7 +5246,9 @@ def numblocks(self): 6 """ - return self.to_dask_array().numblocks + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + return self.to_dask_array(_asanyarray=False).numblocks @property def shape(self): @@ -5178,7 +5278,9 @@ def shape(self): () """ - dx = self.to_dask_array() + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) if math.isnan(dx.size): logger.debug("Computing data shape: Performance may be degraded") dx.compute_chunk_sizes() @@ -5217,7 +5319,9 @@ def size(self): 1 """ - dx = self.to_dask_array() + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) size = dx.size if math.isnan(size): logger.debug("Computing data size: Performance may be degraded") @@ -5268,13 +5372,25 @@ def array(self): 2000-12-01 00:00:00 """ - array = self.compute().copy() - if issparse(array): - array = array.toarray() - elif not isinstance(array, np.ndarray): - array = np.asanyarray(array) + a = self.compute().copy() + if issparse(a): + a = a.toarray() + elif not isinstance(a, np.ndarray): + a = np.asanyarray(a) + + if not a.size: + return a + + # Set cached elements + items = [0, -1] + if a.ndim == 2 and a.shape[-1] == 2: + items.extend((1, -2)) + elif a.size == 3: + items.append(1) - return array + self._set_cached_elements({i: a.item(i) for i in items}) + + return a @property def datetime_array(self): @@ -5424,7 +5540,8 @@ def arctan(self, inplace=False): d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array() - d._set_dask(da.arctan(dx)) + dx = da.arctan(dx) + d._set_dask(dx) d.override_units(_units_radians, inplace=True) @@ -5579,7 +5696,8 @@ def arcsinh(self, inplace=False): d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array() - d._set_dask(da.arcsinh(dx)) + dx = da.arcsinh(dx) + d._set_dask(dx) d.override_units(_units_radians, inplace=True) @@ -6399,7 +6517,9 @@ def convert_reference_time( ) d.Units = units0 - dx = d.to_dask_array() + # 'cf_rt2dt' its own call to 'cf_asanyarray', so we can set + # '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) # Convert to the correct date-time objects dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object) @@ -6476,8 +6596,11 @@ def get_deterministic_name(self): raise ValueError() units = self._Units + + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. return tokenize( - self.to_dask_array().name, + self.to_dask_array(_asanyarray=False).name, units.formatted(definition=True, names=True), units._canonical_calendar, ) @@ -6542,7 +6665,10 @@ def get_filenames(self): """ out = set() - for a in self.todict().values(): + + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + for a in self.todict(_asanyarray=False).values(): try: out.update(a.get_filenames()) except AttributeError: @@ -6675,7 +6801,10 @@ def add_file_location(self, location): location = abspath(location).rstrip(sep) updated = False - dsk = self.todict() + + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dsk = self.todict(_asanyarray=False) for key, a in dsk.items(): try: dsk[key] = a.add_file_location(location) @@ -6688,9 +6817,9 @@ def add_file_location(self, location): updated = True if updated: - dx = self.to_dask_array() + dx = self.to_dask_array(_asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE) + self._set_dask(dx, clear=_NONE, asanyarray=None) return location @@ -7672,7 +7801,8 @@ def cos(self, inplace=False, i=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.cos(dx)) + dx = da.cos(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -8115,7 +8245,9 @@ def unique(self, split_every=None): # in the result. d.soften_mask() - dx = d.to_dask_array() + # The applicable chunk function will have its own call to + # 'cf_asanyarray', so we can set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) dx = Collapse().unique(dx, split_every=split_every) d._set_dask(dx) @@ -8376,7 +8508,6 @@ def equals( # Apply a (dask) logical 'and' to confirm if both the mask and the # data are equal for the pair of masked arrays: result = da.logical_and(data_comparison, mask_comparison) - if not result.compute(): if is_log_level_info(logger): logger.info( @@ -8419,7 +8550,8 @@ def exp(self, inplace=False, i=False): d.Units = _units_1 dx = d.to_dask_array() - d._set_dask(da.exp(dx)) + dx = da.exp(dx) + d._set_dask(dx) return d @@ -8478,6 +8610,13 @@ def insert_dimension(self, position=0, inplace=False): data_axes.insert(position, axis) d._axes = data_axes + # Update the HDF5 chunking strategy + chunksizes = d.nc_hdf5_chunksizes() + if isinstance(chunksizes, tuple): + chunksizes = list(chunksizes) + chunksizes.insert(position, 1) + d.nc_set_hdf5_chunksizes(chunksizes) + return d @_deprecated_kwarg_check("size", version="3.14.0", removed_at="5.0.0") @@ -8854,7 +8993,9 @@ def harden_mask(self): [1 -- 3] """ - dx = self.to_dask_array() + # 'cf_harden_mask' has its own call to 'cf_asanyarray', so we + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) self.hardmask = True @@ -8974,7 +9115,9 @@ def soften_mask(self): [ 1 999 3] """ - dx = self.to_dask_array() + # 'cf_soften_mask' has its own call to 'cf_asanyarray', so we + # can set '_asanyarray=False'. + dx = self.to_dask_array(_asanyarray=False) dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype) self._set_dask(dx, clear=_NONE) self.hardmask = False @@ -9003,7 +9146,9 @@ def file_locations(self): """ out = set() - for key, a in self.todict().items(): + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + for key, a in self.todict(_asanyarray=False).items(): try: out.update(a.file_locations()) except AttributeError: @@ -9061,8 +9206,10 @@ def filled(self, fill_value=None, inplace=False): f"data type {d.dtype.str!r}" ) - dx = d.to_dask_array() - dx = dx.map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype) + # 'cf_filled' has its own call to 'cf_asanyarray', so we can + # set '_asanyarray=False'. + dx = d.to_dask_array(_asanyarray=False) + dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype) d._set_dask(dx) return d @@ -9690,7 +9837,7 @@ def override_calendar(self, calendar, inplace=False, i=False): d._Units = Units(d.Units._units, calendar) return d - def to_dask_array(self, apply_mask_hardness=False): + def to_dask_array(self, apply_mask_hardness=False, _asanyarray=True): """Convert the data to a `dask` array. .. warning:: By default, the mask hardness of the returned @@ -9714,6 +9861,22 @@ def to_dask_array(self, apply_mask_hardness=False): If True then force the mask hardness of the returned array to be that given by the `hardmask` attribute. + _asanyarray: `bool`, optional + If True (the default) and the `__asanyarray__` + attribute is also `True`, then a `cf_asanyarray` + operation is added to the graph of the returned Dask + array. If False then this operation is not added. + + In general, setting *_asanyarray* to False should only + be done if it is known that a) the returned Dask array + is never going to be computed; or b) it is not + necessary to add a `cf_asanyarray` operation in lieu of + its functionality being implemented by a new Dask graph + layer that is going to be created at a later stage. See + `cf.data.dask_utils.cf_asanyarray` for further details. + + .. versionadded:: NEXTVERSION + :Returns: `dask.array.Array` @@ -9736,16 +9899,25 @@ def to_dask_array(self, apply_mask_hardness=False): dask.array """ - if apply_mask_hardness and "dask" in self._custom: + dx = self._custom.get("dask") + if dx is None: + raise ValueError(f"{self.__class__.__name__} object has no data") + + if apply_mask_hardness: if self.hardmask: self.harden_mask() else: self.soften_mask() - try: - return self._custom["dask"] - except KeyError: - raise ValueError(f"{self.__class__.__name__} object has no data") + dx = self._custom["dask"] + # Note: The mask hardness functions have their own calls + # to 'cf_asanyarray', so we don't need to worry about + # setting another one. + elif _asanyarray and self.__asanyarray__: + # Add a new cf_asanyarray layer to the output graph + dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype) + + return dx def datum(self, *index): """Return an element of the data array as a standard Python @@ -10003,7 +10175,10 @@ def del_file_location(self, location): location = abspath(location).rstrip(sep) updated = False - dsk = self.todict() + + # The dask graph is never going to be computed, so we can set + # '_asanyarray=False'. + dsk = self.todict(_asanyarray=False) for key, a in dsk.items(): try: dsk[key] = a.del_file_location(location) @@ -10016,9 +10191,9 @@ def del_file_location(self, location): updated = True if updated: - dx = self.to_dask_array() + dx = self.to_dask_array(_asanyarray=False) dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta) - self._set_dask(dx, clear=_NONE) + self._set_dask(dx, clear=_NONE, asanyarray=None) return location @@ -11184,7 +11359,10 @@ def where( # Missing values could be affected, so make sure that the mask # hardness has been applied. - dx = d.to_dask_array(apply_mask_hardness=True) + # + # 'cf_where' has its own calls to 'cf_asanyarray', so we can + # set '_asanyarray=False'. + dx = d.to_dask_array(apply_mask_hardness=True, _asanyarray=False) units = d.Units @@ -11199,6 +11377,9 @@ def where( condition = type(self).asdata(condition) condition = where_broadcastable(d, condition, "condition") + # 'cf_where' has its own calls to 'cf_asanyarray', so we can + # set '_asanyarray=False'. + condition = condition.to_dask_array(_asanyarray=False) # If x or y is self then change it to None. This prevents an # unnecessary copy; and, at compute time, an unncessary numpy @@ -11242,9 +11423,7 @@ def where( x, y = xy # Apply the where operation - dx = da.core.elemwise( - cf_where, dx, da.asanyarray(condition), x, y, d.hardmask - ) + dx = da.core.elemwise(cf_where, dx, condition, x, y, d.hardmask) d._set_dask(dx) # Don't know (yet) if 'x' and 'y' have a deterministic names @@ -11305,7 +11484,8 @@ def sin(self, inplace=False, i=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.sin(dx)) + dx = da.sin(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11365,7 +11545,8 @@ def sinh(self, inplace=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.sinh(dx)) + dx = da.sinh(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11423,7 +11604,8 @@ def cosh(self, inplace=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.cosh(dx)) + dx = da.cosh(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11466,10 +11648,10 @@ def cull_graph(self): ('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3])} """ - dx = self.to_dask_array() + dx = self.to_dask_array(_asanyarray=False) dsk, _ = cull(dx.dask, dx.__dask_keys__()) dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype) - self._set_dask(dx, clear=_NONE) + self._set_dask(dx, clear=_NONE, asanyarray=None) @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @_inplace_enabled(default=False) @@ -11526,7 +11708,8 @@ def tanh(self, inplace=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.tanh(dx)) + dx = da.tanh(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) @@ -11673,6 +11856,14 @@ def squeeze(self, axes=None, inplace=False, i=False): # Remove the squeezed axes names d._axes = [axis for i, axis in enumerate(d._axes) if i not in iaxes] + # Update the HDF5 chunking strategy + chunksizes = d.nc_hdf5_chunksizes() + if isinstance(chunksizes, tuple): + chunksizes = [ + size for i, size in enumerate(chunksizes) if i not in iaxes + ] + d.nc_set_hdf5_chunksizes(chunksizes) + return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -11729,27 +11920,45 @@ def tan(self, inplace=False, i=False): d.Units = _units_radians dx = d.to_dask_array() - d._set_dask(da.tan(dx)) + dx = da.tan(dx) + d._set_dask(dx) d.override_units(_units_1, inplace=True) return d - def todict(self, optimize_graph=True): + def todict( + self, optimize_graph=True, apply_mask_hardness=False, _asanyarray=True + ): """Return a dictionary of the dask graph key/value pairs. .. versionadded:: 3.15.0 - .. seealso:: `to_dask_array`, `tolist` + .. seealso:: `to_dask_array` :Parameters: - `optimize_graph`: `bool` + optimize_graph: `bool` If True, the default, then prior to being converted to a dictionary, the graph is optimised to remove unused chunks. Note that optimising the graph can add a considerable performance overhead. + apply_mask_hardness: `bool`, optional + If True then force the mask hardness of the returned + array to be that given by the `hardmask` attribute. + + .. versionadded:: NEXTVERSION + + _asanyarray: `bool`, optional + If True (the default) and the `__asanyarray__` + attribute is also `True`, then a `cf_asanyarray` + operation is added to the dictionary representation of + the Dask graph. If False then this operation is not + added. See `to_dask_array` for details. + + .. versionadded:: NEXTVERSION + :Returns: `dict` @@ -11775,7 +11984,9 @@ def todict(self, optimize_graph=True): 0), (slice(0, 1, 1),))} """ - dx = self.to_dask_array() + dx = self.to_dask_array( + apply_mask_hardness=apply_mask_hardness, _asanyarray=_asanyarray + ) if optimize_graph: return collections_to_dsk((dx,), optimize_graph=True) @@ -11897,6 +12108,12 @@ def transpose(self, axes=None, inplace=False, i=False): d._set_dask(dx) + # Update the HDF5 chunking strategy + chunksizes = d.nc_hdf5_chunksizes() + if isinstance(chunksizes, tuple): + chunksizes = [chunksizes[i] for i in axes] + d.nc_set_hdf5_chunksizes(chunksizes) + return d @_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0") @@ -11933,7 +12150,8 @@ def trunc(self, inplace=False, i=False): """ d = _inplace_enabled_define_and_cleanup(self) dx = d.to_dask_array() - d._set_dask(da.trunc(dx)) + dx = da.trunc(dx) + d._set_dask(dx) return d @classmethod diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py index 2ce2dafa60..b7315107d4 100644 --- a/cf/data/fragment/__init__.py +++ b/cf/data/fragment/__init__.py @@ -1,3 +1,5 @@ from .fullfragmentarray import FullFragmentArray +from .h5netcdffragmentarray import H5netcdfFragmentArray from .netcdffragmentarray import NetCDFFragmentArray +from .netcdf4fragmentarray import NetCDF4FragmentArray from .umfragmentarray import UMFragmentArray diff --git a/cf/data/fragment/fullfragmentarray.py b/cf/data/fragment/fullfragmentarray.py index eecb50ef16..e2855b3003 100644 --- a/cf/data/fragment/fullfragmentarray.py +++ b/cf/data/fragment/fullfragmentarray.py @@ -16,8 +16,7 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=False, + attributes=None, source=None, copy=True, ): @@ -41,17 +40,9 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set to `None` during the first - `__getitem__` call. + {{init attributes: `dict` or `None`, optional}} - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set to `None` during - the first `__getitem__` call. + .. versionadded:: NEXTVERSION {{aggregated_units: `str` or `None`, optional}} @@ -61,13 +52,20 @@ def __init__( {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + """ super().__init__( fill_value=fill_value, dtype=dtype, shape=shape, - units=units, - calendar=calendar, + attributes=attributes, source=source, copy=False, ) diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py new file mode 100644 index 0000000000..0b70976c7f --- /dev/null +++ b/cf/data/fragment/h5netcdffragmentarray.py @@ -0,0 +1,97 @@ +from ..array.h5netcdfarray import H5netcdfArray +from .mixin import FragmentArrayMixin + + +class H5netcdfFragmentArray(FragmentArrayMixin, H5netcdfArray): + """A netCDF fragment array accessed with `h5netcdf`. + + .. versionadded:: NEXTVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + aggregated_units=False, + aggregated_calendar=False, + attributes=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The names of the netCDF fragment files containing the + array. + + address: (sequence of `str`), optional + The name of the netCDF variable containing the + fragment array. Required unless *varid* is set. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple`, optional + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init storage_options: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=True, + attributes=attributes, + storage_options=storage_options, + source=source, + copy=copy, + ) + + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py index 544befd337..f02c779002 100644 --- a/cf/data/fragment/mixin/fragmentarraymixin.py +++ b/cf/data/fragment/mixin/fragmentarraymixin.py @@ -1,4 +1,4 @@ -from numbers import Integral +from math import prod import numpy as np @@ -12,69 +12,69 @@ class FragmentArrayMixin: """ - def __getitem__(self, indices): - """Returns a subspace of the fragment as a numpy array. + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. - x.__getitem__(indices) <==> x[indices] + .. versionadded:: NEXTVERSION - Indexing is similar to numpy indexing, with the following - differences: + .. seealso:: `__array__`, `index` - * A dimension's index can't be rank-reducing, i.e. it can't - be an integer, a scalar `numpy` array, nor a scalar `dask` - array. + :Parameters: - * When two or more dimension's indices are sequences of - integers then these indices work independently along each - dimension (similar to the way vector subscripts work in - Fortran). + {{index: `tuple` or `None`, optional}} - .. versionadded:: 3.15.0 + It is important that there is a distinct value for each + fragment dimension, which is guaranteed when the + default of the `index` attribute is being used. - """ - # TODOACTIVE: modify this for the case when - # super().__getitem__(tuple(indices)) returns a - # dictionary + :Returns: + + `numpy.ndarray` + The subspace. - indices = self._parse_indices(indices) + """ + if index is None: + index = self.index() try: - array = super().__getitem__(tuple(indices)) + array = super()._get_array(index) except ValueError: # A ValueError is expected to be raised when the fragment # variable has fewer than 'self.ndim' dimensions (we know - # this because because 'indices' has 'self.ndim' + # that this is the case because 'index' has 'self.ndim' # elements). - axis = self._size_1_axis(indices) + axis = self._size_1_axis(index) if axis is not None: # There is a unique size 1 index that must correspond # to the missing dimension => Remove it from the # indices, get the fragment array with the new # indices; and then insert the missing size one # dimension. - indices.pop(axis) - array = super().__getitem__(tuple(indices)) + index = list(index) + index.pop(axis) + array = super()._get_array(tuple(index)) array = np.expand_dims(array, axis) else: # There are multiple size 1 indices so we don't know # how many missing dimensions the fragment has, nor # their positions => Get the full fragment array and # then reshape it to the shape of the dask compute - # chunk. - array = super().__getitem__(Ellipsis) - if array.size != self.size: + # chunk; and then apply the index. + array = super()._get_array(Ellipsis) + if array.size > prod(self.original_shape): raise ValueError( f"Can't get CFA fragment data from ({self}) when " "the fragment has two or more missing size 1 " "dimensions, whilst also spanning two or more " - "dask compute chunks." + "Dask compute chunks." "\n\n" "Consider re-creating the data with exactly one " - "dask compute chunk per fragment (e.g. by setting " + "Dask compute chunk per fragment (e.g. by setting " "'chunks=None' as a keyword to cf.read)." ) - array = array.reshape(self.shape) + array = array.reshape(self.original_shape) + array = array[index] array = self._conform_to_aggregated_units(array) return array @@ -116,8 +116,8 @@ def _conform_to_aggregated_units(self, array): if isinstance(array, dict): # 'array' is a dictionary. raise ValueError( - "TODOACTIVE. This error is notification of an " - "unreplaced placeholder for dealing with active " + "TODOACTIVE. Placeholder notification that " + "we can't yet deal with active " "storage reductions on CFA fragments." ) else: @@ -128,87 +128,6 @@ def _conform_to_aggregated_units(self, array): return array - def _parse_indices(self, indices): - """Parse the indices that retrieve the fragment data. - - Ellipses are replaced with the approriate number of `slice` - instances, and rank-reducing indices (such as an integer or - scalar array) are disallowed. - - .. versionadded:: 3.15.0 - - :Parameters: - - indices: `tuple` or `Ellipsis` - The array indices to be parsed. - - :Returns: - - `list` - The parsed indices. - - **Examples** - - >>> a.shape - (12, 1, 73, 144) - >>> a._parse_indices([2, 4, 5], Ellipsis, slice(45, 67)) - [[2, 4, 5], slice(0, 1), slice(0, 73), slice(45, 67)] - >>> a._parse_indices([2, 4, 5], [0], slice(None), slice(45, 67)) - [[2, 4, 5], [0], slice(0, 73), slice(45, 67)] - - """ - shape = self.shape - if indices is Ellipsis: - return [slice(0, n) for n in shape] - - indices = list(indices) - - # Check indices - has_ellipsis = False - for i, (index, n) in enumerate(zip(indices, shape)): - if isinstance(index, slice): - if index == slice(None): - indices[i] = slice(0, n) - - continue - - if index is Ellipsis: - has_ellipsis = True - continue - - if isinstance(index, Integral) or not getattr(index, "ndim", True): - # TODOCFA: what about [] or np.array([])? - - # 'index' is an integer or a scalar numpy/dask array - raise ValueError( - f"Can't subspace {self.__class__.__name__} with a " - f"rank-reducing index: {index!r}" - ) - - if has_ellipsis: - # Replace Ellipsis with one or more slices - indices2 = [] - length = len(indices) - n = self.ndim - for index in indices: - if index is Ellipsis: - m = n - length + 1 - indices2.extend([slice(None)] * m) - n -= m - else: - indices2.append(index) - n -= 1 - - length -= 1 - - indices = indices2 - - for i, (index, n) in enumerate(zip(indices, shape)): - if index == slice(None): - indices[i] = slice(0, n) - - return indices - def _size_1_axis(self, indices): """Find the position of a unique size 1 index. @@ -216,6 +135,8 @@ def _size_1_axis(self, indices): .. seealso:: `_parse_indices`, `__getitem__` + :Paramealso:: `_parse_indices`, `__getitem__` + :Parameters: indices: sequence of index @@ -244,33 +165,11 @@ def _size_1_axis(self, indices): None """ - axis = None - - n_size_1 = 0 # Number of size 1 indices - for i, (index, n) in enumerate(zip(indices, self.shape)): - try: - x = index.indices(n) - if abs(x[1] - x[0]) == 1: - # Index is a size 1 slice - n_size_1 += 1 - axis = i - except AttributeError: - try: - if index.size == 1: - # Index is a size 1 numpy or dask array - n_size_1 += 1 - axis = i - except AttributeError: - if len(index) == 1: - # Index is a size 1 list - n_size_1 += 1 - axis = i - - if n_size_1 > 1: - # There are two or more size 1 indices - axis = None - - return axis + original_shape = self.original_shape + if original_shape.count(1): + return original_shape.index(1) + + return @property def aggregated_Units(self): diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py new file mode 100644 index 0000000000..91f87dc2e8 --- /dev/null +++ b/cf/data/fragment/netcdf4fragmentarray.py @@ -0,0 +1,108 @@ +from ..array.netcdf4array import NetCDF4Array +from .mixin import FragmentArrayMixin + + +class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array): + """A netCDF fragment array accessed with `netCDF4`. + + .. versionadded:: NEXTVERSION + + """ + + def __init__( + self, + filename=None, + address=None, + dtype=None, + shape=None, + aggregated_units=False, + aggregated_calendar=False, + attributes=None, + storage_options=None, + source=None, + copy=True, + ): + """**Initialisation** + + :Parameters: + + filename: (sequence of `str`), optional + The names of the netCDF fragment files containing the + array. + + address: (sequence of `str`), optional + The name of the netCDF variable containing the + fragment array. Required unless *varid* is set. + + dtype: `numpy.dtype`, optional + The data type of the aggregated array. May be `None` + if the numpy data-type is not known (which can be the + case for netCDF string types, for example). This may + differ from the data type of the netCDF fragment + variable. + + shape: `tuple`, optional + The shape of the fragment within the aggregated + array. This may differ from the shape of the netCDF + fragment variable in that the latter may have fewer + size 1 dimensions. + + units: `str` or `None`, optional + The units of the fragment data. Set to `None` to + indicate that there are no units. If unset then the + units will be set during the first `__getitem__` call. + + calendar: `str` or `None`, optional + The calendar of the fragment data. Set to `None` to + indicate the CF default calendar, if applicable. If + unset then the calendar will be set during the first + `__getitem__` call. + + {{init attributes: `dict` or `None`, optional}} + + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + {{aggregated_units: `str` or `None`, optional}} + + {{aggregated_calendar: `str` or `None`, optional}} + + {{init storage_options: `dict` or `None`, optional}} + + {{init source: optional}} + + {{init copy: `bool`, optional}} + + """ + super().__init__( + filename=filename, + address=address, + dtype=dtype, + shape=shape, + mask=True, + attributes=attributes, + storage_options=storage_options, + source=source, + copy=copy, + ) + + if source is not None: + try: + aggregated_units = source._get_component( + "aggregated_units", False + ) + except AttributeError: + aggregated_units = False + + try: + aggregated_calendar = source._get_component( + "aggregated_calendar", False + ) + except AttributeError: + aggregated_calendar = False + + self._set_component("aggregated_units", aggregated_units, copy=False) + self._set_component( + "aggregated_calendar", aggregated_calendar, copy=False + ) diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py index 578412f124..ee34501e94 100644 --- a/cf/data/fragment/netcdffragmentarray.py +++ b/cf/data/fragment/netcdffragmentarray.py @@ -1,11 +1,25 @@ -from ..array.netcdfarray import NetCDFArray +import cfdm + +from ..array.abstract import Array +from ..array.mixin import FileArrayMixin, IndexMixin +from .h5netcdffragmentarray import H5netcdfFragmentArray from .mixin import FragmentArrayMixin +from .netcdf4fragmentarray import NetCDF4FragmentArray + +class NetCDFFragmentArray( + FragmentArrayMixin, + IndexMixin, + cfdm.data.mixin.NetCDFFileMixin, + FileArrayMixin, + cfdm.data.mixin.FileArrayMixin, + Array, +): + """A netCDF fragment array. -class NetCDFFragmentArray(FragmentArrayMixin, NetCDFArray): - """A CFA fragment array stored in a netCDF file. + Access will be with either `netCDF4` or `h5netcdf`. - .. versionadded:: 3.14.0 + .. versionadded:: 3.15.0 """ @@ -17,8 +31,8 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=None, + attributes=None, + storage_options=None, source=None, copy=True, ): @@ -47,39 +61,66 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. + {{init attributes: `dict` or `None`, optional}} - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. + If *attributes* is `None`, the default, then the + attributes will be set from the netCDF variable during + the first `__getitem__` call. + + .. versionadded:: NEXTVERSION {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} + {{init storage_options: `dict` or `None`, optional}} + + .. versionadded:: NEXTVERSION + {{init source: optional}} {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + """ super().__init__( - filename=filename, - address=address, - dtype=dtype, - shape=shape, - mask=True, - units=units, - calendar=calendar, source=source, copy=copy, ) if source is not None: + try: + shape = source._get_component("shape", None) + except AttributeError: + shape = None + + try: + filename = source._get_component("filename", None) + except AttributeError: + filename = None + + try: + address = source._get_component("address", None) + except AttributeError: + address = None + + try: + dtype = source._get_component("dtype", None) + except AttributeError: + dtype = None + + try: + attributes = source._get_component("attributes", None) + except AttributeError: + attributes = None + try: aggregated_units = source._get_component( "aggregated_units", False @@ -94,7 +135,105 @@ def __init__( except AttributeError: aggregated_calendar = False + try: + storage_options = source._get_component( + "storage_options", None + ) + except AttributeError: + storage_options = None + + if filename is not None: + if isinstance(filename, str): + filename = (filename,) + else: + filename = tuple(filename) + + self._set_component("filename", filename, copy=False) + + if address is not None: + if isinstance(address, int): + address = (address,) + else: + address = tuple(address) + + self._set_component("address", address, copy=False) + + if storage_options is not None: + self._set_component("storage_options", storage_options, copy=False) + + self._set_component("shape", shape, copy=False) + self._set_component("dtype", dtype, copy=False) + self._set_component("attributes", attributes, copy=False) + self._set_component("mask", True, copy=False) + self._set_component("aggregated_units", aggregated_units, copy=False) self._set_component( "aggregated_calendar", aggregated_calendar, copy=False ) + + # By default, close the file after data array access + self._set_component("close", True, copy=False) + + def _get_array(self, index=None): + """Returns a subspace of the dataset variable. + + The method acts as a factory for either a + `NetCDF4FragmentArray` or a `H5netcdfFragmentArray` class, and + it is the result of calling `!_get_array` on the newly created + instance that is returned. + + `H5netcdfFragmentArray` will only be used if + `NetCDF4FragmentArray` returns a `FileNotFoundError` exception. + + .. versionadded:: NEXTVERSION + + .. seealso:: `__array__`, `index` + + :Parameters: + + {{index: `tuple` or `None`, optional}} + + It is important that there is a distinct value for each + fragment dimension, which is guaranteed when the + default of the `index` attribute is being used. + + :Returns: + + `numpy.ndarray` + The subspace. + + """ + kwargs = { + "dtype": self.dtype, + "shape": self.shape, + "aggregated_units": self.get_aggregated_units(None), + "aggregated_calendar": self.get_aggregated_calendar(None), + "attributes": self.get_attributes(None), + "copy": False, + } + + # Loop round the files, returning as soon as we find one that + # is accessible. + filenames = self.get_filenames() + for filename, address in zip(filenames, self.get_addresses()): + kwargs["filename"] = filename + kwargs["address"] = address + kwargs["storage_options"] = self.get_storage_options( + create_endpoint_url=False + ) + + try: + return NetCDF4FragmentArray(**kwargs)._get_array(index) + except FileNotFoundError: + pass + except Exception: + return H5netcdfFragmentArray(**kwargs)._get_array(index) + + # Still here? + if not filenames: + raise FileNotFoundError("No fragment files") + + if len(filenames) == 1: + raise FileNotFoundError(f"No such fragment file: {filenames[0]}") + + raise FileNotFoundError(f"No such fragment files: {filenames}") diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py index a30737f46d..9168225945 100644 --- a/cf/data/fragment/umfragmentarray.py +++ b/cf/data/fragment/umfragmentarray.py @@ -17,8 +17,8 @@ def __init__( shape=None, aggregated_units=False, aggregated_calendar=False, - units=False, - calendar=False, + attributes=None, + storage_options=None, source=None, copy=True, ): @@ -45,33 +45,41 @@ def __init__( fragment variable in that the latter may have fewer size 1 dimensions. - units: `str` or `None`, optional - The units of the fragment data. Set to `None` to - indicate that there are no units. If unset then the - units will be set during the first `__getitem__` call. + {{init attributes: `dict` or `None`, optional}} - calendar: `str` or `None`, optional - The calendar of the fragment data. Set to `None` to - indicate the CF default calendar, if applicable. If - unset then the calendar will be set during the first - `__getitem__` call. + During the first `__getitem__` call, any of the + ``_FillValue``, ``add_offset``, ``scale_factor``, + ``units``, and ``calendar`` attributes which haven't + already been set will be inferred from the lookup + header and cached for future use. + + .. versionadded:: NEXTVERSION {{aggregated_units: `str` or `None`, optional}} {{aggregated_calendar: `str` or `None`, optional}} + {{init storage_options: `dict` or `None`, optional}} + {{init source: optional}} {{init copy: `bool`, optional}} + units: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + + calendar: `str` or `None`, optional + Deprecated at version NEXTVERSION. Use the + *attributes* parameter instead. + """ super().__init__( filename=filename, address=address, dtype=dtype, shape=shape, - units=units, - calendar=calendar, + attributes=attributes, source=source, copy=False, ) diff --git a/cf/data/utils.py b/cf/data/utils.py index c70d0f1c98..c7982fc857 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -780,7 +780,7 @@ def collapse( The function that collapses the underlying `dask` array of *d*. Must have the minimum signature (parameters and default values) ``func(dx, axis=None, keepdims=False, - mtol=None, split_every=None)`` (optionally including + mtol=1, split_every=None)`` (optionally including ``weights=None`` or ``ddof=None``), where ``dx`` is a the dask array contained in *d*. @@ -829,23 +829,29 @@ def collapse( mtol: number, optional The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between - 0 and 1 inclusive) of the contributing input data values. + 0 and 1 inclusive) of the contributing input data + values. A missing value in the output array occurs + whenever more than ``100*mtol%`` of its contributing input + array elements are missing data. - The default of *mtol* is 1, meaning that a missing datum + The default of *mtol* is 1, meaning that a missing value in the output array occurs whenever all of its contributing input array elements are missing data. - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its - contributing input array elements are missing data. + Note that for non-zero values of *mtol*, different + collapsed elements may have different sample sizes, + depending on the distribution of missing data in the input + data. ddof: number, optional - The delta degrees of freedom. The number of degrees of - freedom used in the calculation is (N-*ddof*) where N - represents the number of non-missing elements. + The delta degrees of freedom, a non-negative number. The + number of degrees of freedom used in the calculation is + ``N-ddof`` where ``N`` is the number of non-missing + elements. A value of 1 applies Bessel's correction. If the + calculation is weighted then *ddof* can only be 0 or 1. - For collapse functions that do not have a ``ddof`` - parameter, *ddof* must be `None`. + For collapse functions for which delta degrees of freedom + is not applicable (such as `max`), *ddof* must be `None`. split_every: `int` or `dict`, optional Determines the depth of the recursive aggregation. See @@ -872,7 +878,11 @@ def collapse( if ddof is not None: kwargs["ddof"] = ddof - dx = d.to_dask_array() + # The applicable chunk function will have its own call to + # 'cf_asanyarray', so we can set '_asanyarray=False'. Also, setting + # _asanyarray=False will ensure that any active storage operations + # are not compromised. + dx = d.to_dask_array(_asanyarray=False) dx = func(dx, **kwargs) d._set_dask(dx) @@ -985,8 +995,9 @@ def parse_weights(d, weights, axis=None): w = [] shape = d.shape axes = d._axes + Data = type(d) for key, value in weights.items(): - value = type(d).asdata(value) + value = Data.asdata(value) # Make sure axes are in ascending order if key != tuple(sorted(key)): diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py index b54a9c82ed..a11e0129b9 100644 --- a/cf/docstring/docstring.py +++ b/cf/docstring/docstring.py @@ -282,14 +282,12 @@ The sample size threshold below which collapsed values are set to missing data. It is defined as a fraction (between 0 and 1 inclusive) of the contributing input - data values. - - The default of *mtol* is 1, meaning that a missing - datum in the output array occurs whenever all of its + data values. A missing value in the output array + occurs whenever more than ``100*mtol%`` of its contributing input array elements are missing data. - For other values, a missing datum in the output array - occurs whenever more than ``100*mtol%`` of its + The default of *mtol* is 1, meaning that a missing + value in the output array occurs whenever all of its contributing input array elements are missing data. Note that for non-zero values of *mtol*, different @@ -300,10 +298,10 @@ "{{ddof: number}}": """ddof: number The delta degrees of freedom, a non-negative number. The number of degrees of freedom used in the - calculation is (N-*ddof*) where N represents the - number of non-missing elements. A value of 1 applies - Bessel's correction. If the calculation is weighted - then *ddof* can only be 0 or 1.""", + calculation is ``N-ddof`` where ``N`` is the number of + non-missing elements. A value of 1 applies Bessel's + correction. If the calculation is weighted then *ddof* + can only be 0 or 1.""", # split_every "{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional Determines the depth of the recursive aggregation. If @@ -324,11 +322,24 @@ value. A default can also be set globally with the ``split_every`` key in `dask.config`. See `dask.array.reduction` for details.""", + # active_storage + "{{active_storage: `bool`, optional}}": """{{active_storage: `bool`, optional}} + If True then attempt to perform the collapse using + active storage reductions. However, if other necessary + conditions are not met (see `Collapse` for details) + then the reduction will be carried out locally, as + usual. When an active storage reduction on a chunk + fails at compute time, the reduction for that chunk is + carried out locally. + + If False, the default, then the reduction will be + carried out locally.""", # Collapse chunk_function - "{{chunk_function: callable, optional}}": """{{chunk_function: callable, optional}} + "{{chunk_function: callable or `None`, optional}}": """{{chunk_function: callable or `None`, optional}} Provides the ``chunk`` parameter to - `dask.array.reduction`. If unset then an approriate - default function will be used.""", + `dask.array.reduction`. If `None`, the default, then + an appropriate default function from + `cf.data.collapse.dask_collapse` will be used.""", # Collapse weights "{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional Weights associated with values of the array. By @@ -621,6 +632,10 @@ "{{to_size: `int`, optional}}": """to_size: `int`, optional Pad the axis after so that the new axis has the given size.""", + # _get_array index + "{{index: `tuple` or `None`, optional}}": """index: `tuple` or `None`, optional + Provide the indices that define the subspace. If `None` + then the `index` attribute is used.""", # subspace config options "{{config: optional}}": """config: optional Configure the subspace by specifying the mode of diff --git a/cf/domain.py b/cf/domain.py index 1fbdb73e1c..8889fdf97a 100644 --- a/cf/domain.py +++ b/cf/domain.py @@ -707,7 +707,7 @@ def identities(self): equals e.g. ``'foo=bar'``. * The netCDF variable name, preceded by ``'ncvar%'``. - .. versionadded:: (cfdm) 1.9.0.0 + .. versionadded:: 3.8.0 .. seealso:: `identity` diff --git a/cf/field.py b/cf/field.py index f6c90d215e..d73e2b7b49 100644 --- a/cf/field.py +++ b/cf/field.py @@ -547,8 +547,6 @@ def __getitem__(self, indices): autocyclic={"no-op": True}, ) - new.set_data(new_data, axes=data_axes, copy=False) - return new def __setitem__(self, indices, value): @@ -5397,18 +5395,20 @@ def collapse( **Collapse weights** - The calculations of means, standard deviations and variances are, - by default, **not weighted**. For weights to be incorporated in - the collapse, the axes to be weighted must be identified with the - *weights* keyword. + .. warning:: By default, the collapse calculations are **not** + weighted. + + For weights to be incorporated in the collapse, + the *weights* keyword must be set. Weights are either derived from the field construct's metadata - (such as cell sizes), or may be provided explicitly in the form of - other field constructs containing data of weights values. In - either case, the weights actually used are those derived by the - `weights` method of the field construct with the same weights - keyword value. Collapsed axes that are not identified by the - *weights* keyword are unweighted during the collapse operation. + (such as cell sizes), or may be provided explicitly in the + form of other field constructs containing data of weights + values. In either case, the weights actually used are those + derived by the `weights` method of the field construct with + the same *weights* keyword value. Collapsed axes that are not + identified by the *weights* keyword are unweighted during the + collapse operation. *Example:* Create a weighted time average: @@ -5598,6 +5598,48 @@ def collapse( ... within_years=cf.seasons(), weights=True) + **Active storage collapses** + + When the data being collapsed are stored remotely, the + collapse calculations may be carried out on a server (ideally + one that is close in a network distance sense) to the data, + thereby removing the time and energy costs of transferring the + entire un-collapsed data to the local client. Whether or not + this will occur is determined on a case-by-case basis, and + will only be done if all of the following criteria are met: + + * the collapse method is one of ``'mean'``, ``'maximum'``, + ``'minimum'``, or ``'sum'``; + + * the collapse is over all axes; + + * the collapse is unweighted; + + * ``cf.active_storage()`` is `True`; + + * a URL of the active storage server has been set with + `cf.active_storage_url`; + + * the data values are in netCDF-4 files on disk (rather than + in any other file format, or in memory) and are not + numerically packed; + + * the `!active_storage` attribute of the field's `Data` object + is `True`, indicating that active storage operations may be + possible. In general, it will only be `True` for data that + are in files on disk, are not compressed by convention and + have not had any other operations applied, apart from + subspacing; + + * it is possible to import the external `activestorage.Active` + class. + + The performance improvements from using active storage + operations will increase the closer the active storage server + is to the data storage. If the active storage server is + sufficiently far away from the data then it may be faster to + do a normal, non-active operation. + .. versionadded:: 1.0 .. seealso:: `bin`, `cell_area`, `convolution_filter`, @@ -7058,10 +7100,13 @@ def collapse( "collapse" ) + # Note: It is important that size 1 axes are also passed + # on to the Data collapse, because active storage + # collapses get confused if they're not there. data_axes = f.get_data_axes() iaxes = [ data_axes.index(axis) - for axis in collapse_axes + for axis in collapse_axes_all_sizes if axis in data_axes ] @@ -7367,7 +7412,7 @@ def _collapse_grouped( group=None, group_span=None, group_contiguous=False, - mtol=None, + mtol=1, ddof=None, regroup=None, coordinate=None, diff --git a/cf/functions.py b/cf/functions.py index d186774d02..22820bc3db 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -2,11 +2,11 @@ import csv import ctypes.util import importlib +import logging import os import platform import re import sys -import urllib.parse import warnings from collections.abc import Iterable from itertools import product @@ -19,7 +19,7 @@ from os.path import expandvars as _os_path_expandvars from os.path import join as _os_path_join from os.path import relpath as _os_path_relpath -from urllib.parse import urlparse +from urllib.parse import urljoin, urlparse import cfdm import netCDF4 @@ -172,6 +172,9 @@ def configuration( regrid_logging=None, relaxed_identities=None, bounds_combination_mode=None, + active_storage=None, + active_storage_url=None, + active_storage_max_requests=None, of_fraction=None, collapse_parallel_mode=None, free_memory_factor=None, @@ -190,6 +193,9 @@ def configuration( * `regrid_logging` * `relaxed_identities` * `bounds_combination_mode` + * `active_storage` + * `active_storage_url` + * `active_storage_max_requests` These are all constants that apply throughout cf, except for in specific functions only if overridden by the corresponding keyword @@ -208,7 +214,9 @@ def configuration( .. seealso:: `atol`, `rtol`, `tempdir`, `chunksize`, `total_memory`, `log_level`, `regrid_logging`, - `relaxed_identities`, `bounds_combination_mode` + `relaxed_identities`, `bounds_combination_mode`, + `active_storage`, `active_storage_url`, + `active_storage_max_requests` :Parameters: @@ -260,6 +268,24 @@ def configuration( construct identity. The default is to not change the current value. + active_storage: `bool` or `Constant`, optional + The new value (either True to enable active storage + reductions or False to disable them). The default is to + not change the current behaviour. + + .. versionadded:: NEXTVERSION + + active_storage_url: `str` or `None` or `Constant`, optional + The new value (either a new URL string or `None` to remove + the URL). The default is to not change the value. + + .. versionadded:: NEXTVERSION + + active_storage_max_requests: `int` or `Constant`, optional + The new value. The default is to not change the value. + + .. versionadded:: NEXTVERSION + of_fraction: `float` or `Constant`, optional Deprecated at version 3.14.0 and is no longer available. @@ -289,7 +315,10 @@ def configuration( 'relaxed_identities': False, 'log_level': 'WARNING', 'bounds_combination_mode': 'AND', - 'chunksize': 82873466.88000001} + 'chunksize': 82873466.88000001, + 'active_storage': False, + 'active_storage_url': None, + 'active_storage_max_requests': 100} >>> cf.chunksize(7.5e7) # any change to one constant... 82873466.88000001 >>> cf.configuration()['chunksize'] # ...is reflected in the configuration @@ -303,7 +332,10 @@ def configuration( 'relaxed_identities': False, 'log_level': 'WARNING', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': None, + 'active_storage_max_requests': 100} >>> cf.configuration() # the items set have been updated accordingly {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -312,7 +344,10 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': None, + 'active_storage_max_requests': 100} Use as a context manager: @@ -324,7 +359,9 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': None} >>> with cf.configuration(atol=9, rtol=10): ... print(cf.configuration()) ... @@ -335,7 +372,10 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': None, + 'active_storage_max_requests': 100} >>> print(cf.configuration()) {'rtol': 2.220446049250313e-16, 'atol': 2.220446049250313e-16, @@ -344,7 +384,10 @@ def configuration( 'relaxed_identities': False, 'log_level': 'INFO', 'bounds_combination_mode': 'AND', - 'chunksize': 75000000.0} + 'chunksize': 75000000.0, + 'active_storage': False, + 'active_storage_url': None, + 'active_storage_max_requests': 100} """ if of_fraction is not None: @@ -375,6 +418,9 @@ def configuration( new_regrid_logging=regrid_logging, new_relaxed_identities=relaxed_identities, bounds_combination_mode=bounds_combination_mode, + active_storage=active_storage, + active_storage_url=active_storage_url, + active_storage_max_requests=active_storage_max_requests, ) @@ -424,6 +470,9 @@ def _configuration(_Configuration, **kwargs): "new_regrid_logging": regrid_logging, "new_relaxed_identities": relaxed_identities, "bounds_combination_mode": bounds_combination_mode, + "active_storage": active_storage, + "active_storage_url": active_storage_url, + "active_storage_max_requests": active_storage_max_requests, } old_values = {} @@ -553,11 +602,17 @@ class regrid_logging(ConstantAccess): **Examples** - >>> cf.regrid_logging() + >>> print(cf.regrid_logging()) + False + >>> print(cf.regrid_logging(True)) False - >>> cf.regrid_logging(True) + >>> print(cf.regrid_logging()) + True + >>> with cf.regrid_logging(False): + ... print(cf.regrid_logging()) + ... False - >>> cf.regrid_logging() + >>> print(cf.regrid_logging()) True """ @@ -682,13 +737,19 @@ class relaxed_identities(ConstantAccess): >>> org = cf.relaxed_identities() >>> org False - >>> cf.relaxed_identities(True) + >>> print(cf.relaxed_identities(True)) False - >>> cf.relaxed_identities() + >>> print(cf.relaxed_identities()) + True + >>> print(cf.relaxed_identities(org)) True - >>> cf.relaxed_identities(org) + >>> print(cf.relaxed_identities()) + False + >>> with cf.relaxed_identities(True): + ... print(cf.relaxed_identities()) + ... True - >>> cf.relaxed_identities() + >>> print(cf.relaxed_identities()) False """ @@ -805,18 +866,24 @@ class tempdir(ConstantAccess): :Returns: - `str` - The directory prior to the change, or the current - directory if no new value was specified. + `Constant` + The directory name prior to the change, or the name of the + current directory if no new value was specified. **Examples** - >>> cf.tempdir() + >>> print(cf.tempdir()) '/tmp' >>> old = cf.tempdir('/home/me/tmp') - >>> cf.tempdir(old) + >>> print(cf.tempdir(old)) '/home/me/tmp' - >>> cf.tempdir() + >>> print(cf.tempdir()) + '/tmp' + >>> with cf.tempdir('~/NEW_TMP'): + ... print(cf.tempdir()) + ... + /home/me/NEW_TMP + >>> print(cf.tempdir()) '/tmp' """ @@ -1082,11 +1149,6 @@ class bounds_combination_mode(ConstantAccess): OR >>> print(cf.bounds_combination_mode(old)) OR - >>> print(cf.bounds_combination_mode()) - AND - - Use as a context manager: - >>> print(cf.bounds_combination_mode()) AND >>> with cf.bounds_combination_mode('XOR'): @@ -1135,6 +1197,218 @@ def _parse(cls, arg): return arg +class active_storage(ConstantAccess): + """Whether or not to attempt active storage reductions. + + .. versionadded:: NEXTVERSION + + .. seealso:: `active_storage_max_requests`, `active_storage_url`, + `configuration` + + :Parameters: + + arg: `bool` or `Constant`, optional + Provide a value that will apply to all subsequent + operations. + + :Returns: + + `Constant` + The value prior to the change, or the current value if no + new value was specified. + + **Examples** + + >>> cf.active_storage() + False + >>> with cf.active_storage(True): + ... print(cf.active_storage()) + ... + True + >>> cf.active_storage() + False + >>> cf.active_storage(True) + False + >>> cf.active_storage() + True + + """ + + _name = "active_storage" + + def _parse(cls, arg): + """Parse a new constant value. + + .. versionaddedd:: NEXTVERSION + + :Parameters: + + cls: + This class. + + arg: + The given new constant value. + + :Returns: + + A version of the new constant value suitable for + insertion into the `CONSTANTS` dictionary. + + """ + try: + from activestorage import Active # noqa: F401 + except ModuleNotFoundError as error: + if arg: + raise ModuleNotFoundError( + f"Can't enable active storage operations: {error}" + ) + + return bool(arg) + + +class active_storage_url(ConstantAccess): + """The URL location of the active storage reducer. + + .. versionadded:: NEXTVERSION + + .. seealso:: `active_storage`, `active_storage_max_requests`, + `configuration` + + :Parameters: + + arg: `str` or `None` or `Constant`, optional + Provide a value that will apply to all subsequent + operations. + + :Returns: + + `Constant` + The value prior to the change, or the current value if no + new value was specified. + + **Examples** + + >>> print(cf.active_storage_url()) + None + >>> with cf.active_storage_url('http://active/storage/location'): + ... print(cf.active_storage_url()) + ... + 'http://active/storage/location' + >>> print(cf.active_storage_url()) + None + >>> print(cf.active_storage_url('http://other/location')) + None + >>> cf.active_storage_url() + 'http://other/location' + + """ + + _name = "active_storage_url" + + def _parse(cls, arg): + """Parse a new constant value. + + .. versionaddedd:: NEXTVERSION + + :Parameters: + + cls: + This class. + + arg: + The given new constant value. + + :Returns: + + A version of the new constant value suitable for + insertion into the `CONSTANTS` dictionary. + + """ + if arg is None: + return arg + + return str(arg) + + +class active_storage_max_requests(ConstantAccess): + """Concurrent active storage server requests per `dask` chunk. + + This is the maximum number of concurrent requests per `dask` chunk + that are sent to the active storage server by an `Active` + instance. The default is ``100``. The optimum number may be + estimated by :math:`N = N_{max} / min(N_{PE}, N_{chunk})`, where + + * :math:`N_{max}` is the maximum number of requests that the + active storage server can process concurrently before its + performance is degraded; + + * :math:`N_{PE}` the number of available PEs on the local client; + + * :math:`N_{chunk}` the number of `dask` chunks being reduced with + active storage. + + This formula only applies to cases where all `dask` chunks for the + collapse operation are utilising active storage. If some are not + then :math:`N` will likely be underestimated. + + .. versionadded:: NEXTVERSION + + .. seealso:: `active_storage`, `active_storage_url`, + `configuration` + + :Parameters: + + arg: `int` or `Constant`, optional + Provide a value that will apply to all subsequent + operations. + + :Returns: + + `Constant` + The value prior to the change, or the current value if no + new value was specified. + + **Examples** + + >>> print(cf.active_storage_max_requests()) + 100 + >>> with cf.active_storage_max_requests(25): + ... print(cf.active_storage_max_requests()) + ... + 25 + >>> print(cf.active_storage_max_requests()) + None + >>> print(cf.active_storage_max_requests(12) + None + >>> cf.active_storage_max_requests() + 12 + + """ + + _name = "active_storage_max_requests" + + def _parse(cls, arg): + """Parse a new constant value. + + .. versionaddedd:: NEXTVERSION + + :Parameters: + + cls: + This class. + + arg: + The given new constant value. + + :Returns: + + A version of the new constant value suitable for + insertion into the `CONSTANTS` dictionary. + + """ + return int(arg) + + def CF(): """The version of the CF conventions. @@ -1275,6 +1549,27 @@ def total_memory(): return CONSTANTS["TOTAL_MEMORY"] +def is_log_level_info(logger): + """Return True if and only if log level is at least as verbose as INFO. + + .. versionadded:: NEXTVERSION + + .. seealso:: `log_level` + + :Parameters: + + logger: `logging.Logger` + The logger in use. + + :Returns: + + `bool` + Whether or not the log level is at least INFO. + + """ + return logger.parent.level <= logging.INFO + + # -------------------------------------------------------------------- # Aliases (for back-compatibility etc.): # -------------------------------------------------------------------- @@ -2143,61 +2438,6 @@ def normalize_slice(index, size, cyclic=False): return slice(start, stop, step) -def get_subspace(array, indices): - """Return a subspace defined by the given indices of an array. - - Subset the input numpy array with the given indices. Indexing is - similar to that of a numpy array. The differences to numpy array - indexing are: - - 1. An integer index i takes the i-th element but does not reduce - the rank of the output array by one. - - 2. When more than one dimension's slice is a 1-d boolean array or - 1-d sequence of integers then these indices work independently - along each dimension (similar to the way vector subscripts work - in Fortran). - - Indices must contain an index for each dimension of the input array. - - :Parameters: - - array: `numpy.ndarray` - - indices: `list` - - """ - gg = [i for i, x in enumerate(indices) if not isinstance(x, slice)] - len_gg = len(gg) - - if len_gg < 2: - # ------------------------------------------------------------ - # At most one axis has a list-of-integers index so we can do a - # normal numpy subspace - # ------------------------------------------------------------ - return array[tuple(indices)] - - else: - # ------------------------------------------------------------ - # At least two axes have list-of-integers indices so we can't - # do a normal numpy subspace - # ------------------------------------------------------------ - if np.ma.isMA(array): - take = np.ma.take - else: - take = np.take - - indices = indices[:] - for axis in gg: - array = take(array, indices[axis], axis=axis) - indices[axis] = slice(None) - - if len_gg < len(indices): - array = array[tuple(indices)] - - return array - - _equals = cfdm.Data()._equals @@ -2646,7 +2886,7 @@ def relpath(filename, start=None): 'http://data/archive/file.nc' """ - u = urllib.parse.urlparse(filename) + u = urlparse(filename) if u.scheme != "": return filename @@ -2684,7 +2924,7 @@ def dirname(filename): 'http://data/archive' """ - u = urllib.parse.urlparse(filename) + u = urlparse(filename) if u.scheme != "": return filename.rpartition("/")[0] @@ -2723,9 +2963,9 @@ def pathjoin(path1, path2): 'http://data/archive/file.nc' """ - u = urllib.parse.urlparse(path1) + u = urlparse(path1) if u.scheme != "": - return urllib.parse.urljoin(path1, path2) + return urljoin(path1, path2) return _os_path_join(path1, path2) @@ -3118,44 +3358,50 @@ def environment(display=True, paths=True): **Examples** >>> cf.environment() - Platform: Linux-4.15.0-54-generic-x86_64-with-glibc2.10 - HDF5 library: 1.10.6 - netcdf library: 4.8.0 - udunits2 library: /home/username/anaconda3/envs/cf-env/lib/libudunits2.so.0 - esmpy/ESMF: 8.4.1 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/esmpy/__init__.py - Python: 3.8.10 /home/username/anaconda3/envs/cf-env/bin/python - dask: 2022.6.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/dask/__init__.py - netCDF4: 1.5.6 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/netCDF4/__init__.py - psutil: 5.9.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/psutil/__init__.py - packaging: 21.3 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/packaging/__init__.py - numpy: 1.22.2 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/numpy/__init__.py - scipy: 1.10.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/scipy/__init__.py - matplotlib: 3.4.3 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/matplotlib/__init__.py - cftime: 1.6.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cftime/__init__.py - cfunits: 3.3.6 /home/username/cfunits/cfunits/__init__.py - cfplot: 3.1.18 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cfplot/__init__.py - cfdm: 1.10.1.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cfdm/__init__.py - cf: 3.14.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cf/__init__.py + Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35 + HDF5 library: 1.12.2 + netcdf library: 4.9.3-development + udunits2 library: /home/user/lib/libudunits2.so.0 + esmpy/ESMF: 8.6.1 /home/user/lib/python3.12/site-packages/esmpy/__init__.py + Python: 3.12.2 /home/user/bin/python + dask: 2024.6.0 /home/user/lib/python3.12/site-packages/dask/__init__.py + netCDF4: 1.6.5 /home/user/lib/python3.12/site-packages/netCDF4/__init__.py + h5netcdf: 1.3.0 /home/user/lib/python3.12/site-packages/h5netcdf/__init__.py + h5py: 3.11.0 /home/user/lib/python3.12/site-packages/h5py/__init__.py + s3fs: 2024.6.0 /home/user/lib/python3.12/site-packages/s3fs/__init__.py + psutil: 5.9.8 /home/user/lib/python3.12/site-packages/psutil/__init__.py + packaging: 23.2 /home/user/lib/python3.12/site-packages/packaging/__init__.py + numpy: 1.26.4 /home/user/lib/python3.12/site-packages/numpy/__init__.py + scipy: 1.13.0 /home/user/lib/python3.12/site-packages/scipy/__init__.py + matplotlib: 3.8.4 /home/user/lib/python3.12/site-packages/matplotlib/__init__.py + cftime: 1.6.3 /home/user/lib/python3.12/site-packages/cftime/__init__.py + cfunits: 3.3.7 /home/user/lib/python3.12/site-packages/cfunits/__init__.py + cfplot: 3.3.0 /home/user/lib/python3.12/site-packages/cfplot/__init__.py + cfdm: 1.11.2.0 /home/user/cfdm/cfdm/__init__.py + cf: NEXTVERSION /home/user/cf-python/cf/__init__.py >>> cf.environment(paths=False) - Platform: Linux-4.15.0-54-generic-x86_64-with-glibc2.10 - HDF5 library: 1.10.6 - netcdf library: 4.8.0 - udunits2 library: libudunits2.so.0 - esmpy/ESMF: 8.4.1 - Python: 3.8.10 - dask: 2022.6.0 - netCDF4: 1.5.6 - psutil: 5.9.0 - packaging: 21.3 - numpy: 1.22.2 - scipy: 1.10.0 - matplotlib: 3.4.3 - cftime: 1.6.0 - cfunits: 3.3.6 - cfplot: 3.1.18 - cfdm: 1.10.1.0 - cf: 3.14.0 + Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35 + HDF5 library: 1.12.2 + netcdf library: 4.9.3-development + udunits2 library: /home/user/lib/libudunits2.so.0 + esmpy/ESMF: 8.6.1 + Python: 3.12.2 + dask: 2024.6.0 + netCDF4: 1.6.5 + h5netcdf: 1.3.0 + h5py: 3.11.0 + s3fs: 2024.6.0 + psutil: 5.9.8 + packaging: 23.2 + numpy: 1.26.4 + scipy: 1.13.0 + matplotlib: 3.8.4 + cftime: 1.6.3 + cfunits: 3.3.7 + cfplot: 3.3.0 + cfdm: 1.11.2.0 + cf: NEXTVERSION """ dependency_version_paths_mapping = { @@ -3174,6 +3420,9 @@ def environment(display=True, paths=True): "dask": _get_module_info("dask"), # Then Python libraries not related to CF "netCDF4": _get_module_info("netCDF4"), + "h5netcdf": _get_module_info("h5netcdf"), + "h5py": _get_module_info("h5py"), + "s3fs": _get_module_info("s3fs"), "psutil": _get_module_info("psutil"), "packaging": _get_module_info("packaging"), "numpy": _get_module_info("numpy"), diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py index c8e4fb1bb2..1e9e1f67e1 100644 --- a/cf/mixin/fielddomain.py +++ b/cf/mixin/fielddomain.py @@ -2317,7 +2317,7 @@ def iscyclic(self, *identity, **filter_kwargs): {{filter_kwargs: optional}} - .. versionadded:: (cfdm) 3.9.0 + .. versionadded:: 3.9.0 :Returns: diff --git a/cf/query.py b/cf/query.py index 87ff0c1c5b..268ca7b6d0 100644 --- a/cf/query.py +++ b/cf/query.py @@ -166,10 +166,11 @@ class Query: evaluated. ====================== ============================================== - In general, each method must have the query value as it's only - parameter. The only exception is for `__query_isclose__`, which + In general, each method must have the query value as its only + parameter. The only exceptions are for `__query_isclose__`, which also requires the absolute and relative numerical tolerances to be - provided. + provided, and for `__query_wi__`, which also requires upper and + lower interval openness Booleans to be provided. When the condition is on an attribute, or nested attributes, of the operand, the query interface method is looked for on the @@ -972,7 +973,7 @@ def _evaluate(self, x, parent_attr): if operator == "wi": _wi = getattr(x, "__query_wi__", None) if _wi is not None: - return _wi(value) + return _wi(value, self.open_lower, self.open_upper) if self.open_lower: lower_bound = x > value[0] diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py index f8437349c6..883cc7b5a2 100644 --- a/cf/read_write/netcdf/netcdfread.py +++ b/cf/read_write/netcdf/netcdfread.py @@ -208,8 +208,10 @@ def _create_data( # one dask chunk if data.npartitions == 1: data._cfa_set_write(True) + if ( not compression_index + and self.read_vars.get("cache") and self.implementation.get_construct_type(construct) != "field" ): @@ -251,17 +253,18 @@ def _create_data( coord_ncvar=coord_ncvar, ) + attributes = kwargs["attributes"] data = self._create_Data( cfa_array, ncvar, - units=kwargs["units"], - calendar=kwargs["calendar"], + units=attributes.get("units"), + calendar=attributes.get("calendar"), ) # Note: We don't cache elements from CFA variables, because # the data are in fragment files which have not been - # opened; and may not not even be openable, such as - # could be the case if a fragement was on tape storage. + # opened and may not not even be openable (such as could + # be the case if a fragment file was on tape storage). # Set the CFA write status to True iff each non-aggregated # axis has exactly one dask storage chunk @@ -503,6 +506,7 @@ def _cache_data_elements(self, data, ncvar): `None` """ + if data.data.get_compression_type(): # Don't get cached elements from arrays compressed by # convention, as they'll likely be wrong. @@ -649,13 +653,13 @@ def _create_cfanetcdfarray( :Returns: (`CFANetCDFArray`, `dict`) - The new `NetCDFArray` instance and dictionary of the - kwargs used to create it. + The new `CFANetCDFArray` instance and dictionary of + the kwargs used to create it. """ g = self.read_vars - # Get the kwargs needed to instantiate a general NetCDFArray + # Get the kwargs needed to instantiate a general netCDF array # instance kwargs = self._create_netcdfarray( ncvar, @@ -664,8 +668,8 @@ def _create_cfanetcdfarray( return_kwargs_only=True, ) - # Get rid of the incorrect shape of (). This will end up - # getting set correctly by the CFANetCDFArray instance. + # Get rid of the incorrect shape. This will end up getting set + # correctly by the CFANetCDFArray instance. kwargs.pop("shape", None) aggregated_data = g["cfa_aggregated_data"][ncvar] @@ -692,7 +696,11 @@ def _create_cfanetcdfarray( kwargs["instructions"] = " ".join(sorted(instructions)) # Use the kwargs to create a CFANetCDFArray instance - array = self.implementation.initialise_CFANetCDFArray(**kwargs) + if g["original_netCDF4"]: + array = self.implementation.initialise_CFANetCDF4Array(**kwargs) + else: + # h5netcdf + array = self.implementation.initialise_CFAH5netcdfArray(**kwargs) return array, kwargs @@ -727,19 +735,23 @@ def _create_cfanetcdfarray_term( :Returns: (`CFANetCDFArray`, `dict`) - The new `NetCDFArray` instance and dictionary of the - kwargs used to create it. + The new `CFANetCDFArray` instance and dictionary of + the kwargs used to create it. """ g = self.read_vars - # Get the kwargs needed to instantiate a general NetCDFArray + # Get the kwargs needed to instantiate a general netCDF array # instance kwargs = self._create_netcdfarray( ncvar, return_kwargs_only=True, ) + # Get rid of the incorrect shape. This will end up getting set + # correctly by the CFANetCDFArray instance. + kwargs.pop("shape", None) + instructions = [] aggregation_instructions = {} for t, term_ncvar in g["cfa_aggregated_data"][parent_ncvar].items(): @@ -754,8 +766,11 @@ def _create_cfanetcdfarray_term( kwargs["x"] = aggregation_instructions kwargs["instructions"] = " ".join(sorted(instructions)) - # Use the kwargs to create a CFANetCDFArray instance - array = self.implementation.initialise_CFANetCDFArray(**kwargs) + if g["original_netCDF4"]: + array = self.implementation.initialise_CFANetCDF4Array(**kwargs) + else: + # h5netcdf + array = self.implementation.initialise_CFAH5netcdfArray(**kwargs) return array, kwargs @@ -940,6 +955,7 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): aggregation_instructions = g["cfa_aggregation_instructions"] variable_attributes = g["variable_attributes"] + # Loop round aggregation instruction terms out = {} for x in self._parse_x( ncvar, @@ -954,10 +970,16 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): # Already processed this term continue - array = g["variables"][term_ncvar][...] - aggregation_instructions[term_ncvar] = self._cfa_conform_array( - array + variable = g["variables"][term_ncvar] + array = cfdm.netcdf_indexer( + variable, + mask=True, + unpack=True, + always_masked_array=False, + orthogonal_indexing=False, + copy=False, ) + aggregation_instructions[term_ncvar] = array[...] if term == "file": # Find URI substitutions that may be stored in the @@ -981,43 +1003,3 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data): g["cfa_aggregated_data"][ncvar] = out return out - - def _cfa_conform_array(self, array): - """Conform an array so that it is suitable for CFA processing. - - .. versionadded: 3.15.0 - - :Parameters: - - array: `np.ndarray` - The array to conform. - - :Returns: - - array: `np.ndarray` - The conformed array. - - """ - if isinstance(array, str): - # string - return np.array(array, dtype=f"S{len(array)}").astype("U") - - kind = array.dtype.kind - if kind == "O": - # string - return array.astype("U") - - if kind in "SU": - # char - if kind == "U": - array = array.astype("S") - - array = netCDF4.chartostring(array) - shape = array.shape - array = np.array([x.rstrip() for x in array.flat], dtype="S") - array = np.reshape(array, shape) - array = np.ma.masked_where(array == b"", array) - return array.astype("U") - - # number - return array diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py index 8890fe4da1..e2c8da0236 100644 --- a/cf/read_write/netcdf/netcdfwrite.py +++ b/cf/read_write/netcdf/netcdfwrite.py @@ -4,6 +4,7 @@ import dask.array as da import numpy as np +from ...data.dask_utils import cf_asanyarray from .netcdfread import NetCDFRead @@ -103,8 +104,10 @@ def _write_as_cfa(self, cfvar, construct_type, domain_axes): raise ValueError( f"Can't write {cfvar!r} as a CFA-netCDF " - "aggregation variable. Consider setting " - "cfa={'strict': False}" + "aggregation variable. Possible reasons for this " + "include 1) there is more than one Dask chunk " + "per fragment, and 2) data values have been " + "changed relative to those in the fragments." ) return cfa_get_write @@ -464,7 +467,7 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): ): f_ncdim = f"f_{ncdim}" if f_ncdim not in g["dimensions"]: - # Create a new fragement dimension + # Create a new fragment dimension self._write_dimension(f_ncdim, None, size=size) fragment_ncdimensions.append(f_ncdim) @@ -576,55 +579,6 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar): }, ) - def _convert_to_builtin_type(self, x): - """Convert a non-JSON-encodable object to a JSON-encodable - built-in type. - - Possible conversions are: - - ============== ============= ====================================== - Input object Output object numpy data types covered - ============== ============= ====================================== - numpy.bool_ bool bool - numpy.integer int int, int8, int16, int32, int64, uint8, - uint16, uint32, uint64 - numpy.floating float float, float16, float32, float64 - ============== ============= ====================================== - - .. versionadded:: 3.0.0 - - :Parameters: - - x: - - :Returns: - - 'int' or `float` or `bool` - - **Examples:** - - >>> type(_convert_to_builtin_type(numpy.bool_(True))) - bool - >>> type(_convert_to_builtin_type(numpy.array([1.0])[0])) - double - >>> type(_convert_to_builtin_type(numpy.array([2])[0])) - int - - """ - if isinstance(x, np.bool_): - return bool(x) - - if isinstance(x, np.integer): - return int(x) - - if isinstance(x, np.floating): - return float(x) - - raise TypeError( - f"{type(x)!r} object can't be converted to a JSON serializable " - f"type: {x!r}" - ) - def _check_valid(self, array, cfvar=None, attributes=None): """Checks for array values outside of the valid range. @@ -792,7 +746,10 @@ def _cfa_write_non_standard_terms( # dimensions, with one value per fragment. If a chunk has # more than one unique value then the fragment's value is # missing data. - dx = data.to_dask_array() + # + # '_cfa_unique' has its own call to 'cf_asanyarray', so + # we can set '_asanyarray=False'. + dx = data.to_dask_array(_asanyarray=False) dx_ind = tuple(range(dx.ndim)) out_ind = dx_ind dx = da.blockwise( @@ -850,6 +807,8 @@ def _cfa_unique(cls, a): data if there is not a unique value. """ + a = cf_asanyarray(a) + out_shape = (1,) * a.ndim a = np.unique(a) if np.ma.isMA(a): @@ -918,17 +877,17 @@ def _cfa_aggregation_instructions(self, data, cfvar): if len(file_details) != 1: if file_details: raise ValueError( - "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r} when the " - f"dask storage chunk defined by indices {indices} " - "spans two or more files" + f"Can't write {cfvar!r} as a CFA-netCDF " + "aggregation variable: Dask chunk defined by index " + f"{indices} spans two or more fragments. " + "A possible fix for this is to set chunks=None as " + "an argument of a prior call to cf.read" ) raise ValueError( - "Can't write CFA-netCDF aggregation variable from " - f"{cfvar!r} when the " - f"dask storage chunk defined by indices {indices} spans " - "zero files" + f"Can't write {cfvar!r} as a CFA-netCDF " + "aggregation variable: Dask chunk defined by index " + f"{indices} spans zero fragments." ) filenames, addresses, formats = file_details.pop() @@ -1003,7 +962,10 @@ def _cfa_aggregation_instructions(self, data, cfvar): # Create the location array # ------------------------------------------------------------ dtype = np.dtype(np.int32) - if max(data.to_dask_array().chunksize) > np.iinfo(dtype).max: + if ( + max(data.to_dask_array(_asanyarray=False).chunksize) + > np.iinfo(dtype).max + ): dtype = np.dtype(np.int64) ndim = data.ndim diff --git a/cf/read_write/read.py b/cf/read_write/read.py index a7ad3f23af..c6ad881db9 100644 --- a/cf/read_write/read.py +++ b/cf/read_write/read.py @@ -58,21 +58,26 @@ def read( select_options=None, follow_symlinks=False, mask=True, + unpack=True, warn_valid=False, chunks="auto", domain=False, cfa=None, + netcdf_backend=None, + storage_options=None, + cache=True, ): """Read field or domain constructs from files. - The following file formats are supported: CF-netCDF, CFA-netCDF, - CDL, PP and UM fields datasets. + The following file formats are supported: netCDF, CFA-netCDF, CDL, + UM fields file, and PP. Input datasets are mapped to constructs in memory which are returned as elements of a `FieldList` or if the *domain* parameter is True, a `DomainList`. - NetCDF files may be on disk or on an OPeNDAP server. + NetCDF files may be on disk, on an OPeNDAP server, or in an S3 + object store. Any amount of files of any combination of file types may be read. @@ -408,14 +413,13 @@ def read( parameter. mask: `bool`, optional - If False then do not mask by convention when reading the - data of field or metadata constructs from disk. By default - data is masked by convention. + If True (the default) then mask by convention the data of + field and metadata constructs. - The masking by convention of a netCDF array depends on the - values of any of the netCDF variable attributes - ``_FillValue``, ``missing_value``, ``valid_min``, - ``valid_max`` and ``valid_range``. + A netCDF array is masked depending on the values of any of + the netCDF attributes ``_FillValue``, ``missing_value``, + ``_Unsigned``, ``valid_min``, ``valid_max``, and + ``valid_range``. The masking by convention of a PP or UM array depends on the value of BMDI in the lookup header. A value other than @@ -427,6 +431,16 @@ def read( .. versionadded:: 3.4.0 + unpack: `bool`, optional + If True, the default, then unpack arrays by convention + when the data is read from disk. + + Unpacking is determined by netCDF conventions for the + following variable attributes: ``add_offset``, + ``scale_factor``, and ``_Unsigned``. + + .. versionadded:: NEXTVERSION + warn_valid: `bool`, optional If True then print a warning for the presence of ``valid_min``, ``valid_max`` or ``valid_range`` properties @@ -652,18 +666,91 @@ def read( A dictionary whose key/value pairs define text substitutions to be applied to the fragment file names. Each key may be specified with or without the - ``${...}`` syntax. For instance, the following are - equivalent: ``{'base': 'sub'}``, ``{'${base}': 'sub'}``. - The substitutions are used in conjunction with, and take - precedence over, any that are stored in the CFA-netCDF - file by the ``substitutions`` attribute of the ``file`` - CFA aggregation instruction variable. + ``${*}`` syntax (where `*` represents any amount of any + characters). For instance, ``{'substitution': + 'replacement'}`` and ``{'${substitution}': 'replacement'}``' + are equivalent. The substitutions are used in + conjunction with, and take precedence over, any that are + stored in the CFA-netCDF file by the ``substitutions`` + attribute of the ``file`` fragement array variable. *Example:* - ``{'base': 'file:///data/'}`` + ``{'replacement': 'file:///data/'}`` .. versionadded:: 3.15.0 + netcdf_backend: `None` or `str`, optional + Specify which library to use for reading netCDF files. By + default, or if `None`, then the first one of `netCDF4` and + `h5netcdf` to successfully open the file netCDF file is + used. Setting *netcdf_backend* to one of ``'netCDF4'`` and + ``'h5netcdf'`` will force the use of that library. + + .. note:: The *netcdf_backend* parameter does not affect + the opening of netCDF fragment files that define + the data of aggregation variables. For these, it + is always the case that the first one of + `netCDF4` and `h5netcdf` to successfully open + the file is used. + + .. versionadded:: NEXTVERSION + + storage_options: `dict` or `None`, optional + Pass parameters to the backend file system driver, such as + username, password, server, port, etc. How the storage + options are interpreted depends on the location of the + file: + + **Local File System** + + Storage options are ignored for local files. + + **HTTP(S)** + + Storage options are ignored for files available across the + network via OPeNDAP. + + **S3-compatible services** + + The backend used is `s3fs`, and the storage options are + used to initialise an `s3fs.S3FileSystem` file system + object. By default, or if `None`, then *storage_options* + is taken as ``{}``. + + If the ``'endpoint_url'`` key is not in *storage_options*, + nor in a dictionary defined by the ``'client_kwargs'`` key + (both of which are the case when *storage_options* is + `None`), then one will be automatically inserted for + accessing an S3 file. For example, for a file name of + ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key + with value ``'https://store'`` would be created. To + disable this, set ``'endpoint_url'`` to `None`. + + *Parameter example:* + For a file name of ``'s3://store/data/file.nc'``, the + following are equivalent: ``None``, ``{}``, + ``{'endpoint_url': 'https://store'}``, and + ``{'client_kwargs': {'endpoint_url': 'https://store'}}`` + + *Parameter example:* + ``{'key': 'scaleway-api-key...', 'secret': + 'scaleway-secretkey...', 'endpoint_url': + 'https://s3.fr-par.scw.cloud', 'client_kwargs': + {'region_name': 'fr-par'}}`` + + .. versionadded:: NEXTVERSION + + cache: `bool`, optional + If True, the default, then cache the first and last array + elements of metadata constructs (not field constructs) for + fast future access. In addition, the second and + penultimate array elements will be cached from coordinate + bounds when there are two bounds per cell. For remote + data, setting *cache* to False may speed up the parsing of + the file. + + .. versionadded:: NEXTVERSION + umversion: deprecated at version 3.0.0 Use the *um* parameter instead. @@ -823,6 +910,8 @@ def read( cfa_options["substitutions"] = substitutions + cache = bool(cache) + # Initialise the output list of fields/domains if domain: out = DomainList() @@ -885,8 +974,8 @@ def read( file_glob = os.path.expanduser(os.path.expandvars(file_glob)) scheme = urlparse(file_glob).scheme - if scheme in ("https", "http"): - # Do not glob a URL + if scheme in ("https", "http", "s3"): + # Do not glob a remote URL files2 = (file_glob,) else: # Glob files on disk @@ -951,10 +1040,14 @@ def read( height_at_top_of_model=height_at_top_of_model, chunks=chunks, mask=mask, + unpack=unpack, warn_valid=warn_valid, select=select, domain=domain, cfa_options=cfa_options, + netcdf_backend=netcdf_backend, + storage_options=storage_options, + cache=cache, ) # -------------------------------------------------------- @@ -1064,11 +1157,15 @@ def _read_a_file( extra=None, height_at_top_of_model=None, mask=True, + unpack=True, warn_valid=False, chunks="auto", select=None, domain=False, cfa_options=None, + netcdf_backend=None, + storage_options=None, + cache=True, ): """Read the contents of a single file into a field list. @@ -1090,6 +1187,9 @@ def _read_a_file( mask: `bool`, optional See `cf.read` for details. + unpack: `bool`, optional + See `cf.read` for details. + verbose: `int` or `str` or `None`, optional See `cf.read` for details. @@ -1104,6 +1204,21 @@ def _read_a_file( .. versionadded:: 3.15.0 + storage_options: `dict` or `None`, optional + See `cf.read` for details. + + .. versionadded:: NEXTVERSION + + netcdf_backend: `str` or `None`, optional + See `cf.read` for details. + + .. versionadded:: NEXTVERSION + + cache: `bool`, optional + See `cf.read` for details. + + .. versionadded:: NEXTVERSION + :Returns: `FieldList` or `DomainList` @@ -1138,6 +1253,7 @@ def _read_a_file( "fmt": selected_fmt, "ignore_read_error": ignore_read_error, "cfa_options": cfa_options, + "cache": cache, } # ---------------------------------------------------------------- @@ -1175,8 +1291,11 @@ def _read_a_file( warnings=warnings, extra_read_vars=extra_read_vars, mask=mask, + unpack=unpack, warn_valid=warn_valid, domain=domain, + storage_options=storage_options, + netcdf_backend=netcdf_backend, ) except MaskError: # Some data required for field interpretation is missing, diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 051562945a..e73166eba1 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -1973,8 +1973,10 @@ def create_data(self): recs = self.recs um_Units = self.um_Units - units = getattr(um_Units, "units", None) - calendar = getattr(um_Units, "calendar", None) + attributes = { + "units": getattr(um_Units, "units", None), + "calendar": getattr(um_Units, "calendar", None), + } data_type_in_file = self.data_type_in_file @@ -2015,8 +2017,7 @@ def create_data(self): fmt=fmt, word_size=self.word_size, byte_ordering=self.byte_ordering, - units=units, - calendar=calendar, + attributes=attributes, ) key = f"{klass_name}-{tokenize(subarray)}" @@ -2069,8 +2070,7 @@ def create_data(self): fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, - units=units, - calendar=calendar, + attributes=attributes, ) key = f"{klass_name}-{tokenize(subarray)}" @@ -2120,8 +2120,7 @@ def create_data(self): fmt=fmt, word_size=word_size, byte_ordering=byte_ordering, - units=units, - calendar=calendar, + attributes=attributes, ) key = f"{klass_name}-{tokenize(subarray)}" @@ -3535,7 +3534,7 @@ def is_um_file(self, filename): """Whether or not a file is a PP file or UM fields file. Note that the file type is determined by inspecting the file's - content and any file suffix is not not considered. + content and any file suffix is not considered. :Parameters: diff --git a/cf/read_write/write.py b/cf/read_write/write.py index c3d0edb615..23a8dda3cd 100644 --- a/cf/read_write/write.py +++ b/cf/read_write/write.py @@ -97,22 +97,22 @@ def write( construct. - **NetCDF hierarchical groups** + **NetCDF-4 hierarchical groups** Hierarchical groups in CF provide a mechanism to structure - variables within netCDF4 datasets with well defined rules for + variables within netCDF-4 datasets with well defined rules for resolving references to out-of-group netCDF variables and dimensions. The group structure defined by a field construct's netCDF interface will, by default, be recreated in the output dataset. See the *group* parameter for details. - **NetCDF4 HDF chunk sizes** + **NetCDF-4 HDF chunk sizes** HDF5 chunksizes may be set on contruct's data. See the - `~cf.Data.nc_hdf5_chunksizes`, - `~cf.Data.nc_clear_hdf5_chunksizes` and - `~cf.Data.nc_set_hdf5_chunksizes` methods of a `Data` instance. + `~cf.Data.nc_hdf5_chunksizes`, `~cf.Data.nc_clear_hdf5_chunksizes` + and `~cf.Data.nc_set_hdf5_chunksizes` methods of a `Data` + instance. .. seealso:: `cf.read` @@ -121,7 +121,6 @@ def write( fields: (arbitrarily nested sequence of) `Field` or `FieldList` The field constructs to write to the file. - filename: `str` The output netCDF file name. Various type of expansion are applied to the file names. @@ -548,7 +547,7 @@ def write( variables. By default only auxiliary and scalar coordinate variables are included. - .. versionadded:: (cfdm) 3.7.0 + .. versionadded:: 3.7.0 omit_data: (sequence of) `str`, optional Do not write the data of the named construct types. diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py index 08990ad925..a06d1cc960 100644 --- a/cf/regrid/regrid.py +++ b/cf/regrid/regrid.py @@ -2465,7 +2465,8 @@ def create_esmpy_weights( from netCDF4 import Dataset from .. import __version__ - from ..data.array.netcdfarray import _lock + + from ..data.array.locks import netcdf_lock if ( max(dst_esmpy_field.data.size, src_esmpy_field.data.size) @@ -2491,7 +2492,7 @@ def create_esmpy_weights( if src_grid.ln_z: regrid_method += f", ln {src_grid.method} in vertical" - _lock.acquire() + netcdf_lock.acquire() nc = Dataset(weights_file, "w", format="NETCDF4") nc.title = ( @@ -2532,7 +2533,7 @@ def create_esmpy_weights( v[...] = col nc.close() - _lock.release() + netcdf_lock.release() if esmpy_regrid_operator is None: # Destroy esmpy objects (the esmpy.Grid objects exist even if diff --git a/cf/regrid/regridoperator.py b/cf/regrid/regridoperator.py index 997cd0d6e6..10a77bc641 100644 --- a/cf/regrid/regridoperator.py +++ b/cf/regrid/regridoperator.py @@ -727,9 +727,9 @@ def tosparse(self): # Read the weights from the weights file from netCDF4 import Dataset - from ..data.array.netcdfarray import _lock + from ..data.array.locks import netcdf_lock - _lock.acquire() + netcdf_lock.acquire() nc = Dataset(weights_file, "r") weights = nc.variables["S"][...] row = nc.variables["row"][...] @@ -746,7 +746,7 @@ def tosparse(self): row_start_index = 1 nc.close() - _lock.release() + netcdf_lock.release() else: raise ValueError( "Conversion to sparse array format requires at least " diff --git a/cf/test/individual_tests.sh b/cf/test/individual_tests.sh index fea95ef58b..425c7dd435 100755 --- a/cf/test/individual_tests.sh +++ b/cf/test/individual_tests.sh @@ -5,9 +5,9 @@ do echo "Running $file" python $file rc=$? - if [[ $rc != 0 ]]; then - exit $rc - fi +# if [[ $rc != 0 ]]; then +# exit $rc +# fi done file=setup_create_field.py diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index fd61d0e9af..e29c192062 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -1479,6 +1479,18 @@ def test_Data__getitem__(self): f = cf.Data([-999, 35], mask=[True, False]).reshape(2, 1) self.assertTrue(e.equals(f)) + # Chained subspaces reading from disk + f = cf.read(self.filename)[0] + d = f.data + a = d[:1, [1, 3, 4], :][:, [True, False, True], ::-2].array + b = d.array[:1, [1, 3, 4], :][:, [True, False, True], ::-2] + self.assertTrue((a == b).all()) + + d.__keepdims_indexing__ = False + a = d[0, [1, 3, 4], :][[True, False, True], ::-2].array + b = d.array[0, [1, 3, 4], :][[True, False, True], ::-2] + self.assertTrue((a == b).all()) + def test_Data__setitem__(self): """Test the assignment of data elements on Data.""" for hardmask in (False, True): @@ -3277,6 +3289,14 @@ def test_Data_rechunk(self): self.assertEqual(e.chunks, ((4,), (5,))) self.assertTrue(e.equals(d)) + # Test rechunking after a __getitem__ + e = d[:2].rechunk((2, 5)) + self.assertTrue(e.equals(d[:2])) + + d = cf.Data.ones((4, 5), chunks=(4, 5)) + e = d[:2].rechunk((1, 3)) + self.assertTrue(e.equals(d[:2])) + def test_Data_reshape(self): """Test the `reshape` Data method.""" a = np.arange(12).reshape(3, 4) @@ -4500,13 +4520,15 @@ def test_Data__str__(self): def test_Data_cull_graph(self): """Test `Data.cull`""" + # Note: The number of layers in the culled graphs include a + # `cf_asanyarray` layer d = cf.Data([1, 2, 3, 4, 5], chunks=3) d = d[:2] - self.assertEqual(len(dict(d.to_dask_array().dask)), 3) + self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 3) # Check that there are fewer keys after culling d.cull_graph() - self.assertEqual(len(dict(d.to_dask_array().dask)), 2) + self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 2) def test_Data_npartitions(self): """Test the `npartitions` Data property.""" @@ -4754,6 +4776,13 @@ def test_Data_pad_missing(self): with self.assertRaises(ValueError): d.pad_missing(99, to_size=99) + def test_Data_is_masked(self): + """Test Data.is_masked.""" + d = cf.Data(np.arange(6).reshape(2, 3)) + d[0, 0] = cf.masked + self.assertTrue(d[0].is_masked) + self.assertFalse(d[1].is_masked) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py index 3f10e1664e..92392b29f5 100644 --- a/cf/test/test_Field.py +++ b/cf/test/test_Field.py @@ -1466,7 +1466,7 @@ def test_Field_indices(self): shape = (1, 1, 1) self.assertEqual(g.shape, shape) - self.assertEqual(g.array.compressed(), 29) + self.assertEqual(np.ma.compressed(g.array), 29) if mode != "full": self.assertEqual(g.construct("longitude").array, 83) @@ -1484,7 +1484,7 @@ def test_Field_indices(self): shape = (1, 2, 2) self.assertEqual(g.shape, shape) - self.assertTrue((g.array.compressed() == [4, 29]).all()) + self.assertTrue((np.ma.compressed(g.array) == [4, 29]).all()) # Add 2-d auxiliary coordinates with bounds, so we can # properly test cf.contains values diff --git a/cf/test/test_FullArray.py b/cf/test/test_FullArray.py new file mode 100644 index 0000000000..63dcb84f34 --- /dev/null +++ b/cf/test/test_FullArray.py @@ -0,0 +1,55 @@ +import datetime +import faulthandler +import unittest + +import numpy as np + +faulthandler.enable() # to debug seg faults and timeouts + +import cf + + +class FullArrayTest(unittest.TestCase): + def test_FullValue_inspection(self): + full = 9 + f = cf.FullArray(full, np.dtype(int), shape=(2, 3, 4)) + self.assertEqual(f.get_full_value(), full) + self.assertEqual(f.shape, (2, 3, 4)) + self.assertEqual(f.dtype, np.dtype(int)) + self.assertIsNone(f.set_full_value(10)) + self.assertEqual(f.get_full_value(), 10) + + def test_FullValue_array(self): + full = 9 + f = cf.FullArray(full, np.dtype(int), shape=(2, 3, 4)) + self.assertTrue((f.array == np.full(f.shape, full)).all()) + + f = f[0, [True, False, True], ::3] + self.assertTrue((f.array == np.full((2, 1), full)).all()) + + def test_FullValue_masked_array(self): + full = np.ma.masked + f = cf.FullArray(full, np.dtype(int), shape=(2, 3)) + + a = np.ma.masked_all(f.shape, dtype=np.dtype(int)) + array = f.array + self.assertEqual(array.dtype, a.dtype) + self.assertTrue( + (np.ma.getmaskarray(array) == np.ma.getmaskarray(a)).all() + ) + + def test_FullValue_get_array(self): + full = 9 + f = cf.FullArray(full, np.dtype(int), shape=(2, 3)) + f = f[0, 1] + self.assertEqual(f.shape, ()) + + array = f._get_array(index=Ellipsis) + self.assertTrue((array == np.full((2, 3), full)).all()) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print() + unittest.main(verbosity=2) diff --git a/cf/test/test_NetCDFArray.py b/cf/test/test_NetCDF4Array.py similarity index 56% rename from cf/test/test_NetCDFArray.py rename to cf/test/test_NetCDF4Array.py index c69a4654e7..0d049ff497 100644 --- a/cf/test/test_NetCDFArray.py +++ b/cf/test/test_NetCDF4Array.py @@ -5,6 +5,7 @@ import tempfile import unittest +import numpy as np from dask.base import tokenize faulthandler.enable() # to debug seg faults and timeouts @@ -13,7 +14,7 @@ n_tmpfiles = 1 tmpfiles = [ - tempfile.mkstemp("_test_NetCDFArray.nc", dir=os.getcwd())[1] + tempfile.mkstemp("_test_NetCDF4Array.nc", dir=os.getcwd())[1] for i in range(n_tmpfiles) ] (tmpfile1,) = tmpfiles @@ -31,15 +32,22 @@ def _remove_tmpfiles(): atexit.register(_remove_tmpfiles) -class NetCDFArrayTest(unittest.TestCase): - def test_NetCDFArray_del_file_location(self): - a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) +class NetCDF4ArrayTest(unittest.TestCase): + n = cf.NetCDF4Array( + filename="filename.nc", + address="x", + shape=(5, 8), + dtype=np.dtype(float), + ) + + def test_NetCDF4Array_del_file_location(self): + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) b = a.del_file_location("/data1") self.assertIsNot(b, a) self.assertEqual(b.get_filenames(), ("/data2/file2",)) self.assertEqual(b.get_addresses(), ("tas2",)) - a = cf.NetCDFArray( + a = cf.NetCDF4Array( ("/data1/file1", "/data2/file1", "/data2/file2"), ("tas1", "tas1", "tas2"), ) @@ -52,18 +60,18 @@ def test_NetCDFArray_del_file_location(self): with self.assertRaises(ValueError): b.del_file_location("/data1/") - def test_NetCDFArray_file_locations(self): - a = cf.NetCDFArray("/data1/file1") + def test_NetCDF4Array_file_locations(self): + a = cf.NetCDF4Array("/data1/file1") self.assertEqual(a.file_locations(), ("/data1",)) - a = cf.NetCDFArray(("/data1/file1", "/data2/file2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2")) self.assertEqual(a.file_locations(), ("/data1", "/data2")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file2", "/data1/file2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2", "/data1/file2")) self.assertEqual(a.file_locations(), ("/data1", "/data2", "/data1")) - def test_NetCDFArray_add_file_location(self): - a = cf.NetCDFArray("/data1/file1", "tas") + def test_NetCDF4Array_add_file_location(self): + a = cf.NetCDF4Array("/data1/file1", "tas") b = a.add_file_location("/home/user") self.assertIsNot(b, a) self.assertEqual( @@ -71,7 +79,7 @@ def test_NetCDFArray_add_file_location(self): ) self.assertEqual(b.get_addresses(), ("tas", "tas")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2")) b = a.add_file_location("/home/user") self.assertEqual( b.get_filenames(), @@ -84,7 +92,7 @@ def test_NetCDFArray_add_file_location(self): ) self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1", "tas2")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) b = a.add_file_location("/home/user") self.assertEqual( b.get_filenames(), @@ -92,24 +100,24 @@ def test_NetCDFArray_add_file_location(self): ) self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1")) - a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) + a = cf.NetCDF4Array(("/data1/file1", "/data2/file1"), ("tas1", "tas2")) b = a.add_file_location("/data1/") self.assertEqual(b.get_filenames(), a.get_filenames()) self.assertEqual(b.get_addresses(), a.get_addresses()) - def test_NetCDFArray__dask_tokenize__(self): - a = cf.NetCDFArray("/data1/file1", "tas", shape=(12, 2), mask=False) + def test_NetCDF4Array__dask_tokenize__(self): + a = cf.NetCDF4Array("/data1/file1", "tas", shape=(12, 2), mask=False) self.assertEqual(tokenize(a), tokenize(a.copy())) - b = cf.NetCDFArray("/home/file2", "tas", shape=(12, 2)) + b = cf.NetCDF4Array("/home/file2", "tas", shape=(12, 2)) self.assertNotEqual(tokenize(a), tokenize(b)) - def test_NetCDFArray_multiple_files(self): + def test_NetCDF4Array_multiple_files(self): f = cf.example_field(0) cf.write(f, tmpfile1) # Create instance with non-existent file - n = cf.NetCDFArray( + n = cf.NetCDF4Array( filename=os.path.join("/bad/location", os.path.basename(tmpfile1)), address=f.nc_get_variable(), shape=f.shape, @@ -121,6 +129,40 @@ def test_NetCDFArray_multiple_files(self): self.assertEqual(len(n.get_filenames()), 2) self.assertTrue((n[...] == f.array).all()) + def test_NetCDF4Array_shape(self): + shape = (12, 73, 96) + a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) + self.assertEqual(a.shape, shape) + self.assertEqual(a.original_shape, shape) + a = a[::2] + self.assertEqual(a.shape, (shape[0] // 2,) + shape[1:]) + self.assertEqual(a.original_shape, shape) + + def test_NetCDF4Array_index(self): + shape = (12, 73, 96) + a = cf.NetCDF4Array("/home/file2", "tas", shape=shape) + self.assertEqual( + a.index(), + ( + slice( + None, + ), + ) + * len(shape), + ) + a = a[8:7:-1, 10:19:3, [15, 1, 4, 12]] + a = a[[0], [True, False, True], ::-2] + self.assertEqual(a.shape, (1, 2, 2)) + self.assertEqual( + a.index(), + (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)), + ) + + index = a.index(conform=False) + self.assertTrue((index[0] == [8]).all()) + self.assertTrue((index[1] == [10, 16]).all()) + self.assertTrue((index[2] == [12, 1]).all()) + if __name__ == "__main__": print("Run date:", datetime.datetime.now()) diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py new file mode 100644 index 0000000000..f14e063849 --- /dev/null +++ b/cf/test/test_active_storage.py @@ -0,0 +1,69 @@ +import atexit +import datetime +import faulthandler +import os +import tempfile +import unittest + +faulthandler.enable() # to debug seg faults and timeouts + +import cf + +try: + from activestorage import Active +except ModuleNotFoundError: + Active = None + +n_tmpfiles = 2 +tmpfiles = [ + tempfile.mkstemp("_test_active_storage.nc", dir=os.getcwd())[1] + for i in range(n_tmpfiles) +] +(tmpfile, tmpfile2) = tmpfiles + + +def _remove_tmpfiles(): + """Try to remove defined temporary files by deleting their paths.""" + for f in tmpfiles: + try: + os.remove(f) + except OSError: + pass + + +atexit.register(_remove_tmpfiles) + + +class ActiveStorageTest(unittest.TestCase): + @unittest.skipUnless(Active is not None, "Requires activestorage.Active") + def test_active_storage(self): + # No masked values + f = cf.example_field(0) + cf.write(f, tmpfile) + + f = cf.read(tmpfile, chunks={"latitude": (4, 1), "longitude": (3, 5)}) + f = f[0] + self.assertEqual(f.data.chunks, ((4, 1), (3, 5))) + + cf.active_storage(False) + self.assertFalse(cf.active_storage()) + f.collapse("mean", weights=False) + + local_array = f.collapse("mean", weights=False).array + + with cf.configuration(active_storage=True, active_storage_url="dummy"): + self.assertTrue(cf.active_storage()) + self.assertEqual(cf.active_storage_url(), "dummy") + active_array = f.collapse("mean", weights=False).array + + self.assertEqual(active_array, local_array) + + # TODOACTIVE: Test with masked values (not yet working in + # activestorage.Active) + + +if __name__ == "__main__": + print("Run date:", datetime.datetime.now()) + cf.environment() + print("") + unittest.main(verbosity=2) diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py index 9cc5e5eab1..f6cce13ae0 100644 --- a/cf/test/test_functions.py +++ b/cf/test/test_functions.py @@ -58,7 +58,7 @@ def test_configuration(self): self.assertIsInstance(org, dict) # Check all keys that should be there are, with correct value type: - self.assertEqual(len(org), 8) # update expected len if add new key(s) + self.assertEqual(len(org), 11) # update expected len if add new key(s) # Types expected: self.assertIsInstance(org["atol"], float) @@ -68,6 +68,8 @@ def test_configuration(self): self.assertIsInstance(org["bounds_combination_mode"], str) self.assertIsInstance(org["regrid_logging"], bool) self.assertIsInstance(org["tempdir"], str) + self.assertIsInstance(org["active_storage"], bool) + self.assertIsInstance(org["active_storage_max_requests"], int) # Log level may be input as an int but always given as # equiv. string self.assertIsInstance(org["log_level"], str) @@ -87,12 +89,20 @@ def test_configuration(self): "bounds_combination_mode": "XOR", "log_level": "INFO", "chunksize": 8e9, + "active_storage": True, + "active_storage_url": None, + "active_storage_max_requests": 100, } # Test the setting of each lone item. expected_post_set = dict(org) # copy for safety with mutable dict for setting, value in reset_values.items(): - cf.configuration(**{setting: value}) + try: + cf.configuration(**{setting: value}) + except ModuleNotFoundError as error: + print(f"WARNING: not testing {setting!r} due to: {error}") + continue + post_set = cf.configuration() # Expect a dict that is identical to the original to start diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 3347fee8ea..f0bf697fac 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -8,7 +8,7 @@ import tempfile import unittest -import numpy +import numpy as np faulthandler.enable() # to debug seg faults and timeouts @@ -93,26 +93,26 @@ def test_read_mask(self): cf.write(f, tmpfile) g = cf.read(tmpfile)[0] - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) g = cf.read(tmpfile, mask=False)[0] - self.assertEqual(numpy.ma.count(g.data.array), N) + self.assertEqual(np.ma.count(g.data.array), N) g.apply_masking(inplace=True) - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) f.set_property("_FillValue", 999) f.set_property("missing_value", -111) cf.write(f, tmpfile) g = cf.read(tmpfile)[0] - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) g = cf.read(tmpfile, mask=False)[0] - self.assertEqual(numpy.ma.count(g.data.array), N) + self.assertEqual(np.ma.count(g.data.array), N) g.apply_masking(inplace=True) - self.assertEqual(numpy.ma.count(g.data.array), N - 2) + self.assertEqual(np.ma.count(g.data.array), N - 2) def test_read_directory(self): pwd = os.getcwd() + "/" @@ -562,38 +562,38 @@ def test_read_write_netCDF4_compress_shuffle(self): def test_write_datatype(self): f = cf.read(self.filename)[0] - self.assertEqual(f.dtype, numpy.dtype(float)) + self.assertEqual(f.dtype, np.dtype(float)) cf.write( f, tmpfile, fmt="NETCDF4", - datatype={numpy.dtype(float): numpy.dtype("float32")}, + datatype={np.dtype(float): np.dtype("float32")}, ) g = cf.read(tmpfile)[0] self.assertEqual( g.dtype, - numpy.dtype("float32"), + np.dtype("float32"), "datatype read in is " + str(g.dtype), ) # Keyword single f = cf.read(self.filename)[0] - self.assertEqual(f.dtype, numpy.dtype(float)) + self.assertEqual(f.dtype, np.dtype(float)) cf.write(f, tmpfile, fmt="NETCDF4", single=True) g = cf.read(tmpfile)[0] self.assertEqual( g.dtype, - numpy.dtype("float32"), + np.dtype("float32"), "datatype read in is " + str(g.dtype), ) # Keyword double f = g - self.assertEqual(f.dtype, numpy.dtype("float32")) + self.assertEqual(f.dtype, np.dtype("float32")) cf.write(f, tmpfile2, fmt="NETCDF4", double=True) g = cf.read(tmpfile2)[0] self.assertEqual( - g.dtype, numpy.dtype(float), "datatype read in is " + str(g.dtype) + g.dtype, np.dtype(float), "datatype read in is " + str(g.dtype) ) for single in (True, False): @@ -601,7 +601,7 @@ def test_write_datatype(self): with self.assertRaises(Exception): cf.write(g, double=double, single=single) - datatype = {numpy.dtype(float): numpy.dtype("float32")} + datatype = {np.dtype(float): np.dtype("float32")} with self.assertRaises(Exception): cf.write(g, datatype=datatype, single=True) @@ -898,8 +898,8 @@ def test_write_omit_data(self): g = g[0] # Check that the data are missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) # Check that a dump works g.dump(display=False) @@ -909,16 +909,16 @@ def test_write_omit_data(self): # Check that only the field and dimension coordinate data are # missing - self.assertFalse(g.array.count()) - self.assertFalse(g.construct("grid_latitude").array.count()) - self.assertTrue(g.construct("latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertFalse(np.ma.count(g.construct("grid_latitude").array)) + self.assertTrue(np.ma.count(g.construct("latitude").array)) cf.write(f, tmpfile, omit_data="field") g = cf.read(tmpfile)[0] # Check that only the field data are missing - self.assertFalse(g.array.count()) - self.assertTrue(g.construct("grid_latitude").array.count()) + self.assertFalse(np.ma.count(g.array)) + self.assertTrue(np.ma.count(g.construct("grid_latitude").array)) @unittest.skipUnless( False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL" diff --git a/docs/cheat_sheet.html b/docs/cheat_sheet.html index ef89f98a11..52748b2401 100644 --- a/docs/cheat_sheet.html +++ b/docs/cheat_sheet.html @@ -133,7 +133,7 @@

Related Topics

Version 3.16.2 for version 1.11 of the CF conventions.

This cheat sheet provides a summary of some key functions and methods in cf-python (also available as a printable PDF for -pdf).

+pdf).

@@ -440,4 +440,4 @@

Related Topics

- \ No newline at end of file + diff --git a/docs/source/cf_data_model.rst b/docs/source/cf_data_model.rst deleted file mode 100644 index 1838fe8a1d..0000000000 --- a/docs/source/cf_data_model.rst +++ /dev/null @@ -1,110 +0,0 @@ -.. currentmodule:: cf -.. default-role:: obj - -.. raw:: html - - - -.. role:: small - -.. _CF-data-model: - -**CF data model** -================= - ----- - -Version |release| for version |version| of the CF conventions. - -The CF (Climate and Forecast) metadata conventions -(http://cfconventions.org) provide a description of the physical -meaning of data and of their spatial and temporal properties and are -designed to promote the creation, processing, and sharing of climate -and forecasting data using netCDF files and libraries -(https://www.unidata.ucar.edu/software/netcdf). - -`The CF data model -`_ -identifies the fundamental elements ("constructs") of the CF -conventions and shows how they relate to each other, independently of -the netCDF encoding. - -The CF data model defines a **field construct** for storing data with -all of its metadata. It is defined as follows: - -.. glossary:: - - field construct - corresponds to a CF-netCDF data variable with all of its - metadata. It consists of - - - descriptive properties that apply to field construct as a whole - (e.g. the standard name), - - - a data array, - - - a **domain construct** that describes the locations of each cell - of the data array (i.e. the "domain"), - - - **metadata constructs** that describe the physical nature of the - data array, defined by - - .. glossary:: - - field ancillary constructs - corresponding to CF-netCDF ancillary variables - - cell method constructs - corresponding to a CF-netCDF cell_methods attribute of data - variable - - domain construct - that describes the locations of each cell of the domain. It may - exist independently of a **field construct** and consists of - - - descriptive properties that apply to domain construct as a whole, - - - **metadata constructs** that describe the locations of each cell - of the domain, defined by - - .. glossary:: - - domain axis constructs - corresponding to CF-netCDF dimensions or scalar coordinate - variables - - dimension coordinate constructs - corresponding to CF-netCDF coordinate variables or numeric - scalar coordinate variables - - auxiliary coordinate constructs - corresponding to CF-netCDF auxiliary coordinate variables and - non-numeric scalar coordinate variables - - coordinate reference constructs - corresponding to CF-netCDF grid mapping variables or the - formula_terms attribute of a coordinate variable - - domain ancillary constructs - corresponding to CF-netCDF variables named by the - formula_terms attribute of a coordinate variable - - cell measure constructs - corresponding to CF-netCDF cell measure variables - - domain topology constructs - corresponding to CF-netCDF UGRID mesh topology variables - - cell connectivity constructs - corresponding to CF-netCDF UGRID connectivity variables - ----- - - -| - -.. figure:: images/cfdm_field.svg - - *The constructs of the CF data model described using UML. The field construct corresponds to a CF-netCDF data variable. The domain construct provides the linkage between the field construct and the constructs which describe measurement locations and cell properties. It is useful to define an abstract generic coordinate construct that can be used to refer to coordinates when the their type (dimension or auxiliary coordinate construct) is not an issue.* - ----- diff --git a/docs/source/class.rst b/docs/source/class.rst index dfa24006c8..cd236ff624 100644 --- a/docs/source/class.rst +++ b/docs/source/class.rst @@ -77,24 +77,11 @@ Data classes :toctree: class/ cf.Data - cf.NetCDFArray - -Miscellaneous classes ---------------------- - -.. autosummary:: - :nosignatures: - :toctree: class/ - - cf.Flags - cf.Query - cf.TimeDuration - cf.Units - cf.RegridOperator - cf.Constant - cf.Configuration - - + cf.H5netcdfArray + cf.NetCDF4Array + cf.FullArray + cf.UMArray + Data compression classes ------------------------ @@ -111,3 +98,33 @@ Classes that support the creation and storage of compressed arrays. cf.RaggedContiguousArray cf.RaggedIndexedArray cf.RaggedIndexedContiguousArray + cf.SubsampledArray + +Data UGRID classes +------------------ + +Classes that support the creation and storage of UGRID-related arrays. + +.. autosummary:: + :nosignatures: + :toctree: class/ + + cf.BoundsFromNodesArray + cf.CellConnectivityArray + cf.PointTopologyArray + +Miscellaneous classes +--------------------- + +.. autosummary:: + :nosignatures: + :toctree: class/ + + cf.Flags + cf.Query + cf.TimeDuration + cf.Units + cf.RegridOperator + cf.Constant + cf.Configuration + diff --git a/docs/source/class/cf.AuxiliaryCoordinate.rst b/docs/source/class/cf.AuxiliaryCoordinate.rst index cfa9503c68..e3d69564ed 100644 --- a/docs/source/class/cf.AuxiliaryCoordinate.rst +++ b/docs/source/class/cf.AuxiliaryCoordinate.rst @@ -502,6 +502,9 @@ NetCDF ~cf.AuxiliaryCoordinate.nc_set_node_coordinate_variable ~cf.AuxiliaryCoordinate.nc_set_node_coordinate_variable_groups ~cf.AuxiliaryCoordinate.nc_clear_node_coordinate_variable_groups + ~cf.AuxiliaryCoordinate.nc_clear_hdf5_chunksizes + ~cf.AuxiliaryCoordinate.nc_hdf5_chunksizes + ~cf.AuxiliaryCoordinate.nc_set_hdf5_chunksizes Groups ^^^^^^ diff --git a/docs/source/class/cf.Bounds.rst b/docs/source/class/cf.Bounds.rst index 364076340e..4b31a8836e 100644 --- a/docs/source/class/cf.Bounds.rst +++ b/docs/source/class/cf.Bounds.rst @@ -414,6 +414,9 @@ NetCDF ~cf.Bounds.nc_get_dimension ~cf.Bounds.nc_has_dimension ~cf.Bounds.nc_set_dimension + ~cf.Bounds.nc_clear_hdf5_chunksizes + ~cf.Bounds.nc_hdf5_chunksizes + ~cf.Bounds.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.CellMeasure.rst b/docs/source/class/cf.CellMeasure.rst index 1488fce781..96a7727675 100644 --- a/docs/source/class/cf.CellMeasure.rst +++ b/docs/source/class/cf.CellMeasure.rst @@ -434,6 +434,9 @@ NetCDF ~cf.CellMeasure.nc_set_variable ~cf.CellMeasure.nc_get_external ~cf.CellMeasure.nc_set_external + ~cf.CellMeasure.nc_clear_hdf5_chunksizes + ~cf.CellMeasure.nc_hdf5_chunksizes + ~cf.CellMeasure.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.Count.rst b/docs/source/class/cf.Count.rst index 97250f29f1..64f71624fc 100644 --- a/docs/source/class/cf.Count.rst +++ b/docs/source/class/cf.Count.rst @@ -411,7 +411,10 @@ NetCDF ~cf.Count.nc_get_sample_dimension ~cf.Count.nc_has_sample_dimension ~cf.Count.nc_set_sample_dimension - + ~cf.Count.nc_clear_hdf5_chunksizes + ~cf.Count.nc_hdf5_chunksizes + ~cf.Count.nc_set_hdf5_chunksizes + CFA --- diff --git a/docs/source/class/cf.Data.rst b/docs/source/class/cf.Data.rst index d4d5fac596..eb1fc87826 100644 --- a/docs/source/class/cf.Data.rst +++ b/docs/source/class/cf.Data.rst @@ -591,6 +591,18 @@ Compression by convention ~cf.Data.compressed_array +Active storage +-------------- + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.Data.active_storage + Miscellaneous ------------- diff --git a/docs/source/class/cf.DimensionCoordinate.rst b/docs/source/class/cf.DimensionCoordinate.rst index 300034b4c5..b031250aa6 100644 --- a/docs/source/class/cf.DimensionCoordinate.rst +++ b/docs/source/class/cf.DimensionCoordinate.rst @@ -506,7 +506,10 @@ NetCDF ~cf.DimensionCoordinate.nc_del_variable ~cf.DimensionCoordinate.nc_get_variable ~cf.DimensionCoordinate.nc_has_variable - ~cf.DimensionCoordinate.nc_set_variable + ~cf.DimensionCoordinate.nc_set_variable + ~cf.DimensionCoordinate.nc_clear_hdf5_chunksizes + ~cf.DimensionCoordinate.nc_hdf5_chunksizes + ~cf.DimensionCoordinate.nc_set_hdf5_chunksizes Groups ^^^^^^ diff --git a/docs/source/class/cf.DomainAncillary.rst b/docs/source/class/cf.DomainAncillary.rst index c9e8643726..f9b5e2fa3d 100644 --- a/docs/source/class/cf.DomainAncillary.rst +++ b/docs/source/class/cf.DomainAncillary.rst @@ -463,6 +463,9 @@ NetCDF ~cf.DomainAncillary.nc_get_variable ~cf.DomainAncillary.nc_has_variable ~cf.DomainAncillary.nc_set_variable + ~cf.DomainAncillary.nc_clear_hdf5_chunksizes + ~cf.DomainAncillary.nc_hdf5_chunksizes + ~cf.DomainAncillary.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.Field.rst b/docs/source/class/cf.Field.rst index 2239992be8..eed7dad7ac 100644 --- a/docs/source/class/cf.Field.rst +++ b/docs/source/class/cf.Field.rst @@ -416,6 +416,9 @@ NetCDF ~cf.Field.nc_set_global_attribute ~cf.Field.nc_set_global_attributes ~cf.Field.ncdimensions + ~cf.Field.nc_clear_hdf5_chunksizes + ~cf.Field.nc_hdf5_chunksizes + ~cf.Field.nc_set_hdf5_chunksizes Groups ^^^^^^ diff --git a/docs/source/class/cf.FieldAncillary.rst b/docs/source/class/cf.FieldAncillary.rst index cf585d00a9..0afc0fa7a9 100644 --- a/docs/source/class/cf.FieldAncillary.rst +++ b/docs/source/class/cf.FieldAncillary.rst @@ -408,6 +408,9 @@ NetCDF ~cf.FieldAncillary.nc_get_variable ~cf.FieldAncillary.nc_has_variable ~cf.FieldAncillary.nc_set_variable + ~cf.FieldAncillary.nc_clear_hdf5_chunksizes + ~cf.FieldAncillary.nc_hdf5_chunksizes + ~cf.FieldAncillary.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.GatheredArray.rst b/docs/source/class/cf.GatheredArray.rst index f6693a2a33..8f2d3997b7 100644 --- a/docs/source/class/cf.GatheredArray.rst +++ b/docs/source/class/cf.GatheredArray.rst @@ -20,11 +20,12 @@ cf.GatheredArray ~cf.GatheredArray.compressed_dimensions ~cf.GatheredArray.conformed_data ~cf.GatheredArray.copy + ~cf.GatheredArray.get_attributes ~cf.GatheredArray.get_calendar ~cf.GatheredArray.get_compressed_axes ~cf.GatheredArray.get_compressed_dimension ~cf.GatheredArray.get_compression_type - ~cf.GatheredArray.get_filename' + ~cf.GatheredArray.get_filename ~cf.GatheredArray.get_filenames ~cf.GatheredArray.get_list ~cf.GatheredArray.get_Subarray diff --git a/docs/source/class/cf.H5netcdfArray.rst b/docs/source/class/cf.H5netcdfArray.rst new file mode 100644 index 0000000000..6b51c6bc5a --- /dev/null +++ b/docs/source/class/cf.H5netcdfArray.rst @@ -0,0 +1,142 @@ +.. currentmodule:: cf +.. default-role:: obj + +cf.H5netcdfArray +================ + +---- + +.. autoclass:: cf.H5netcdfArray + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.get_compression_type + ~cf.H5netcdfArray.get_subspace + ~cf.H5netcdfArray.get_attributes + ~cf.H5netcdfArray.index + + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.H5netcdfArray.array + ~cf.H5netcdfArray.dtype + ~cf.H5netcdfArray.ndim + ~cf.H5netcdfArray.shape + ~cf.H5netcdfArray.size + ~cf.H5netcdfArray.original_shape + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.get_calendar + ~cf.H5netcdfArray.get_units + ~cf.H5netcdfArray.Units + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.get_address + ~cf.H5netcdfArray.get_addresses + ~cf.H5netcdfArray.close + ~cf.H5netcdfArray.open + ~cf.H5netcdfArray.get_filename + ~cf.H5netcdfArray.get_filenames + ~cf.H5netcdfArray.get_format + ~cf.H5netcdfArray.get_formats + ~cf.H5netcdfArray.get_groups + ~cf.H5netcdfArray.get_mask + ~cf.H5netcdfArray.get_unpack + ~cf.H5netcdfArray.get_storage_options + ~cf.H5netcdfArray.add_file_location + ~cf.H5netcdfArray.del_file_location + ~cf.H5netcdfArray.file_locations + ~cf.H5netcdfArray._lock + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.copy + ~cf.H5netcdfArray.to_memory + +Active storage +-------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.active_storage + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.__dask_tokenize__ + ~cf.H5netcdfArray.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray._docstring_special_substitutions + ~cf.H5netcdfArray._docstring_substitutions + ~cf.H5netcdfArray._docstring_package_depth + ~cf.H5netcdfArray._docstring_method_exclusions + +Deprecated +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.H5netcdfArray.filename + ~cf.H5netcdfArray.get_missing_values diff --git a/docs/source/class/cf.Index.rst b/docs/source/class/cf.Index.rst index b72adb3d8b..46cc545cd6 100644 --- a/docs/source/class/cf.Index.rst +++ b/docs/source/class/cf.Index.rst @@ -412,6 +412,9 @@ NetCDF ~cf.Index.nc_get_sample_dimension ~cf.Index.nc_has_sample_dimension ~cf.Index.nc_set_sample_dimension + ~cf.Index.nc_clear_hdf5_chunksizes + ~cf.Index.nc_hdf5_chunksizes + ~cf.Index.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.List.rst b/docs/source/class/cf.List.rst index 2760506db6..5e8de3f544 100644 --- a/docs/source/class/cf.List.rst +++ b/docs/source/class/cf.List.rst @@ -404,6 +404,9 @@ NetCDF ~cf.List.nc_get_variable ~cf.List.nc_has_variable ~cf.List.nc_set_variable + ~cf.List.nc_clear_hdf5_chunksizes + ~cf.List.nc_hdf5_chunksizes + ~cf.List.nc_set_hdf5_chunksizes CFA --- diff --git a/docs/source/class/cf.NetCDF4Array.rst b/docs/source/class/cf.NetCDF4Array.rst new file mode 100644 index 0000000000..f3e668ec92 --- /dev/null +++ b/docs/source/class/cf.NetCDF4Array.rst @@ -0,0 +1,144 @@ +.. currentmodule:: cf +.. default-role:: obj + +cf.NetCDF4Array +=============== + +---- + +.. autoclass:: cf.NetCDF4Array + :no-members: + :no-inherited-members: + +Inspection +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.get_compression_type + ~cf.NetCDF4Array.get_subspace + ~cf.NetCDF4Array.get_attributes + ~cf.NetCDF4Array.index + + +.. rubric:: Attributes + +.. autosummary:: + :nosignatures: + :toctree: ../attribute/ + :template: attribute.rst + + ~cf.NetCDF4Array.array + ~cf.NetCDF4Array.dtype + ~cf.NetCDF4Array.ndim + ~cf.NetCDF4Array.shape + ~cf.NetCDF4Array.size + ~cf.NetCDF4Array.original_shape + +Units +----- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.get_calendar + ~cf.NetCDF4Array.get_units + ~cf.NetCDF4Array.Units + + + +File +---- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.get_address + ~cf.NetCDF4Array.get_addresses + ~cf.NetCDF4Array.close + ~cf.NetCDF4Array.open + ~cf.NetCDF4Array.get_filename + ~cf.NetCDF4Array.get_filenames + ~cf.NetCDF4Array.get_format + ~cf.NetCDF4Array.get_formats + ~cf.NetCDF4Array.get_groups + ~cf.NetCDF4Array.get_mask + ~cf.NetCDF4Array.get_unpack + ~cf.NetCDF4Array.get_storage_options + ~cf.NetCDF4Array.add_file_location + ~cf.NetCDF4Array.del_file_location + ~cf.NetCDF4Array.file_locations + ~cf.NetCDF4Array._lock + +Miscellaneous +------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.copy + ~cf.NetCDF4Array.to_memory + +Active storage +-------------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.active_storage + +Special +------- + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.__dask_tokenize__ + ~cf.NetCDF4Array.__getitem__ + +Docstring substitutions +----------------------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array._docstring_special_substitutions + ~cf.NetCDF4Array._docstring_substitutions + ~cf.NetCDF4Array._docstring_package_depth + ~cf.NetCDF4Array._docstring_method_exclusions + +Deprecated +---------- + +.. rubric:: Methods + +.. autosummary:: + :nosignatures: + :toctree: ../method/ + :template: method.rst + + ~cf.NetCDF4Array.filename + ~cf.NetCDF4Array.get_missing_values diff --git a/docs/source/class/cf.NetCDFArray.rst b/docs/source/class/cf.NetCDFArray.rst deleted file mode 100644 index 34d7bf0d65..0000000000 --- a/docs/source/class/cf.NetCDFArray.rst +++ /dev/null @@ -1,70 +0,0 @@ -.. currentmodule:: cf -.. default-role:: obj - -cf.NetCDFArray -============== - ----- - -.. autoclass:: cf.NetCDFArray - :no-members: - :no-inherited-members: - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cf.NetCDFArray.add_file_location - ~cf.NetCDFArray.close - ~cf.NetCDFArray.copy - ~cf.NetCDFArray.del_file_location - ~cf.NetCDFArray.file_locations - ~cf.NetCDFArray.filename - ~cf.NetCDFArray.get_address - ~cf.NetCDFArray.get_addresses - ~cf.NetCDFArray.get_format - ~cf.NetCDFArray.get_formats - ~cf.NetCDFArray.get_calendar - ~cf.NetCDFArray.get_compression_type - ~cf.NetCDFArray.get_filename - ~cf.NetCDFArray.get_filenames - ~cf.NetCDFArray.get_group - ~cf.NetCDFArray.get_groups - ~cf.NetCDFArray.get_mask - ~cf.NetCDFArray.get_missing_values - ~cf.NetCDFArray.get_ncvar - ~cf.NetCDFArray.get_subspace - ~cf.NetCDFArray.get_units - ~cf.NetCDFArray.get_varid - ~cf.NetCDFArray.open - ~cf.NetCDFArray.to_memory - ~cf.NetCDFArray.Units - -.. rubric:: Attributes - -.. autosummary:: - :nosignatures: - :toctree: ../attribute/ - :template: attribute.rst - - ~cf.NetCDFArray.array - ~cf.NetCDFArray.dtype - ~cf.NetCDFArray.file_address - ~cf.NetCDFArray.ndim - ~cf.NetCDFArray.shape - ~cf.NetCDFArray.size - -Special -------- - -.. rubric:: Methods - -.. autosummary:: - :nosignatures: - :toctree: ../method/ - :template: method.rst - - ~cf.NetCDFArray.__getitem__ diff --git a/docs/source/class/cf.RaggedContiguousArray.rst b/docs/source/class/cf.RaggedContiguousArray.rst index e1f6fd8dc1..1543586451 100644 --- a/docs/source/class/cf.RaggedContiguousArray.rst +++ b/docs/source/class/cf.RaggedContiguousArray.rst @@ -21,12 +21,13 @@ cf.RaggedContiguousArray ~cf.RaggedContiguousArray.compressed_dimensions ~cf.RaggedContiguousArray.conformed_data ~cf.RaggedContiguousArray.copy + ~cf.RaggedContiguousArray.get_attributes ~cf.RaggedContiguousArray.get_calendar ~cf.RaggedContiguousArray.get_compressed_axes ~cf.RaggedContiguousArray.get_compressed_dimension ~cf.RaggedContiguousArray.get_compression_type ~cf.RaggedContiguousArray.get_count - ~cf.RaggedContiguousArray.get_filename' + ~cf.RaggedContiguousArray.get_filename ~cf.RaggedContiguousArray.get_filenames ~cf.RaggedContiguousArray.get_index ~cf.RaggedContiguousArray.get_Subarray diff --git a/docs/source/class/cf.RaggedIndexedArray.rst b/docs/source/class/cf.RaggedIndexedArray.rst index abc4edc74a..2b0fb76075 100644 --- a/docs/source/class/cf.RaggedIndexedArray.rst +++ b/docs/source/class/cf.RaggedIndexedArray.rst @@ -20,12 +20,13 @@ cf.RaggedIndexedArray ~cf.RaggedIndexedArray.compressed_dimensions ~cf.RaggedIndexedArray.conformed_data ~cf.RaggedIndexedArray.copy + ~cf.RaggedIndexedArray.get_attributes ~cf.RaggedIndexedArray.get_calendar ~cf.RaggedIndexedArray.get_compressed_axes ~cf.RaggedIndexedArray.get_compressed_dimension ~cf.RaggedIndexedArray.get_compression_type ~cf.RaggedIndexedArray.get_count - ~cf.RaggedIndexedArray.get_filename' + ~cf.RaggedIndexedArray.get_filename ~cf.RaggedIndexedArray.get_filenames ~cf.RaggedIndexedArray.get_index ~cf.RaggedIndexedArray.get_Subarray diff --git a/docs/source/class/cf.RaggedIndexedContiguousArray.rst b/docs/source/class/cf.RaggedIndexedContiguousArray.rst index af27e95277..f8041c5a31 100644 --- a/docs/source/class/cf.RaggedIndexedContiguousArray.rst +++ b/docs/source/class/cf.RaggedIndexedContiguousArray.rst @@ -20,12 +20,13 @@ cf.RaggedIndexedContiguousArray ~cf.RaggedIndexedContiguousArray.compressed_dimensions ~cf.RaggedIndexedContiguousArray.conformed_data ~cf.RaggedIndexedContiguousArray.copy + ~cf.RaggedIndexedContiguousArray.get_attributes ~cf.RaggedIndexedContiguousArray.get_calendar ~cf.RaggedIndexedContiguousArray.get_compressed_axes ~cf.RaggedIndexedContiguousArray.get_compressed_dimension ~cf.RaggedIndexedContiguousArray.get_compression_type ~cf.RaggedIndexedContiguousArray.get_count - ~cf.RaggedIndexedContiguousArray.get_filename' + ~cf.RaggedIndexedContiguousArray.get_filename ~cf.RaggedIndexedContiguousArray.get_filenames ~cf.RaggedIndexedContiguousArray.get_index ~cf.RaggedIndexedContiguousArray.get_Subarray diff --git a/docs/source/conf.py b/docs/source/conf.py index ac8ca82105..5e39feaf2b 100755 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -156,6 +156,8 @@ def _get_date(): "cfplot": ("https://ajheaps.github.io/cf-plot", None), "dask": ("https://docs.dask.org/en/latest", None), "matplotlib": ("https://matplotlib.org/stable/", None), + # REVIEW: h5: new intersphinx mapping + "h5netcdf": ("https://h5netcdf.org", None), } # This extension is meant to help with the common pattern of having diff --git a/docs/source/field_analysis.rst b/docs/source/field_analysis.rst index ea588ccfae..638ee2d073 100644 --- a/docs/source/field_analysis.rst +++ b/docs/source/field_analysis.rst @@ -342,15 +342,14 @@ construct. Collapse weights ^^^^^^^^^^^^^^^^ -.. The calculations of means, standard deviations and variances are, - by default, not weighted. For weights to be incorporated in the - collapse, the axes to be weighted must be identified with the - *weights* keyword. +.. warning:: By default, the collapse calculations are **not** + weighted. -For weights to be incorporated in the collapse, the axes to be -weighted must be identified with the *weights* keyword. A collapse by -a particular method is either never weighted, or may be weighted, or -is always weighted, as described in the following table: + For weights to be incorporated in the collapse, the + *weights* keyword must be set. + +A collapse by a particular method is either never weighted, or may be +weighted, or is always weighted, as described in the following table: ============================ ============================ ======== Method Description Weighted @@ -853,6 +852,51 @@ method constructs. : longitude(8) = [22.5, ..., 337.5] degrees_east : air_pressure(1) = [850.0] hPa +.. _Active-storage-collapses: + +Active storage collapses +^^^^^^^^^^^^^^^^^^^^^^^^ + +When the data being collapsed are stored remotely, the collapse +calculations may be carried out on a server that is close (in a +network distance sense) to the data, thereby removing the time and +power costs of transferring the entire un-collapsed data to the local +client. + +Whether or not this will occur for an individual `dask` chunk is +determined on a case-by-case basis, and will only be done if all of +the following criteria are met: + +* ``cf.active_storage()`` is `True`; + +* ``cf.active_storage_url()`` returns the URL of a valid active + storage server; + +* the `dask` chunk's data are defined by a netCDF-4 file on disk + (rather than in any other file format, or in memory); + +* it is possible to import the `activestorage.Active` class; + +* the method is one of those specified by + `cf.data.collapse.active_reduction_methods`; + +* the collapse is over all axes; + +* the collapse is unweighted; + +* the data are not numerically packed. + +If any of these conditions are not met then the `dask` chunk will be +collapsed "as usual", i.e. by retrieving the data to memory (if it is +not already there) and using the local client to perform the collapse +calculations. + +The performance improvements from using active storage operations will +increase the closer, in a network sense, the active storage server is +to the data storage. If the active storage server is sufficiently far +away from the data then it could even be faster and require less +energy to do non-active operation of the local client. + ---- .. _ Other-statistical-operations: diff --git a/docs/source/function.rst b/docs/source/function.rst index df6a1bd5ad..8892f325c1 100644 --- a/docs/source/function.rst +++ b/docs/source/function.rst @@ -20,6 +20,7 @@ Reading and writing cf.read cf.write + cf.netcdf_lock Aggregation ----------- @@ -145,6 +146,19 @@ Resource management cf.TEMPDIR cf.TOTAL_MEMORY +Active storage reductions +------------------------- + +.. autosummary:: + :nosignatures: + :toctree: function/ + :template: function.rst + + cf.active_storage + cf.active_storage_url + cf.active_storage_max_requests + cf.netcdf_lock + Miscellaneous ------------- diff --git a/docs/source/index.rst b/docs/source/index.rst index 2a7bc15e5e..b8b196a8d8 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -20,12 +20,10 @@ :maxdepth: 1 introduction - cf_data_model installation cheat_sheet recipes/index tutorial - analysis api_reference aggregation_rules performance diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 5f58dabdd2..601fc487fc 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -201,10 +201,17 @@ Required * `cftime `_, version 1.6.2 or newer (note that this package may be installed with netCDF4). +* `h5netcdf `_, version 1.3.0 + newer. + +* `h5py `_, version 3.10.0 or newer. + +* `s3fs `_, version 2024.2.0 or newer. + * `scipy `_, version 1.10.0 or newer. -* `cfdm `_, version 1.11.1.0 or up to, - but not including, 1.11.2.0. +* `cfdm `_, version 1.11.2.0 or up to, + but not including, 1.11.3.0. * `cfunits `_, version 3.3.7 or newer. @@ -261,6 +268,13 @@ environments for which these features are not required. * `matplotlib `_, version 3.0.0 or newer. +.. rubric:: Active storage collapses + +* `activestorage `_. This + library is not yet in PyPi (it will be soon), so in the interim you + must use a copy of the ``pyfive`` branch of the + https://github.com/NCAS-CMS/PyActiveStorage repository. + ---- .. _Tests: diff --git a/docs/source/introduction.rst b/docs/source/introduction.rst index 05e92dab60..3b4a5e1719 100644 --- a/docs/source/introduction.rst +++ b/docs/source/introduction.rst @@ -13,6 +13,12 @@ .. role:: blue +.. raw:: html + + + +.. role:: small + **Introduction** ================ @@ -59,12 +65,17 @@ The `cf` package uses :ref:`Dask ` for all of its array manipulation and can: * read :term:`field constructs ` and :term:`domain - constructs ` from netCDF, CDL, PP and UM datasets, + constructs ` from netCDF, CDL, PP and UM datasets + with a choice of netCDF backends, + +* read files from OPeNDAP servers and S3 object stores, * create new field constructs in memory, * write and append field constructs to netCDF datasets on disk, +* read, write, and manipulate UGRID mesh topologies, + * read, write, and create coordinates defined by geometry cells, * read netCDF and CDL datasets containing hierarchical groups, @@ -150,6 +161,101 @@ installed, which ---- +.. _CF-data-model: + +**CF data model** +----------------- + +The CF (Climate and Forecast) metadata conventions +(http://cfconventions.org) provide a description of the physical +meaning of data and of their spatial and temporal properties and are +designed to promote the creation, processing, and sharing of climate +and forecasting data using netCDF files and libraries +(https://www.unidata.ucar.edu/software/netcdf). + +`The CF data model +`_ +identifies the fundamental elements ("constructs") of the CF +conventions and shows how they relate to each other, independently of +the netCDF encoding. + +The CF data model defines a **field construct** for storing data with +all of its metadata. It is defined in CF-|version| as follows: + +.. glossary:: + + field construct + corresponds to a CF-netCDF data variable with all of its + metadata. It consists of + + - descriptive properties that apply to field construct as a whole + (e.g. the standard name), + + - a data array, + + - a **domain construct** that describes the locations of each cell + of the data array (i.e. the "domain"), + + - **metadata constructs** that describe the physical nature of the + data array, defined by + + .. glossary:: + + field ancillary constructs + corresponding to CF-netCDF ancillary variables + + cell method constructs + corresponding to a CF-netCDF cell_methods attribute of data + variable + + domain construct + that describes the locations of each cell of the domain. It may + exist independently of a **field construct** and consists of + + - descriptive properties that apply to domain construct as a whole, + + - **metadata constructs** that describe the locations of each cell + of the domain, defined by + + .. glossary:: + + domain axis constructs + corresponding to CF-netCDF dimensions or scalar coordinate + variables + + dimension coordinate constructs + corresponding to CF-netCDF coordinate variables or numeric + scalar coordinate variables + + auxiliary coordinate constructs + corresponding to CF-netCDF auxiliary coordinate variables and + non-numeric scalar coordinate variables + + coordinate reference constructs + corresponding to CF-netCDF grid mapping variables or the + formula_terms attribute of a coordinate variable + + domain ancillary constructs + corresponding to CF-netCDF variables named by the + formula_terms attribute of a coordinate variable + + cell measure constructs + corresponding to CF-netCDF cell measure variables + + domain topology constructs + corresponding to CF-netCDF UGRID mesh topology variables + + cell connectivity constructs + corresponding to CF-netCDF UGRID connectivity variables + +| + +.. figure:: images/cfdm_field.svg + + *The constructs of the CF data model described using UML. The field construct corresponds to a CF-netCDF data variable. The domain construct provides the linkage between the field construct and the constructs which describe measurement locations and cell properties. It is useful to define an abstract generic coordinate construct that can be used to refer to coordinates when the their type (dimension or auxiliary coordinate construct) is not an issue.* + +---- + **References** -------------- diff --git a/docs/source/performance.rst b/docs/source/performance.rst index 74b3f85c00..a799af42a4 100644 --- a/docs/source/performance.rst +++ b/docs/source/performance.rst @@ -83,7 +83,7 @@ Some notable cases where non-lazy computations occur are: The weights may also be stored on disk for re-use in future sessions by using the ``weights_file`` keyword parameter. - + * **Aggregation** When two or more field or domain constructs are aggregated to form a @@ -100,7 +100,8 @@ Some notable cases where non-lazy computations occur are: convention (such as compression by gathering, some discrete sampling geometries, etc.), the compression metadata, such as the "list" array for compression by gathering, are read from disk non-lazily - during the `cf.read` operation. + during the `cf.read` operation. The compressed data themselves are, + however, accessed lazily. ---- @@ -173,15 +174,15 @@ scheduler has been defined will use that scheduler. >>> import cf >>> import dask + >>> # To make cf computations use local processes: >>> dask.config.set(scheduler='processes') - >>> # cf computations will now use local processes + >>> # To make cf computations be single-threaded >>> dask.config.set(scheduler='synchronous') - >>> # cf computations will now be single-threaded + >>> # To make cf computations use local threads (the default) >>> dask.config.set(scheduler='threads') - >>> # cf computations will now use local threads (the default) + >>> # To make cf computations use a user-defined distributed cluster >>> from dask.distributed import Client >>> client = Client('127.0.0.1:8786') - >>> # cf computations will now use the defined distributed cluster Operations are stored by Dask in `task graphs `_ where each task @@ -212,7 +213,7 @@ basic data computation over four chunks: [18 19 20 21 22 23] [24 25 26 27 28 29]] >>> e = d ** 2 + d - >>> e.to_dask_array().visualize('dask_task_graph.png') + >>> e.to_dask_array().visualize('dask_task_graph.svg') >>> print(e.array) [[ 0 2 6 12 20 30] [ 42 56 72 90 110 132] @@ -220,27 +221,25 @@ basic data computation over four chunks: [342 380 420 462 506 552] [600 650 702 756 812 870]] -The image file ``dask_task_graph.png`` contains the visualisation of -the dask task graph, showing the operations on each chunk. The -operations were only executed when their result was requested with the -final ``e.array`` command. The boxes represent the data chunks and the -circles represent the operations to be performed on the chunks. The -five boxes in the bottom row are the starting data (i.e. the four -chunks of ``d`` and the scalar ``2``), and the four boxes in the top -row are the result of the computations which combine to produce the -values in ``e.array``. +The image file ``dask_task_graph.svg`` contains the visualisation of +the Dask task graph, showing the operations on each chunk: .. figure:: images/dask_task_graph.svg :scale: 8 % - *The dask task graph from dask_task_graph.png* - +The operations were only executed when their result was requested with +the final ``e.array`` command. The boxes in ``dask_task_graph.svg`` +represent the data chunks and the circles represent the operations to +be performed on the chunks. The five boxes in the bottom row are the +starting data (i.e. the four chunks of ``d`` and the scalar ``2``), +and the four boxes in the top row are the result of the computations +which combine to produce the values in ``e.array``. NetCDF file access ^^^^^^^^^^^^^^^^^^ -Note that reading from and writing to netCDF files is currently a -serial operation, i.e. only one Dask chunk can access any netCDF file +Note that reading from and writing to netCDF files are currently only +serial operations, i.e. only one Dask chunk can access any netCDF file at any given moment. This situation can result in slower-than-expected performance. When a thread-safe version of the netCDF-C library is available we hope to lift this restriction. diff --git a/docs/source/recipes/plot_16_recipe.py b/docs/source/recipes/plot_16_recipe.py new file mode 100644 index 0000000000..c0d0d76a22 --- /dev/null +++ b/docs/source/recipes/plot_16_recipe.py @@ -0,0 +1,59 @@ +""" +Plotting contour subplots with different projections +==================================================== + +In this recipe, we will plot the same data using different projections +as subplots to illustrate visually some available possibilities. + +""" + +# %% +# 1. Import cf-python and cf-plot: + +import cfplot as cfp + +import cf + +# %% +# 2. Read the field in: +f = cf.read("~/recipes/ggap.nc")[0] + +# %% +# 3. Create the file with subplots. If changing the number of subplots, +# ensure the number of rows * number of columns = the number of projections. +# Here we are doing 6 projections so 2 x 3 is fine: +cfp.gopen(rows=2, columns=3, bottom=0.2, file="projections.png") + +# %% +# 4. List the projection types to use. Here we are using +# Cylindrical/Default, North Pole Stereographic, South Pole Stereographic, +# Mollweide, Mercator and Robinson. However there are several other choices +# possible, see: +# https://ncas-cms.github.io/cf-plot/build/user_guide.html#appendixc. Our +# chosen list is: +projtypes = ["cyl", "npstere", "spstere", "moll", "merc", "robin"] + +# %% +# 5. Loop through the list of projection types and plot each as a sub-plot: +for i, proj in enumerate(projtypes): + # gpos has 1 added to the index because it takes 1 as its first value + cfp.gpos(i + 1) + cfp.mapset(proj=proj) + + # For the final plot only, add a colour bar to cover all the sub-plots + if i == len(projtypes) - 1: + cfp.con( + f.subspace(pressure=850), + lines=False, + title=proj, + colorbar_position=[0.1, 0.1, 0.8, 0.02], + colorbar_orientation="horizontal", + ) + else: + cfp.con( + f.subspace(pressure=850), + lines=False, + title=proj, + colorbar=False, + ) +cfp.gclose() diff --git a/docs/source/recipes/recipe_list.txt b/docs/source/recipes/recipe_list.txt index d93193c0a5..96b741c1ba 100644 --- a/docs/source/recipes/recipe_list.txt +++ b/docs/source/recipes/recipe_list.txt @@ -13,7 +13,7 @@ plot_06_recipe.html#sphx-glr-recipes-plot-06-recipe-py plot_07_recipe.html#sphx-glr-recipes-plot-07-recipe-py
plot_08_recipe.html#sphx-glr-recipes-plot-08-recipe-py -
+
plot_09_recipe.html#sphx-glr-recipes-plot-09-recipe-py
plot_10_recipe.html#sphx-glr-recipes-plot-10-recipe-py @@ -27,6 +27,8 @@ plot_13_recipe.html#sphx-glr-recipes-plot-13-recipe-py plot_14_recipe.html#sphx-glr-recipes-plot-14-recipe-py
plot_15_recipe.html#sphx-glr-recipes-plot-15-recipe-py +
+plot_16_recipe.html#sphx-glr-recipes-plot-16-recipe-py
plot_17_recipe.html#sphx-glr-recipes-plot-17-recipe-py
diff --git a/docs/source/releases.rst b/docs/source/releases.rst index cf764279e7..73e1c3f23b 100644 --- a/docs/source/releases.rst +++ b/docs/source/releases.rst @@ -29,9 +29,9 @@ Documentation for all versions of cf. * `Version 3.15.3 `_ (2023-08-31) * `Version 3.15.2 `_ (2023-07-21) * `Version 3.15.1 `_ (2023-06-09) -* `Version 3.15.0 `_ (2023-04-27) -* `Version 3.14.1 `_ (2023-03-10) -* `Version 3.14.0 `_ (2023-01-31 *first Dask release*) +* `Version 3.15.0 `_ (2023-04-27) +* `Version 3.14.1 `_ (2023-03-10) +* `Version 3.14.0 `_ (2023-01-31 *first Dask release*) ---- diff --git a/docs/source/tutorial.rst b/docs/source/tutorial.rst index 54467d81df..ceafb8815e 100644 --- a/docs/source/tutorial.rst +++ b/docs/source/tutorial.rst @@ -241,6 +241,9 @@ The `cf.read` function has optional parameters to read recursively, and to allow directories which resolve to symbolic links; and +* choose either `netCDF4` or `h5netcdf` backends for accessing netCDF + files. + * configure parameters for :ref:`reading PP and UM fields files `. @@ -4606,19 +4609,20 @@ All the of above examples use arrays in memory to construct the data instances for the field and metadata constructs. It is, however, possible to create data from arrays that reside on disk. The `cf.read` function creates data in this manner. A pointer to an array in a -netCDF file can be stored in a `cf.NetCDFArray` instance, which is is -used to initialise a `cf.Data` instance. +netCDF file can be stored in a `cf.NetCDF4Array` or +`~cf.H5netcdfAarray` instance, which is used to initialise a +`cf.Data` instance. .. code-block:: python :caption: *Define a variable from a dataset with the netCDF package - and use it to create a NetCDFArray instance with which to - initialise a Data instance.* + and use it to create a NetCDF4Array instance with which + to initialise a Data instance.* >>> import netCDF4 >>> nc = netCDF4.Dataset('file.nc', 'r') >>> v = nc.variables['ta'] - >>> netcdf_array = cf.NetCDFArray(filename='file.nc', address='ta', - ... dtype=v.dtype, shape=v.shape) + >>> netcdf_array = cf.NetCDF4Array(filename='file.nc', address='ta', + ... dtype=v.dtype, shape=v.shape) >>> data_disk = cf.Data(netcdf_array) @@ -4634,7 +4638,7 @@ used to initialise a `cf.Data` instance. Note that data type, number of dimensions, dimension sizes and number of elements of the array on disk that are used to initialise the -`cf.NetCDFArray` instance are those expected by the CF data model, +`cf.NetCDF4Array` instance are those expected by the CF data model, which may be different to those of the netCDF variable in the file (although they are the same in the above example). For example, a netCDF character array of shape ``(12, 9)`` is viewed in cf as a diff --git a/release_docs b/release_docs index bd9643b686..eb9c7ace24 100755 --- a/release_docs +++ b/release_docs @@ -12,7 +12,7 @@ version=`python -c "import cf; print(cf.__version__)"` sphinx_version=`python -c "import sphinx; print(sphinx.__version__)"` if [[ $sphinx_version != "2.4.5" ]] ; then - echo "ERROR: Must use sphinx version 2.4.5. Got $sphinx_version" + echo "ERROR: Must (sadly) use sphinx version 2.4.5. Got $sphinx_version" exit 3 fi diff --git a/requirements.txt b/requirements.txt index 7af6236b81..8b01daddca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,9 +1,12 @@ netCDF4>=1.6.5 cftime>=1.6.2 numpy>=1.22 -cfdm>=1.11.1.0, <1.11.2.0 +cfdm>=1.11.2.0, <1.11.3.0 psutil>=0.6.0 cfunits>=3.3.7 -dask>=2022.12.1 +dask>=2024.4.0 packaging>=20.0 scipy>=1.10.0 +h5netcdf>=1.3.0 +h5py>=3.10.0 +s3fs>=2024.2.0