diff --git a/Changelog.rst b/Changelog.rst
index f0cf3a3579..14723cf7a4 100644
--- a/Changelog.rst
+++ b/Changelog.rst
@@ -14,6 +14,12 @@ version NEXTVERSION
* New keyword parameter to `cf.Field.derivative`:
``ignore_coordinate_units``
(https://github.com/NCAS-CMS/cf-python/issues/807)
+* Allow access to netCDF-4 files in S3 object stores
+ (https://github.com/NCAS-CMS/cf-python/issues/712)
+* New class `cf.H5netcdfArray`
+* New class `cf.NetCDF4Array`
+* New class `cf.CFAH5netcdfArray`
+* New class `cf.CFANetCDF4Array`
* Fix bug that sometimes puts an incorrect ``radian-1`` or
``radian-2`` in the returned units of the differential operator
methods and functions
@@ -27,6 +33,12 @@ version NEXTVERSION
* Fix bug where `cf.normalize_slice` doesn't correctly
handle certain cyclic slices
(https://github.com/NCAS-CMS/cf-python/issues/774)
+* New dependency: ``h5netcdf>=1.3.0``
+* New dependency: ``h5py>=3.10.0``
+* New dependency: ``s3fs>=2024.2.0``
+* Changed dependency: ``1.11.2.0<=cfdm<1.11.3.0``
+* Changed dependency: ``cfunits>=3.3.7``
+
----
@@ -141,6 +153,8 @@ version 3.16.0
* Changed dependency: ``1.11.0.0<=cfdm<1.11.1.0``
* New dependency: ``scipy>=1.10.0``
+----
+
version 3.15.4
--------------
@@ -279,7 +293,7 @@ version 3.14.1
----
-version 3.14.0 (*first Dask release*)
+version 3.14.0 (*first Dask version*)
-------------------------------------
**2023-01-31**
@@ -314,7 +328,7 @@ version 3.14.0 (*first Dask release*)
----
-version 3.13.1 (*last LAMA release*)
+version 3.13.1 (*last LAMA version*)
------------------------------------
**2022-10-17**
diff --git a/cf/__init__.py b/cf/__init__.py
index fe0ed8c07a..9e630d86ea 100644
--- a/cf/__init__.py
+++ b/cf/__init__.py
@@ -13,10 +13,17 @@
* read field constructs from netCDF, CDL, PP and UM datasets,
+* read field constructs and domain constructs from netCDF, CDL, PP and
+ UM datasets with a choice of netCDF backends,
+
+* read files from OPeNDAP servers and S3 object stores,
+
* create new field constructs in memory,
* write and append field constructs to netCDF datasets on disk,
+* read, write, and manipulate UGRID mesh topologies,
+
* read, write, and create coordinates defined by geometry cells,
* read netCDF and CDL datasets containing hierarchical groups,
@@ -74,8 +81,8 @@
"""
__Conventions__ = "CF-1.11"
-__date__ = "2024-04-26"
-__version__ = "3.16.2"
+__date__ = "2024-??-??"
+__version__ = "3.17.0"
_requires = (
"numpy",
@@ -199,8 +206,8 @@
)
# Check the version of cfdm
-_minimum_vn = "1.11.1.0"
-_maximum_vn = "1.11.2.0"
+_minimum_vn = "1.11.2.0"
+_maximum_vn = "1.11.3.0"
_cfdm_version = Version(cfdm.__version__)
if not Version(_minimum_vn) <= _cfdm_version < Version(_maximum_vn):
raise RuntimeError(
@@ -209,12 +216,6 @@
)
# Check the version of dask
-_minimum_vn = "2022.12.1"
-if Version(dask.__version__) < Version(_minimum_vn):
- raise RuntimeError(
- f"Bad dask version: cf requires dask>={_minimum_vn}. "
- f"Got {dask.__version__} at {dask.__file__}"
- )
# Check the version of Python
_minimum_vn = "3.8.0"
@@ -274,15 +275,19 @@
from .data.array import (
BoundsFromNodesArray,
CellConnectivityArray,
- CFANetCDFArray,
+ CFAH5netcdfArray,
+ CFANetCDF4Array,
FullArray,
GatheredArray,
+ H5netcdfArray,
NetCDFArray,
+ NetCDF4Array,
PointTopologyArray,
RaggedContiguousArray,
RaggedIndexedArray,
RaggedIndexedContiguousArray,
SubsampledArray,
+ UMArray,
)
from .data.fragment import (
diff --git a/cf/cellmethod.py b/cf/cellmethod.py
index 3bdfbf3659..dda1e343b0 100644
--- a/cf/cellmethod.py
+++ b/cf/cellmethod.py
@@ -56,7 +56,7 @@ class CellMethod(cfdm.CellMethod):
def __new__(cls, *args, **kwargs):
"""This must be overridden in subclasses.
- .. versionadded:: (cfdm) 3.7.0
+ .. versionadded:: 3.7.0
"""
instance = super().__new__(cls)
diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py
index b40a7dcd73..db8601627c 100644
--- a/cf/cfimplementation.py
+++ b/cf/cfimplementation.py
@@ -26,12 +26,15 @@
TiePointIndex,
)
from .data import Data
+
from .data.array import (
BoundsFromNodesArray,
CellConnectivityArray,
- CFANetCDFArray,
+ CFAH5netcdfArray,
+ CFANetCDF4Array,
GatheredArray,
- NetCDFArray,
+ H5netcdfArray,
+ NetCDF4Array,
PointTopologyArray,
RaggedContiguousArray,
RaggedIndexedArray,
@@ -112,65 +115,39 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs):
parent, construct, axes=axes, copy=copy, **kwargs
)
- def initialise_CFANetCDFArray(
- self,
- filename=None,
- address=None,
- dtype=None,
- mask=True,
- units=False,
- calendar=False,
- instructions=None,
- substitutions=None,
- term=None,
- x=None,
- **kwargs,
- ):
- """Return a `CFANetCDFArray` instance.
+ def initialise_CFANetCDF4Array(self, **kwargs):
+ """Return a `CFANetCDF4Array` instance.
:Parameters:
- filename: `str`
-
- address: (sequence of) `str` or `int`
-
- dytpe: `numpy.dtype`
-
- mask: `bool`, optional
+ kwargs: optional
+ Initialisation parameters to pass to the new instance.
- units: `str` or `None`, optional
+ :Returns:
- calendar: `str` or `None`, optional
+ `CFANetCDF4Array`
- instructions: `str`, optional
+ """
+ cls = self.get_class("CFANetCDF4Array")
+ return cls(**kwargs)
- substitutions: `dict`, optional
+ def initialise_CFAH5netcdfArray(self, **kwargs):
+ """Return a `CFAH5netcdfArray` instance.
- term: `str`, optional
+ .. versionadded:: NEXTVERSION
- x: `dict`, optional
+ :Parameters:
kwargs: optional
- Ignored.
+ Initialisation parameters to pass to the new instance.
:Returns:
- `CFANetCDFArray`
+ `CFAH5netcdfArray`
"""
- cls = self.get_class("CFANetCDFArray")
- return cls(
- filename=filename,
- address=address,
- dtype=dtype,
- mask=mask,
- units=units,
- calendar=calendar,
- instructions=instructions,
- substitutions=substitutions,
- term=term,
- x=x,
- )
+ cls = self.get_class("CFAH5netcdfArray")
+ return cls(**kwargs)
_implementation = CFImplementation(
@@ -179,7 +156,8 @@ def initialise_CFANetCDFArray(
CellConnectivity=CellConnectivity,
CellMeasure=CellMeasure,
CellMethod=CellMethod,
- CFANetCDFArray=CFANetCDFArray,
+ CFAH5netcdfArray=CFAH5netcdfArray,
+ CFANetCDF4Array=CFANetCDF4Array,
CoordinateReference=CoordinateReference,
DimensionCoordinate=DimensionCoordinate,
Domain=Domain,
@@ -202,7 +180,8 @@ def initialise_CFANetCDFArray(
BoundsFromNodesArray=BoundsFromNodesArray,
CellConnectivityArray=CellConnectivityArray,
GatheredArray=GatheredArray,
- NetCDFArray=NetCDFArray,
+ H5netcdfArray=H5netcdfArray,
+ NetCDF4Array=NetCDF4Array,
PointTopologyArray=PointTopologyArray,
RaggedContiguousArray=RaggedContiguousArray,
RaggedIndexedArray=RaggedIndexedArray,
@@ -236,7 +215,8 @@ def implementation():
'CellConnectivityArray': cf.data.array.cellconnectivityarray.CellConnectivityArray,
'CellMeasure': cf.cellmeasure.CellMeasure,
'CellMethod': cf.cellmethod.CellMethod,
- 'CFANetCDFArray': cf.data.array.cfanetcdfarray.CFANetCDFArray,
+ 'CFAH5netcdfArray': cf.data.array.cfah5netcdfarray.CFAH5netcdfArray,
+ 'CFANetCDF4Array': cf.data.array.cfanetcdf4array.CFANetCDF4Array,
'CoordinateReference': cf.coordinatereference.CoordinateReference,
'DimensionCoordinate': cf.dimensioncoordinate.DimensionCoordinate,
'Domain': cf.domain.Domain,
@@ -257,7 +237,8 @@ def implementation():
'PartNodeCountProperties': cf.partnodecountproperties.PartNodeCountProperties,
'Data': cf.data.data.Data,
'GatheredArray': cf.data.array.gatheredarray.GatheredArray,
- 'NetCDFArray': cf.data.array.netcdfarray.NetCDFArray,
+ 'H5netcdfArray': cf.data.array.h5netcdfarray.H5netcdfArray,
+ 'NetCDF4Array': cf.data.array.netcdf4array.NetCDF4Array,
'PointTopologyArray': ,
'RaggedContiguousArray': cf.data.array.raggedcontiguousarray.RaggedContiguousArray,
'RaggedIndexedArray': cf.data.array.raggedindexedarray.RaggedIndexedArray,
diff --git a/cf/constants.py b/cf/constants.py
index ea335f63ee..1472bd83d2 100644
--- a/cf/constants.py
+++ b/cf/constants.py
@@ -63,6 +63,9 @@
"LOG_LEVEL": logging.getLevelName(logging.getLogger().level),
"BOUNDS_COMBINATION_MODE": "AND",
"CHUNKSIZE": parse_bytes(_CHUNKSIZE),
+ "active_storage": False,
+ "active_storage_url": None,
+ "active_storage_max_requests": 100,
}
masked = np.ma.masked
diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py
index c21f8916c5..cd2c53766b 100644
--- a/cf/data/array/__init__.py
+++ b/cf/data/array/__init__.py
@@ -1,9 +1,12 @@
from .boundsfromnodesarray import BoundsFromNodesArray
from .cellconnectivityarray import CellConnectivityArray
-from .cfanetcdfarray import CFANetCDFArray
+from .cfah5netcdfarray import CFAH5netcdfArray
+from .cfanetcdf4array import CFANetCDF4Array
from .fullarray import FullArray
from .gatheredarray import GatheredArray
+from .h5netcdfarray import H5netcdfArray
from .netcdfarray import NetCDFArray
+from .netcdf4array import NetCDF4Array
from .pointtopologyarray import PointTopologyArray
from .raggedcontiguousarray import RaggedContiguousArray
from .raggedindexedarray import RaggedIndexedArray
diff --git a/cf/data/array/cfah5netcdfarray.py b/cf/data/array/cfah5netcdfarray.py
new file mode 100644
index 0000000000..47c58bff06
--- /dev/null
+++ b/cf/data/array/cfah5netcdfarray.py
@@ -0,0 +1,10 @@
+from .h5netcdfarray import H5netcdfArray
+from .mixin import CFAMixin
+
+
+class CFAH5netcdfArray(CFAMixin, H5netcdfArray):
+ """A CFA-netCDF array accessed with `h5netcdf`
+
+ .. versionadded:: NEXTVERSION
+
+ """
diff --git a/cf/data/array/cfanetcdf4array.py b/cf/data/array/cfanetcdf4array.py
new file mode 100644
index 0000000000..b3b6b69d7a
--- /dev/null
+++ b/cf/data/array/cfanetcdf4array.py
@@ -0,0 +1,10 @@
+from .mixin import CFAMixin
+from .netcdf4array import NetCDF4Array
+
+
+class CFANetCDF4Array(CFAMixin, NetCDF4Array):
+ """A CFA-netCDF array accessed with `netCDF4`.
+
+ .. versionadded:: NEXTVERSION
+
+ """
diff --git a/cf/data/array/fullarray.py b/cf/data/array/fullarray.py
index f6c29858fb..81278c3407 100644
--- a/cf/data/array/fullarray.py
+++ b/cf/data/array/fullarray.py
@@ -1,11 +1,13 @@
import numpy as np
+from ...functions import indices_shape, parse_indices
from .abstract import Array
+from .mixin import IndexMixin
_FULLARRAY_HANDLED_FUNCTIONS = {}
-class FullArray(Array):
+class FullArray(IndexMixin, Array):
"""A array filled with a given value.
The array may be empty or all missing values.
@@ -19,8 +21,7 @@ def __init__(
fill_value=None,
dtype=None,
shape=None,
- units=False,
- calendar=False,
+ attributes=None,
source=None,
copy=True,
):
@@ -38,23 +39,22 @@ def __init__(
shape: `tuple`
The array dimension sizes.
- units: `str` or `None`, optional
- The units of the netCDF variable. Set to `None` to
- indicate that there are no units. If unset then the
- units will be set to `None` during the first
- `__getitem__` call.
+ {{init attributes: `dict` or `None`, optional}}
- calendar: `str` or `None`, optional
- The calendar of the netCDF variable. By default, or if
- set to `None`, then the CF default calendar is
- assumed, if applicable. If unset then the calendar
- will be set to `None` during the first `__getitem__`
- call.
+ .. versionadded:: NEXTVERSION
{{init source: optional}}
{{init copy: `bool`, optional}}
+ units: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
+ calendar: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
"""
super().__init__(source=source, copy=copy)
@@ -75,42 +75,14 @@ def __init__(
shape = None
try:
- units = source._get_component("units", False)
- except AttributeError:
- units = False
-
- try:
- calendar = source._get_component("calendar", None)
+ attributes = source._get_component("attributes", False)
except AttributeError:
- calendar = None
+ attributes = None
self._set_component("full_value", fill_value, copy=False)
self._set_component("dtype", dtype, copy=False)
self._set_component("shape", shape, copy=False)
- self._set_component("units", units, copy=False)
- self._set_component("calendar", calendar, copy=False)
-
- def __array__(self, *dtype):
- """The numpy array interface.
-
- .. versionadded:: 3.15.0
-
- :Parameters:
-
- dtype: optional
- Typecode or data-type to which the array is cast.
-
- :Returns:
-
- `numpy.ndarray`
- An independent numpy array of the data.
-
- """
- array = self[...]
- if not dtype:
- return array
- else:
- return array.astype(dtype[0], copy=False)
+ self._set_component("attributes", attributes, copy=False)
def __array_function__(self, func, types, args, kwargs):
"""The `numpy` `__array_function__` protocol.
@@ -128,54 +100,6 @@ def __array_function__(self, func, types, args, kwargs):
return _FULLARRAY_HANDLED_FUNCTIONS[func](*args, **kwargs)
- def __getitem__(self, indices):
- """x.__getitem__(indices) <==> x[indices]
-
- Returns a numpy array.
-
- """
- # If 'apply_indices' is False then we can make a filled numpy
- # array that already has the correct shape. Otherwise we need
- # to make one with shape 'self.shape' and then apply the
- # indices to that.
- apply_indices = False
-
- if indices is Ellipsis:
- array_shape = self.shape
- else:
- array_shape = []
- for i, size in zip(indices, self.shape):
- if not isinstance(i, slice):
- continue
-
- start, stop, step = i.indices(size)
- a, b = divmod(stop - start, step)
- if b:
- a += 1
-
- array_shape.append(a)
-
- if len(array_shape) != self.ndim:
- apply_indices = True
- array_shape = self.shape
-
- fill_value = self.get_full_value()
- if fill_value is np.ma.masked:
- array = np.ma.masked_all(array_shape, dtype=self.dtype)
- elif fill_value is not None:
- array = np.full(
- array_shape, fill_value=fill_value, dtype=self.dtype
- )
- else:
- array = np.empty(array_shape, dtype=self.dtype)
-
- if apply_indices:
- array = self.get_subspace(array, indices)
-
- self._set_units()
-
- return array
-
def __repr__(self):
"""Called by the `repr` built-in function.
@@ -196,6 +120,53 @@ def __str__(self):
return f"Filled with {fill_value!r}"
+ def _get_array(self, index=None):
+ """Returns the full array.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `__array__`, `index`
+
+ :Parameters:
+
+ {{index: `tuple` or `None`, optional}}
+
+ :Returns:
+
+ `numpy.ndarray`
+ The subspace.
+
+ """
+ if index is None:
+ shape = self.shape
+ else:
+ original_shape = self.original_shape
+ index = parse_indices(original_shape, index, keepdims=False)
+ shape = indices_shape(index, original_shape, keepdims=False)
+
+ fill_value = self.get_full_value()
+ if fill_value is np.ma.masked:
+ array = np.ma.masked_all(shape, dtype=self.dtype)
+ elif fill_value is not None:
+ array = np.full(shape, fill_value=fill_value, dtype=self.dtype)
+ else:
+ array = np.empty(shape, dtype=self.dtype)
+
+ return array
+
+ @property
+ def array(self):
+ """Return an independent numpy array containing the data.
+
+ .. versionadded:: NEXTVERSION
+
+ :Returns:
+
+ `numpy.ndarray`
+ An independent numpy array of the data.
+ """
+ return np.asanyarray(self)
+
@property
def dtype(self):
"""Data-type of the data elements."""
diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py
new file mode 100644
index 0000000000..02cd0f1cc5
--- /dev/null
+++ b/cf/data/array/h5netcdfarray.py
@@ -0,0 +1,81 @@
+import cfdm
+
+from ...mixin_container import Container
+from .locks import netcdf_lock
+from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin
+
+
+class H5netcdfArray(
+ ActiveStorageMixin,
+ IndexMixin,
+ FileArrayMixin,
+ ArrayMixin,
+ Container,
+ cfdm.H5netcdfArray,
+):
+ """A netCDF array accessed with `h5netcdf`.
+
+ **Active storage reductions**
+
+ An active storage reduction may be enabled with the `actify`
+ method. See `cf.data.collapse.Collapse` for details.
+
+ .. versionadded:: NEXTVERSION
+
+ """
+
+ def __dask_tokenize__(self):
+ """Return a value fully representative of the object.
+
+ .. versionadded:: NEXTVERSION
+
+ """
+ return super().__dask_tokenize__() + (self.get_mask(),)
+
+ @property
+ def _lock(self):
+ """Set the lock for use in `dask.array.from_array`.
+
+ Returns a lock object because concurrent reads are not
+ currently supported by the HDF5 library. The lock object will
+ be the same for all `NetCDF4Array` and `H5netcdfArray`
+ instances, regardless of the dataset they access, which means
+ that access to all netCDF and HDF files coordinates around the
+ same lock.
+
+ .. versionadded:: NEXTVERSION
+
+ """
+ return netcdf_lock
+
+ def _get_array(self, index=None):
+ """Returns a subspace of the dataset variable.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `__array__`, `index`
+
+ :Parameters:
+
+ {{index: `tuple` or `None`, optional}}
+
+ :Returns:
+
+ `numpy.ndarray`
+ The subspace.
+
+ """
+ if index is None:
+ index = self.index()
+
+ # We need to lock because the netCDF file is about to be accessed.
+ self._lock.acquire()
+
+ # It's cfdm.H5netcdfArray.__getitem__ that we want to
+ # call here, but we use 'Container' in super because
+ # that comes immediately before cfdm.H5netcdfArray in
+ # the method resolution order.
+ array = super(Container, self).__getitem__(index)
+
+ self._lock.release()
+ return array
diff --git a/cf/data/array/locks.py b/cf/data/array/locks.py
new file mode 100644
index 0000000000..5a7b2bd333
--- /dev/null
+++ b/cf/data/array/locks.py
@@ -0,0 +1,4 @@
+from dask.utils import SerializableLock
+
+# Global lock for netCDF file access
+netcdf_lock = SerializableLock()
diff --git a/cf/data/array/mixin/__init__.py b/cf/data/array/mixin/__init__.py
index a9f7f75cb3..8e5dd7690d 100644
--- a/cf/data/array/mixin/__init__.py
+++ b/cf/data/array/mixin/__init__.py
@@ -1,3 +1,6 @@
+from .activestoragemixin import ActiveStorageMixin
from .arraymixin import ArrayMixin
+from .cfamixin import CFAMixin
from .compressedarraymixin import CompressedArrayMixin
from .filearraymixin import FileArrayMixin
+from .indexmixin import IndexMixin
diff --git a/cf/data/array/mixin/activestoragemixin.py b/cf/data/array/mixin/activestoragemixin.py
new file mode 100644
index 0000000000..024bb05d04
--- /dev/null
+++ b/cf/data/array/mixin/activestoragemixin.py
@@ -0,0 +1,28 @@
+class ActiveStorageMixin:
+ """Mixin class for enabling active storage operations.
+
+ .. versionadded:: NEXTVERSION
+
+ """
+
+ @property
+ def active_storage(self):
+ """Whether active storage operations are allowed.
+
+ Currently, active storage operations are allowed unless the
+ data are numerically packed.
+
+ .. versionadded:: NEXTVERSION
+
+ :Returns:
+
+ `bool`
+ `True` if active storage operations are allowed,
+ otherwise `False`.
+
+ """
+ attributes = self.get_attributes({})
+ if "add_offset" in attributes or "scale_factor" in attributes:
+ return False
+
+ return True
diff --git a/cf/data/array/mixin/arraymixin.py b/cf/data/array/mixin/arraymixin.py
index a167abec50..3468253b36 100644
--- a/cf/data/array/mixin/arraymixin.py
+++ b/cf/data/array/mixin/arraymixin.py
@@ -1,3 +1,5 @@
+import numpy as np
+
from ....units import Units
@@ -16,6 +18,22 @@ def __array_function__(self, func, types, args, kwargs):
"""
return NotImplemented
+ @property
+ def _meta(self):
+ """Normalise the array to an appropriate Dask meta object.
+
+ The Dask meta can be thought of as a suggestion to Dask. Dask
+ uses this meta to generate the task graph until it can infer
+ the actual metadata from the values. It does not force the
+ output to have the structure or dtype of the specified meta.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `dask.utils.meta_from_array`
+
+ """
+ return np.array((), dtype=self.dtype)
+
@property
def Units(self):
"""The `cf.Units` object containing the units of the array.
@@ -23,4 +41,4 @@ def Units(self):
.. versionadded:: 3.14.0
"""
- return Units(self.get_units(), self.get_calendar(None))
+ return Units(self.get_units(None), self.get_calendar(None))
diff --git a/cf/data/array/cfanetcdfarray.py b/cf/data/array/mixin/cfamixin.py
similarity index 62%
rename from cf/data/array/cfanetcdfarray.py
rename to cf/data/array/mixin/cfamixin.py
index 86465309e9..6bcb01a468 100644
--- a/cf/data/array/cfanetcdfarray.py
+++ b/cf/data/array/mixin/cfamixin.py
@@ -4,36 +4,50 @@
import numpy as np
-from ..fragment import FullFragmentArray, NetCDFFragmentArray, UMFragmentArray
-from ..utils import chunk_locations, chunk_positions
-from .netcdfarray import NetCDFArray
+from ...utils import chunk_locations, chunk_positions
-# Store fragment array classes.
-_FragmentArray = {
- "nc": NetCDFFragmentArray,
- "um": UMFragmentArray,
- "full": FullFragmentArray,
-}
+class CFAMixin:
+ """Mixin class for a CFA array.
-class CFANetCDFArray(NetCDFArray):
- """A CFA aggregated array stored in a netCDF file.
-
- .. versionadded:: 3.14.0
+ .. versionadded:: NEXTVERSION
"""
+ def __new__(cls, *args, **kwargs):
+ """Store fragment array classes.
+
+ .. versionadded:: NEXTVERSION
+
+ """
+ # Import fragment array classes. Do this here (as opposed to
+ # outside the class) to avoid a circular import.
+ from ...fragment import (
+ FullFragmentArray,
+ NetCDFFragmentArray,
+ UMFragmentArray,
+ )
+
+ instance = super().__new__(cls)
+ instance._FragmentArray = {
+ "nc": NetCDFFragmentArray,
+ "um": UMFragmentArray,
+ "full": FullFragmentArray,
+ }
+ return instance
+
def __init__(
self,
filename=None,
address=None,
dtype=None,
mask=True,
- units=False,
- calendar=False,
+ unpack=True,
instructions=None,
substitutions=None,
term=None,
+ attributes=None,
+ storage_options=None,
source=None,
copy=True,
x=None,
@@ -43,39 +57,34 @@ def __init__(
:Parameters:
filename: (sequence of) `str`, optional
- The name of the CFA-netCDF file containing the
- array. If a sequence then it must contain one element.
+ The name of the CFA file containing the array. If a
+ sequence then it must contain one element.
address: (sequence of) `str`, optional
- The name of the CFA-netCDF aggregation variable for the
+ The name of the CFA aggregation variable for the
array. If a sequence then it must contain one element.
dtype: `numpy.dtype`
The data type of the aggregated data array. May be
`None` if the numpy data-type is not known (which can
- be the case for netCDF string types, for example).
+ be the case for some string types, for example).
mask: `bool`
If True (the default) then mask by convention when
reading data from disk.
- A netCDF array is masked depending on the values of any of
- the netCDF variable attributes ``valid_min``,
- ``valid_max``, ``valid_range``, ``_FillValue`` and
- ``missing_value``.
+ A array is masked depending on the values of any of
+ the variable attributes ``valid_min``, ``valid_max``,
+ ``valid_range``, ``_FillValue`` and ``missing_value``.
- units: `str` or `None`, optional
- The units of the aggregated data. Set to `None` to
- indicate that there are no units.
+ {{init unpack: `bool`, optional}}
- calendar: `str` or `None`, optional
- The calendar of the aggregated data. Set to `None` to
- indicate the CF default calendar, if applicable.
+ .. versionadded:: NEXTVERSION
instructions: `str`, optional
The ``aggregated_data`` attribute value as found on
- the CFA netCDF variable. If set then this will be used
- to improve the performance of `__dask_tokenize__`.
+ the CFA variable. If set then this will be used to
+ improve the performance of `__dask_tokenize__`.
substitutions: `dict`, optional
A dictionary whose key/value pairs define text
@@ -88,23 +97,66 @@ def __init__(
term: `str`, optional
The name of a non-standard aggregation instruction
term from which the array is to be created, instead of
- creating the aggregated data in the standard
- terms. If set then *address* must be the name of the
- term's CFA-netCDF aggregation instruction variable,
- which must be defined on the fragment dimensions and
- no others. Each value of the aggregation instruction
- variable will be broadcast across the shape of the
- corresponding fragment.
+ creating the aggregated data in the standard terms. If
+ set then *address* must be the name of the term's
+ aggregation instruction variable, which must be
+ defined on the fragment dimensions and no others. Each
+ value of the aggregation instruction variable will be
+ broadcast across the shape of the corresponding
+ fragment.
*Parameter example:*
``address='cfa_tracking_id', term='tracking_id'``
.. versionadded:: 3.15.0
+ storage_options: `dict` or `None`, optional
+ Key/value pairs to be passed on to the creation of
+ `s3fs.S3FileSystem` file systems to control the
+ opening of fragment files in S3 object stores. Ignored
+ for files not in an S3 object store, i.e. those whose
+ names do not start with ``s3:``.
+
+ By default, or if `None`, then *storage_options* is
+ taken as ``{}``.
+
+ If the ``'endpoint_url'`` key is not in
+ *storage_options* or is not in a dictionary defined by
+ the ``'client_kwargs`` key (which is always the case
+ when *storage_options* is `None`), then one will be
+ automatically inserted for accessing a fragment S3
+ file. For example, for a file name of
+ ``'s3://store/data/file.nc'``, an ``'endpoint_url'``
+ key with value ``'https://store'`` would be created.
+
+ *Parameter example:*
+ ``{'key: 'scaleway-api-key...', 'secret':
+ 'scaleway-secretkey...', 'endpoint_url':
+ 'https://s3.fr-par.scw.cloud', 'client_kwargs':
+ {'region_name': 'fr-par'}}``
+
+ .. versionadded:: NEXTVERSION
+
+ {{init attributes: `dict` or `None`, optional}}
+
+ If *attributes* is `None`, the default, then the
+ attributes will be set from the netCDF variable during
+ the first `__getitem__` call.
+
+ .. versionaddedd:: NEXTVERSION
+
{{init source: optional}}
{{init copy: `bool`, optional}}
+ units: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
+ calendar: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
"""
if source is not None:
super().__init__(source=source, copy=copy)
@@ -135,90 +187,16 @@ def __init__(
term = None
elif filename is not None:
- aggregated_data = {}
-
- location = x["location"]
- ndim = location.shape[0]
-
- chunks = [i.compressed().tolist() for i in location]
- shape = [sum(c) for c in chunks]
- positions = chunk_positions(chunks)
- locations = chunk_locations(chunks)
-
- if term is not None:
- # --------------------------------------------------------
- # This fragment contains a constant value, not file
- # locations.
- # --------------------------------------------------------
- term = x[term]
- fragment_shape = term.shape
- aggregated_data = {
- frag_loc: {
- "location": loc,
- "fill_value": term[frag_loc].item(),
- "format": "full",
- }
- for frag_loc, loc in zip(positions, locations)
- }
- else:
- a = x["address"]
- f = x["file"]
- fmt = x["format"]
-
- extra_dimension = f.ndim > ndim
- if extra_dimension:
- # There is an extra non-fragment dimension
- fragment_shape = f.shape[:-1]
- else:
- fragment_shape = f.shape
-
- if not a.ndim:
- a = np.full(f.shape, a, dtype=a.dtype)
-
- if not fmt.ndim:
- fmt = np.full(fragment_shape, fmt, dtype=fmt.dtype)
-
- if extra_dimension:
- aggregated_data = {
- frag_loc: {
- "location": loc,
- "filename": f[frag_loc].tolist(),
- "address": a[frag_loc].tolist(),
- "format": fmt[frag_loc].item(),
- }
- for frag_loc, loc in zip(positions, locations)
- }
- else:
- aggregated_data = {
- frag_loc: {
- "location": loc,
- "filename": (f[frag_loc].item(),),
- "address": (a[frag_loc].item(),),
- "format": fmt[frag_loc].item(),
- }
- for frag_loc, loc in zip(positions, locations)
- }
-
- # Apply string substitutions to the fragment filenames
- if substitutions:
- for value in aggregated_data.values():
- filenames2 = []
- for filename in value["filename"]:
- for base, sub in substitutions.items():
- filename = filename.replace(base, sub)
-
- filenames2.append(filename)
-
- value["filename"] = filenames2
-
+ shape, fragment_shape, aggregated_data = self._parse_cfa(
+ x, term, substitutions
+ )
super().__init__(
filename=filename,
address=address,
shape=shape,
dtype=dtype,
mask=mask,
- units=units,
- calendar=calendar,
+ attributes=attributes,
copy=copy,
)
else:
@@ -227,8 +205,7 @@ def __init__(
address=address,
dtype=dtype,
mask=mask,
- units=units,
- calendar=calendar,
+ attributes=attributes,
copy=copy,
)
@@ -247,6 +224,134 @@ def __init__(
"substitutions", substitutions.copy(), copy=False
)
+ def _parse_cfa(self, x, term, substitutions):
+ """Parse the CFA aggregation instructions.
+
+ .. versionadded:: NEXTVERSION
+
+ :Parameters:
+
+ x: `dict`
+
+ term: `str` or `None`
+ The name of a non-standard aggregation instruction
+ term from which the array is to be created, instead of
+ creating the aggregated data in the standard
+ terms. Each value of the aggregation instruction
+ variable will be broadcast across the shape of the
+ corresponding fragment.
+
+ substitutions: `dict` or `None`
+ A dictionary whose key/value pairs define text
+ substitutions to be applied to the fragment file
+ names. Each key must be specified with the ``${...}``
+ syntax, for instance ``{'${base}': 'sub'}``.
+
+ :Returns:
+
+ 3-`tuple`
+ 1. The shape of the aggregated data.
+ 2. The shape of the array of fragments.
+ 3. The parsed aggregation instructions.
+
+ """
+ aggregated_data = {}
+
+ location = x["location"]
+ ndim = location.shape[0]
+ compressed = np.ma.compressed
+ chunks = [compressed(i).tolist() for i in location]
+ shape = [sum(c) for c in chunks]
+ positions = chunk_positions(chunks)
+ locations = chunk_locations(chunks)
+
+ if term is not None:
+ # --------------------------------------------------------
+ # Each fragment contains a constant value, not file
+ # locations.
+ # --------------------------------------------------------
+ term = x[term]
+ fragment_shape = term.shape
+ aggregated_data = {
+ frag_loc: {
+ "location": loc,
+ "fill_value": term[frag_loc].item(),
+ "format": "full",
+ }
+ for frag_loc, loc in zip(positions, locations)
+ }
+ else:
+ # --------------------------------------------------------
+ # Each fragment contains file locations
+ # --------------------------------------------------------
+ a = x["address"]
+ f = x["file"]
+ file_fmt = x["format"]
+
+ extra_dimension = f.ndim > ndim
+ if extra_dimension:
+ # There is an extra non-fragment dimension
+ fragment_shape = f.shape[:-1]
+ else:
+ fragment_shape = f.shape
+
+ if not a.ndim:
+ a = (a.item(),)
+ scalar_address = True
+ else:
+ scalar_address = False
+
+ if not file_fmt.ndim:
+ file_fmt = file_fmt.item()
+ scalar_fmt = True
+ else:
+ scalar_fmt = False
+
+ for frag_loc, location in zip(positions, locations):
+ if extra_dimension:
+ filename = compressed(f[frag_loc]).tolist()
+ if scalar_address:
+ address = a * len(filename)
+ else:
+ address = compressed(a[frag_loc].tolist())
+
+ if scalar_fmt:
+ fmt = file_fmt
+ else:
+ fmt = compressed(file_fmt[frag_loc]).tolist()
+ else:
+ filename = (f[frag_loc].item(),)
+ if scalar_address:
+ address = a
+ else:
+ address = (a[frag_loc].item(),)
+
+ if scalar_fmt:
+ fmt = file_fmt
+ else:
+ fmt = file_fmt[frag_loc].item()
+
+ aggregated_data[frag_loc] = {
+ "location": location,
+ "filename": filename,
+ "address": address,
+ "format": fmt,
+ }
+
+ # Apply string substitutions to the fragment filenames
+ if substitutions:
+ for value in aggregated_data.values():
+ filenames2 = []
+ for filename in value["filename"]:
+ for base, sub in substitutions.items():
+ filename = filename.replace(base, sub)
+
+ filenames2.append(filename)
+
+ value["filename"] = filenames2
+
+ return shape, fragment_shape, aggregated_data
+
def __dask_tokenize__(self):
"""Used by `dask.base.tokenize`.
@@ -298,12 +403,14 @@ def get_aggregated_data(self, copy=True):
>>> a.get_fragment_shape()
(2, 1, 1, 1)
>>> a.get_aggregated_data()
- {(0, 0, 0, 0): {'file': 'January-June.nc',
- 'address': 'temp',
+ {(0, 0, 0, 0): {
+ 'file': ('January-June.nc',),
+ 'address': ('temp',),
'format': 'nc',
'location': [(0, 6), (0, 1), (0, 73), (0, 144)]},
- (1, 0, 0, 0): {'file': 'July-December.nc',
- 'address': 'temp',
+ (1, 0, 0, 0): {
+ 'file': ('July-December.nc',),
+ 'address': ('temp',),
'format': 'nc',
'location': [(6, 12), (0, 1), (0, 73), (0, 144)]}}
@@ -357,6 +464,33 @@ def get_fragment_shape(self):
"""
return self._get_component("fragment_shape")
+ def get_storage_options(self):
+ """Return `s3fs.S3FileSystem` options for accessing S3 fragment files.
+
+ .. versionadded:: NEXTVERSION
+
+ :Returns:
+
+ `dict` or `None`
+ The `s3fs.S3FileSystem` options.
+
+ **Examples**
+
+ >>> f.get_storage_options()
+ {}
+
+ >>> f.get_storage_options()
+ {'anon': True}
+
+ >>> f.get_storage_options()
+ {'key: 'scaleway-api-key...',
+ 'secret': 'scaleway-secretkey...',
+ 'endpoint_url': 'https://s3.fr-par.scw.cloud',
+ 'client_kwargs': {'region_name': 'fr-par'}}
+
+ """
+ return super().get_storage_options(create_endpoint_url=False)
+
def get_term(self, default=ValueError()):
"""The CFA aggregation instruction term for the data, if set.
@@ -380,13 +514,18 @@ def get_term(self, default=ValueError()):
def subarray_shapes(self, shapes):
"""Create the subarray shapes.
+ A fragmented dimension (i.e. one spanned by two or more
+ fragments) will always have a subarray size equal to the
+ size of each of its fragments, overriding any other size
+ implied by the *shapes* parameter.
+
.. versionadded:: 3.14.0
.. seealso:: `subarrays`
:Parameters:
- shapes: `int`, sequence, `dict` or `str`, optional
+ shapes: `int`, sequence, `dict` or `str`, optional
Define the subarray shapes.
Any value accepted by the *chunks* parameter of the
@@ -427,7 +566,8 @@ def subarray_shapes(self, shapes):
from dask.array.core import normalize_chunks
- # Indices of fragmented dimensions
+ # Positions of fragmented dimensions (i.e. those spanned by
+ # two or more fragments)
f_dims = self.get_fragmented_dimensions()
shape = self.shape
@@ -440,8 +580,9 @@ def subarray_shapes(self, shapes):
zip(self.get_fragment_shape(), self.shape)
):
if dim in f_dims:
- # This aggregated dimension is spanned by more than
- # one fragment.
+ # This aggregated dimension is spanned by two or more
+ # fragments => set the chunks to be the same size as
+ # each fragment.
c = []
index = [0] * ndim
for j in range(n_fragments):
@@ -453,8 +594,8 @@ def subarray_shapes(self, shapes):
chunks.append(tuple(c))
else:
# This aggregated dimension is spanned by exactly one
- # fragment. Store None, for now, in the expectation
- # that it will get overwrittten.
+ # fragment => store `None` for now. This will get
+ # overwritten from 'shapes'.
chunks.append(None)
if isinstance(shapes, (str, Number)) or shapes is None:
@@ -657,18 +798,19 @@ def to_dask_array(self, chunks="auto"):
name = (f"{self.__class__.__name__}-{tokenize(self)}",)
dtype = self.dtype
- units = self.get_units()
+ units = self.get_units(None)
calendar = self.get_calendar(None)
aggregated_data = self.get_aggregated_data(copy=False)
# Set the chunk sizes for the dask array
chunks = self.subarray_shapes(chunks)
- if self.get_mask():
- fragment_arrays = _FragmentArray
- else:
- fragment_arrays = _FragmentArray.copy()
- fragment_arrays["nc"] = partial(_FragmentArray["nc"], mask=False)
+ fragment_arrays = self._FragmentArray
+ if not self.get_mask():
+ fragment_arrays = fragment_arrays.copy()
+ fragment_arrays["nc"] = partial(fragment_arrays["nc"], mask=False)
+
+ storage_options = self.get_storage_options()
dsk = {}
for (
@@ -691,6 +833,10 @@ def to_dask_array(self, chunks="auto"):
f"fragment dataset format: {fragment_format!r}"
)
+ if storage_options and kwargs["address"] == "nc":
+ # Pass on any file system options
+ kwargs["storage_options"] = storage_options
+
fragment = FragmentArray(
dtype=dtype,
shape=fragment_shape,
@@ -706,7 +852,7 @@ def to_dask_array(self, chunks="auto"):
key,
f_indices,
False,
- getattr(fragment, "_lock", False),
+ False,
)
# Return the dask array
diff --git a/cf/data/array/mixin/filearraymixin.py b/cf/data/array/mixin/filearraymixin.py
index 378567a23a..b5b314b9e2 100644
--- a/cf/data/array/mixin/filearraymixin.py
+++ b/cf/data/array/mixin/filearraymixin.py
@@ -1,8 +1,6 @@
from os import sep
from os.path import basename, dirname, join
-import numpy as np
-
from ....functions import _DEPRECATION_ERROR_ATTRIBUTE, abspath
@@ -26,20 +24,6 @@ def __dask_tokenize__(self):
self.get_addresses(),
)
- @property
- def _dask_meta(self):
- """The metadata for the containing dask array.
-
- This is the kind of array that will result from slicing the
- file array.
-
- .. versionadded:: 3.14.0
-
- .. seealso:: `dask.array.from_array`
-
- """
- return np.array((), dtype=self.dtype)
-
@property
def filename(self):
"""The name of the file containing the array.
diff --git a/cf/data/array/mixin/indexmixin.py b/cf/data/array/mixin/indexmixin.py
new file mode 100644
index 0000000000..d105ba943a
--- /dev/null
+++ b/cf/data/array/mixin/indexmixin.py
@@ -0,0 +1,364 @@
+from numbers import Integral
+
+import numpy as np
+from dask.array.slicing import normalize_index
+from dask.base import is_dask_collection
+
+from ....functions import indices_shape, parse_indices
+
+
+class IndexMixin:
+ """Mixin class for lazy indexing of a data array.
+
+ A data for a subspace is retrieved by casting the object as a
+ `numpy` array. See `__getitem__` for more details.
+
+ **Examples**
+
+ >>> a = cf.{{class}}(...)
+ >>> a.shape
+ (6, 5)
+ >>> print(np.asanyarray(a))
+ [[ 0 1 2 3 4])
+ [ 5 6 7 8 9]
+ [10 11 12 13 14]
+ [15 16 17 18 19]
+ [20 21 22 23 24]
+ [25 26 27 28 29]]
+ >>> a = a[::2, [1, 2, 4]]
+ >>> a = a[[True, False, True], :]
+ >>> a.shape
+ (2, 3)
+ >>> print(np.asanyarray(a))
+ [[ 1, 2, 4],
+ [21, 22, 24]]
+
+ .. versionadded:: NEXTVERSION
+
+ """
+
+ def __array__(self, *dtype):
+ """Convert the `{{class}}` into a `numpy` array.
+
+ .. versionadded:: NEXTVERSION
+
+ :Parameters:
+
+ dtype: optional
+ Typecode or data-type to which the array is cast.
+
+ :Returns:
+
+ `numpy.ndarray`
+ An independent `numpy` array of the subspace of the
+ data defined by the `indices` attribute.
+
+ """
+ array = self._get_array()
+ if dtype:
+ return array.astype(dtype[0], copy=False)
+
+ return array
+
+ def __getitem__(self, index):
+ """Returns a subspace of the data as a new `{{class}}`.
+
+ x.__getitem__(indices) <==> x[indices]
+
+ Subspaces created by indexing are lazy and are not applied
+ until the `{{class}}` object is converted to a `numpy` array,
+ by which time all lazily-defined subspaces will have been
+ converted to a single combined index which defines only the
+ actual elements that need to be retrieved from the original
+ data.
+
+ The combined index is orthogonal, meaning that the index for
+ each dimension is to be applied independently, regardless of
+ how that index was defined. For instance, the indices ``[[0,
+ 1], [1, 3], 0]`` and ``[:2, 1::2, 0]`` will give identical
+ results.
+
+ For example, if the original data has shape ``(12, 145, 192)``
+ and consecutive subspaces of ``[::2, [1, 3, 4], 96:]`` and
+ ``[[0, 5], [True, False, True], 0]`` are applied, then only
+ the elements defined by the combined index``[[0, 10], [1, 4],
+ 96]`` will be retrieved from the data when `__array__` is
+ called.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `index`, `original_shape`, `__array__`,
+ `__getitem__`
+
+ :Returns:
+
+ `{{class}}`
+ The subspaced data.
+
+ """
+ shape0 = self.shape
+ index0 = self.index(conform=False)
+ original_shape = self.original_shape
+
+ index1 = parse_indices(shape0, index, keepdims=False)
+
+ new = self.copy()
+ new_indices = []
+ new_shape = []
+
+ i = 0
+ for ind0, original_size in zip(index0, original_shape):
+ if isinstance(ind0, Integral):
+ # The previous call to __getitem__ resulted in a
+ # dimension being removed (i.e. 'ind0' is
+ # integer-valued). Therefore 'index1' must have fewer
+ # elements than 'index0', so we need to "carry
+ # forward" the integer-valued index so that it is
+ # available at evaluation time.
+ new_indices.append(ind0)
+ continue
+
+ ind1 = index1[i]
+ size0 = shape0[i]
+ i += 1
+
+ # If this dimension is not subspaced by the new index then
+ # we don't need to update the old index.
+ if isinstance(ind1, slice) and ind1 == slice(None):
+ new_indices.append(ind0)
+ continue
+
+ # Still here? Then we have to work out the index of the
+ # full array that is equivalent to applying
+ # 'ind0' followed by 'ind1'.
+ if is_dask_collection(ind1):
+ # Note: This will never occur when this __getitem__ is
+ # being called from within a Dask graph, because
+ # any lazy indices will have already been
+ # computed as part of the whole graph execution;
+ # i.e. we don't have to worry about a
+ # compute-within-a-compute situation. (If this
+ # were not the case then we could add
+ # `scheduler="synchronous"` to the compute
+ # call.)
+ ind1 = ind1.compute()
+
+ if isinstance(ind0, slice):
+ if isinstance(ind1, slice):
+ # ind0: slice
+ # ind1: slice
+ start, stop, step = ind0.indices(original_size)
+ start1, stop1, step1 = ind1.indices(size0)
+ size1, mod1 = divmod(stop1 - start1, step1)
+
+ if mod1 != 0:
+ size1 += 1
+
+ start += start1 * step
+ step *= step1
+ stop = start + (size1 - 1) * step
+
+ if step > 0:
+ stop += 1
+ else:
+ stop -= 1
+
+ if stop < 0:
+ stop = None
+
+ new_index = slice(start, stop, step)
+ else:
+ # ind0: slice
+ # ind1: int, or array of int/bool
+ new_index = np.arange(*ind0.indices(original_size))[ind1]
+ else:
+ # ind0: array of int. If we made it to here then it
+ # can't be anything else. This is
+ # because we've dealt with ind0
+ # being a slice or an int, the
+ # very first ind0 is always
+ # slice(None), and a previous ind1
+ # that was an array of bool will
+ # have resulted in this ind0 being
+ # an array of int.
+ #
+ # ind1: anything
+ new_index = np.asanyarray(ind0)[ind1]
+
+ new_indices.append(new_index)
+
+ new._custom["index"] = tuple(new_indices)
+
+ # Find the shape defined by the new index
+ new_shape = indices_shape(new_indices, original_shape, keepdims=False)
+ new._set_component("shape", tuple(new_shape), copy=False)
+
+ return new
+
+ def __repr__(self):
+ """Called by the `repr` built-in function.
+
+ x.__repr__() <==> repr(x)
+
+ """
+ return (
+ f""
+ )
+
+ @property
+ def __asanyarray__(self):
+ """Whether the array is accessed by conversion to a `numpy` array.
+
+ .. versionadded:: NEXTVERSION
+
+ :Returns:
+
+ `True`
+
+ """
+ return True
+
+ def _get_array(self, index=None):
+ """Returns a subspace of the data as a `numpy` array.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `__array__`, `index`
+
+ :Parameters:
+
+ index: `tuple` or `None`, optional
+ Provide the indices that define the subspace. If
+ `None` then the `index` attribute is used.
+
+ :Returns:
+
+ `numpy.ndarray`
+ The subspace.
+
+ """
+ return NotImplementedError(
+ f"Must implement {self.__class__.__name__}._get_array"
+ )
+
+ def index(self, conform=True):
+ """The index to be applied when converting to a `numpy` array.
+
+ The `shape` is defined by the `index` applied to the
+ `original_shape`.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `shape`, `original_shape`
+
+ :Parameters:
+
+ conform: `bool`, optional
+ If True, the default, then
+
+ * Convert a decreasing size 1 slice to an increasing
+ one.
+
+ * Convert, where possible, a sequence of integers to a
+ slice.
+
+ These transformations are to allow subspacing on data
+ objects that have restricted indexing functionality,
+ such as `h5py.Variable` objects.
+
+ If False then these transformations are not done.
+
+ :Returns:
+
+ `tuple`
+
+ **Examples**
+
+ >>> x.shape
+ (12, 145, 192)
+ >>> x.index()
+ (slice(None), slice(None), slice(None))
+ >>> x = x[8:7:-1, 10:19:3, [15, 1, 4, 12]]
+ >>> x = x[[0], [True, False, True], ::-2]
+ >>> x.shape
+ (1, 2, 2)
+ >>> x.index()
+ (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11))
+ >>> x.index(conform=False)
+ (array([8]), array([10, 16]), array([12, 1]))
+
+ """
+ ind = self._custom.get("index")
+ if ind is None:
+ # No indices have been applied yet, so define indices that
+ # are equivalent to Ellipsis, and set the original shape.
+ ind = (slice(None),) * self.ndim
+ self._custom["index"] = ind
+ self._custom["original_shape"] = self.shape
+ return ind
+
+ if not conform:
+ return ind
+
+ # Still here? Then conform the indices by:
+ #
+ # 1) Converting decreasing size 1 slices to increasing
+ # ones. This helps when the parent class can't cope with
+ # decreasing slices.
+ #
+ # 2) Converting, where possible, sequences of integers to
+ # slices. This helps when the parent class can't cope with
+ # indices that are sequences of integers.
+ ind = list(ind)
+ for n, (i, size) in enumerate(zip(ind[:], self.original_shape)):
+ if isinstance(i, slice):
+ if size == 1:
+ start, _, step = i.indices(size)
+ if step and step < 0:
+ # Decreasing slices are not universally
+ # accepted (e.g. `h5py` doesn't like them),
+ # but we can convert them to increasing ones.
+ ind[n] = slice(start, start + 1)
+ elif np.iterable(i):
+ i = normalize_index((i,), (size,))[0]
+ if i.size == 1:
+ # Convert a sequence of one integer into a slice
+ start = i.item()
+ ind[n] = slice(start, start + 1)
+ else:
+ # Convert a sequence of two or more evenly spaced
+ # integers into a slice.
+ step = np.unique(np.diff(i))
+ if step.size == 1:
+ start, stop = i[[0, -1]]
+ if stop >= start:
+ stop += 1
+ elif stop:
+ stop = -1
+ else:
+ stop = None
+
+ ind[n] = slice(start, stop, step.item())
+
+ return tuple(ind)
+
+ @property
+ def original_shape(self):
+ """The original shape of the data, before any subspacing.
+
+ The `shape` is defined by the result of subspacing the data in
+ its original shape with the indices given by `index`.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `index`, `shape`
+
+ """
+ out = self._custom.get("original_shape")
+ if out is None:
+ # No subspace has been defined yet
+ out = self.shape
+ self._custom["original_shape"] = out
+
+ return out
diff --git a/cf/data/array/netcdf4array.py b/cf/data/array/netcdf4array.py
new file mode 100644
index 0000000000..095bf2d3ad
--- /dev/null
+++ b/cf/data/array/netcdf4array.py
@@ -0,0 +1,80 @@
+import cfdm
+
+from ...mixin_container import Container
+from .locks import netcdf_lock
+from .mixin import ActiveStorageMixin, ArrayMixin, FileArrayMixin, IndexMixin
+
+
+class NetCDF4Array(
+ ActiveStorageMixin,
+ IndexMixin,
+ FileArrayMixin,
+ ArrayMixin,
+ Container,
+ cfdm.NetCDF4Array,
+):
+ """A netCDF array accessed with `netCDF4`.
+
+ **Active storage reductions**
+
+ An active storage reduction may be enabled with the `actify`
+ method. See `cf.data.collapse.Collapse` for details.
+
+ """
+
+ def __dask_tokenize__(self):
+ """Return a value fully representative of the object.
+
+ .. versionadded:: 3.15.0
+
+ """
+ return super().__dask_tokenize__() + (self.get_mask(),)
+
+ @property
+ def _lock(self):
+ """Set the lock for use in `dask.array.from_array`.
+
+ Returns a lock object because concurrent reads are not
+ currently supported by the netCDF and HDF libraries. The lock
+ object will be the same for all `NetCDF4Array` and
+ `H5netcdfArray` instances, regardless of the dataset they
+ access, which means that access to all netCDF and HDF files
+ coordinates around the same lock.
+
+ .. versionadded:: 3.14.0
+
+ """
+ return netcdf_lock
+
+ def _get_array(self, index=None):
+ """Returns a subspace of the dataset variable.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `__array__`, `index`
+
+ :Parameters:
+
+ {{index: `tuple` or `None`, optional}}
+
+ :Returns:
+
+ `numpy.ndarray`
+ The subspace.
+
+ """
+ if index is None:
+ index = self.index()
+
+ # Note: We need to lock because the netCDF file is about to be
+ # accessed.
+ self._lock.acquire()
+
+ # Note: It's cfdm.NetCDFArray.__getitem__ that we want to call
+ # here, but we use 'Container' in super because that
+ # comes immediately before cfdm.NetCDFArray in the
+ # method resolution order.
+ array = super(Container, self).__getitem__(index)
+
+ self._lock.release()
+ return array
diff --git a/cf/data/array/netcdfarray.py b/cf/data/array/netcdfarray.py
index 6dab56af1a..4642dff910 100644
--- a/cf/data/array/netcdfarray.py
+++ b/cf/data/array/netcdfarray.py
@@ -1,35 +1,16 @@
-import cfdm
-from dask.utils import SerializableLock
+class NetCDFArray:
+ """A netCDF array accessed with `netCDF4`.
-from ...mixin_container import Container
-from .mixin import ArrayMixin, FileArrayMixin
+ Deprecated at version NEXTVERSION and is no longer available. Use
+ `cf.NetCDF4Array` instead.
-# Global lock for netCDF file access
-_lock = SerializableLock()
+ """
+ def __init__(self, *args, **kwargs):
+ """**Initialisation**"""
+ from ...functions import DeprecationError
-class NetCDFArray(FileArrayMixin, ArrayMixin, Container, cfdm.NetCDFArray):
- """An array stored in a netCDF file."""
-
- def __dask_tokenize__(self):
- """Return a value fully representative of the object.
-
- .. versionadded:: 3.15.0
-
- """
- return super().__dask_tokenize__() + (self.get_mask(),)
-
- @property
- def _lock(self):
- """Set the lock for use in `dask.array.from_array`.
-
- Returns a lock object because concurrent reads are not
- currently supported by the netCDF-C library. The lock object
- will be the same for all `NetCDFArray` instances, regardless
- of the dataset they access, which means that access to all
- netCDF files coordinates around the same lock.
-
- .. versionadded:: 3.14.0
-
- """
- return _lock
+ raise DeprecationError(
+ f"{self.__class__.__name__} was deprecated at version NEXTVERSION "
+ "and is no longer available. Use cf.NetCDF4Array instead."
+ )
diff --git a/cf/data/array/umarray.py b/cf/data/array/umarray.py
index a5e24ddb74..510b9c97ee 100644
--- a/cf/data/array/umarray.py
+++ b/cf/data/array/umarray.py
@@ -1,19 +1,15 @@
import cfdm
-import numpy as np
from ...constants import _stash2standard_name
-from ...functions import (
- _DEPRECATION_ERROR_ATTRIBUTE,
- get_subspace,
- load_stash2standard_name,
- parse_indices,
-)
+from ...functions import _DEPRECATION_ERROR_ATTRIBUTE, load_stash2standard_name
from ...umread_lib.umfile import File, Rec
from .abstract import Array
-from .mixin import FileArrayMixin
+from .mixin import FileArrayMixin, IndexMixin
-class UMArray(FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array):
+class UMArray(
+ IndexMixin, FileArrayMixin, cfdm.data.mixin.FileArrayMixin, Array
+):
"""A sub-array stored in a PP or UM fields file."""
def __init__(
@@ -25,8 +21,7 @@ def __init__(
fmt=None,
word_size=None,
byte_ordering=None,
- units=False,
- calendar=False,
+ attributes=None,
source=None,
copy=True,
):
@@ -61,16 +56,15 @@ def __init__(
byte_ordering: `str`, optional
``'little_endian'`` or ``'big_endian'``
- units: `str` or `None`, optional
- The units of the fragment data. Set to `None` to
- indicate that there are no units. If unset then the
- units will be set during the first `__getitem__` call.
+ {{init attributes: `dict` or `None`, optional}}
- calendar: `str` or `None`, optional
- The calendar of the fragment data. Set to `None` to
- indicate the CF default calendar, if applicable. If
- unset then the calendar will be set during the first
- `__getitem__` call.
+ During the first `__getitem__` call, any of the
+ ``_FillValue``, ``add_offset``, ``scale_factor``,
+ ``units``, and ``calendar`` attributes which haven't
+ already been set will be inferred from the lookup
+ header and cached for future use.
+
+ .. versionadded:: NEXTVERSION
{{init source: optional}}
@@ -83,7 +77,7 @@ def __init__(
Deprecated at version 3.14.0.
header_offset: `int`
- Deprecated at version 3.15.0. use the *address*
+ Deprecated at version 3.15.0. Use the *address*
parameter instead.
data_offset: `int`, optional
@@ -92,6 +86,14 @@ def __init__(
disk_length: `int`, optional
Deprecated at version 3.15.0.
+ units: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
+ calendar: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
"""
super().__init__(source=source, copy=copy)
@@ -132,14 +134,9 @@ def __init__(
byte_ordering = None
try:
- units = source._get_component("units", False)
+ attributes = source._get_component("attributes", None)
except AttributeError:
- units = False
-
- try:
- calendar = source._get_component("calendar", False)
- except AttributeError:
- calendar = False
+ attributes = None
if filename is not None:
if isinstance(filename, str):
@@ -159,8 +156,7 @@ def __init__(
self._set_component("shape", shape, copy=False)
self._set_component("dtype", dtype, copy=False)
- self._set_component("units", units, copy=False)
- self._set_component("calendar", calendar, copy=False)
+ self._set_component("attributes", attributes, copy=False)
if fmt is not None:
self._set_component("fmt", fmt, copy=False)
@@ -174,75 +170,60 @@ def __init__(
# By default, close the UM file after data array access
self._set_component("close", True, copy=False)
- def __getitem__(self, indices):
- """Return a subspace of the array.
+ def _get_array(self, index=None):
+ """Returns a subspace of the dataset variable.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `__array__`, `index`
+
+ :Parameters:
+
+ {{index: `tuple` or `None`, optional}}
- x.__getitem__(indices) <==> x[indices]
+ :Returns:
- Returns a subspace of the array as an independent numpy array.
+ `numpy.ndarray`
+ The subspace.
"""
+ # Note: No need to lock the UM file - concurrent reads are OK.
+
+ if index is None:
+ index = self.index()
+
f, header_offset = self.open()
rec = self._get_rec(f, header_offset)
int_hdr = rec.int_hdr
real_hdr = rec.real_hdr
- array = rec.get_data().reshape(self.shape)
+ array = rec.get_data().reshape(self.original_shape)
self.close(f)
del f, rec
- if indices is not Ellipsis:
- indices = parse_indices(array.shape, indices)
- array = get_subspace(array, indices)
-
- # Set the units, if they haven't been set already.
- self._set_units(int_hdr)
-
- LBUSER2 = int_hdr.item(38)
- if LBUSER2 == 3:
- # Return the numpy array now if it is a boolean array
- self._set_component("dtype", np.dtype(bool), copy=False)
- return array.astype(bool)
-
- integer_array = LBUSER2 == 2
-
- # ------------------------------------------------------------
- # Convert to a masked array
- # ------------------------------------------------------------
- # Set the fill_value from BMDI
- fill_value = real_hdr.item(17)
- if fill_value != -1.0e30:
- # -1.0e30 is the flag for no missing data
- if integer_array:
- # The fill_value must be of the same type as the data
- # values
- fill_value = int(fill_value)
-
- # Mask any missing values
- mask = array == fill_value
- if mask.any():
- array = np.ma.masked_where(mask, array, copy=False)
-
- # ------------------------------------------------------------
- # Unpack the array using the scale_factor and add_offset, if
- # either is available
- # ------------------------------------------------------------
- # Treat BMKS as a scale_factor if it is neither 0 nor 1
- scale_factor = real_hdr.item(18)
- if scale_factor != 1.0 and scale_factor != 0.0:
- if integer_array:
- scale_factor = int(scale_factor)
-
- array *= scale_factor
-
- # Treat BDATUM as an add_offset if it is not 0
- add_offset = real_hdr.item(4)
- if add_offset != 0.0:
- if integer_array:
- add_offset = int(add_offset)
+ # Set the netCDF attributes for the data
+ attributes = self.get_attributes({})
+ self._set_units(int_hdr, attributes)
+ self._set_FillValue(int_hdr, real_hdr, attributes)
+ self._set_unpack(int_hdr, real_hdr, attributes)
+ self._set_component("attributes", attributes, copy=False)
+
+ # Get the data subspace, applying any masking and unpacking
+ array = cfdm.netcdf_indexer(
+ array,
+ mask=True,
+ unpack=True,
+ always_masked_array=False,
+ orthogonal_indexing=True,
+ attributes=attributes,
+ copy=False,
+ )
+ array = array[index]
- array += add_offset
+ if int_hdr.item(38) == 3:
+ # Convert the data to a boolean array
+ array = array.astype(bool)
# Set the data type
self._set_component("dtype", array.dtype, copy=False)
@@ -289,68 +270,144 @@ def _get_rec(self, f, header_offset):
# if r.hdr_offset == header_offset:
# return r
- def _set_units(self, int_hdr):
- """The units and calendar properties.
+ def _set_FillValue(self, int_hdr, real_hdr, attributes):
+ """Set the ``_FillValue`` attribute.
- These are set from inpection of the integer header, but only
- if they have already not been defined, either during {{class}}
- instantiation or by a previous call to `_set_units`.
+ .. versionadded:: NEXTVERSION
+
+ :Parameters:
+
+ int_hdr: `numpy.ndarray`
+ The integer header of the data.
+
+ real_header: `numpy.ndarray`
+ The real header of the data.
+
+ attributes: `dict`
+ The dictionary in which to store the new
+ attributes. If a new attribute exists then
+ *attributes* is updated in-place.
+
+ :Returns:
+
+ `None
+
+ """
+ if "FillValue" in attributes:
+ return
+
+ # Set the fill_value from BMDI
+ _FillValue = real_hdr.item(17)
+ if _FillValue != -1.0e30:
+ # -1.0e30 is the flag for no missing data
+ if int_hdr.item(38) == 2:
+ # Must have an integer _FillValue for integer data
+ _FillValue = int(_FillValue)
+
+ attributes["_FillValue"] = _FillValue
+
+ def _set_units(self, int_hdr, attributes):
+ """Set the ``units`` attribute.
.. versionadded:: 3.14.0
+ .. versionadded:: NEXTVERSION
+
:Parameters:
int_hdr: `numpy.ndarray`
The integer header of the data.
+ real_header: `numpy.ndarray`
+ The real header of the data.
+
+ attributes: `dict`
+ The dictionary in which to store the new
+ attributes. If a new attribute exists then
+ *attributes* is updated in-place.
+
:Returns:
- `tuple`
- The units and calendar values, either of which may be
- `None`.
+ `None`
+
+ """
+ if "units" in attributes:
+ return
+
+ units = None
+ if not _stash2standard_name:
+ load_stash2standard_name()
+
+ submodel = int_hdr.item(44)
+ stash = int_hdr.item(41)
+ records = _stash2standard_name.get((submodel, stash))
+ if records:
+ LBSRCE = int_hdr.item(37)
+ version, source = divmod(LBSRCE, 10000)
+ if version <= 0:
+ version = 405.0
+
+ for (
+ long_name,
+ units0,
+ valid_from,
+ valid_to,
+ standard_name,
+ cf_info,
+ condition,
+ ) in records:
+ if not self._test_version(
+ valid_from, valid_to, version
+ ) or not self._test_condition(condition, int_hdr):
+ continue
+
+ units = units0
+ break
+
+ attributes["units"] = units
+
+ def _set_unpack(self, int_hdr, real_hdr, attributes):
+ """Set the ``add_offset`` and ``scale_factor`` attributes.
+
+ .. versionadded:: NEXTVERSION
+
+ :Parameters:
+
+ int_hdr: `numpy.ndarray`
+ The integer header of the data.
+
+ real_header: `numpy.ndarray`
+ The real header of the data.
+
+ attributes: `dict`
+ The dictionary in which to store the new
+ attributes. If any new attributes exist then
+ *attributes* is updated in-place.
+
+ :Returns:
+
+ `None
"""
- units = self._get_component("units", False)
- if units is False:
- units = None
-
- if not _stash2standard_name:
- load_stash2standard_name()
-
- submodel = int_hdr[44]
- stash = int_hdr[41]
- records = _stash2standard_name.get((submodel, stash))
- if records:
- LBSRCE = int_hdr[37]
- version, source = divmod(LBSRCE, 10000)
- if version <= 0:
- version = 405.0
-
- for (
- long_name,
- units0,
- valid_from,
- valid_to,
- standard_name,
- cf_info,
- condition,
- ) in records:
- if not self._test_version(
- valid_from, valid_to, version
- ) or not self._test_condition(condition, int_hdr):
- continue
-
- units = units0
- break
-
- self._set_component("units", units, copy=False)
-
- calendar = self._get_component("calendar", False)
- if calendar is False:
- calendar = None
- self._set_component("calendar", calendar, copy=False)
-
- return units, calendar
+ if "scale_factor" not in attributes:
+ # Treat BMKS as a scale_factor if it is neither 0 nor 1
+ scale_factor = real_hdr.item(18)
+ if scale_factor != 1.0 and scale_factor != 0.0:
+ if int_hdr.item(38) == 2:
+ # Must have an integer scale_factor for integer data
+ scale_factor = int(scale_factor)
+
+ attributes["scale_factor"] = scale_factor
+
+ if "add_offset" not in attributes:
+ # Treat BDATUM as an add_offset if it is not 0
+ add_offset = real_hdr.item(4)
+ if add_offset != 0.0:
+ if int_hdr.item(38) == 2:
+ # Must have an integer add_offset for integer data
+ add_offset = int(add_offset)
+
+ attributes["add_offset"] = add_offset
def _test_condition(self, condition, int_hdr):
"""Return `True` if a field satisfies a condition for a STASH
@@ -381,14 +438,14 @@ def _test_condition(self, condition, int_hdr):
return True
if condition == "true_latitude_longitude":
- LBCODE = int_hdr[15]
+ LBCODE = int_hdr.item(15)
# LBCODE 1: Unrotated regular lat/long grid
# LBCODE 2 = Regular lat/lon grid boxes (grid points are
# box centres)
if LBCODE in (1, 2):
return True
elif condition == "rotated_latitude_longitude":
- LBCODE = int_hdr[15]
+ LBCODE = int_hdr.item(15)
# LBCODE 101: Rotated regular lat/long grid
# LBCODE 102: Rotated regular lat/lon grid boxes (grid
# points are box centres)
@@ -686,7 +743,7 @@ def open(self):
**Examples**
>>> f.open()
- (, 4)
+ (, 4)
"""
return super().open(
diff --git a/cf/data/collapse/__init__.py b/cf/data/collapse/__init__.py
index 0de12360ea..ec720438f0 100644
--- a/cf/data/collapse/__init__.py
+++ b/cf/data/collapse/__init__.py
@@ -1 +1,2 @@
from .collapse import Collapse
+from .collapse_active import active_reduction_methods
diff --git a/cf/data/collapse/collapse.py b/cf/data/collapse/collapse.py
index 0220f3e17a..550c14959c 100644
--- a/cf/data/collapse/collapse.py
+++ b/cf/data/collapse/collapse.py
@@ -11,6 +11,47 @@
class Collapse(metaclass=DocstringRewriteMeta):
"""Container for functions that collapse dask arrays.
+ **Active storage reductions**
+
+ A collapse method (such as `max`, `var`, etc.) will attempt to
+ make use of active storage reduction on an individual `dask` chunk
+ when all of the following conditions are met:
+
+ * `cf.active_storage()` is True;
+
+ * ``cf.active_storage_url()`` returns the URL of a valid active
+ storage server;
+
+ * the `dask` chunk's data are defined by a netCDF-4 file on disk
+ (rather than in any other file format, or in memory);
+
+ * it is possible to import the `activestorage.Active` class;
+
+ * the method is one of those specified by
+ `cf.data.collapse.active_reduction_methods`;
+
+ * the collapse is over all axes;
+
+ * the collapse is unweighted;
+
+ * the data are not numerically packed.
+
+ If any of these conditions are not met then the `dask` chunk will
+ be collapsed "as usual", i.e. by retrieving the data to memory (if
+ it is not already there) and using the local client to perform the
+ collapse calculations.
+
+ .. note:: The performance improvements from using active storage
+ operations will increase the closer, in a network sense,
+ the active storage server is to the data storage. If the
+ active storage server is sufficiently far away from the
+ data then it could even be faster and require less
+ energy to do non-active operation of the local client.
+
+ See `cf.data.collapse.collapse_active.actify` and
+ `cf.data.collapse.collapse_active.active_chunk_function` for
+ further details.
+
.. versionadded:: 3.14.0
"""
@@ -52,7 +93,7 @@ def max(
a,
axis=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -80,7 +121,7 @@ def max(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -91,6 +132,7 @@ def max(
from .dask_collapse import cf_max_agg, cf_max_chunk, cf_max_combine
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_max_chunk
check_input_dtype(a)
@@ -113,7 +155,7 @@ def max_abs(
a,
axis=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -141,7 +183,8 @@ def max_abs(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
+
:Returns:
@@ -163,7 +206,7 @@ def mean(
axis=None,
weights=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -193,7 +236,7 @@ def mean(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -204,6 +247,7 @@ def mean(
from .dask_collapse import cf_mean_agg, cf_mean_chunk, cf_mean_combine
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_mean_chunk
check_input_dtype(a)
@@ -228,7 +272,7 @@ def mean_abs(
weights=None,
axis=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -258,7 +302,7 @@ def mean_abs(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -281,7 +325,7 @@ def mid_range(
axis=None,
dtype=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -309,7 +353,7 @@ def mid_range(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -324,6 +368,7 @@ def mid_range(
)
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_range_chunk
check_input_dtype(a, allowed="fi")
@@ -346,7 +391,7 @@ def min(
a,
axis=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -374,7 +419,7 @@ def min(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -385,6 +430,7 @@ def min(
from .dask_collapse import cf_min_agg, cf_min_chunk, cf_min_combine
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_min_chunk
check_input_dtype(a)
@@ -407,7 +453,7 @@ def min_abs(
a,
axis=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -435,7 +481,7 @@ def min_abs(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -456,7 +502,7 @@ def range(
a,
axis=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -484,7 +530,7 @@ def range(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -499,6 +545,7 @@ def range(
)
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_range_chunk
check_input_dtype(a, allowed="fi")
@@ -522,7 +569,7 @@ def rms(
axis=None,
weights=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -552,7 +599,8 @@ def rms(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
+
:Returns:
@@ -563,6 +611,7 @@ def rms(
from .dask_collapse import cf_mean_combine, cf_rms_agg, cf_rms_chunk
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_rms_chunk
check_input_dtype(a)
@@ -586,7 +635,7 @@ def sample_size(
a,
axis=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -614,7 +663,7 @@ def sample_size(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -629,6 +678,7 @@ def sample_size(
)
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_sample_size_chunk
check_input_dtype(a)
@@ -652,7 +702,7 @@ def sum(
axis=None,
weights=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -682,7 +732,7 @@ def sum(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -693,6 +743,7 @@ def sum(
from .dask_collapse import cf_sum_agg, cf_sum_chunk, cf_sum_combine
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_sum_chunk
check_input_dtype(a)
@@ -720,7 +771,7 @@ def sum_of_weights(
axis=None,
weights=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -750,7 +801,7 @@ def sum_of_weights(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -765,6 +816,7 @@ def sum_of_weights(
)
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_sum_of_weights_chunk
check_input_dtype(a)
@@ -789,7 +841,7 @@ def sum_of_weights2(
axis=None,
weights=None,
keepdims=False,
- mtol=None,
+ mtol=1,
split_every=None,
chunk_function=None,
):
@@ -819,7 +871,7 @@ def sum_of_weights2(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -830,17 +882,18 @@ def sum_of_weights2(
from .dask_collapse import (
cf_sum_agg,
cf_sum_combine,
- cf_sum_of_weights_chunk,
+ cf_sum_of_weights2_chunk,
)
if chunk_function is None:
- chunk_function = cf_sum_of_weights_chunk
+ # Default function for chunk calculations
+ chunk_function = cf_sum_of_weights2_chunk
check_input_dtype(a)
dtype = double_precision_dtype(weights, default="i8")
return reduction(
a,
- partial(chunk_function, square=True),
+ chunk_function,
partial(cf_sum_agg, mtol=mtol, original_shape=a.shape),
axis=axis,
keepdims=keepdims,
@@ -864,7 +917,7 @@ def unique(self, a, split_every=None, chunk_function=None):
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
:Returns:
@@ -875,6 +928,7 @@ def unique(self, a, split_every=None, chunk_function=None):
from .dask_collapse import cf_unique_agg, cf_unique_chunk
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_unique_chunk
check_input_dtype(a, "fibUS")
@@ -905,7 +959,7 @@ def var(
axis=None,
weights=None,
keepdims=False,
- mtol=None,
+ mtol=1,
ddof=None,
split_every=None,
chunk_function=None,
@@ -938,7 +992,12 @@ def var(
{{split_every: `int` or `dict`, optional}}
- {{chunk_function: callable, optional}}
+ {{chunk_function: callable or `None`, optional}}
+
+ A callable function must accept a *ddof* keyword
+ parameter that sets the delta degrees of freedom. See
+ `cf.data.collapse.dask_collapse.cf_var_chunk` for
+ details.
:Returns:
@@ -949,6 +1008,7 @@ def var(
from .dask_collapse import cf_var_agg, cf_var_chunk, cf_var_combine
if chunk_function is None:
+ # Default function for chunk calculations
chunk_function = cf_var_chunk
check_input_dtype(a)
diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py
new file mode 100644
index 0000000000..db8fc277f6
--- /dev/null
+++ b/cf/data/collapse/collapse_active.py
@@ -0,0 +1,286 @@
+import datetime
+import logging
+import time
+from functools import wraps
+from numbers import Integral
+
+try:
+ from activestorage import Active
+except ModuleNotFoundError:
+ pass
+
+from ...functions import (
+ active_storage,
+ active_storage_max_requests,
+ active_storage_url,
+ is_log_level_info,
+)
+
+logger = logging.getLogger(__name__)
+
+# --------------------------------------------------------------------
+# Specify which reduction methods are possible with active storage
+# --------------------------------------------------------------------
+active_reduction_methods = ("max", "mean", "min", "sum")
+
+
+class ActiveStorageError(Exception):
+ pass
+
+
+def actify(method):
+ """Decorator for active storage reductions on chunks.
+
+ Intended to decorate the ``cf_*_chunk`` methods in
+ `cf.data.collapse.dask_collapse`.
+
+ When a ``cf_*_chunk`` method is decorated, its computations will
+ be attempted in active storage. If that is not possible (due to
+ configuration settings, limitations on the type of reduction that
+ can be done in active storage, or the active storage reduction
+ failed) then the computations will be done locally "as usual".
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `active_chunk_function`
+
+ :Parameters:
+
+ method: `str`
+ The name of the reduction method.
+
+ """
+
+ def decorator(chunk_function):
+ @wraps(chunk_function)
+ def wrapper(*args, **kwargs):
+ try:
+ # Try doing an active storage reduction
+ out = active_chunk_function(method, *args, **kwargs)
+ except ActiveStorageError as error:
+ # The active storage reduction failed
+ logger.warning(
+ f"{error} => reverting to local computation"
+ ) # pragma: no cover
+ else:
+ if out is not None:
+ # The active storage reduction succeeded
+ return out
+
+ # Still here? Then using active storage was not
+ # appropriate, or else doing the active storage operation
+ # failed => do a local computation.
+ return chunk_function(*args, **kwargs)
+
+ return wrapper
+
+ return decorator
+
+
+def active_chunk_function(method, *args, **kwargs):
+ """Collapse data in a chunk with active storage.
+
+ Called by the `actify` decorator function.
+
+ If an active storage reduction is not appropriate then `None` is
+ returned.
+
+ If the active storage operation fails then an ActiveStorageError
+ is raised.
+
+ If the active storage operation is successful then a dictionary of
+ reduction components, similar to that returned by a ``cf_*_chunk``
+ method, is returned.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `actify`
+
+ :Parameters:
+
+ method: `str`
+ The name of the reduction method (e.g. ``'mean'``).
+
+ x: array_like
+ The data to be collapsed.
+
+ kwargs: optional
+ Extra keyword arguments that define the reduction.
+
+ :Returns:
+
+ `dict` or `None`
+ The reduced data in component form, or `None` if an active
+ storage reduction is not appropriate.
+
+ **Examples**
+
+ >>> d = active_chunk_function('sum', x)
+ >>> print(d)
+ {'N': 7008, 'sum': 7006221.66903949}
+
+ Active storage reduction is not (yet) possible for variances:
+
+ >>> d = active_chunk_function('variance', x, weights)
+ >>> print(d)
+ None
+
+ """
+ x = args[0]
+
+ # Dask reduction machinery
+ if kwargs.get("computing_meta"):
+ return x
+
+ # ----------------------------------------------------------------
+ # Return None if active storage reduction is not appropriate.
+ # Inside `actify`, this will trigger a local reduction to be
+ # carried out instead.
+ # ----------------------------------------------------------------
+ if not active_storage():
+ # Active storage is turned off
+ return
+
+ url = kwargs.get("active_storage_url")
+ if url is None:
+ url = active_storage_url().value
+ if url is None:
+ # Active storage is not possible when no active storage
+ # server URL has been provided
+ return
+
+ if method not in active_reduction_methods:
+ # Active storage is not (yet) available for this method
+ return
+
+ if not getattr(x, "active_storage", False):
+ # The data object 'x' is incompatible with active storage
+ # operations. E.g. it is a UMArray object, a numpy array, etc.
+ return
+
+ if len(args) == 2:
+ # Weights, if present, are always passed in as a positional
+ # parameter, never as a keyword parameter (see
+ # `dask.array.reductions.reduction` for details).
+ weights = args[1]
+ if weights is not None:
+ # Active storage is not (yet) allowed for weighted
+ # reductions
+ return
+
+ axis = kwargs.get("axis")
+ if axis is not None:
+ if isinstance(axis, Integral):
+ axis = (axis,)
+
+ if len(axis) < x.ndim:
+ # Active storage is not (yet) allowed for reductions over
+ # a subset of the axes
+ return
+
+ # ----------------------------------------------------------------
+ # Still here? Set up an Active instance that will carry out the
+ # active storage operation. If the operation fails, for any
+ # reason, then this will trigger (inside `actify`) a local
+ # reduction being carried out instead.
+ # ----------------------------------------------------------------
+ filename = x.get_filename()
+ address = x.get_address()
+ max_requests = active_storage_max_requests()
+ active_kwargs = {
+ "uri": "/".join(filename.split("/")[3:]),
+ "ncvar": address,
+ "storage_options": x.get_storage_options(),
+ "active_storage_url": url,
+ "storage_type": "s3",
+ "max_threads": max_requests,
+ }
+ # WARNING: The "uri", "storage_options", and "storage_type" keys
+ # of the `active_kwargs` dictionary are currently
+ # formatted according to the whims of the `Active` class
+ # (i.e. the pyfive branch of PyActiveStorage). Future
+ # versions of `Active` will have a better API, that will
+ # require improvements to `active_kwargs`.
+
+ index = x.index()
+
+ details = (
+ f"{method!r} (file={filename}, address={address}, url={url}, "
+ f"Dask chunk={index})"
+ )
+
+ info = is_log_level_info(logger)
+ if info:
+ # Do some detailed logging
+ start = time.time()
+ logger.info(
+ f"STARTED active storage {details}: {datetime.datetime.now()}"
+ ) # pragma: no cover
+
+ active = Active(**active_kwargs)
+ active.method = method
+ active.components = True
+
+ # Instruct the `Active` class to attempt an active storage
+ # reduction on the remote server
+ #
+ # WARNING: The `_version` API of `Active` is likely to change from
+ # the current version (i.e. the pyfive branch of
+ # PyActiveStorage)
+ active._version = 2
+
+ # ----------------------------------------------------------------
+ # Execute the active storage operation by indexing the Active
+ # instance
+ # ----------------------------------------------------------------
+ try:
+ d = active[index]
+ except Exception as error:
+ # Something went wrong with the active storage operations =>
+ # Raise an ActiveStorageError that will in turn trigger
+ # (inside `actify`) a local reduction to be carried out
+ # instead.
+ raise ActiveStorageError(
+ f"FAILED in active storage {details} ({error}))"
+ )
+ else:
+ # Active storage reduction was successful
+ if info:
+ # Do some detailed logging
+ try:
+ md = active.metric_data
+ except AttributeError:
+ logger.info(
+ f"FINISHED active storage {details}: "
+ f"{time.time() - start:6.2f}s"
+ ) # pragma: no cover
+ else:
+ logger.info(
+ f"FINISHED active storage {details}: "
+ f"dataset chunks: {md['dataset chunks']}, "
+ f"load nc (s): {md['load nc time']:6.2f}, "
+ f"indexing (s): {md['indexing time (s)']:6.2f}, "
+ f"reduction (s): {md['reduction time (s)']:6.2f}, "
+ f"selection 2 (s): {md['selection 2 time (s)']:6.2f}, "
+ f"Total: {(time.time() - start):6.2f}s"
+ ) # pragma: no cover
+
+ # ----------------------------------------------------------------
+ # Active storage reduction was a success. Reformat the resulting
+ # components dictionary 'd' to match the output of the
+ # corresponding local chunk function (e.g. `cf_mean_chunk`).
+ # ----------------------------------------------------------------
+ if method == "max":
+ # Local chunk function `cf_max_chunk`
+ d = {"N": d["n"], "max": d["max"]}
+ elif method == "mean":
+ # Local chunk function `cf_mean_chunk`
+ d = {"N": d["n"], "sum": d["sum"], "V1": d["n"], "weighted": False}
+ elif method == "min":
+ # Local chunk function `cf_min_chunk`
+ d = {"N": d["n"], "min": d["min"]}
+ elif method == "sum":
+ # Local chunk function `cf_sum_chunk`
+ d = {"N": d["n"], "sum": d["sum"]}
+
+ return d
diff --git a/cf/data/collapse/dask_collapse.py b/cf/data/collapse/dask_collapse.py
index 05f83ce077..9476ffcb0b 100644
--- a/cf/data/collapse/dask_collapse.py
+++ b/cf/data/collapse/dask_collapse.py
@@ -1,7 +1,8 @@
"""Reduction functions intended to be passed to be dask.
-Most of these functions are expected to be set as *chunk*, *combine* and
-*aggregate* parameters of `dask.array.reduction`
+Most of these functions are expected to be passed to
+`dask.array.reduction` as its *chunk*, *combine* and *aggregate*
+parameters.
"""
@@ -15,6 +16,8 @@
from dask.core import flatten
from dask.utils import deepmap
+from ..dask_utils import cf_asanyarray
+from .collapse_active import actify
from .collapse_utils import double_precision_dtype
@@ -37,16 +40,15 @@ def mask_small_sample_size(x, N, axis, mtol, original_shape):
mtol: number
The sample size threshold below which collapsed values are
set to missing data. It is defined as a fraction (between
- 0 and 1 inclusive) of the contributing input data values.
+ 0 and 1 inclusive) of the contributing input data
+ values. A missing value in the output array occurs
+ whenever more than ``100*mtol%`` of its contributing input
+ array elements are missing data.
- The default of *mtol* is 1, meaning that a missing datum
+ The default of *mtol* is 1, meaning that a missing value
in the output array occurs whenever all of its
contributing input array elements are missing data.
- For other values, a missing datum in the output array
- occurs whenever more than ``100*mtol%`` of its
- contributing input array elements are missing data.
-
Note that for non-zero values of *mtol*, different
collapsed elements may have different sample sizes,
depending on the distribution of missing data in the input
@@ -126,7 +128,8 @@ def sum_weights_chunk(
N = cf_sample_size_chunk(x, **kwargs)["N"]
return N
- elif check_weights:
+
+ if check_weights:
w_min = weights.min()
if w_min <= 0:
raise ValueError(
@@ -227,6 +230,7 @@ def sum_sample_sizes(pairs, axis, computing_meta=False, **kwargs):
# --------------------------------------------------------------------
# mean
# --------------------------------------------------------------------
+@actify("mean")
def cf_mean_chunk(
x,
weights=None,
@@ -237,8 +241,13 @@ def cf_mean_chunk(
):
"""Chunk calculations for the mean.
- This function is passed to `dask.array.reduction` as its *chunk*
- parameter.
+ This function is passed to `dask.array.reduction` as its *chunk*
+ parameter.
+
+ Weights are interpreted as reliability weights, as opposed to
+ frequency weights. See
+ https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights
+ for details.
.. versionadded:: 3.14.0
@@ -267,11 +276,15 @@ def cf_mean_chunk(
if computing_meta:
return x
+ x = cf_asanyarray(x)
+ if weights is not None:
+ weights = cf_asanyarray(weights)
+
# N, sum
- d = cf_sum_chunk(x, weights, dtype=dtype, **kwargs)
+ d = cf_sum_chunk(x, weights=weights, dtype=dtype, **kwargs)
d["V1"] = sum_weights_chunk(
- x, weights, N=d["N"], check_weights=False, **kwargs
+ x, weights=weights, N=d["N"], check_weights=False, **kwargs
)
d["weighted"] = weights is not None
@@ -363,6 +376,7 @@ def cf_mean_agg(
# --------------------------------------------------------------------
# maximum
# --------------------------------------------------------------------
+@actify("max")
def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs):
"""Chunk calculations for the maximum.
@@ -381,12 +395,14 @@ def cf_max_chunk(x, dtype=None, computing_meta=False, **kwargs):
Dictionary with the keys:
* N: The sample size.
- * max: The maximum of `x``.
+ * max: The maximum of ``x``.
"""
if computing_meta:
return x
+ x = cf_asanyarray(x)
+
return {
"max": chunk.max(x, **kwargs),
"N": cf_sample_size_chunk(x, **kwargs)["N"],
@@ -514,6 +530,7 @@ def cf_mid_range_agg(
# --------------------------------------------------------------------
# minimum
# --------------------------------------------------------------------
+@actify("min")
def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs):
"""Chunk calculations for the minimum.
@@ -538,6 +555,8 @@ def cf_min_chunk(x, dtype=None, computing_meta=False, **kwargs):
if computing_meta:
return x
+ x = cf_asanyarray(x)
+
return {
"min": chunk.min(x, **kwargs),
"N": cf_sample_size_chunk(x, **kwargs)["N"],
@@ -617,6 +636,7 @@ def cf_min_agg(
# --------------------------------------------------------------------
# range
# --------------------------------------------------------------------
+@actify("range")
def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs):
"""Chunk calculations for the range.
@@ -636,12 +656,14 @@ def cf_range_chunk(x, dtype=None, computing_meta=False, **kwargs):
* N: The sample size.
* min: The minimum of ``x``.
- * max: The maximum of ``x`.
+ * max: The maximum of ``x``.
"""
if computing_meta:
return x
+ x = cf_asanyarray(x)
+
# N, max
d = cf_max_chunk(x, **kwargs)
@@ -727,12 +749,18 @@ def cf_range_agg(
# --------------------------------------------------------------------
# root mean square
# --------------------------------------------------------------------
+@actify("rms")
def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs):
"""Chunk calculations for the root mean square (RMS).
This function is passed to `dask.array.reduction` as its *chunk*
parameter.
+ Weights are interpreted as reliability weights, as opposed to
+ frequency weights. See
+ https://en.wikipedia.org/wiki/Weighted_arithmetic_mean#Reliability_weights
+ for details.
+
.. versionadded:: 3.14.0
:Parameters:
@@ -751,6 +779,8 @@ def cf_rms_chunk(x, weights=None, dtype="f8", computing_meta=False, **kwargs):
if computing_meta:
return x
+ x = cf_asanyarray(x)
+
return cf_mean_chunk(
np.multiply(x, x, dtype=dtype), weights=weights, dtype=dtype, **kwargs
)
@@ -803,6 +833,7 @@ def cf_rms_agg(
# --------------------------------------------------------------------
# sample size
# --------------------------------------------------------------------
+@actify("sample_size")
def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs):
"""Chunk calculations for the sample size.
@@ -826,6 +857,8 @@ def cf_sample_size_chunk(x, dtype="i8", computing_meta=False, **kwargs):
if computing_meta:
return x
+ x = cf_asanyarray(x)
+
if np.ma.isMA(x):
N = chunk.sum(np.ones_like(x, dtype=dtype), **kwargs)
else:
@@ -913,6 +946,7 @@ def cf_sample_size_agg(
# --------------------------------------------------------------------
# sum
# --------------------------------------------------------------------
+@actify("sum")
def cf_sum_chunk(
x,
weights=None,
@@ -951,7 +985,10 @@ def cf_sum_chunk(
if computing_meta:
return x
+ x = cf_asanyarray(x)
+
if weights is not None:
+ weights = cf_asanyarray(weights)
if check_weights:
w_min = weights.min()
if w_min <= 0:
@@ -1044,8 +1081,9 @@ def cf_sum_agg(
# --------------------------------------------------------------------
# sum of weights
# --------------------------------------------------------------------
+@actify("sum_of_weights")
def cf_sum_of_weights_chunk(
- x, weights=None, dtype="f8", computing_meta=False, square=False, **kwargs
+ x, weights=None, dtype="f8", computing_meta=False, **kwargs
):
"""Chunk calculations for the sum of the weights.
@@ -1054,9 +1092,50 @@ def cf_sum_of_weights_chunk(
:Parameters:
- square: `bool`, optional
- If True then calculate the sum of the squares of the
- weights.
+ See `dask.array.reductions` for details of the other
+ parameters.
+
+ :Returns:
+
+ `dict`
+ Dictionary with the keys:
+
+ * N: The sample size.
+ * sum: The sum of ``weights``.
+
+ """
+ if computing_meta:
+ return x
+
+ x = cf_asanyarray(x)
+ if weights is not None:
+ weights = cf_asanyarray(weights)
+
+ # N
+ d = cf_sample_size_chunk(x, **kwargs)
+
+ d["sum"] = sum_weights_chunk(
+ x, weights=weights, square=False, N=d["N"], **kwargs
+ )
+
+ return d
+
+
+# --------------------------------------------------------------------
+# sum of squares of weights
+# --------------------------------------------------------------------
+@actify("sum_of_weights2")
+def cf_sum_of_weights2_chunk(
+ x, weights=None, dtype="f8", computing_meta=False, **kwargs
+):
+ """Chunk calculations for the sum of the squares of the weights.
+
+ This function is passed to `dask.array.reduction` as its *chunk*
+ parameter.
+
+ .. versionadded:: NEXTVERSION
+
+ :Parameters:
See `dask.array.reductions` for details of the other
parameters.
@@ -1067,18 +1146,21 @@ def cf_sum_of_weights_chunk(
Dictionary with the keys:
* N: The sample size.
- * sum: The sum of ``weights``, or the sum of
- ``weights**2`` if *square* is True.
+ * sum: The sum of the squares of ``weights``.
"""
if computing_meta:
return x
+ x = cf_asanyarray(x)
+ if weights is not None:
+ weights = cf_asanyarray(weights)
+
# N
d = cf_sample_size_chunk(x, **kwargs)
d["sum"] = sum_weights_chunk(
- x, weights=weights, square=square, N=d["N"], **kwargs
+ x, weights=weights, square=True, N=d["N"], **kwargs
)
return d
@@ -1087,6 +1169,7 @@ def cf_sum_of_weights_chunk(
# --------------------------------------------------------------------
# unique
# --------------------------------------------------------------------
+@actify("unique")
def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs):
"""Chunk calculations for the unique values.
@@ -1110,6 +1193,8 @@ def cf_unique_chunk(x, dtype=None, computing_meta=False, **kwargs):
if computing_meta:
return x
+ x = cf_asanyarray(x)
+
return {"unique": np.unique(x)}
@@ -1148,18 +1233,37 @@ def cf_unique_agg(pairs, axis=None, computing_meta=False, **kwargs):
# --------------------------------------------------------------------
# variance
# --------------------------------------------------------------------
+@actify("var")
def cf_var_chunk(
x, weights=None, dtype="f8", computing_meta=False, ddof=None, **kwargs
):
- """Chunk calculations for the variance.
+ r"""Chunk calculations for the variance.
This function is passed to `dask.array.reduction` as its *chunk*
parameter.
+ For non-overlapping data sets, X_{i}, making up the aggregate data
+ set X=\bigcup _{i}X_{i}, the unweighted variance \sigma ^{2} is
+
+ \mu &={\frac {1}{\sum _{i}{N_{X_{i}}}}}\left(\sum
+ _{i}{N_{X_{i}}\mu _{X_{i}}}\right)
+
+ \sigma ^{2}&={\sqrt {{\frac {1}{\sum
+ _{i}{N_{X_{i}}-ddof}}}\left(\sum _{i}{\left[(N_{X_{i}}-1)\sigma
+ _{X_{i}}^{2}+N_{X_{i}}\mu _{X_{i}}^{2}\right]}-\left[\sum
+ _{i}{N_{X_{i}}}\right]\mu _{X}^{2}\right)}}
+
+ where X_{i}\cap X_{j}=\emptyset , \forall i>> a = np.array([[1, 2, 3]])
+ >>> print(cf.data.dask_utils.cf_filled(a, -999))
+ [[1 2 3]]
+ >>> a = np.ma.array([[1, 2, 3]], mask=[[True, False, False]])
+ >>> print(cf.data.dask_utils.cf_filled(a, -999))
+ [[-999 2 3]]
+
+ """
+ a = cf_asanyarray(a)
+ return np.ma.filled(a, fill_value=fill_value)
+
+
+def cf_asanyarray(a):
+ """Convert to a `numpy` array.
+
+ Only do this if the input *a* has an `__asanyarray__` attribute
+ with value True.
+
+ .. versionadded:: NEXTVERSION
+
+ :Parameters:
+
+ a: array_like
+ The array.
+
+ :Returns:
+
+ The array converted to a `numpy` array, or the input array
+ unchanged if ``a.__asanyarray__`` False.
+
+ """
+ if getattr(a, "__asanyarray__", False):
+ return np.asanyarray(a)
+
+ return a
diff --git a/cf/data/data.py b/cf/data/data.py
index fd84d14bb4..61abf86bc7 100644
--- a/cf/data/data.py
+++ b/cf/data/data.py
@@ -13,7 +13,6 @@
import numpy as np
from cfdm import is_log_level_info
from dask import compute, delayed # noqa: F401
-from dask.array import Array
from dask.array.core import normalize_chunks
from dask.base import collections_to_dsk, is_dask_collection, tokenize
from dask.highlevelgraph import HighLevelGraph
@@ -44,11 +43,15 @@
from ..units import Units
from .collapse import Collapse
from .creation import generate_axis_identifiers, to_dask
+
from .dask_utils import (
_da_ma_allclose,
+ cf_asanyarray,
cf_contains,
cf_dt2rt,
+ cf_filled,
cf_harden_mask,
+ cf_is_masked,
cf_percentile,
cf_rt2dt,
cf_soften_mask,
@@ -370,16 +373,22 @@ def __init__(
super().__init__(
source=source, _use_array=_use_array and array is not None
)
-
if _use_array:
try:
- array = source.to_dask_array()
+ array = source.to_dask_array(_asanyarray=False)
except (AttributeError, TypeError):
- pass
+ try:
+ array = source.to_dask_array()
+ except (AttributeError, TypeError):
+ pass
+ else:
+ self._set_dask(array, copy=copy, clear=_NONE)
else:
- self._set_dask(array, copy=copy, clear=_NONE)
+ self._set_dask(
+ array, copy=copy, clear=_NONE, asanyarray=None
+ )
else:
- self._del_dask(None)
+ self._del_dask(None, clear=_NONE)
# Set the mask hardness
self.hardmask = getattr(source, "hardmask", _DEFAULT_HARDMASK)
@@ -427,6 +436,7 @@ def __init__(
return
# Still here? Then create a dask array and store it.
+ custom = self._custom
# Find out if the input data is compressed by convention
try:
@@ -440,17 +450,18 @@ def __init__(
"for compressed input arrays"
)
+ # Bring the compressed data into memory without
+ # decompressing it
if to_memory:
try:
array = array.to_memory()
except AttributeError:
pass
- try:
- array.get_filenames()
- except AttributeError:
- pass
- else:
+ if self._is_abstract_Array_subclass(array):
+ # Save the input array in case it's useful later. For
+ # compressed input arrays this will contain extra
+ # information, such as a count or index variable.
self._set_Array(array)
# Cast the input data as a dask array
@@ -463,7 +474,20 @@ def __init__(
# Set whether or not we're sure that the Data instance has a
# deterministic name
- self._custom["deterministic"] = not is_dask_collection(array)
+ is_dask = is_dask_collection(array)
+ custom["deterministic"] = not is_dask
+
+ # Set whether or not to call `np.asanyarray` on chunks to
+ # convert them to numpy arrays.
+ if is_dask:
+ # We don't know what's in the dask array, so we should
+ # assume that it might need converting to a numpy array.
+ custom["__asanyarray__"] = True
+ else:
+ # Use the array's __asanyarray__ value, if it has one.
+ custom["__asanyarray__"] = bool(
+ getattr(array, "__asanyarray__", False)
+ )
dx = to_dask(array, chunks, **kwargs)
@@ -487,7 +511,7 @@ def __init__(
self._Units = units
# Store the dask array
- self._set_dask(dx, clear=_NONE)
+ self._set_dask(dx, clear=_NONE, asanyarray=None)
# Override the data type
if dtype is not None:
@@ -621,9 +645,13 @@ def __contains__(self, value):
# are incompatible
return False
- value = value.to_dask_array()
+ # 'cf_contains' has its own calls to 'cf_asanyarray', so
+ # we can set '_asanyarray=False'.
+ value = value.to_dask_array(_asanyarray=False)
- dx = self.to_dask_array()
+ # 'cf_contains' has its own calls to 'cf_asanyarray', so we
+ # can set '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
out_ind = tuple(range(dx.ndim))
dx_ind = out_ind
@@ -756,7 +784,9 @@ def __len__(self):
TypeError: len() of unsized object
"""
- dx = self.to_dask_array()
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
if math.isnan(dx.size):
logger.debug("Computing data len: Performance may be degraded")
dx.compute_chunk_sizes()
@@ -874,10 +904,10 @@ def __getitem__(self, indices):
new = self.roll(
axis=tuple(roll.keys()), shift=tuple(roll.values())
)
- dx = new.to_dask_array()
+ dx = new.to_dask_array(_asanyarray=False)
else:
- new = self.copy(array=False)
- dx = self.to_dask_array()
+ new = self.copy()
+ dx = self.to_dask_array(_asanyarray=False)
# ------------------------------------------------------------
# Subspace the dask array
@@ -923,8 +953,12 @@ def __getitem__(self, indices):
# ------------------------------------------------------------
# Set the subspaced dask array
+ #
+ # * A subspaced chunk might not result in an array in memory,
+ # so we set asanyarray=True to ensure that, if required,
+ # they are converted at compute time.
# ------------------------------------------------------------
- new._set_dask(dx)
+ new._set_dask(dx, asanyarray=True)
# ------------------------------------------------------------
# Get the axis identifiers for the subspace
@@ -1117,6 +1151,7 @@ def __setitem__(self, indices, value):
# Do the assignment
self._set_subspace(dx, indices, value)
+ self._set_dask(dx)
# Unroll any axes that were rolled to enable a cyclic
# assignment
@@ -1136,15 +1171,23 @@ def __setitem__(self, indices, value):
self[indices] = reset
- # Remove elements made invalid by updating the `dask` array
- # in-place
- self._clear_after_dask_update(_ALL)
-
return
- # ----------------------------------------------------------------
- # Indexing behaviour attributes
- # ----------------------------------------------------------------
+ @property
+ def __asanyarray__(self):
+ """Whether or not chunks need conversion to `numpy` arrays.
+
+ .. versionadded:: NEXTVERSION
+
+ ..seealso:: `to_dask_array`, `todict`, `_set_dask`
+
+ :Returns:
+
+ `bool`
+
+ """
+ return self._custom.get("__asanyarray__", True)
+
@property
def __orthogonal_indexing__(self):
"""Flag to indicate that orthogonal indexing is supported.
@@ -1323,9 +1366,6 @@ def _clear_after_dask_update(self, clear=_ALL):
* If ``clear & _CFA`` is non-zero then the CFA write
status is set to `False`.
- * If ``clear`` is non-zero then the CFA term status is
- set to `False`.
-
By default *clear* is the ``_ALL`` integer-valued
constant, which results in all components being
removed.
@@ -1361,7 +1401,7 @@ def _clear_after_dask_update(self, clear=_ALL):
# Set the CFA write status to False
self._cfa_del_write()
- def _set_dask(self, array, copy=False, clear=_ALL):
+ def _set_dask(self, dx, copy=False, clear=_ALL, asanyarray=False):
"""Set the dask array.
.. versionadded:: 3.14.0
@@ -1371,7 +1411,7 @@ def _set_dask(self, array, copy=False, clear=_ALL):
:Parameters:
- array: `dask.array.Array`
+ dx: `dask.array.Array`
The array to be inserted.
copy: `bool`, optional
@@ -1384,12 +1424,18 @@ def _set_dask(self, array, copy=False, clear=_ALL):
results in all components being removed. See
`_clear_after_dask_update` for details.
+ asanyarray: `None` or `bool`, optional
+ If `None` then do nothing. Otherwise set the
+ `__asanyarray__` attribute to *asanyarray*.
+
+ .. versionadded:: NEXTVERSION
+
:Returns:
`None`
"""
- if array is NotImplemented:
+ if dx is NotImplemented:
logger.warning(
"WARNING: NotImplemented has been set in the place of a "
"dask array."
@@ -1402,16 +1448,20 @@ def _set_dask(self, array, copy=False, clear=_ALL):
"suitability (such as data type casting, "
"broadcasting, etc.). Note that the exception may be "
"difficult to diagnose, as dask will have silently "
- "trapped it and returned NotImplemented (for "
- "instance, see dask.array.core.elemwise). Print "
+ "trapped it and returned NotImplemented (see, for "
+ "instance, dask.array.core.elemwise). Print "
"statements in a local copy of dask are possibly the "
"way to go if the cause of the error is not obvious."
)
if copy:
- array = array.copy()
+ dx = dx.copy()
+
+ custom = self._custom
+ custom["dask"] = dx
+ if asanyarray is not None:
+ custom["__asanyarray__"] = bool(asanyarray)
- self._custom["dask"] = array
self._clear_after_dask_update(clear)
def _del_dask(self, default=ValueError(), clear=_ALL):
@@ -1513,6 +1563,20 @@ def _get_cached_elements(self):
return cache.copy()
+ def _is_abstract_Array_subclass(self, array):
+ """Whether or not an array is a type of Array.
+
+ :Parameters:
+
+ array:
+
+ :Returns:
+
+ `bool`
+
+ """
+ return isinstance(array, cfdm.Array)
+
def _set_cached_elements(self, elements):
"""Cache selected element values.
@@ -1520,7 +1584,7 @@ def _set_cached_elements(self, elements):
within its ``custom`` dictionary.
.. warning:: Never change ``_custom['cached_elements']``
- in-place.
+ in-place.
.. versionadded:: 3.14.0
@@ -2470,7 +2534,9 @@ def percentile(
else:
axes = tuple(sorted(d._parse_axes(axes)))
- dx = d.to_dask_array()
+ # 'cf_percentile' has its own call to 'cf_asanyarray', so we
+ # can set '_asanyarray=False'.
+ dx = d.to_dask_array(_asanyarray=False)
dtype = dx.dtype
shape = dx.shape
@@ -2535,7 +2601,7 @@ def percentile(
name = name[0]
graph = HighLevelGraph.from_collections(name, dsk, dependencies=[dx])
- dx = Array(graph, name, chunks=out_chunks, dtype=float)
+ dx = da.Array(graph, name, chunks=out_chunks, dtype=float)
d._set_dask(dx)
@@ -2586,11 +2652,9 @@ def persist(self, inplace=False):
"""
d = _inplace_enabled_define_and_cleanup(self)
-
dx = self.to_dask_array()
dx = dx.persist()
d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE)
-
return d
@_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0")
@@ -2628,7 +2692,8 @@ def ceil(self, inplace=False, i=False):
"""
d = _inplace_enabled_define_and_cleanup(self)
dx = d.to_dask_array()
- d._set_dask(da.ceil(dx))
+ dx = da.ceil(dx)
+ d._set_dask(dx)
return d
def cfa_get_term(self):
@@ -2753,7 +2818,8 @@ def compute(self): # noqa: F811
.. versionadded:: 3.14.0
- .. seealso:: `persist`, `array`, `datetime_array`
+ .. seealso:: `persist`, `array`, `datetime_array`,
+ `sparse_array`
:Returns:
@@ -2778,7 +2844,8 @@ def compute(self): # noqa: F811
[0., 0., 0.]])
"""
- a = self.to_dask_array().compute()
+ dx = self.to_dask_array()
+ a = dx.compute()
if np.ma.isMA(a):
if self.hardmask:
@@ -2966,8 +3033,8 @@ def convolution_filter(
dx = d.to_dask_array()
- # Cast to float to ensure that NaNs can be stored (as required
- # by cf_convolve1d)
+ # Cast to float to ensure that NaNs can be stored (so
+ # map_overlap can correctly assign the halos)
if dx.dtype != float:
dx = dx.astype(float, copy=False)
@@ -3153,9 +3220,13 @@ def rechunk(
"""
d = _inplace_enabled_define_and_cleanup(self)
- dx = d.to_dask_array()
+ dx = d.to_dask_array(_asanyarray=False)
dx = dx.rechunk(chunks, threshold, block_size_limit, balance)
- d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE)
+ # Dask rechunking is essentially a wrapper for __getitem__
+ # calls on the chunks, which means that we can use the same
+ # 'asanyarray' and 'clear' keyword values to `_set_dask` as
+ # are used in `__gettem__`.
+ d._set_dask(dx, clear=_ALL ^ _ARRAY ^ _CACHE, asanyarray=True)
return d
@@ -3206,7 +3277,9 @@ def _asdatetime(self, inplace=False):
)
if not d._isdatetime():
- dx = d.to_dask_array()
+ # 'cf_rt2dt' has its own call to 'cf_asanyarray', so we
+ # can set '_asanyarray=False'.
+ dx = d.to_dask_array(_asanyarray=False)
dx = dx.map_blocks(cf_rt2dt, units=units, dtype=object)
d._set_dask(dx)
@@ -3261,7 +3334,9 @@ def _asreftime(self, inplace=False):
)
if d._isdatetime():
- dx = d.to_dask_array()
+ # 'cf_dt2rt' has its own call to 'cf_asanyarray', so we
+ # can set '_asanyarray=False'.
+ dx = d.to_dask_array(_asanyarray=False)
dx = dx.map_blocks(cf_dt2rt, units=units, dtype=float)
d._set_dask(dx)
@@ -3871,7 +3946,9 @@ def _regrid(
f"the shape of the regrid operator: {operator.src_shape}"
)
- dx = self.to_dask_array()
+ # 'regrid' has its own calls to 'cf_asanyarray', so we can set
+ # '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
# Rechunk so that each chunk contains data in the form
# expected by the regrid operator, i.e. the regrid axes all
@@ -3939,7 +4016,7 @@ def _regrid(
)
# Create a regridding function to apply to each chunk
- regrid_func = partial(
+ cf_regrid_func = partial(
regrid,
method=method,
src_shape=src_shape,
@@ -3959,7 +4036,7 @@ def _regrid(
# github.com/pangeo-data/pangeo/issues/334#issuecomment-403787663
dx = dx.map_blocks(
- regrid_func,
+ cf_regrid_func,
weights_dst_mask=weights_dst_mask,
ref_src_mask=src_mask,
chunks=regridded_chunks,
@@ -4113,8 +4190,11 @@ def concatenate(
processed_data.append(data1)
copied = not copy # to avoid making two copies in a given case
- # Get data as dask arrays and apply concatenation operation
- dxs = [d.to_dask_array() for d in processed_data]
+ # Get data as dask arrays and apply concatenation
+ # operation. We can set '_asanyarray=False' because at compute
+ # time the concatenation operation does not need to access the
+ # actual data.
+ dxs = [d.to_dask_array(_asanyarray=False) for d in processed_data]
dx = da.concatenate(dxs, axis=axis)
# Set the CFA write status
@@ -4142,8 +4222,18 @@ def concatenate(
cfa = _NONE
break
+ # Define the __asanyarray__ status
+ asanyarray = processed_data[0].__asanyarray__
+ for d in processed_data[1:]:
+ if d.__asanyarray__ != asanyarray:
+ # If and only if any two input Data objects have
+ # different __asanyarray__ values, then set
+ # asanyarray=True on the concatenation.
+ asanyarray = True
+ break
+
# Set the new dask array
- data0._set_dask(dx, clear=_ALL ^ cfa)
+ data0._set_dask(dx, clear=_ALL ^ cfa, asanyarray=asanyarray)
# Set appropriate cached elements
cached_elements = {}
@@ -4770,7 +4860,7 @@ def _axes(self, value):
# ----------------------------------------------------------------
@property
def chunks(self):
- """The chunk sizes for each dimension.
+ """The `dask` chunk sizes for each dimension.
.. versionadded:: 3.14.0
@@ -4787,7 +4877,9 @@ def chunks(self):
6
"""
- return self.to_dask_array().chunks
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ return self.to_dask_array(_asanyarray=False).chunks
# ----------------------------------------------------------------
# Attributes
@@ -4840,10 +4932,12 @@ def Units(self, value):
else:
dtype = _dtype_float
- func = partial(cf_units, from_units=old_units, to_units=value)
+ cf_func = partial(cf_units, from_units=old_units, to_units=value)
- dx = self.to_dask_array()
- dx = dx.map_blocks(func, dtype=dtype)
+ # 'cf_units' has its own call to 'cf_asanyarray', so we can
+ # set '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
+ dx = dx.map_blocks(cf_func, dtype=dtype)
# Setting equivalent units doesn't affect the CFA write
# status. Nor does it invalidate any cached values, but only
@@ -4854,7 +4948,7 @@ def Units(self, value):
cache = self._get_cached_elements()
if cache:
self._set_cached_elements(
- {index: func(value) for index, value in cache.items()}
+ {index: cf_func(value) for index, value in cache.items()}
)
self._Units = value
@@ -4909,16 +5003,17 @@ def dtype(self):
[1 2 3]
"""
- dx = self.to_dask_array()
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
return dx.dtype
@dtype.setter
def dtype(self, value):
- dx = self.to_dask_array()
-
# Only change the datatype if it's different to that of the
# dask array
- if dx.dtype != value:
+ if self.dtype != value:
+ dx = self.to_dask_array()
dx = dx.astype(value)
self._set_dask(dx)
@@ -5023,18 +5118,15 @@ def is_masked(self):
True
"""
-
- def is_masked(a):
- out = np.ma.is_masked(a)
- return np.array(out).reshape((1,) * a.ndim)
-
- dx = self.to_dask_array()
+ # 'cf_is_masked' has its own call to 'cf_asanyarray', so we
+ # can set '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
out_ind = tuple(range(dx.ndim))
dx_ind = out_ind
dx = da.blockwise(
- is_masked,
+ cf_is_masked,
out_ind,
dx,
dx_ind,
@@ -5071,7 +5163,9 @@ def nbytes(self):
24
"""
- dx = self.to_dask_array()
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
if math.isnan(dx.size):
logger.debug("Computing data nbytes: Performance may be degraded")
dx.compute_chunk_sizes()
@@ -5105,7 +5199,9 @@ def ndim(self):
0
"""
- dx = self.to_dask_array()
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
return dx.ndim
@property
@@ -5127,7 +5223,9 @@ def npartitions(self):
6
"""
- return self.to_dask_array().npartitions
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ return self.to_dask_array(_asanyarray=False).npartitions
@property
def numblocks(self):
@@ -5148,7 +5246,9 @@ def numblocks(self):
6
"""
- return self.to_dask_array().numblocks
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ return self.to_dask_array(_asanyarray=False).numblocks
@property
def shape(self):
@@ -5178,7 +5278,9 @@ def shape(self):
()
"""
- dx = self.to_dask_array()
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
if math.isnan(dx.size):
logger.debug("Computing data shape: Performance may be degraded")
dx.compute_chunk_sizes()
@@ -5217,7 +5319,9 @@ def size(self):
1
"""
- dx = self.to_dask_array()
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
size = dx.size
if math.isnan(size):
logger.debug("Computing data size: Performance may be degraded")
@@ -5268,13 +5372,25 @@ def array(self):
2000-12-01 00:00:00
"""
- array = self.compute().copy()
- if issparse(array):
- array = array.toarray()
- elif not isinstance(array, np.ndarray):
- array = np.asanyarray(array)
+ a = self.compute().copy()
+ if issparse(a):
+ a = a.toarray()
+ elif not isinstance(a, np.ndarray):
+ a = np.asanyarray(a)
+
+ if not a.size:
+ return a
+
+ # Set cached elements
+ items = [0, -1]
+ if a.ndim == 2 and a.shape[-1] == 2:
+ items.extend((1, -2))
+ elif a.size == 3:
+ items.append(1)
- return array
+ self._set_cached_elements({i: a.item(i) for i in items})
+
+ return a
@property
def datetime_array(self):
@@ -5424,7 +5540,8 @@ def arctan(self, inplace=False):
d = _inplace_enabled_define_and_cleanup(self)
dx = d.to_dask_array()
- d._set_dask(da.arctan(dx))
+ dx = da.arctan(dx)
+ d._set_dask(dx)
d.override_units(_units_radians, inplace=True)
@@ -5579,7 +5696,8 @@ def arcsinh(self, inplace=False):
d = _inplace_enabled_define_and_cleanup(self)
dx = d.to_dask_array()
- d._set_dask(da.arcsinh(dx))
+ dx = da.arcsinh(dx)
+ d._set_dask(dx)
d.override_units(_units_radians, inplace=True)
@@ -6399,7 +6517,9 @@ def convert_reference_time(
)
d.Units = units0
- dx = d.to_dask_array()
+ # 'cf_rt2dt' its own call to 'cf_asanyarray', so we can set
+ # '_asanyarray=False'.
+ dx = d.to_dask_array(_asanyarray=False)
# Convert to the correct date-time objects
dx = dx.map_blocks(cf_rt2dt, units=units0, dtype=object)
@@ -6476,8 +6596,11 @@ def get_deterministic_name(self):
raise ValueError()
units = self._Units
+
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
return tokenize(
- self.to_dask_array().name,
+ self.to_dask_array(_asanyarray=False).name,
units.formatted(definition=True, names=True),
units._canonical_calendar,
)
@@ -6542,7 +6665,10 @@ def get_filenames(self):
"""
out = set()
- for a in self.todict().values():
+
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ for a in self.todict(_asanyarray=False).values():
try:
out.update(a.get_filenames())
except AttributeError:
@@ -6675,7 +6801,10 @@ def add_file_location(self, location):
location = abspath(location).rstrip(sep)
updated = False
- dsk = self.todict()
+
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dsk = self.todict(_asanyarray=False)
for key, a in dsk.items():
try:
dsk[key] = a.add_file_location(location)
@@ -6688,9 +6817,9 @@ def add_file_location(self, location):
updated = True
if updated:
- dx = self.to_dask_array()
+ dx = self.to_dask_array(_asanyarray=False)
dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta)
- self._set_dask(dx, clear=_NONE)
+ self._set_dask(dx, clear=_NONE, asanyarray=None)
return location
@@ -7672,7 +7801,8 @@ def cos(self, inplace=False, i=False):
d.Units = _units_radians
dx = d.to_dask_array()
- d._set_dask(da.cos(dx))
+ dx = da.cos(dx)
+ d._set_dask(dx)
d.override_units(_units_1, inplace=True)
@@ -8115,7 +8245,9 @@ def unique(self, split_every=None):
# in the result.
d.soften_mask()
- dx = d.to_dask_array()
+ # The applicable chunk function will have its own call to
+ # 'cf_asanyarray', so we can set '_asanyarray=False'.
+ dx = d.to_dask_array(_asanyarray=False)
dx = Collapse().unique(dx, split_every=split_every)
d._set_dask(dx)
@@ -8376,7 +8508,6 @@ def equals(
# Apply a (dask) logical 'and' to confirm if both the mask and the
# data are equal for the pair of masked arrays:
result = da.logical_and(data_comparison, mask_comparison)
-
if not result.compute():
if is_log_level_info(logger):
logger.info(
@@ -8419,7 +8550,8 @@ def exp(self, inplace=False, i=False):
d.Units = _units_1
dx = d.to_dask_array()
- d._set_dask(da.exp(dx))
+ dx = da.exp(dx)
+ d._set_dask(dx)
return d
@@ -8478,6 +8610,13 @@ def insert_dimension(self, position=0, inplace=False):
data_axes.insert(position, axis)
d._axes = data_axes
+ # Update the HDF5 chunking strategy
+ chunksizes = d.nc_hdf5_chunksizes()
+ if isinstance(chunksizes, tuple):
+ chunksizes = list(chunksizes)
+ chunksizes.insert(position, 1)
+ d.nc_set_hdf5_chunksizes(chunksizes)
+
return d
@_deprecated_kwarg_check("size", version="3.14.0", removed_at="5.0.0")
@@ -8854,7 +8993,9 @@ def harden_mask(self):
[1 -- 3]
"""
- dx = self.to_dask_array()
+ # 'cf_harden_mask' has its own call to 'cf_asanyarray', so we
+ # can set '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
dx = dx.map_blocks(cf_harden_mask, dtype=self.dtype)
self._set_dask(dx, clear=_NONE)
self.hardmask = True
@@ -8974,7 +9115,9 @@ def soften_mask(self):
[ 1 999 3]
"""
- dx = self.to_dask_array()
+ # 'cf_soften_mask' has its own call to 'cf_asanyarray', so we
+ # can set '_asanyarray=False'.
+ dx = self.to_dask_array(_asanyarray=False)
dx = dx.map_blocks(cf_soften_mask, dtype=self.dtype)
self._set_dask(dx, clear=_NONE)
self.hardmask = False
@@ -9003,7 +9146,9 @@ def file_locations(self):
"""
out = set()
- for key, a in self.todict().items():
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ for key, a in self.todict(_asanyarray=False).items():
try:
out.update(a.file_locations())
except AttributeError:
@@ -9061,8 +9206,10 @@ def filled(self, fill_value=None, inplace=False):
f"data type {d.dtype.str!r}"
)
- dx = d.to_dask_array()
- dx = dx.map_blocks(np.ma.filled, fill_value=fill_value, dtype=d.dtype)
+ # 'cf_filled' has its own call to 'cf_asanyarray', so we can
+ # set '_asanyarray=False'.
+ dx = d.to_dask_array(_asanyarray=False)
+ dx = dx.map_blocks(cf_filled, fill_value=fill_value, dtype=d.dtype)
d._set_dask(dx)
return d
@@ -9690,7 +9837,7 @@ def override_calendar(self, calendar, inplace=False, i=False):
d._Units = Units(d.Units._units, calendar)
return d
- def to_dask_array(self, apply_mask_hardness=False):
+ def to_dask_array(self, apply_mask_hardness=False, _asanyarray=True):
"""Convert the data to a `dask` array.
.. warning:: By default, the mask hardness of the returned
@@ -9714,6 +9861,22 @@ def to_dask_array(self, apply_mask_hardness=False):
If True then force the mask hardness of the returned
array to be that given by the `hardmask` attribute.
+ _asanyarray: `bool`, optional
+ If True (the default) and the `__asanyarray__`
+ attribute is also `True`, then a `cf_asanyarray`
+ operation is added to the graph of the returned Dask
+ array. If False then this operation is not added.
+
+ In general, setting *_asanyarray* to False should only
+ be done if it is known that a) the returned Dask array
+ is never going to be computed; or b) it is not
+ necessary to add a `cf_asanyarray` operation in lieu of
+ its functionality being implemented by a new Dask graph
+ layer that is going to be created at a later stage. See
+ `cf.data.dask_utils.cf_asanyarray` for further details.
+
+ .. versionadded:: NEXTVERSION
+
:Returns:
`dask.array.Array`
@@ -9736,16 +9899,25 @@ def to_dask_array(self, apply_mask_hardness=False):
dask.array
"""
- if apply_mask_hardness and "dask" in self._custom:
+ dx = self._custom.get("dask")
+ if dx is None:
+ raise ValueError(f"{self.__class__.__name__} object has no data")
+
+ if apply_mask_hardness:
if self.hardmask:
self.harden_mask()
else:
self.soften_mask()
- try:
- return self._custom["dask"]
- except KeyError:
- raise ValueError(f"{self.__class__.__name__} object has no data")
+ dx = self._custom["dask"]
+ # Note: The mask hardness functions have their own calls
+ # to 'cf_asanyarray', so we don't need to worry about
+ # setting another one.
+ elif _asanyarray and self.__asanyarray__:
+ # Add a new cf_asanyarray layer to the output graph
+ dx = dx.map_blocks(cf_asanyarray, dtype=dx.dtype)
+
+ return dx
def datum(self, *index):
"""Return an element of the data array as a standard Python
@@ -10003,7 +10175,10 @@ def del_file_location(self, location):
location = abspath(location).rstrip(sep)
updated = False
- dsk = self.todict()
+
+ # The dask graph is never going to be computed, so we can set
+ # '_asanyarray=False'.
+ dsk = self.todict(_asanyarray=False)
for key, a in dsk.items():
try:
dsk[key] = a.del_file_location(location)
@@ -10016,9 +10191,9 @@ def del_file_location(self, location):
updated = True
if updated:
- dx = self.to_dask_array()
+ dx = self.to_dask_array(_asanyarray=False)
dx = da.Array(dsk, dx.name, dx.chunks, dx.dtype, dx._meta)
- self._set_dask(dx, clear=_NONE)
+ self._set_dask(dx, clear=_NONE, asanyarray=None)
return location
@@ -11184,7 +11359,10 @@ def where(
# Missing values could be affected, so make sure that the mask
# hardness has been applied.
- dx = d.to_dask_array(apply_mask_hardness=True)
+ #
+ # 'cf_where' has its own calls to 'cf_asanyarray', so we can
+ # set '_asanyarray=False'.
+ dx = d.to_dask_array(apply_mask_hardness=True, _asanyarray=False)
units = d.Units
@@ -11199,6 +11377,9 @@ def where(
condition = type(self).asdata(condition)
condition = where_broadcastable(d, condition, "condition")
+ # 'cf_where' has its own calls to 'cf_asanyarray', so we can
+ # set '_asanyarray=False'.
+ condition = condition.to_dask_array(_asanyarray=False)
# If x or y is self then change it to None. This prevents an
# unnecessary copy; and, at compute time, an unncessary numpy
@@ -11242,9 +11423,7 @@ def where(
x, y = xy
# Apply the where operation
- dx = da.core.elemwise(
- cf_where, dx, da.asanyarray(condition), x, y, d.hardmask
- )
+ dx = da.core.elemwise(cf_where, dx, condition, x, y, d.hardmask)
d._set_dask(dx)
# Don't know (yet) if 'x' and 'y' have a deterministic names
@@ -11305,7 +11484,8 @@ def sin(self, inplace=False, i=False):
d.Units = _units_radians
dx = d.to_dask_array()
- d._set_dask(da.sin(dx))
+ dx = da.sin(dx)
+ d._set_dask(dx)
d.override_units(_units_1, inplace=True)
@@ -11365,7 +11545,8 @@ def sinh(self, inplace=False):
d.Units = _units_radians
dx = d.to_dask_array()
- d._set_dask(da.sinh(dx))
+ dx = da.sinh(dx)
+ d._set_dask(dx)
d.override_units(_units_1, inplace=True)
@@ -11423,7 +11604,8 @@ def cosh(self, inplace=False):
d.Units = _units_radians
dx = d.to_dask_array()
- d._set_dask(da.cosh(dx))
+ dx = da.cosh(dx)
+ d._set_dask(dx)
d.override_units(_units_1, inplace=True)
@@ -11466,10 +11648,10 @@ def cull_graph(self):
('array-21ea057f160746a3d3f0943bba945460', 0): array([1, 2, 3])}
"""
- dx = self.to_dask_array()
+ dx = self.to_dask_array(_asanyarray=False)
dsk, _ = cull(dx.dask, dx.__dask_keys__())
dx = da.Array(dsk, name=dx.name, chunks=dx.chunks, dtype=dx.dtype)
- self._set_dask(dx, clear=_NONE)
+ self._set_dask(dx, clear=_NONE, asanyarray=None)
@_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0")
@_inplace_enabled(default=False)
@@ -11526,7 +11708,8 @@ def tanh(self, inplace=False):
d.Units = _units_radians
dx = d.to_dask_array()
- d._set_dask(da.tanh(dx))
+ dx = da.tanh(dx)
+ d._set_dask(dx)
d.override_units(_units_1, inplace=True)
@@ -11673,6 +11856,14 @@ def squeeze(self, axes=None, inplace=False, i=False):
# Remove the squeezed axes names
d._axes = [axis for i, axis in enumerate(d._axes) if i not in iaxes]
+ # Update the HDF5 chunking strategy
+ chunksizes = d.nc_hdf5_chunksizes()
+ if isinstance(chunksizes, tuple):
+ chunksizes = [
+ size for i, size in enumerate(chunksizes) if i not in iaxes
+ ]
+ d.nc_set_hdf5_chunksizes(chunksizes)
+
return d
@_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0")
@@ -11729,27 +11920,45 @@ def tan(self, inplace=False, i=False):
d.Units = _units_radians
dx = d.to_dask_array()
- d._set_dask(da.tan(dx))
+ dx = da.tan(dx)
+ d._set_dask(dx)
d.override_units(_units_1, inplace=True)
return d
- def todict(self, optimize_graph=True):
+ def todict(
+ self, optimize_graph=True, apply_mask_hardness=False, _asanyarray=True
+ ):
"""Return a dictionary of the dask graph key/value pairs.
.. versionadded:: 3.15.0
- .. seealso:: `to_dask_array`, `tolist`
+ .. seealso:: `to_dask_array`
:Parameters:
- `optimize_graph`: `bool`
+ optimize_graph: `bool`
If True, the default, then prior to being converted to
a dictionary, the graph is optimised to remove unused
chunks. Note that optimising the graph can add a
considerable performance overhead.
+ apply_mask_hardness: `bool`, optional
+ If True then force the mask hardness of the returned
+ array to be that given by the `hardmask` attribute.
+
+ .. versionadded:: NEXTVERSION
+
+ _asanyarray: `bool`, optional
+ If True (the default) and the `__asanyarray__`
+ attribute is also `True`, then a `cf_asanyarray`
+ operation is added to the dictionary representation of
+ the Dask graph. If False then this operation is not
+ added. See `to_dask_array` for details.
+
+ .. versionadded:: NEXTVERSION
+
:Returns:
`dict`
@@ -11775,7 +11984,9 @@ def todict(self, optimize_graph=True):
0), (slice(0, 1, 1),))}
"""
- dx = self.to_dask_array()
+ dx = self.to_dask_array(
+ apply_mask_hardness=apply_mask_hardness, _asanyarray=_asanyarray
+ )
if optimize_graph:
return collections_to_dsk((dx,), optimize_graph=True)
@@ -11897,6 +12108,12 @@ def transpose(self, axes=None, inplace=False, i=False):
d._set_dask(dx)
+ # Update the HDF5 chunking strategy
+ chunksizes = d.nc_hdf5_chunksizes()
+ if isinstance(chunksizes, tuple):
+ chunksizes = [chunksizes[i] for i in axes]
+ d.nc_set_hdf5_chunksizes(chunksizes)
+
return d
@_deprecated_kwarg_check("i", version="3.0.0", removed_at="4.0.0")
@@ -11933,7 +12150,8 @@ def trunc(self, inplace=False, i=False):
"""
d = _inplace_enabled_define_and_cleanup(self)
dx = d.to_dask_array()
- d._set_dask(da.trunc(dx))
+ dx = da.trunc(dx)
+ d._set_dask(dx)
return d
@classmethod
diff --git a/cf/data/fragment/__init__.py b/cf/data/fragment/__init__.py
index 2ce2dafa60..b7315107d4 100644
--- a/cf/data/fragment/__init__.py
+++ b/cf/data/fragment/__init__.py
@@ -1,3 +1,5 @@
from .fullfragmentarray import FullFragmentArray
+from .h5netcdffragmentarray import H5netcdfFragmentArray
from .netcdffragmentarray import NetCDFFragmentArray
+from .netcdf4fragmentarray import NetCDF4FragmentArray
from .umfragmentarray import UMFragmentArray
diff --git a/cf/data/fragment/fullfragmentarray.py b/cf/data/fragment/fullfragmentarray.py
index eecb50ef16..e2855b3003 100644
--- a/cf/data/fragment/fullfragmentarray.py
+++ b/cf/data/fragment/fullfragmentarray.py
@@ -16,8 +16,7 @@ def __init__(
shape=None,
aggregated_units=False,
aggregated_calendar=False,
- units=False,
- calendar=False,
+ attributes=None,
source=None,
copy=True,
):
@@ -41,17 +40,9 @@ def __init__(
fragment variable in that the latter may have fewer
size 1 dimensions.
- units: `str` or `None`, optional
- The units of the fragment data. Set to `None` to
- indicate that there are no units. If unset then the
- units will be set to `None` during the first
- `__getitem__` call.
+ {{init attributes: `dict` or `None`, optional}}
- calendar: `str` or `None`, optional
- The calendar of the fragment data. Set to `None` to
- indicate the CF default calendar, if applicable. If
- unset then the calendar will be set to `None` during
- the first `__getitem__` call.
+ .. versionadded:: NEXTVERSION
{{aggregated_units: `str` or `None`, optional}}
@@ -61,13 +52,20 @@ def __init__(
{{init copy: `bool`, optional}}
+ units: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
+ calendar: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
"""
super().__init__(
fill_value=fill_value,
dtype=dtype,
shape=shape,
- units=units,
- calendar=calendar,
+ attributes=attributes,
source=source,
copy=False,
)
diff --git a/cf/data/fragment/h5netcdffragmentarray.py b/cf/data/fragment/h5netcdffragmentarray.py
new file mode 100644
index 0000000000..0b70976c7f
--- /dev/null
+++ b/cf/data/fragment/h5netcdffragmentarray.py
@@ -0,0 +1,97 @@
+from ..array.h5netcdfarray import H5netcdfArray
+from .mixin import FragmentArrayMixin
+
+
+class H5netcdfFragmentArray(FragmentArrayMixin, H5netcdfArray):
+ """A netCDF fragment array accessed with `h5netcdf`.
+
+ .. versionadded:: NEXTVERSION
+
+ """
+
+ def __init__(
+ self,
+ filename=None,
+ address=None,
+ dtype=None,
+ shape=None,
+ aggregated_units=False,
+ aggregated_calendar=False,
+ attributes=None,
+ storage_options=None,
+ source=None,
+ copy=True,
+ ):
+ """**Initialisation**
+
+ :Parameters:
+
+ filename: (sequence of `str`), optional
+ The names of the netCDF fragment files containing the
+ array.
+
+ address: (sequence of `str`), optional
+ The name of the netCDF variable containing the
+ fragment array. Required unless *varid* is set.
+
+ dtype: `numpy.dtype`, optional
+ The data type of the aggregated array. May be `None`
+ if the numpy data-type is not known (which can be the
+ case for netCDF string types, for example). This may
+ differ from the data type of the netCDF fragment
+ variable.
+
+ shape: `tuple`, optional
+ The shape of the fragment within the aggregated
+ array. This may differ from the shape of the netCDF
+ fragment variable in that the latter may have fewer
+ size 1 dimensions.
+
+ {{init attributes: `dict` or `None`, optional}}
+
+ If *attributes* is `None`, the default, then the
+ attributes will be set from the netCDF variable during
+ the first `__getitem__` call.
+
+ {{aggregated_units: `str` or `None`, optional}}
+
+ {{aggregated_calendar: `str` or `None`, optional}}
+
+ {{init storage_options: `dict` or `None`, optional}}
+
+ {{init source: optional}}
+
+ {{init copy: `bool`, optional}}
+
+ """
+ super().__init__(
+ filename=filename,
+ address=address,
+ dtype=dtype,
+ shape=shape,
+ mask=True,
+ attributes=attributes,
+ storage_options=storage_options,
+ source=source,
+ copy=copy,
+ )
+
+ if source is not None:
+ try:
+ aggregated_units = source._get_component(
+ "aggregated_units", False
+ )
+ except AttributeError:
+ aggregated_units = False
+
+ try:
+ aggregated_calendar = source._get_component(
+ "aggregated_calendar", False
+ )
+ except AttributeError:
+ aggregated_calendar = False
+
+ self._set_component("aggregated_units", aggregated_units, copy=False)
+ self._set_component(
+ "aggregated_calendar", aggregated_calendar, copy=False
+ )
diff --git a/cf/data/fragment/mixin/fragmentarraymixin.py b/cf/data/fragment/mixin/fragmentarraymixin.py
index 544befd337..f02c779002 100644
--- a/cf/data/fragment/mixin/fragmentarraymixin.py
+++ b/cf/data/fragment/mixin/fragmentarraymixin.py
@@ -1,4 +1,4 @@
-from numbers import Integral
+from math import prod
import numpy as np
@@ -12,69 +12,69 @@ class FragmentArrayMixin:
"""
- def __getitem__(self, indices):
- """Returns a subspace of the fragment as a numpy array.
+ def _get_array(self, index=None):
+ """Returns a subspace of the dataset variable.
- x.__getitem__(indices) <==> x[indices]
+ .. versionadded:: NEXTVERSION
- Indexing is similar to numpy indexing, with the following
- differences:
+ .. seealso:: `__array__`, `index`
- * A dimension's index can't be rank-reducing, i.e. it can't
- be an integer, a scalar `numpy` array, nor a scalar `dask`
- array.
+ :Parameters:
- * When two or more dimension's indices are sequences of
- integers then these indices work independently along each
- dimension (similar to the way vector subscripts work in
- Fortran).
+ {{index: `tuple` or `None`, optional}}
- .. versionadded:: 3.15.0
+ It is important that there is a distinct value for each
+ fragment dimension, which is guaranteed when the
+ default of the `index` attribute is being used.
- """
- # TODOACTIVE: modify this for the case when
- # super().__getitem__(tuple(indices)) returns a
- # dictionary
+ :Returns:
+
+ `numpy.ndarray`
+ The subspace.
- indices = self._parse_indices(indices)
+ """
+ if index is None:
+ index = self.index()
try:
- array = super().__getitem__(tuple(indices))
+ array = super()._get_array(index)
except ValueError:
# A ValueError is expected to be raised when the fragment
# variable has fewer than 'self.ndim' dimensions (we know
- # this because because 'indices' has 'self.ndim'
+ # that this is the case because 'index' has 'self.ndim'
# elements).
- axis = self._size_1_axis(indices)
+ axis = self._size_1_axis(index)
if axis is not None:
# There is a unique size 1 index that must correspond
# to the missing dimension => Remove it from the
# indices, get the fragment array with the new
# indices; and then insert the missing size one
# dimension.
- indices.pop(axis)
- array = super().__getitem__(tuple(indices))
+ index = list(index)
+ index.pop(axis)
+ array = super()._get_array(tuple(index))
array = np.expand_dims(array, axis)
else:
# There are multiple size 1 indices so we don't know
# how many missing dimensions the fragment has, nor
# their positions => Get the full fragment array and
# then reshape it to the shape of the dask compute
- # chunk.
- array = super().__getitem__(Ellipsis)
- if array.size != self.size:
+ # chunk; and then apply the index.
+ array = super()._get_array(Ellipsis)
+ if array.size > prod(self.original_shape):
raise ValueError(
f"Can't get CFA fragment data from ({self}) when "
"the fragment has two or more missing size 1 "
"dimensions, whilst also spanning two or more "
- "dask compute chunks."
+ "Dask compute chunks."
"\n\n"
"Consider re-creating the data with exactly one "
- "dask compute chunk per fragment (e.g. by setting "
+ "Dask compute chunk per fragment (e.g. by setting "
"'chunks=None' as a keyword to cf.read)."
)
- array = array.reshape(self.shape)
+ array = array.reshape(self.original_shape)
+ array = array[index]
array = self._conform_to_aggregated_units(array)
return array
@@ -116,8 +116,8 @@ def _conform_to_aggregated_units(self, array):
if isinstance(array, dict):
# 'array' is a dictionary.
raise ValueError(
- "TODOACTIVE. This error is notification of an "
- "unreplaced placeholder for dealing with active "
+ "TODOACTIVE. Placeholder notification that "
+ "we can't yet deal with active "
"storage reductions on CFA fragments."
)
else:
@@ -128,87 +128,6 @@ def _conform_to_aggregated_units(self, array):
return array
- def _parse_indices(self, indices):
- """Parse the indices that retrieve the fragment data.
-
- Ellipses are replaced with the approriate number of `slice`
- instances, and rank-reducing indices (such as an integer or
- scalar array) are disallowed.
-
- .. versionadded:: 3.15.0
-
- :Parameters:
-
- indices: `tuple` or `Ellipsis`
- The array indices to be parsed.
-
- :Returns:
-
- `list`
- The parsed indices.
-
- **Examples**
-
- >>> a.shape
- (12, 1, 73, 144)
- >>> a._parse_indices([2, 4, 5], Ellipsis, slice(45, 67))
- [[2, 4, 5], slice(0, 1), slice(0, 73), slice(45, 67)]
- >>> a._parse_indices([2, 4, 5], [0], slice(None), slice(45, 67))
- [[2, 4, 5], [0], slice(0, 73), slice(45, 67)]
-
- """
- shape = self.shape
- if indices is Ellipsis:
- return [slice(0, n) for n in shape]
-
- indices = list(indices)
-
- # Check indices
- has_ellipsis = False
- for i, (index, n) in enumerate(zip(indices, shape)):
- if isinstance(index, slice):
- if index == slice(None):
- indices[i] = slice(0, n)
-
- continue
-
- if index is Ellipsis:
- has_ellipsis = True
- continue
-
- if isinstance(index, Integral) or not getattr(index, "ndim", True):
- # TODOCFA: what about [] or np.array([])?
-
- # 'index' is an integer or a scalar numpy/dask array
- raise ValueError(
- f"Can't subspace {self.__class__.__name__} with a "
- f"rank-reducing index: {index!r}"
- )
-
- if has_ellipsis:
- # Replace Ellipsis with one or more slices
- indices2 = []
- length = len(indices)
- n = self.ndim
- for index in indices:
- if index is Ellipsis:
- m = n - length + 1
- indices2.extend([slice(None)] * m)
- n -= m
- else:
- indices2.append(index)
- n -= 1
-
- length -= 1
-
- indices = indices2
-
- for i, (index, n) in enumerate(zip(indices, shape)):
- if index == slice(None):
- indices[i] = slice(0, n)
-
- return indices
-
def _size_1_axis(self, indices):
"""Find the position of a unique size 1 index.
@@ -216,6 +135,8 @@ def _size_1_axis(self, indices):
.. seealso:: `_parse_indices`, `__getitem__`
+ :Paramealso:: `_parse_indices`, `__getitem__`
+
:Parameters:
indices: sequence of index
@@ -244,33 +165,11 @@ def _size_1_axis(self, indices):
None
"""
- axis = None
-
- n_size_1 = 0 # Number of size 1 indices
- for i, (index, n) in enumerate(zip(indices, self.shape)):
- try:
- x = index.indices(n)
- if abs(x[1] - x[0]) == 1:
- # Index is a size 1 slice
- n_size_1 += 1
- axis = i
- except AttributeError:
- try:
- if index.size == 1:
- # Index is a size 1 numpy or dask array
- n_size_1 += 1
- axis = i
- except AttributeError:
- if len(index) == 1:
- # Index is a size 1 list
- n_size_1 += 1
- axis = i
-
- if n_size_1 > 1:
- # There are two or more size 1 indices
- axis = None
-
- return axis
+ original_shape = self.original_shape
+ if original_shape.count(1):
+ return original_shape.index(1)
+
+ return
@property
def aggregated_Units(self):
diff --git a/cf/data/fragment/netcdf4fragmentarray.py b/cf/data/fragment/netcdf4fragmentarray.py
new file mode 100644
index 0000000000..91f87dc2e8
--- /dev/null
+++ b/cf/data/fragment/netcdf4fragmentarray.py
@@ -0,0 +1,108 @@
+from ..array.netcdf4array import NetCDF4Array
+from .mixin import FragmentArrayMixin
+
+
+class NetCDF4FragmentArray(FragmentArrayMixin, NetCDF4Array):
+ """A netCDF fragment array accessed with `netCDF4`.
+
+ .. versionadded:: NEXTVERSION
+
+ """
+
+ def __init__(
+ self,
+ filename=None,
+ address=None,
+ dtype=None,
+ shape=None,
+ aggregated_units=False,
+ aggregated_calendar=False,
+ attributes=None,
+ storage_options=None,
+ source=None,
+ copy=True,
+ ):
+ """**Initialisation**
+
+ :Parameters:
+
+ filename: (sequence of `str`), optional
+ The names of the netCDF fragment files containing the
+ array.
+
+ address: (sequence of `str`), optional
+ The name of the netCDF variable containing the
+ fragment array. Required unless *varid* is set.
+
+ dtype: `numpy.dtype`, optional
+ The data type of the aggregated array. May be `None`
+ if the numpy data-type is not known (which can be the
+ case for netCDF string types, for example). This may
+ differ from the data type of the netCDF fragment
+ variable.
+
+ shape: `tuple`, optional
+ The shape of the fragment within the aggregated
+ array. This may differ from the shape of the netCDF
+ fragment variable in that the latter may have fewer
+ size 1 dimensions.
+
+ units: `str` or `None`, optional
+ The units of the fragment data. Set to `None` to
+ indicate that there are no units. If unset then the
+ units will be set during the first `__getitem__` call.
+
+ calendar: `str` or `None`, optional
+ The calendar of the fragment data. Set to `None` to
+ indicate the CF default calendar, if applicable. If
+ unset then the calendar will be set during the first
+ `__getitem__` call.
+
+ {{init attributes: `dict` or `None`, optional}}
+
+ If *attributes* is `None`, the default, then the
+ attributes will be set from the netCDF variable during
+ the first `__getitem__` call.
+
+ {{aggregated_units: `str` or `None`, optional}}
+
+ {{aggregated_calendar: `str` or `None`, optional}}
+
+ {{init storage_options: `dict` or `None`, optional}}
+
+ {{init source: optional}}
+
+ {{init copy: `bool`, optional}}
+
+ """
+ super().__init__(
+ filename=filename,
+ address=address,
+ dtype=dtype,
+ shape=shape,
+ mask=True,
+ attributes=attributes,
+ storage_options=storage_options,
+ source=source,
+ copy=copy,
+ )
+
+ if source is not None:
+ try:
+ aggregated_units = source._get_component(
+ "aggregated_units", False
+ )
+ except AttributeError:
+ aggregated_units = False
+
+ try:
+ aggregated_calendar = source._get_component(
+ "aggregated_calendar", False
+ )
+ except AttributeError:
+ aggregated_calendar = False
+
+ self._set_component("aggregated_units", aggregated_units, copy=False)
+ self._set_component(
+ "aggregated_calendar", aggregated_calendar, copy=False
+ )
diff --git a/cf/data/fragment/netcdffragmentarray.py b/cf/data/fragment/netcdffragmentarray.py
index 578412f124..ee34501e94 100644
--- a/cf/data/fragment/netcdffragmentarray.py
+++ b/cf/data/fragment/netcdffragmentarray.py
@@ -1,11 +1,25 @@
-from ..array.netcdfarray import NetCDFArray
+import cfdm
+
+from ..array.abstract import Array
+from ..array.mixin import FileArrayMixin, IndexMixin
+from .h5netcdffragmentarray import H5netcdfFragmentArray
from .mixin import FragmentArrayMixin
+from .netcdf4fragmentarray import NetCDF4FragmentArray
+
+class NetCDFFragmentArray(
+ FragmentArrayMixin,
+ IndexMixin,
+ cfdm.data.mixin.NetCDFFileMixin,
+ FileArrayMixin,
+ cfdm.data.mixin.FileArrayMixin,
+ Array,
+):
+ """A netCDF fragment array.
-class NetCDFFragmentArray(FragmentArrayMixin, NetCDFArray):
- """A CFA fragment array stored in a netCDF file.
+ Access will be with either `netCDF4` or `h5netcdf`.
- .. versionadded:: 3.14.0
+ .. versionadded:: 3.15.0
"""
@@ -17,8 +31,8 @@ def __init__(
shape=None,
aggregated_units=False,
aggregated_calendar=False,
- units=False,
- calendar=None,
+ attributes=None,
+ storage_options=None,
source=None,
copy=True,
):
@@ -47,39 +61,66 @@ def __init__(
fragment variable in that the latter may have fewer
size 1 dimensions.
- units: `str` or `None`, optional
- The units of the fragment data. Set to `None` to
- indicate that there are no units. If unset then the
- units will be set during the first `__getitem__` call.
+ {{init attributes: `dict` or `None`, optional}}
- calendar: `str` or `None`, optional
- The calendar of the fragment data. Set to `None` to
- indicate the CF default calendar, if applicable. If
- unset then the calendar will be set during the first
- `__getitem__` call.
+ If *attributes* is `None`, the default, then the
+ attributes will be set from the netCDF variable during
+ the first `__getitem__` call.
+
+ .. versionadded:: NEXTVERSION
{{aggregated_units: `str` or `None`, optional}}
{{aggregated_calendar: `str` or `None`, optional}}
+ {{init storage_options: `dict` or `None`, optional}}
+
+ .. versionadded:: NEXTVERSION
+
{{init source: optional}}
{{init copy: `bool`, optional}}
+ units: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
+ calendar: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
"""
super().__init__(
- filename=filename,
- address=address,
- dtype=dtype,
- shape=shape,
- mask=True,
- units=units,
- calendar=calendar,
source=source,
copy=copy,
)
if source is not None:
+ try:
+ shape = source._get_component("shape", None)
+ except AttributeError:
+ shape = None
+
+ try:
+ filename = source._get_component("filename", None)
+ except AttributeError:
+ filename = None
+
+ try:
+ address = source._get_component("address", None)
+ except AttributeError:
+ address = None
+
+ try:
+ dtype = source._get_component("dtype", None)
+ except AttributeError:
+ dtype = None
+
+ try:
+ attributes = source._get_component("attributes", None)
+ except AttributeError:
+ attributes = None
+
try:
aggregated_units = source._get_component(
"aggregated_units", False
@@ -94,7 +135,105 @@ def __init__(
except AttributeError:
aggregated_calendar = False
+ try:
+ storage_options = source._get_component(
+ "storage_options", None
+ )
+ except AttributeError:
+ storage_options = None
+
+ if filename is not None:
+ if isinstance(filename, str):
+ filename = (filename,)
+ else:
+ filename = tuple(filename)
+
+ self._set_component("filename", filename, copy=False)
+
+ if address is not None:
+ if isinstance(address, int):
+ address = (address,)
+ else:
+ address = tuple(address)
+
+ self._set_component("address", address, copy=False)
+
+ if storage_options is not None:
+ self._set_component("storage_options", storage_options, copy=False)
+
+ self._set_component("shape", shape, copy=False)
+ self._set_component("dtype", dtype, copy=False)
+ self._set_component("attributes", attributes, copy=False)
+ self._set_component("mask", True, copy=False)
+
self._set_component("aggregated_units", aggregated_units, copy=False)
self._set_component(
"aggregated_calendar", aggregated_calendar, copy=False
)
+
+ # By default, close the file after data array access
+ self._set_component("close", True, copy=False)
+
+ def _get_array(self, index=None):
+ """Returns a subspace of the dataset variable.
+
+ The method acts as a factory for either a
+ `NetCDF4FragmentArray` or a `H5netcdfFragmentArray` class, and
+ it is the result of calling `!_get_array` on the newly created
+ instance that is returned.
+
+ `H5netcdfFragmentArray` will only be used if
+ `NetCDF4FragmentArray` returns a `FileNotFoundError` exception.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `__array__`, `index`
+
+ :Parameters:
+
+ {{index: `tuple` or `None`, optional}}
+
+ It is important that there is a distinct value for each
+ fragment dimension, which is guaranteed when the
+ default of the `index` attribute is being used.
+
+ :Returns:
+
+ `numpy.ndarray`
+ The subspace.
+
+ """
+ kwargs = {
+ "dtype": self.dtype,
+ "shape": self.shape,
+ "aggregated_units": self.get_aggregated_units(None),
+ "aggregated_calendar": self.get_aggregated_calendar(None),
+ "attributes": self.get_attributes(None),
+ "copy": False,
+ }
+
+ # Loop round the files, returning as soon as we find one that
+ # is accessible.
+ filenames = self.get_filenames()
+ for filename, address in zip(filenames, self.get_addresses()):
+ kwargs["filename"] = filename
+ kwargs["address"] = address
+ kwargs["storage_options"] = self.get_storage_options(
+ create_endpoint_url=False
+ )
+
+ try:
+ return NetCDF4FragmentArray(**kwargs)._get_array(index)
+ except FileNotFoundError:
+ pass
+ except Exception:
+ return H5netcdfFragmentArray(**kwargs)._get_array(index)
+
+ # Still here?
+ if not filenames:
+ raise FileNotFoundError("No fragment files")
+
+ if len(filenames) == 1:
+ raise FileNotFoundError(f"No such fragment file: {filenames[0]}")
+
+ raise FileNotFoundError(f"No such fragment files: {filenames}")
diff --git a/cf/data/fragment/umfragmentarray.py b/cf/data/fragment/umfragmentarray.py
index a30737f46d..9168225945 100644
--- a/cf/data/fragment/umfragmentarray.py
+++ b/cf/data/fragment/umfragmentarray.py
@@ -17,8 +17,8 @@ def __init__(
shape=None,
aggregated_units=False,
aggregated_calendar=False,
- units=False,
- calendar=False,
+ attributes=None,
+ storage_options=None,
source=None,
copy=True,
):
@@ -45,33 +45,41 @@ def __init__(
fragment variable in that the latter may have fewer
size 1 dimensions.
- units: `str` or `None`, optional
- The units of the fragment data. Set to `None` to
- indicate that there are no units. If unset then the
- units will be set during the first `__getitem__` call.
+ {{init attributes: `dict` or `None`, optional}}
- calendar: `str` or `None`, optional
- The calendar of the fragment data. Set to `None` to
- indicate the CF default calendar, if applicable. If
- unset then the calendar will be set during the first
- `__getitem__` call.
+ During the first `__getitem__` call, any of the
+ ``_FillValue``, ``add_offset``, ``scale_factor``,
+ ``units``, and ``calendar`` attributes which haven't
+ already been set will be inferred from the lookup
+ header and cached for future use.
+
+ .. versionadded:: NEXTVERSION
{{aggregated_units: `str` or `None`, optional}}
{{aggregated_calendar: `str` or `None`, optional}}
+ {{init storage_options: `dict` or `None`, optional}}
+
{{init source: optional}}
{{init copy: `bool`, optional}}
+ units: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
+ calendar: `str` or `None`, optional
+ Deprecated at version NEXTVERSION. Use the
+ *attributes* parameter instead.
+
"""
super().__init__(
filename=filename,
address=address,
dtype=dtype,
shape=shape,
- units=units,
- calendar=calendar,
+ attributes=attributes,
source=source,
copy=False,
)
diff --git a/cf/data/utils.py b/cf/data/utils.py
index c70d0f1c98..c7982fc857 100644
--- a/cf/data/utils.py
+++ b/cf/data/utils.py
@@ -780,7 +780,7 @@ def collapse(
The function that collapses the underlying `dask` array of
*d*. Must have the minimum signature (parameters and
default values) ``func(dx, axis=None, keepdims=False,
- mtol=None, split_every=None)`` (optionally including
+ mtol=1, split_every=None)`` (optionally including
``weights=None`` or ``ddof=None``), where ``dx`` is a the
dask array contained in *d*.
@@ -829,23 +829,29 @@ def collapse(
mtol: number, optional
The sample size threshold below which collapsed values are
set to missing data. It is defined as a fraction (between
- 0 and 1 inclusive) of the contributing input data values.
+ 0 and 1 inclusive) of the contributing input data
+ values. A missing value in the output array occurs
+ whenever more than ``100*mtol%`` of its contributing input
+ array elements are missing data.
- The default of *mtol* is 1, meaning that a missing datum
+ The default of *mtol* is 1, meaning that a missing value
in the output array occurs whenever all of its
contributing input array elements are missing data.
- For other values, a missing datum in the output array
- occurs whenever more than ``100*mtol%`` of its
- contributing input array elements are missing data.
+ Note that for non-zero values of *mtol*, different
+ collapsed elements may have different sample sizes,
+ depending on the distribution of missing data in the input
+ data.
ddof: number, optional
- The delta degrees of freedom. The number of degrees of
- freedom used in the calculation is (N-*ddof*) where N
- represents the number of non-missing elements.
+ The delta degrees of freedom, a non-negative number. The
+ number of degrees of freedom used in the calculation is
+ ``N-ddof`` where ``N`` is the number of non-missing
+ elements. A value of 1 applies Bessel's correction. If the
+ calculation is weighted then *ddof* can only be 0 or 1.
- For collapse functions that do not have a ``ddof``
- parameter, *ddof* must be `None`.
+ For collapse functions for which delta degrees of freedom
+ is not applicable (such as `max`), *ddof* must be `None`.
split_every: `int` or `dict`, optional
Determines the depth of the recursive aggregation. See
@@ -872,7 +878,11 @@ def collapse(
if ddof is not None:
kwargs["ddof"] = ddof
- dx = d.to_dask_array()
+ # The applicable chunk function will have its own call to
+ # 'cf_asanyarray', so we can set '_asanyarray=False'. Also, setting
+ # _asanyarray=False will ensure that any active storage operations
+ # are not compromised.
+ dx = d.to_dask_array(_asanyarray=False)
dx = func(dx, **kwargs)
d._set_dask(dx)
@@ -985,8 +995,9 @@ def parse_weights(d, weights, axis=None):
w = []
shape = d.shape
axes = d._axes
+ Data = type(d)
for key, value in weights.items():
- value = type(d).asdata(value)
+ value = Data.asdata(value)
# Make sure axes are in ascending order
if key != tuple(sorted(key)):
diff --git a/cf/docstring/docstring.py b/cf/docstring/docstring.py
index b54a9c82ed..a11e0129b9 100644
--- a/cf/docstring/docstring.py
+++ b/cf/docstring/docstring.py
@@ -282,14 +282,12 @@
The sample size threshold below which collapsed values
are set to missing data. It is defined as a fraction
(between 0 and 1 inclusive) of the contributing input
- data values.
-
- The default of *mtol* is 1, meaning that a missing
- datum in the output array occurs whenever all of its
+ data values. A missing value in the output array
+ occurs whenever more than ``100*mtol%`` of its
contributing input array elements are missing data.
- For other values, a missing datum in the output array
- occurs whenever more than ``100*mtol%`` of its
+ The default of *mtol* is 1, meaning that a missing
+ value in the output array occurs whenever all of its
contributing input array elements are missing data.
Note that for non-zero values of *mtol*, different
@@ -300,10 +298,10 @@
"{{ddof: number}}": """ddof: number
The delta degrees of freedom, a non-negative
number. The number of degrees of freedom used in the
- calculation is (N-*ddof*) where N represents the
- number of non-missing elements. A value of 1 applies
- Bessel's correction. If the calculation is weighted
- then *ddof* can only be 0 or 1.""",
+ calculation is ``N-ddof`` where ``N`` is the number of
+ non-missing elements. A value of 1 applies Bessel's
+ correction. If the calculation is weighted then *ddof*
+ can only be 0 or 1.""",
# split_every
"{{split_every: `int` or `dict`, optional}}": """split_every: `int` or `dict`, optional
Determines the depth of the recursive aggregation. If
@@ -324,11 +322,24 @@
value. A default can also be set globally with the
``split_every`` key in `dask.config`. See
`dask.array.reduction` for details.""",
+ # active_storage
+ "{{active_storage: `bool`, optional}}": """{{active_storage: `bool`, optional}}
+ If True then attempt to perform the collapse using
+ active storage reductions. However, if other necessary
+ conditions are not met (see `Collapse` for details)
+ then the reduction will be carried out locally, as
+ usual. When an active storage reduction on a chunk
+ fails at compute time, the reduction for that chunk is
+ carried out locally.
+
+ If False, the default, then the reduction will be
+ carried out locally.""",
# Collapse chunk_function
- "{{chunk_function: callable, optional}}": """{{chunk_function: callable, optional}}
+ "{{chunk_function: callable or `None`, optional}}": """{{chunk_function: callable or `None`, optional}}
Provides the ``chunk`` parameter to
- `dask.array.reduction`. If unset then an approriate
- default function will be used.""",
+ `dask.array.reduction`. If `None`, the default, then
+ an appropriate default function from
+ `cf.data.collapse.dask_collapse` will be used.""",
# Collapse weights
"{{Collapse weights: data_like or `None`, optional}}": """weights: data_like or `None`, optional
Weights associated with values of the array. By
@@ -621,6 +632,10 @@
"{{to_size: `int`, optional}}": """to_size: `int`, optional
Pad the axis after so that the new axis has the given
size.""",
+ # _get_array index
+ "{{index: `tuple` or `None`, optional}}": """index: `tuple` or `None`, optional
+ Provide the indices that define the subspace. If `None`
+ then the `index` attribute is used.""",
# subspace config options
"{{config: optional}}": """config: optional
Configure the subspace by specifying the mode of
diff --git a/cf/domain.py b/cf/domain.py
index 1fbdb73e1c..8889fdf97a 100644
--- a/cf/domain.py
+++ b/cf/domain.py
@@ -707,7 +707,7 @@ def identities(self):
equals e.g. ``'foo=bar'``.
* The netCDF variable name, preceded by ``'ncvar%'``.
- .. versionadded:: (cfdm) 1.9.0.0
+ .. versionadded:: 3.8.0
.. seealso:: `identity`
diff --git a/cf/field.py b/cf/field.py
index f6c90d215e..d73e2b7b49 100644
--- a/cf/field.py
+++ b/cf/field.py
@@ -547,8 +547,6 @@ def __getitem__(self, indices):
autocyclic={"no-op": True},
)
- new.set_data(new_data, axes=data_axes, copy=False)
-
return new
def __setitem__(self, indices, value):
@@ -5397,18 +5395,20 @@ def collapse(
**Collapse weights**
- The calculations of means, standard deviations and variances are,
- by default, **not weighted**. For weights to be incorporated in
- the collapse, the axes to be weighted must be identified with the
- *weights* keyword.
+ .. warning:: By default, the collapse calculations are **not**
+ weighted.
+
+ For weights to be incorporated in the collapse,
+ the *weights* keyword must be set.
Weights are either derived from the field construct's metadata
- (such as cell sizes), or may be provided explicitly in the form of
- other field constructs containing data of weights values. In
- either case, the weights actually used are those derived by the
- `weights` method of the field construct with the same weights
- keyword value. Collapsed axes that are not identified by the
- *weights* keyword are unweighted during the collapse operation.
+ (such as cell sizes), or may be provided explicitly in the
+ form of other field constructs containing data of weights
+ values. In either case, the weights actually used are those
+ derived by the `weights` method of the field construct with
+ the same *weights* keyword value. Collapsed axes that are not
+ identified by the *weights* keyword are unweighted during the
+ collapse operation.
*Example:*
Create a weighted time average:
@@ -5598,6 +5598,48 @@ def collapse(
... within_years=cf.seasons(), weights=True)
+ **Active storage collapses**
+
+ When the data being collapsed are stored remotely, the
+ collapse calculations may be carried out on a server (ideally
+ one that is close in a network distance sense) to the data,
+ thereby removing the time and energy costs of transferring the
+ entire un-collapsed data to the local client. Whether or not
+ this will occur is determined on a case-by-case basis, and
+ will only be done if all of the following criteria are met:
+
+ * the collapse method is one of ``'mean'``, ``'maximum'``,
+ ``'minimum'``, or ``'sum'``;
+
+ * the collapse is over all axes;
+
+ * the collapse is unweighted;
+
+ * ``cf.active_storage()`` is `True`;
+
+ * a URL of the active storage server has been set with
+ `cf.active_storage_url`;
+
+ * the data values are in netCDF-4 files on disk (rather than
+ in any other file format, or in memory) and are not
+ numerically packed;
+
+ * the `!active_storage` attribute of the field's `Data` object
+ is `True`, indicating that active storage operations may be
+ possible. In general, it will only be `True` for data that
+ are in files on disk, are not compressed by convention and
+ have not had any other operations applied, apart from
+ subspacing;
+
+ * it is possible to import the external `activestorage.Active`
+ class.
+
+ The performance improvements from using active storage
+ operations will increase the closer the active storage server
+ is to the data storage. If the active storage server is
+ sufficiently far away from the data then it may be faster to
+ do a normal, non-active operation.
+
.. versionadded:: 1.0
.. seealso:: `bin`, `cell_area`, `convolution_filter`,
@@ -7058,10 +7100,13 @@ def collapse(
"collapse"
)
+ # Note: It is important that size 1 axes are also passed
+ # on to the Data collapse, because active storage
+ # collapses get confused if they're not there.
data_axes = f.get_data_axes()
iaxes = [
data_axes.index(axis)
- for axis in collapse_axes
+ for axis in collapse_axes_all_sizes
if axis in data_axes
]
@@ -7367,7 +7412,7 @@ def _collapse_grouped(
group=None,
group_span=None,
group_contiguous=False,
- mtol=None,
+ mtol=1,
ddof=None,
regroup=None,
coordinate=None,
diff --git a/cf/functions.py b/cf/functions.py
index d186774d02..22820bc3db 100644
--- a/cf/functions.py
+++ b/cf/functions.py
@@ -2,11 +2,11 @@
import csv
import ctypes.util
import importlib
+import logging
import os
import platform
import re
import sys
-import urllib.parse
import warnings
from collections.abc import Iterable
from itertools import product
@@ -19,7 +19,7 @@
from os.path import expandvars as _os_path_expandvars
from os.path import join as _os_path_join
from os.path import relpath as _os_path_relpath
-from urllib.parse import urlparse
+from urllib.parse import urljoin, urlparse
import cfdm
import netCDF4
@@ -172,6 +172,9 @@ def configuration(
regrid_logging=None,
relaxed_identities=None,
bounds_combination_mode=None,
+ active_storage=None,
+ active_storage_url=None,
+ active_storage_max_requests=None,
of_fraction=None,
collapse_parallel_mode=None,
free_memory_factor=None,
@@ -190,6 +193,9 @@ def configuration(
* `regrid_logging`
* `relaxed_identities`
* `bounds_combination_mode`
+ * `active_storage`
+ * `active_storage_url`
+ * `active_storage_max_requests`
These are all constants that apply throughout cf, except for in
specific functions only if overridden by the corresponding keyword
@@ -208,7 +214,9 @@ def configuration(
.. seealso:: `atol`, `rtol`, `tempdir`, `chunksize`,
`total_memory`, `log_level`, `regrid_logging`,
- `relaxed_identities`, `bounds_combination_mode`
+ `relaxed_identities`, `bounds_combination_mode`,
+ `active_storage`, `active_storage_url`,
+ `active_storage_max_requests`
:Parameters:
@@ -260,6 +268,24 @@ def configuration(
construct identity. The default is to not change the
current value.
+ active_storage: `bool` or `Constant`, optional
+ The new value (either True to enable active storage
+ reductions or False to disable them). The default is to
+ not change the current behaviour.
+
+ .. versionadded:: NEXTVERSION
+
+ active_storage_url: `str` or `None` or `Constant`, optional
+ The new value (either a new URL string or `None` to remove
+ the URL). The default is to not change the value.
+
+ .. versionadded:: NEXTVERSION
+
+ active_storage_max_requests: `int` or `Constant`, optional
+ The new value. The default is to not change the value.
+
+ .. versionadded:: NEXTVERSION
+
of_fraction: `float` or `Constant`, optional
Deprecated at version 3.14.0 and is no longer
available.
@@ -289,7 +315,10 @@ def configuration(
'relaxed_identities': False,
'log_level': 'WARNING',
'bounds_combination_mode': 'AND',
- 'chunksize': 82873466.88000001}
+ 'chunksize': 82873466.88000001,
+ 'active_storage': False,
+ 'active_storage_url': None,
+ 'active_storage_max_requests': 100}
>>> cf.chunksize(7.5e7) # any change to one constant...
82873466.88000001
>>> cf.configuration()['chunksize'] # ...is reflected in the configuration
@@ -303,7 +332,10 @@ def configuration(
'relaxed_identities': False,
'log_level': 'WARNING',
'bounds_combination_mode': 'AND',
- 'chunksize': 75000000.0}
+ 'chunksize': 75000000.0,
+ 'active_storage': False,
+ 'active_storage_url': None,
+ 'active_storage_max_requests': 100}
>>> cf.configuration() # the items set have been updated accordingly
{'rtol': 2.220446049250313e-16,
'atol': 2.220446049250313e-16,
@@ -312,7 +344,10 @@ def configuration(
'relaxed_identities': False,
'log_level': 'INFO',
'bounds_combination_mode': 'AND',
- 'chunksize': 75000000.0}
+ 'chunksize': 75000000.0,
+ 'active_storage': False,
+ 'active_storage_url': None,
+ 'active_storage_max_requests': 100}
Use as a context manager:
@@ -324,7 +359,9 @@ def configuration(
'relaxed_identities': False,
'log_level': 'INFO',
'bounds_combination_mode': 'AND',
- 'chunksize': 75000000.0}
+ 'chunksize': 75000000.0,
+ 'active_storage': False,
+ 'active_storage_url': None}
>>> with cf.configuration(atol=9, rtol=10):
... print(cf.configuration())
...
@@ -335,7 +372,10 @@ def configuration(
'relaxed_identities': False,
'log_level': 'INFO',
'bounds_combination_mode': 'AND',
- 'chunksize': 75000000.0}
+ 'chunksize': 75000000.0,
+ 'active_storage': False,
+ 'active_storage_url': None,
+ 'active_storage_max_requests': 100}
>>> print(cf.configuration())
{'rtol': 2.220446049250313e-16,
'atol': 2.220446049250313e-16,
@@ -344,7 +384,10 @@ def configuration(
'relaxed_identities': False,
'log_level': 'INFO',
'bounds_combination_mode': 'AND',
- 'chunksize': 75000000.0}
+ 'chunksize': 75000000.0,
+ 'active_storage': False,
+ 'active_storage_url': None,
+ 'active_storage_max_requests': 100}
"""
if of_fraction is not None:
@@ -375,6 +418,9 @@ def configuration(
new_regrid_logging=regrid_logging,
new_relaxed_identities=relaxed_identities,
bounds_combination_mode=bounds_combination_mode,
+ active_storage=active_storage,
+ active_storage_url=active_storage_url,
+ active_storage_max_requests=active_storage_max_requests,
)
@@ -424,6 +470,9 @@ def _configuration(_Configuration, **kwargs):
"new_regrid_logging": regrid_logging,
"new_relaxed_identities": relaxed_identities,
"bounds_combination_mode": bounds_combination_mode,
+ "active_storage": active_storage,
+ "active_storage_url": active_storage_url,
+ "active_storage_max_requests": active_storage_max_requests,
}
old_values = {}
@@ -553,11 +602,17 @@ class regrid_logging(ConstantAccess):
**Examples**
- >>> cf.regrid_logging()
+ >>> print(cf.regrid_logging())
+ False
+ >>> print(cf.regrid_logging(True))
False
- >>> cf.regrid_logging(True)
+ >>> print(cf.regrid_logging())
+ True
+ >>> with cf.regrid_logging(False):
+ ... print(cf.regrid_logging())
+ ...
False
- >>> cf.regrid_logging()
+ >>> print(cf.regrid_logging())
True
"""
@@ -682,13 +737,19 @@ class relaxed_identities(ConstantAccess):
>>> org = cf.relaxed_identities()
>>> org
False
- >>> cf.relaxed_identities(True)
+ >>> print(cf.relaxed_identities(True))
False
- >>> cf.relaxed_identities()
+ >>> print(cf.relaxed_identities())
+ True
+ >>> print(cf.relaxed_identities(org))
True
- >>> cf.relaxed_identities(org)
+ >>> print(cf.relaxed_identities())
+ False
+ >>> with cf.relaxed_identities(True):
+ ... print(cf.relaxed_identities())
+ ...
True
- >>> cf.relaxed_identities()
+ >>> print(cf.relaxed_identities())
False
"""
@@ -805,18 +866,24 @@ class tempdir(ConstantAccess):
:Returns:
- `str`
- The directory prior to the change, or the current
- directory if no new value was specified.
+ `Constant`
+ The directory name prior to the change, or the name of the
+ current directory if no new value was specified.
**Examples**
- >>> cf.tempdir()
+ >>> print(cf.tempdir())
'/tmp'
>>> old = cf.tempdir('/home/me/tmp')
- >>> cf.tempdir(old)
+ >>> print(cf.tempdir(old))
'/home/me/tmp'
- >>> cf.tempdir()
+ >>> print(cf.tempdir())
+ '/tmp'
+ >>> with cf.tempdir('~/NEW_TMP'):
+ ... print(cf.tempdir())
+ ...
+ /home/me/NEW_TMP
+ >>> print(cf.tempdir())
'/tmp'
"""
@@ -1082,11 +1149,6 @@ class bounds_combination_mode(ConstantAccess):
OR
>>> print(cf.bounds_combination_mode(old))
OR
- >>> print(cf.bounds_combination_mode())
- AND
-
- Use as a context manager:
-
>>> print(cf.bounds_combination_mode())
AND
>>> with cf.bounds_combination_mode('XOR'):
@@ -1135,6 +1197,218 @@ def _parse(cls, arg):
return arg
+class active_storage(ConstantAccess):
+ """Whether or not to attempt active storage reductions.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `active_storage_max_requests`, `active_storage_url`,
+ `configuration`
+
+ :Parameters:
+
+ arg: `bool` or `Constant`, optional
+ Provide a value that will apply to all subsequent
+ operations.
+
+ :Returns:
+
+ `Constant`
+ The value prior to the change, or the current value if no
+ new value was specified.
+
+ **Examples**
+
+ >>> cf.active_storage()
+ False
+ >>> with cf.active_storage(True):
+ ... print(cf.active_storage())
+ ...
+ True
+ >>> cf.active_storage()
+ False
+ >>> cf.active_storage(True)
+ False
+ >>> cf.active_storage()
+ True
+
+ """
+
+ _name = "active_storage"
+
+ def _parse(cls, arg):
+ """Parse a new constant value.
+
+ .. versionaddedd:: NEXTVERSION
+
+ :Parameters:
+
+ cls:
+ This class.
+
+ arg:
+ The given new constant value.
+
+ :Returns:
+
+ A version of the new constant value suitable for
+ insertion into the `CONSTANTS` dictionary.
+
+ """
+ try:
+ from activestorage import Active # noqa: F401
+ except ModuleNotFoundError as error:
+ if arg:
+ raise ModuleNotFoundError(
+ f"Can't enable active storage operations: {error}"
+ )
+
+ return bool(arg)
+
+
+class active_storage_url(ConstantAccess):
+ """The URL location of the active storage reducer.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `active_storage`, `active_storage_max_requests`,
+ `configuration`
+
+ :Parameters:
+
+ arg: `str` or `None` or `Constant`, optional
+ Provide a value that will apply to all subsequent
+ operations.
+
+ :Returns:
+
+ `Constant`
+ The value prior to the change, or the current value if no
+ new value was specified.
+
+ **Examples**
+
+ >>> print(cf.active_storage_url())
+ None
+ >>> with cf.active_storage_url('http://active/storage/location'):
+ ... print(cf.active_storage_url())
+ ...
+ 'http://active/storage/location'
+ >>> print(cf.active_storage_url())
+ None
+ >>> print(cf.active_storage_url('http://other/location'))
+ None
+ >>> cf.active_storage_url()
+ 'http://other/location'
+
+ """
+
+ _name = "active_storage_url"
+
+ def _parse(cls, arg):
+ """Parse a new constant value.
+
+ .. versionaddedd:: NEXTVERSION
+
+ :Parameters:
+
+ cls:
+ This class.
+
+ arg:
+ The given new constant value.
+
+ :Returns:
+
+ A version of the new constant value suitable for
+ insertion into the `CONSTANTS` dictionary.
+
+ """
+ if arg is None:
+ return arg
+
+ return str(arg)
+
+
+class active_storage_max_requests(ConstantAccess):
+ """Concurrent active storage server requests per `dask` chunk.
+
+ This is the maximum number of concurrent requests per `dask` chunk
+ that are sent to the active storage server by an `Active`
+ instance. The default is ``100``. The optimum number may be
+ estimated by :math:`N = N_{max} / min(N_{PE}, N_{chunk})`, where
+
+ * :math:`N_{max}` is the maximum number of requests that the
+ active storage server can process concurrently before its
+ performance is degraded;
+
+ * :math:`N_{PE}` the number of available PEs on the local client;
+
+ * :math:`N_{chunk}` the number of `dask` chunks being reduced with
+ active storage.
+
+ This formula only applies to cases where all `dask` chunks for the
+ collapse operation are utilising active storage. If some are not
+ then :math:`N` will likely be underestimated.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `active_storage`, `active_storage_url`,
+ `configuration`
+
+ :Parameters:
+
+ arg: `int` or `Constant`, optional
+ Provide a value that will apply to all subsequent
+ operations.
+
+ :Returns:
+
+ `Constant`
+ The value prior to the change, or the current value if no
+ new value was specified.
+
+ **Examples**
+
+ >>> print(cf.active_storage_max_requests())
+ 100
+ >>> with cf.active_storage_max_requests(25):
+ ... print(cf.active_storage_max_requests())
+ ...
+ 25
+ >>> print(cf.active_storage_max_requests())
+ None
+ >>> print(cf.active_storage_max_requests(12)
+ None
+ >>> cf.active_storage_max_requests()
+ 12
+
+ """
+
+ _name = "active_storage_max_requests"
+
+ def _parse(cls, arg):
+ """Parse a new constant value.
+
+ .. versionaddedd:: NEXTVERSION
+
+ :Parameters:
+
+ cls:
+ This class.
+
+ arg:
+ The given new constant value.
+
+ :Returns:
+
+ A version of the new constant value suitable for
+ insertion into the `CONSTANTS` dictionary.
+
+ """
+ return int(arg)
+
+
def CF():
"""The version of the CF conventions.
@@ -1275,6 +1549,27 @@ def total_memory():
return CONSTANTS["TOTAL_MEMORY"]
+def is_log_level_info(logger):
+ """Return True if and only if log level is at least as verbose as INFO.
+
+ .. versionadded:: NEXTVERSION
+
+ .. seealso:: `log_level`
+
+ :Parameters:
+
+ logger: `logging.Logger`
+ The logger in use.
+
+ :Returns:
+
+ `bool`
+ Whether or not the log level is at least INFO.
+
+ """
+ return logger.parent.level <= logging.INFO
+
+
# --------------------------------------------------------------------
# Aliases (for back-compatibility etc.):
# --------------------------------------------------------------------
@@ -2143,61 +2438,6 @@ def normalize_slice(index, size, cyclic=False):
return slice(start, stop, step)
-def get_subspace(array, indices):
- """Return a subspace defined by the given indices of an array.
-
- Subset the input numpy array with the given indices. Indexing is
- similar to that of a numpy array. The differences to numpy array
- indexing are:
-
- 1. An integer index i takes the i-th element but does not reduce
- the rank of the output array by one.
-
- 2. When more than one dimension's slice is a 1-d boolean array or
- 1-d sequence of integers then these indices work independently
- along each dimension (similar to the way vector subscripts work
- in Fortran).
-
- Indices must contain an index for each dimension of the input array.
-
- :Parameters:
-
- array: `numpy.ndarray`
-
- indices: `list`
-
- """
- gg = [i for i, x in enumerate(indices) if not isinstance(x, slice)]
- len_gg = len(gg)
-
- if len_gg < 2:
- # ------------------------------------------------------------
- # At most one axis has a list-of-integers index so we can do a
- # normal numpy subspace
- # ------------------------------------------------------------
- return array[tuple(indices)]
-
- else:
- # ------------------------------------------------------------
- # At least two axes have list-of-integers indices so we can't
- # do a normal numpy subspace
- # ------------------------------------------------------------
- if np.ma.isMA(array):
- take = np.ma.take
- else:
- take = np.take
-
- indices = indices[:]
- for axis in gg:
- array = take(array, indices[axis], axis=axis)
- indices[axis] = slice(None)
-
- if len_gg < len(indices):
- array = array[tuple(indices)]
-
- return array
-
-
_equals = cfdm.Data()._equals
@@ -2646,7 +2886,7 @@ def relpath(filename, start=None):
'http://data/archive/file.nc'
"""
- u = urllib.parse.urlparse(filename)
+ u = urlparse(filename)
if u.scheme != "":
return filename
@@ -2684,7 +2924,7 @@ def dirname(filename):
'http://data/archive'
"""
- u = urllib.parse.urlparse(filename)
+ u = urlparse(filename)
if u.scheme != "":
return filename.rpartition("/")[0]
@@ -2723,9 +2963,9 @@ def pathjoin(path1, path2):
'http://data/archive/file.nc'
"""
- u = urllib.parse.urlparse(path1)
+ u = urlparse(path1)
if u.scheme != "":
- return urllib.parse.urljoin(path1, path2)
+ return urljoin(path1, path2)
return _os_path_join(path1, path2)
@@ -3118,44 +3358,50 @@ def environment(display=True, paths=True):
**Examples**
>>> cf.environment()
- Platform: Linux-4.15.0-54-generic-x86_64-with-glibc2.10
- HDF5 library: 1.10.6
- netcdf library: 4.8.0
- udunits2 library: /home/username/anaconda3/envs/cf-env/lib/libudunits2.so.0
- esmpy/ESMF: 8.4.1 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/esmpy/__init__.py
- Python: 3.8.10 /home/username/anaconda3/envs/cf-env/bin/python
- dask: 2022.6.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/dask/__init__.py
- netCDF4: 1.5.6 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/netCDF4/__init__.py
- psutil: 5.9.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/psutil/__init__.py
- packaging: 21.3 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/packaging/__init__.py
- numpy: 1.22.2 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/numpy/__init__.py
- scipy: 1.10.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/scipy/__init__.py
- matplotlib: 3.4.3 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/matplotlib/__init__.py
- cftime: 1.6.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cftime/__init__.py
- cfunits: 3.3.6 /home/username/cfunits/cfunits/__init__.py
- cfplot: 3.1.18 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cfplot/__init__.py
- cfdm: 1.10.1.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cfdm/__init__.py
- cf: 3.14.0 /home/username/anaconda3/envs/cf-env/lib/python3.8/site-packages/cf/__init__.py
+ Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35
+ HDF5 library: 1.12.2
+ netcdf library: 4.9.3-development
+ udunits2 library: /home/user/lib/libudunits2.so.0
+ esmpy/ESMF: 8.6.1 /home/user/lib/python3.12/site-packages/esmpy/__init__.py
+ Python: 3.12.2 /home/user/bin/python
+ dask: 2024.6.0 /home/user/lib/python3.12/site-packages/dask/__init__.py
+ netCDF4: 1.6.5 /home/user/lib/python3.12/site-packages/netCDF4/__init__.py
+ h5netcdf: 1.3.0 /home/user/lib/python3.12/site-packages/h5netcdf/__init__.py
+ h5py: 3.11.0 /home/user/lib/python3.12/site-packages/h5py/__init__.py
+ s3fs: 2024.6.0 /home/user/lib/python3.12/site-packages/s3fs/__init__.py
+ psutil: 5.9.8 /home/user/lib/python3.12/site-packages/psutil/__init__.py
+ packaging: 23.2 /home/user/lib/python3.12/site-packages/packaging/__init__.py
+ numpy: 1.26.4 /home/user/lib/python3.12/site-packages/numpy/__init__.py
+ scipy: 1.13.0 /home/user/lib/python3.12/site-packages/scipy/__init__.py
+ matplotlib: 3.8.4 /home/user/lib/python3.12/site-packages/matplotlib/__init__.py
+ cftime: 1.6.3 /home/user/lib/python3.12/site-packages/cftime/__init__.py
+ cfunits: 3.3.7 /home/user/lib/python3.12/site-packages/cfunits/__init__.py
+ cfplot: 3.3.0 /home/user/lib/python3.12/site-packages/cfplot/__init__.py
+ cfdm: 1.11.2.0 /home/user/cfdm/cfdm/__init__.py
+ cf: NEXTVERSION /home/user/cf-python/cf/__init__.py
>>> cf.environment(paths=False)
- Platform: Linux-4.15.0-54-generic-x86_64-with-glibc2.10
- HDF5 library: 1.10.6
- netcdf library: 4.8.0
- udunits2 library: libudunits2.so.0
- esmpy/ESMF: 8.4.1
- Python: 3.8.10
- dask: 2022.6.0
- netCDF4: 1.5.6
- psutil: 5.9.0
- packaging: 21.3
- numpy: 1.22.2
- scipy: 1.10.0
- matplotlib: 3.4.3
- cftime: 1.6.0
- cfunits: 3.3.6
- cfplot: 3.1.18
- cfdm: 1.10.1.0
- cf: 3.14.0
+ Platform: Linux-5.15.0-122-generic-x86_64-with-glibc2.35
+ HDF5 library: 1.12.2
+ netcdf library: 4.9.3-development
+ udunits2 library: /home/user/lib/libudunits2.so.0
+ esmpy/ESMF: 8.6.1
+ Python: 3.12.2
+ dask: 2024.6.0
+ netCDF4: 1.6.5
+ h5netcdf: 1.3.0
+ h5py: 3.11.0
+ s3fs: 2024.6.0
+ psutil: 5.9.8
+ packaging: 23.2
+ numpy: 1.26.4
+ scipy: 1.13.0
+ matplotlib: 3.8.4
+ cftime: 1.6.3
+ cfunits: 3.3.7
+ cfplot: 3.3.0
+ cfdm: 1.11.2.0
+ cf: NEXTVERSION
"""
dependency_version_paths_mapping = {
@@ -3174,6 +3420,9 @@ def environment(display=True, paths=True):
"dask": _get_module_info("dask"),
# Then Python libraries not related to CF
"netCDF4": _get_module_info("netCDF4"),
+ "h5netcdf": _get_module_info("h5netcdf"),
+ "h5py": _get_module_info("h5py"),
+ "s3fs": _get_module_info("s3fs"),
"psutil": _get_module_info("psutil"),
"packaging": _get_module_info("packaging"),
"numpy": _get_module_info("numpy"),
diff --git a/cf/mixin/fielddomain.py b/cf/mixin/fielddomain.py
index c8e4fb1bb2..1e9e1f67e1 100644
--- a/cf/mixin/fielddomain.py
+++ b/cf/mixin/fielddomain.py
@@ -2317,7 +2317,7 @@ def iscyclic(self, *identity, **filter_kwargs):
{{filter_kwargs: optional}}
- .. versionadded:: (cfdm) 3.9.0
+ .. versionadded:: 3.9.0
:Returns:
diff --git a/cf/query.py b/cf/query.py
index 87ff0c1c5b..268ca7b6d0 100644
--- a/cf/query.py
+++ b/cf/query.py
@@ -166,10 +166,11 @@ class Query:
evaluated.
====================== ==============================================
- In general, each method must have the query value as it's only
- parameter. The only exception is for `__query_isclose__`, which
+ In general, each method must have the query value as its only
+ parameter. The only exceptions are for `__query_isclose__`, which
also requires the absolute and relative numerical tolerances to be
- provided.
+ provided, and for `__query_wi__`, which also requires upper and
+ lower interval openness Booleans to be provided.
When the condition is on an attribute, or nested attributes, of
the operand, the query interface method is looked for on the
@@ -972,7 +973,7 @@ def _evaluate(self, x, parent_attr):
if operator == "wi":
_wi = getattr(x, "__query_wi__", None)
if _wi is not None:
- return _wi(value)
+ return _wi(value, self.open_lower, self.open_upper)
if self.open_lower:
lower_bound = x > value[0]
diff --git a/cf/read_write/netcdf/netcdfread.py b/cf/read_write/netcdf/netcdfread.py
index f8437349c6..883cc7b5a2 100644
--- a/cf/read_write/netcdf/netcdfread.py
+++ b/cf/read_write/netcdf/netcdfread.py
@@ -208,8 +208,10 @@ def _create_data(
# one dask chunk
if data.npartitions == 1:
data._cfa_set_write(True)
+
if (
not compression_index
+ and self.read_vars.get("cache")
and self.implementation.get_construct_type(construct)
!= "field"
):
@@ -251,17 +253,18 @@ def _create_data(
coord_ncvar=coord_ncvar,
)
+ attributes = kwargs["attributes"]
data = self._create_Data(
cfa_array,
ncvar,
- units=kwargs["units"],
- calendar=kwargs["calendar"],
+ units=attributes.get("units"),
+ calendar=attributes.get("calendar"),
)
# Note: We don't cache elements from CFA variables, because
# the data are in fragment files which have not been
- # opened; and may not not even be openable, such as
- # could be the case if a fragement was on tape storage.
+ # opened and may not not even be openable (such as could
+ # be the case if a fragment file was on tape storage).
# Set the CFA write status to True iff each non-aggregated
# axis has exactly one dask storage chunk
@@ -503,6 +506,7 @@ def _cache_data_elements(self, data, ncvar):
`None`
"""
+
if data.data.get_compression_type():
# Don't get cached elements from arrays compressed by
# convention, as they'll likely be wrong.
@@ -649,13 +653,13 @@ def _create_cfanetcdfarray(
:Returns:
(`CFANetCDFArray`, `dict`)
- The new `NetCDFArray` instance and dictionary of the
- kwargs used to create it.
+ The new `CFANetCDFArray` instance and dictionary of
+ the kwargs used to create it.
"""
g = self.read_vars
- # Get the kwargs needed to instantiate a general NetCDFArray
+ # Get the kwargs needed to instantiate a general netCDF array
# instance
kwargs = self._create_netcdfarray(
ncvar,
@@ -664,8 +668,8 @@ def _create_cfanetcdfarray(
return_kwargs_only=True,
)
- # Get rid of the incorrect shape of (). This will end up
- # getting set correctly by the CFANetCDFArray instance.
+ # Get rid of the incorrect shape. This will end up getting set
+ # correctly by the CFANetCDFArray instance.
kwargs.pop("shape", None)
aggregated_data = g["cfa_aggregated_data"][ncvar]
@@ -692,7 +696,11 @@ def _create_cfanetcdfarray(
kwargs["instructions"] = " ".join(sorted(instructions))
# Use the kwargs to create a CFANetCDFArray instance
- array = self.implementation.initialise_CFANetCDFArray(**kwargs)
+ if g["original_netCDF4"]:
+ array = self.implementation.initialise_CFANetCDF4Array(**kwargs)
+ else:
+ # h5netcdf
+ array = self.implementation.initialise_CFAH5netcdfArray(**kwargs)
return array, kwargs
@@ -727,19 +735,23 @@ def _create_cfanetcdfarray_term(
:Returns:
(`CFANetCDFArray`, `dict`)
- The new `NetCDFArray` instance and dictionary of the
- kwargs used to create it.
+ The new `CFANetCDFArray` instance and dictionary of
+ the kwargs used to create it.
"""
g = self.read_vars
- # Get the kwargs needed to instantiate a general NetCDFArray
+ # Get the kwargs needed to instantiate a general netCDF array
# instance
kwargs = self._create_netcdfarray(
ncvar,
return_kwargs_only=True,
)
+ # Get rid of the incorrect shape. This will end up getting set
+ # correctly by the CFANetCDFArray instance.
+ kwargs.pop("shape", None)
+
instructions = []
aggregation_instructions = {}
for t, term_ncvar in g["cfa_aggregated_data"][parent_ncvar].items():
@@ -754,8 +766,11 @@ def _create_cfanetcdfarray_term(
kwargs["x"] = aggregation_instructions
kwargs["instructions"] = " ".join(sorted(instructions))
- # Use the kwargs to create a CFANetCDFArray instance
- array = self.implementation.initialise_CFANetCDFArray(**kwargs)
+ if g["original_netCDF4"]:
+ array = self.implementation.initialise_CFANetCDF4Array(**kwargs)
+ else:
+ # h5netcdf
+ array = self.implementation.initialise_CFAH5netcdfArray(**kwargs)
return array, kwargs
@@ -940,6 +955,7 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data):
aggregation_instructions = g["cfa_aggregation_instructions"]
variable_attributes = g["variable_attributes"]
+ # Loop round aggregation instruction terms
out = {}
for x in self._parse_x(
ncvar,
@@ -954,10 +970,16 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data):
# Already processed this term
continue
- array = g["variables"][term_ncvar][...]
- aggregation_instructions[term_ncvar] = self._cfa_conform_array(
- array
+ variable = g["variables"][term_ncvar]
+ array = cfdm.netcdf_indexer(
+ variable,
+ mask=True,
+ unpack=True,
+ always_masked_array=False,
+ orthogonal_indexing=False,
+ copy=False,
)
+ aggregation_instructions[term_ncvar] = array[...]
if term == "file":
# Find URI substitutions that may be stored in the
@@ -981,43 +1003,3 @@ def _cfa_parse_aggregated_data(self, ncvar, aggregated_data):
g["cfa_aggregated_data"][ncvar] = out
return out
-
- def _cfa_conform_array(self, array):
- """Conform an array so that it is suitable for CFA processing.
-
- .. versionadded: 3.15.0
-
- :Parameters:
-
- array: `np.ndarray`
- The array to conform.
-
- :Returns:
-
- array: `np.ndarray`
- The conformed array.
-
- """
- if isinstance(array, str):
- # string
- return np.array(array, dtype=f"S{len(array)}").astype("U")
-
- kind = array.dtype.kind
- if kind == "O":
- # string
- return array.astype("U")
-
- if kind in "SU":
- # char
- if kind == "U":
- array = array.astype("S")
-
- array = netCDF4.chartostring(array)
- shape = array.shape
- array = np.array([x.rstrip() for x in array.flat], dtype="S")
- array = np.reshape(array, shape)
- array = np.ma.masked_where(array == b"", array)
- return array.astype("U")
-
- # number
- return array
diff --git a/cf/read_write/netcdf/netcdfwrite.py b/cf/read_write/netcdf/netcdfwrite.py
index 8890fe4da1..e2c8da0236 100644
--- a/cf/read_write/netcdf/netcdfwrite.py
+++ b/cf/read_write/netcdf/netcdfwrite.py
@@ -4,6 +4,7 @@
import dask.array as da
import numpy as np
+from ...data.dask_utils import cf_asanyarray
from .netcdfread import NetCDFRead
@@ -103,8 +104,10 @@ def _write_as_cfa(self, cfvar, construct_type, domain_axes):
raise ValueError(
f"Can't write {cfvar!r} as a CFA-netCDF "
- "aggregation variable. Consider setting "
- "cfa={'strict': False}"
+ "aggregation variable. Possible reasons for this "
+ "include 1) there is more than one Dask chunk "
+ "per fragment, and 2) data values have been "
+ "changed relative to those in the fragments."
)
return cfa_get_write
@@ -464,7 +467,7 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar):
):
f_ncdim = f"f_{ncdim}"
if f_ncdim not in g["dimensions"]:
- # Create a new fragement dimension
+ # Create a new fragment dimension
self._write_dimension(f_ncdim, None, size=size)
fragment_ncdimensions.append(f_ncdim)
@@ -576,55 +579,6 @@ def _create_cfa_data(self, ncvar, ncdimensions, data, cfvar):
},
)
- def _convert_to_builtin_type(self, x):
- """Convert a non-JSON-encodable object to a JSON-encodable
- built-in type.
-
- Possible conversions are:
-
- ============== ============= ======================================
- Input object Output object numpy data types covered
- ============== ============= ======================================
- numpy.bool_ bool bool
- numpy.integer int int, int8, int16, int32, int64, uint8,
- uint16, uint32, uint64
- numpy.floating float float, float16, float32, float64
- ============== ============= ======================================
-
- .. versionadded:: 3.0.0
-
- :Parameters:
-
- x:
-
- :Returns:
-
- 'int' or `float` or `bool`
-
- **Examples:**
-
- >>> type(_convert_to_builtin_type(numpy.bool_(True)))
- bool
- >>> type(_convert_to_builtin_type(numpy.array([1.0])[0]))
- double
- >>> type(_convert_to_builtin_type(numpy.array([2])[0]))
- int
-
- """
- if isinstance(x, np.bool_):
- return bool(x)
-
- if isinstance(x, np.integer):
- return int(x)
-
- if isinstance(x, np.floating):
- return float(x)
-
- raise TypeError(
- f"{type(x)!r} object can't be converted to a JSON serializable "
- f"type: {x!r}"
- )
-
def _check_valid(self, array, cfvar=None, attributes=None):
"""Checks for array values outside of the valid range.
@@ -792,7 +746,10 @@ def _cfa_write_non_standard_terms(
# dimensions, with one value per fragment. If a chunk has
# more than one unique value then the fragment's value is
# missing data.
- dx = data.to_dask_array()
+ #
+ # '_cfa_unique' has its own call to 'cf_asanyarray', so
+ # we can set '_asanyarray=False'.
+ dx = data.to_dask_array(_asanyarray=False)
dx_ind = tuple(range(dx.ndim))
out_ind = dx_ind
dx = da.blockwise(
@@ -850,6 +807,8 @@ def _cfa_unique(cls, a):
data if there is not a unique value.
"""
+ a = cf_asanyarray(a)
+
out_shape = (1,) * a.ndim
a = np.unique(a)
if np.ma.isMA(a):
@@ -918,17 +877,17 @@ def _cfa_aggregation_instructions(self, data, cfvar):
if len(file_details) != 1:
if file_details:
raise ValueError(
- "Can't write CFA-netCDF aggregation variable from "
- f"{cfvar!r} when the "
- f"dask storage chunk defined by indices {indices} "
- "spans two or more files"
+ f"Can't write {cfvar!r} as a CFA-netCDF "
+ "aggregation variable: Dask chunk defined by index "
+ f"{indices} spans two or more fragments. "
+ "A possible fix for this is to set chunks=None as "
+ "an argument of a prior call to cf.read"
)
raise ValueError(
- "Can't write CFA-netCDF aggregation variable from "
- f"{cfvar!r} when the "
- f"dask storage chunk defined by indices {indices} spans "
- "zero files"
+ f"Can't write {cfvar!r} as a CFA-netCDF "
+ "aggregation variable: Dask chunk defined by index "
+ f"{indices} spans zero fragments."
)
filenames, addresses, formats = file_details.pop()
@@ -1003,7 +962,10 @@ def _cfa_aggregation_instructions(self, data, cfvar):
# Create the location array
# ------------------------------------------------------------
dtype = np.dtype(np.int32)
- if max(data.to_dask_array().chunksize) > np.iinfo(dtype).max:
+ if (
+ max(data.to_dask_array(_asanyarray=False).chunksize)
+ > np.iinfo(dtype).max
+ ):
dtype = np.dtype(np.int64)
ndim = data.ndim
diff --git a/cf/read_write/read.py b/cf/read_write/read.py
index a7ad3f23af..c6ad881db9 100644
--- a/cf/read_write/read.py
+++ b/cf/read_write/read.py
@@ -58,21 +58,26 @@ def read(
select_options=None,
follow_symlinks=False,
mask=True,
+ unpack=True,
warn_valid=False,
chunks="auto",
domain=False,
cfa=None,
+ netcdf_backend=None,
+ storage_options=None,
+ cache=True,
):
"""Read field or domain constructs from files.
- The following file formats are supported: CF-netCDF, CFA-netCDF,
- CDL, PP and UM fields datasets.
+ The following file formats are supported: netCDF, CFA-netCDF, CDL,
+ UM fields file, and PP.
Input datasets are mapped to constructs in memory which are
returned as elements of a `FieldList` or if the *domain* parameter
is True, a `DomainList`.
- NetCDF files may be on disk or on an OPeNDAP server.
+ NetCDF files may be on disk, on an OPeNDAP server, or in an S3
+ object store.
Any amount of files of any combination of file types may be read.
@@ -408,14 +413,13 @@ def read(
parameter.
mask: `bool`, optional
- If False then do not mask by convention when reading the
- data of field or metadata constructs from disk. By default
- data is masked by convention.
+ If True (the default) then mask by convention the data of
+ field and metadata constructs.
- The masking by convention of a netCDF array depends on the
- values of any of the netCDF variable attributes
- ``_FillValue``, ``missing_value``, ``valid_min``,
- ``valid_max`` and ``valid_range``.
+ A netCDF array is masked depending on the values of any of
+ the netCDF attributes ``_FillValue``, ``missing_value``,
+ ``_Unsigned``, ``valid_min``, ``valid_max``, and
+ ``valid_range``.
The masking by convention of a PP or UM array depends on
the value of BMDI in the lookup header. A value other than
@@ -427,6 +431,16 @@ def read(
.. versionadded:: 3.4.0
+ unpack: `bool`, optional
+ If True, the default, then unpack arrays by convention
+ when the data is read from disk.
+
+ Unpacking is determined by netCDF conventions for the
+ following variable attributes: ``add_offset``,
+ ``scale_factor``, and ``_Unsigned``.
+
+ .. versionadded:: NEXTVERSION
+
warn_valid: `bool`, optional
If True then print a warning for the presence of
``valid_min``, ``valid_max`` or ``valid_range`` properties
@@ -652,18 +666,91 @@ def read(
A dictionary whose key/value pairs define text
substitutions to be applied to the fragment file
names. Each key may be specified with or without the
- ``${...}`` syntax. For instance, the following are
- equivalent: ``{'base': 'sub'}``, ``{'${base}': 'sub'}``.
- The substitutions are used in conjunction with, and take
- precedence over, any that are stored in the CFA-netCDF
- file by the ``substitutions`` attribute of the ``file``
- CFA aggregation instruction variable.
+ ``${*}`` syntax (where `*` represents any amount of any
+ characters). For instance, ``{'substitution':
+ 'replacement'}`` and ``{'${substitution}': 'replacement'}``'
+ are equivalent. The substitutions are used in
+ conjunction with, and take precedence over, any that are
+ stored in the CFA-netCDF file by the ``substitutions``
+ attribute of the ``file`` fragement array variable.
*Example:*
- ``{'base': 'file:///data/'}``
+ ``{'replacement': 'file:///data/'}``
.. versionadded:: 3.15.0
+ netcdf_backend: `None` or `str`, optional
+ Specify which library to use for reading netCDF files. By
+ default, or if `None`, then the first one of `netCDF4` and
+ `h5netcdf` to successfully open the file netCDF file is
+ used. Setting *netcdf_backend* to one of ``'netCDF4'`` and
+ ``'h5netcdf'`` will force the use of that library.
+
+ .. note:: The *netcdf_backend* parameter does not affect
+ the opening of netCDF fragment files that define
+ the data of aggregation variables. For these, it
+ is always the case that the first one of
+ `netCDF4` and `h5netcdf` to successfully open
+ the file is used.
+
+ .. versionadded:: NEXTVERSION
+
+ storage_options: `dict` or `None`, optional
+ Pass parameters to the backend file system driver, such as
+ username, password, server, port, etc. How the storage
+ options are interpreted depends on the location of the
+ file:
+
+ **Local File System**
+
+ Storage options are ignored for local files.
+
+ **HTTP(S)**
+
+ Storage options are ignored for files available across the
+ network via OPeNDAP.
+
+ **S3-compatible services**
+
+ The backend used is `s3fs`, and the storage options are
+ used to initialise an `s3fs.S3FileSystem` file system
+ object. By default, or if `None`, then *storage_options*
+ is taken as ``{}``.
+
+ If the ``'endpoint_url'`` key is not in *storage_options*,
+ nor in a dictionary defined by the ``'client_kwargs'`` key
+ (both of which are the case when *storage_options* is
+ `None`), then one will be automatically inserted for
+ accessing an S3 file. For example, for a file name of
+ ``'s3://store/data/file.nc'``, an ``'endpoint_url'`` key
+ with value ``'https://store'`` would be created. To
+ disable this, set ``'endpoint_url'`` to `None`.
+
+ *Parameter example:*
+ For a file name of ``'s3://store/data/file.nc'``, the
+ following are equivalent: ``None``, ``{}``,
+ ``{'endpoint_url': 'https://store'}``, and
+ ``{'client_kwargs': {'endpoint_url': 'https://store'}}``
+
+ *Parameter example:*
+ ``{'key': 'scaleway-api-key...', 'secret':
+ 'scaleway-secretkey...', 'endpoint_url':
+ 'https://s3.fr-par.scw.cloud', 'client_kwargs':
+ {'region_name': 'fr-par'}}``
+
+ .. versionadded:: NEXTVERSION
+
+ cache: `bool`, optional
+ If True, the default, then cache the first and last array
+ elements of metadata constructs (not field constructs) for
+ fast future access. In addition, the second and
+ penultimate array elements will be cached from coordinate
+ bounds when there are two bounds per cell. For remote
+ data, setting *cache* to False may speed up the parsing of
+ the file.
+
+ .. versionadded:: NEXTVERSION
+
umversion: deprecated at version 3.0.0
Use the *um* parameter instead.
@@ -823,6 +910,8 @@ def read(
cfa_options["substitutions"] = substitutions
+ cache = bool(cache)
+
# Initialise the output list of fields/domains
if domain:
out = DomainList()
@@ -885,8 +974,8 @@ def read(
file_glob = os.path.expanduser(os.path.expandvars(file_glob))
scheme = urlparse(file_glob).scheme
- if scheme in ("https", "http"):
- # Do not glob a URL
+ if scheme in ("https", "http", "s3"):
+ # Do not glob a remote URL
files2 = (file_glob,)
else:
# Glob files on disk
@@ -951,10 +1040,14 @@ def read(
height_at_top_of_model=height_at_top_of_model,
chunks=chunks,
mask=mask,
+ unpack=unpack,
warn_valid=warn_valid,
select=select,
domain=domain,
cfa_options=cfa_options,
+ netcdf_backend=netcdf_backend,
+ storage_options=storage_options,
+ cache=cache,
)
# --------------------------------------------------------
@@ -1064,11 +1157,15 @@ def _read_a_file(
extra=None,
height_at_top_of_model=None,
mask=True,
+ unpack=True,
warn_valid=False,
chunks="auto",
select=None,
domain=False,
cfa_options=None,
+ netcdf_backend=None,
+ storage_options=None,
+ cache=True,
):
"""Read the contents of a single file into a field list.
@@ -1090,6 +1187,9 @@ def _read_a_file(
mask: `bool`, optional
See `cf.read` for details.
+ unpack: `bool`, optional
+ See `cf.read` for details.
+
verbose: `int` or `str` or `None`, optional
See `cf.read` for details.
@@ -1104,6 +1204,21 @@ def _read_a_file(
.. versionadded:: 3.15.0
+ storage_options: `dict` or `None`, optional
+ See `cf.read` for details.
+
+ .. versionadded:: NEXTVERSION
+
+ netcdf_backend: `str` or `None`, optional
+ See `cf.read` for details.
+
+ .. versionadded:: NEXTVERSION
+
+ cache: `bool`, optional
+ See `cf.read` for details.
+
+ .. versionadded:: NEXTVERSION
+
:Returns:
`FieldList` or `DomainList`
@@ -1138,6 +1253,7 @@ def _read_a_file(
"fmt": selected_fmt,
"ignore_read_error": ignore_read_error,
"cfa_options": cfa_options,
+ "cache": cache,
}
# ----------------------------------------------------------------
@@ -1175,8 +1291,11 @@ def _read_a_file(
warnings=warnings,
extra_read_vars=extra_read_vars,
mask=mask,
+ unpack=unpack,
warn_valid=warn_valid,
domain=domain,
+ storage_options=storage_options,
+ netcdf_backend=netcdf_backend,
)
except MaskError:
# Some data required for field interpretation is missing,
diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py
index 051562945a..e73166eba1 100644
--- a/cf/read_write/um/umread.py
+++ b/cf/read_write/um/umread.py
@@ -1973,8 +1973,10 @@ def create_data(self):
recs = self.recs
um_Units = self.um_Units
- units = getattr(um_Units, "units", None)
- calendar = getattr(um_Units, "calendar", None)
+ attributes = {
+ "units": getattr(um_Units, "units", None),
+ "calendar": getattr(um_Units, "calendar", None),
+ }
data_type_in_file = self.data_type_in_file
@@ -2015,8 +2017,7 @@ def create_data(self):
fmt=fmt,
word_size=self.word_size,
byte_ordering=self.byte_ordering,
- units=units,
- calendar=calendar,
+ attributes=attributes,
)
key = f"{klass_name}-{tokenize(subarray)}"
@@ -2069,8 +2070,7 @@ def create_data(self):
fmt=fmt,
word_size=word_size,
byte_ordering=byte_ordering,
- units=units,
- calendar=calendar,
+ attributes=attributes,
)
key = f"{klass_name}-{tokenize(subarray)}"
@@ -2120,8 +2120,7 @@ def create_data(self):
fmt=fmt,
word_size=word_size,
byte_ordering=byte_ordering,
- units=units,
- calendar=calendar,
+ attributes=attributes,
)
key = f"{klass_name}-{tokenize(subarray)}"
@@ -3535,7 +3534,7 @@ def is_um_file(self, filename):
"""Whether or not a file is a PP file or UM fields file.
Note that the file type is determined by inspecting the file's
- content and any file suffix is not not considered.
+ content and any file suffix is not considered.
:Parameters:
diff --git a/cf/read_write/write.py b/cf/read_write/write.py
index c3d0edb615..23a8dda3cd 100644
--- a/cf/read_write/write.py
+++ b/cf/read_write/write.py
@@ -97,22 +97,22 @@ def write(
construct.
- **NetCDF hierarchical groups**
+ **NetCDF-4 hierarchical groups**
Hierarchical groups in CF provide a mechanism to structure
- variables within netCDF4 datasets with well defined rules for
+ variables within netCDF-4 datasets with well defined rules for
resolving references to out-of-group netCDF variables and
dimensions. The group structure defined by a field construct's
netCDF interface will, by default, be recreated in the output
dataset. See the *group* parameter for details.
- **NetCDF4 HDF chunk sizes**
+ **NetCDF-4 HDF chunk sizes**
HDF5 chunksizes may be set on contruct's data. See the
- `~cf.Data.nc_hdf5_chunksizes`,
- `~cf.Data.nc_clear_hdf5_chunksizes` and
- `~cf.Data.nc_set_hdf5_chunksizes` methods of a `Data` instance.
+ `~cf.Data.nc_hdf5_chunksizes`, `~cf.Data.nc_clear_hdf5_chunksizes`
+ and `~cf.Data.nc_set_hdf5_chunksizes` methods of a `Data`
+ instance.
.. seealso:: `cf.read`
@@ -121,7 +121,6 @@ def write(
fields: (arbitrarily nested sequence of) `Field` or `FieldList`
The field constructs to write to the file.
-
filename: `str`
The output netCDF file name. Various type of expansion are
applied to the file names.
@@ -548,7 +547,7 @@ def write(
variables. By default only auxiliary and scalar coordinate
variables are included.
- .. versionadded:: (cfdm) 3.7.0
+ .. versionadded:: 3.7.0
omit_data: (sequence of) `str`, optional
Do not write the data of the named construct types.
diff --git a/cf/regrid/regrid.py b/cf/regrid/regrid.py
index 08990ad925..a06d1cc960 100644
--- a/cf/regrid/regrid.py
+++ b/cf/regrid/regrid.py
@@ -2465,7 +2465,8 @@ def create_esmpy_weights(
from netCDF4 import Dataset
from .. import __version__
- from ..data.array.netcdfarray import _lock
+
+ from ..data.array.locks import netcdf_lock
if (
max(dst_esmpy_field.data.size, src_esmpy_field.data.size)
@@ -2491,7 +2492,7 @@ def create_esmpy_weights(
if src_grid.ln_z:
regrid_method += f", ln {src_grid.method} in vertical"
- _lock.acquire()
+ netcdf_lock.acquire()
nc = Dataset(weights_file, "w", format="NETCDF4")
nc.title = (
@@ -2532,7 +2533,7 @@ def create_esmpy_weights(
v[...] = col
nc.close()
- _lock.release()
+ netcdf_lock.release()
if esmpy_regrid_operator is None:
# Destroy esmpy objects (the esmpy.Grid objects exist even if
diff --git a/cf/regrid/regridoperator.py b/cf/regrid/regridoperator.py
index 997cd0d6e6..10a77bc641 100644
--- a/cf/regrid/regridoperator.py
+++ b/cf/regrid/regridoperator.py
@@ -727,9 +727,9 @@ def tosparse(self):
# Read the weights from the weights file
from netCDF4 import Dataset
- from ..data.array.netcdfarray import _lock
+ from ..data.array.locks import netcdf_lock
- _lock.acquire()
+ netcdf_lock.acquire()
nc = Dataset(weights_file, "r")
weights = nc.variables["S"][...]
row = nc.variables["row"][...]
@@ -746,7 +746,7 @@ def tosparse(self):
row_start_index = 1
nc.close()
- _lock.release()
+ netcdf_lock.release()
else:
raise ValueError(
"Conversion to sparse array format requires at least "
diff --git a/cf/test/individual_tests.sh b/cf/test/individual_tests.sh
index fea95ef58b..425c7dd435 100755
--- a/cf/test/individual_tests.sh
+++ b/cf/test/individual_tests.sh
@@ -5,9 +5,9 @@ do
echo "Running $file"
python $file
rc=$?
- if [[ $rc != 0 ]]; then
- exit $rc
- fi
+# if [[ $rc != 0 ]]; then
+# exit $rc
+# fi
done
file=setup_create_field.py
diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py
index fd61d0e9af..e29c192062 100644
--- a/cf/test/test_Data.py
+++ b/cf/test/test_Data.py
@@ -1479,6 +1479,18 @@ def test_Data__getitem__(self):
f = cf.Data([-999, 35], mask=[True, False]).reshape(2, 1)
self.assertTrue(e.equals(f))
+ # Chained subspaces reading from disk
+ f = cf.read(self.filename)[0]
+ d = f.data
+ a = d[:1, [1, 3, 4], :][:, [True, False, True], ::-2].array
+ b = d.array[:1, [1, 3, 4], :][:, [True, False, True], ::-2]
+ self.assertTrue((a == b).all())
+
+ d.__keepdims_indexing__ = False
+ a = d[0, [1, 3, 4], :][[True, False, True], ::-2].array
+ b = d.array[0, [1, 3, 4], :][[True, False, True], ::-2]
+ self.assertTrue((a == b).all())
+
def test_Data__setitem__(self):
"""Test the assignment of data elements on Data."""
for hardmask in (False, True):
@@ -3277,6 +3289,14 @@ def test_Data_rechunk(self):
self.assertEqual(e.chunks, ((4,), (5,)))
self.assertTrue(e.equals(d))
+ # Test rechunking after a __getitem__
+ e = d[:2].rechunk((2, 5))
+ self.assertTrue(e.equals(d[:2]))
+
+ d = cf.Data.ones((4, 5), chunks=(4, 5))
+ e = d[:2].rechunk((1, 3))
+ self.assertTrue(e.equals(d[:2]))
+
def test_Data_reshape(self):
"""Test the `reshape` Data method."""
a = np.arange(12).reshape(3, 4)
@@ -4500,13 +4520,15 @@ def test_Data__str__(self):
def test_Data_cull_graph(self):
"""Test `Data.cull`"""
+ # Note: The number of layers in the culled graphs include a
+ # `cf_asanyarray` layer
d = cf.Data([1, 2, 3, 4, 5], chunks=3)
d = d[:2]
- self.assertEqual(len(dict(d.to_dask_array().dask)), 3)
+ self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 3)
# Check that there are fewer keys after culling
d.cull_graph()
- self.assertEqual(len(dict(d.to_dask_array().dask)), 2)
+ self.assertEqual(len(dict(d.to_dask_array(_asanyarray=False).dask)), 2)
def test_Data_npartitions(self):
"""Test the `npartitions` Data property."""
@@ -4754,6 +4776,13 @@ def test_Data_pad_missing(self):
with self.assertRaises(ValueError):
d.pad_missing(99, to_size=99)
+ def test_Data_is_masked(self):
+ """Test Data.is_masked."""
+ d = cf.Data(np.arange(6).reshape(2, 3))
+ d[0, 0] = cf.masked
+ self.assertTrue(d[0].is_masked)
+ self.assertFalse(d[1].is_masked)
+
if __name__ == "__main__":
print("Run date:", datetime.datetime.now())
diff --git a/cf/test/test_Field.py b/cf/test/test_Field.py
index 3f10e1664e..92392b29f5 100644
--- a/cf/test/test_Field.py
+++ b/cf/test/test_Field.py
@@ -1466,7 +1466,7 @@ def test_Field_indices(self):
shape = (1, 1, 1)
self.assertEqual(g.shape, shape)
- self.assertEqual(g.array.compressed(), 29)
+ self.assertEqual(np.ma.compressed(g.array), 29)
if mode != "full":
self.assertEqual(g.construct("longitude").array, 83)
@@ -1484,7 +1484,7 @@ def test_Field_indices(self):
shape = (1, 2, 2)
self.assertEqual(g.shape, shape)
- self.assertTrue((g.array.compressed() == [4, 29]).all())
+ self.assertTrue((np.ma.compressed(g.array) == [4, 29]).all())
# Add 2-d auxiliary coordinates with bounds, so we can
# properly test cf.contains values
diff --git a/cf/test/test_FullArray.py b/cf/test/test_FullArray.py
new file mode 100644
index 0000000000..63dcb84f34
--- /dev/null
+++ b/cf/test/test_FullArray.py
@@ -0,0 +1,55 @@
+import datetime
+import faulthandler
+import unittest
+
+import numpy as np
+
+faulthandler.enable() # to debug seg faults and timeouts
+
+import cf
+
+
+class FullArrayTest(unittest.TestCase):
+ def test_FullValue_inspection(self):
+ full = 9
+ f = cf.FullArray(full, np.dtype(int), shape=(2, 3, 4))
+ self.assertEqual(f.get_full_value(), full)
+ self.assertEqual(f.shape, (2, 3, 4))
+ self.assertEqual(f.dtype, np.dtype(int))
+ self.assertIsNone(f.set_full_value(10))
+ self.assertEqual(f.get_full_value(), 10)
+
+ def test_FullValue_array(self):
+ full = 9
+ f = cf.FullArray(full, np.dtype(int), shape=(2, 3, 4))
+ self.assertTrue((f.array == np.full(f.shape, full)).all())
+
+ f = f[0, [True, False, True], ::3]
+ self.assertTrue((f.array == np.full((2, 1), full)).all())
+
+ def test_FullValue_masked_array(self):
+ full = np.ma.masked
+ f = cf.FullArray(full, np.dtype(int), shape=(2, 3))
+
+ a = np.ma.masked_all(f.shape, dtype=np.dtype(int))
+ array = f.array
+ self.assertEqual(array.dtype, a.dtype)
+ self.assertTrue(
+ (np.ma.getmaskarray(array) == np.ma.getmaskarray(a)).all()
+ )
+
+ def test_FullValue_get_array(self):
+ full = 9
+ f = cf.FullArray(full, np.dtype(int), shape=(2, 3))
+ f = f[0, 1]
+ self.assertEqual(f.shape, ())
+
+ array = f._get_array(index=Ellipsis)
+ self.assertTrue((array == np.full((2, 3), full)).all())
+
+
+if __name__ == "__main__":
+ print("Run date:", datetime.datetime.now())
+ cf.environment()
+ print()
+ unittest.main(verbosity=2)
diff --git a/cf/test/test_NetCDFArray.py b/cf/test/test_NetCDF4Array.py
similarity index 56%
rename from cf/test/test_NetCDFArray.py
rename to cf/test/test_NetCDF4Array.py
index c69a4654e7..0d049ff497 100644
--- a/cf/test/test_NetCDFArray.py
+++ b/cf/test/test_NetCDF4Array.py
@@ -5,6 +5,7 @@
import tempfile
import unittest
+import numpy as np
from dask.base import tokenize
faulthandler.enable() # to debug seg faults and timeouts
@@ -13,7 +14,7 @@
n_tmpfiles = 1
tmpfiles = [
- tempfile.mkstemp("_test_NetCDFArray.nc", dir=os.getcwd())[1]
+ tempfile.mkstemp("_test_NetCDF4Array.nc", dir=os.getcwd())[1]
for i in range(n_tmpfiles)
]
(tmpfile1,) = tmpfiles
@@ -31,15 +32,22 @@ def _remove_tmpfiles():
atexit.register(_remove_tmpfiles)
-class NetCDFArrayTest(unittest.TestCase):
- def test_NetCDFArray_del_file_location(self):
- a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2"))
+class NetCDF4ArrayTest(unittest.TestCase):
+ n = cf.NetCDF4Array(
+ filename="filename.nc",
+ address="x",
+ shape=(5, 8),
+ dtype=np.dtype(float),
+ )
+
+ def test_NetCDF4Array_del_file_location(self):
+ a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2"))
b = a.del_file_location("/data1")
self.assertIsNot(b, a)
self.assertEqual(b.get_filenames(), ("/data2/file2",))
self.assertEqual(b.get_addresses(), ("tas2",))
- a = cf.NetCDFArray(
+ a = cf.NetCDF4Array(
("/data1/file1", "/data2/file1", "/data2/file2"),
("tas1", "tas1", "tas2"),
)
@@ -52,18 +60,18 @@ def test_NetCDFArray_del_file_location(self):
with self.assertRaises(ValueError):
b.del_file_location("/data1/")
- def test_NetCDFArray_file_locations(self):
- a = cf.NetCDFArray("/data1/file1")
+ def test_NetCDF4Array_file_locations(self):
+ a = cf.NetCDF4Array("/data1/file1")
self.assertEqual(a.file_locations(), ("/data1",))
- a = cf.NetCDFArray(("/data1/file1", "/data2/file2"))
+ a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"))
self.assertEqual(a.file_locations(), ("/data1", "/data2"))
- a = cf.NetCDFArray(("/data1/file1", "/data2/file2", "/data1/file2"))
+ a = cf.NetCDF4Array(("/data1/file1", "/data2/file2", "/data1/file2"))
self.assertEqual(a.file_locations(), ("/data1", "/data2", "/data1"))
- def test_NetCDFArray_add_file_location(self):
- a = cf.NetCDFArray("/data1/file1", "tas")
+ def test_NetCDF4Array_add_file_location(self):
+ a = cf.NetCDF4Array("/data1/file1", "tas")
b = a.add_file_location("/home/user")
self.assertIsNot(b, a)
self.assertEqual(
@@ -71,7 +79,7 @@ def test_NetCDFArray_add_file_location(self):
)
self.assertEqual(b.get_addresses(), ("tas", "tas"))
- a = cf.NetCDFArray(("/data1/file1", "/data2/file2"), ("tas1", "tas2"))
+ a = cf.NetCDF4Array(("/data1/file1", "/data2/file2"), ("tas1", "tas2"))
b = a.add_file_location("/home/user")
self.assertEqual(
b.get_filenames(),
@@ -84,7 +92,7 @@ def test_NetCDFArray_add_file_location(self):
)
self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1", "tas2"))
- a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2"))
+ a = cf.NetCDF4Array(("/data1/file1", "/data2/file1"), ("tas1", "tas2"))
b = a.add_file_location("/home/user")
self.assertEqual(
b.get_filenames(),
@@ -92,24 +100,24 @@ def test_NetCDFArray_add_file_location(self):
)
self.assertEqual(b.get_addresses(), ("tas1", "tas2", "tas1"))
- a = cf.NetCDFArray(("/data1/file1", "/data2/file1"), ("tas1", "tas2"))
+ a = cf.NetCDF4Array(("/data1/file1", "/data2/file1"), ("tas1", "tas2"))
b = a.add_file_location("/data1/")
self.assertEqual(b.get_filenames(), a.get_filenames())
self.assertEqual(b.get_addresses(), a.get_addresses())
- def test_NetCDFArray__dask_tokenize__(self):
- a = cf.NetCDFArray("/data1/file1", "tas", shape=(12, 2), mask=False)
+ def test_NetCDF4Array__dask_tokenize__(self):
+ a = cf.NetCDF4Array("/data1/file1", "tas", shape=(12, 2), mask=False)
self.assertEqual(tokenize(a), tokenize(a.copy()))
- b = cf.NetCDFArray("/home/file2", "tas", shape=(12, 2))
+ b = cf.NetCDF4Array("/home/file2", "tas", shape=(12, 2))
self.assertNotEqual(tokenize(a), tokenize(b))
- def test_NetCDFArray_multiple_files(self):
+ def test_NetCDF4Array_multiple_files(self):
f = cf.example_field(0)
cf.write(f, tmpfile1)
# Create instance with non-existent file
- n = cf.NetCDFArray(
+ n = cf.NetCDF4Array(
filename=os.path.join("/bad/location", os.path.basename(tmpfile1)),
address=f.nc_get_variable(),
shape=f.shape,
@@ -121,6 +129,40 @@ def test_NetCDFArray_multiple_files(self):
self.assertEqual(len(n.get_filenames()), 2)
self.assertTrue((n[...] == f.array).all())
+ def test_NetCDF4Array_shape(self):
+ shape = (12, 73, 96)
+ a = cf.NetCDF4Array("/home/file2", "tas", shape=shape)
+ self.assertEqual(a.shape, shape)
+ self.assertEqual(a.original_shape, shape)
+ a = a[::2]
+ self.assertEqual(a.shape, (shape[0] // 2,) + shape[1:])
+ self.assertEqual(a.original_shape, shape)
+
+ def test_NetCDF4Array_index(self):
+ shape = (12, 73, 96)
+ a = cf.NetCDF4Array("/home/file2", "tas", shape=shape)
+ self.assertEqual(
+ a.index(),
+ (
+ slice(
+ None,
+ ),
+ )
+ * len(shape),
+ )
+ a = a[8:7:-1, 10:19:3, [15, 1, 4, 12]]
+ a = a[[0], [True, False, True], ::-2]
+ self.assertEqual(a.shape, (1, 2, 2))
+ self.assertEqual(
+ a.index(),
+ (slice(8, 9, None), slice(10, 17, 6), slice(12, -1, -11)),
+ )
+
+ index = a.index(conform=False)
+ self.assertTrue((index[0] == [8]).all())
+ self.assertTrue((index[1] == [10, 16]).all())
+ self.assertTrue((index[2] == [12, 1]).all())
+
if __name__ == "__main__":
print("Run date:", datetime.datetime.now())
diff --git a/cf/test/test_active_storage.py b/cf/test/test_active_storage.py
new file mode 100644
index 0000000000..f14e063849
--- /dev/null
+++ b/cf/test/test_active_storage.py
@@ -0,0 +1,69 @@
+import atexit
+import datetime
+import faulthandler
+import os
+import tempfile
+import unittest
+
+faulthandler.enable() # to debug seg faults and timeouts
+
+import cf
+
+try:
+ from activestorage import Active
+except ModuleNotFoundError:
+ Active = None
+
+n_tmpfiles = 2
+tmpfiles = [
+ tempfile.mkstemp("_test_active_storage.nc", dir=os.getcwd())[1]
+ for i in range(n_tmpfiles)
+]
+(tmpfile, tmpfile2) = tmpfiles
+
+
+def _remove_tmpfiles():
+ """Try to remove defined temporary files by deleting their paths."""
+ for f in tmpfiles:
+ try:
+ os.remove(f)
+ except OSError:
+ pass
+
+
+atexit.register(_remove_tmpfiles)
+
+
+class ActiveStorageTest(unittest.TestCase):
+ @unittest.skipUnless(Active is not None, "Requires activestorage.Active")
+ def test_active_storage(self):
+ # No masked values
+ f = cf.example_field(0)
+ cf.write(f, tmpfile)
+
+ f = cf.read(tmpfile, chunks={"latitude": (4, 1), "longitude": (3, 5)})
+ f = f[0]
+ self.assertEqual(f.data.chunks, ((4, 1), (3, 5)))
+
+ cf.active_storage(False)
+ self.assertFalse(cf.active_storage())
+ f.collapse("mean", weights=False)
+
+ local_array = f.collapse("mean", weights=False).array
+
+ with cf.configuration(active_storage=True, active_storage_url="dummy"):
+ self.assertTrue(cf.active_storage())
+ self.assertEqual(cf.active_storage_url(), "dummy")
+ active_array = f.collapse("mean", weights=False).array
+
+ self.assertEqual(active_array, local_array)
+
+ # TODOACTIVE: Test with masked values (not yet working in
+ # activestorage.Active)
+
+
+if __name__ == "__main__":
+ print("Run date:", datetime.datetime.now())
+ cf.environment()
+ print("")
+ unittest.main(verbosity=2)
diff --git a/cf/test/test_functions.py b/cf/test/test_functions.py
index 9cc5e5eab1..f6cce13ae0 100644
--- a/cf/test/test_functions.py
+++ b/cf/test/test_functions.py
@@ -58,7 +58,7 @@ def test_configuration(self):
self.assertIsInstance(org, dict)
# Check all keys that should be there are, with correct value type:
- self.assertEqual(len(org), 8) # update expected len if add new key(s)
+ self.assertEqual(len(org), 11) # update expected len if add new key(s)
# Types expected:
self.assertIsInstance(org["atol"], float)
@@ -68,6 +68,8 @@ def test_configuration(self):
self.assertIsInstance(org["bounds_combination_mode"], str)
self.assertIsInstance(org["regrid_logging"], bool)
self.assertIsInstance(org["tempdir"], str)
+ self.assertIsInstance(org["active_storage"], bool)
+ self.assertIsInstance(org["active_storage_max_requests"], int)
# Log level may be input as an int but always given as
# equiv. string
self.assertIsInstance(org["log_level"], str)
@@ -87,12 +89,20 @@ def test_configuration(self):
"bounds_combination_mode": "XOR",
"log_level": "INFO",
"chunksize": 8e9,
+ "active_storage": True,
+ "active_storage_url": None,
+ "active_storage_max_requests": 100,
}
# Test the setting of each lone item.
expected_post_set = dict(org) # copy for safety with mutable dict
for setting, value in reset_values.items():
- cf.configuration(**{setting: value})
+ try:
+ cf.configuration(**{setting: value})
+ except ModuleNotFoundError as error:
+ print(f"WARNING: not testing {setting!r} due to: {error}")
+ continue
+
post_set = cf.configuration()
# Expect a dict that is identical to the original to start
diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py
index 3347fee8ea..f0bf697fac 100644
--- a/cf/test/test_read_write.py
+++ b/cf/test/test_read_write.py
@@ -8,7 +8,7 @@
import tempfile
import unittest
-import numpy
+import numpy as np
faulthandler.enable() # to debug seg faults and timeouts
@@ -93,26 +93,26 @@ def test_read_mask(self):
cf.write(f, tmpfile)
g = cf.read(tmpfile)[0]
- self.assertEqual(numpy.ma.count(g.data.array), N - 2)
+ self.assertEqual(np.ma.count(g.data.array), N - 2)
g = cf.read(tmpfile, mask=False)[0]
- self.assertEqual(numpy.ma.count(g.data.array), N)
+ self.assertEqual(np.ma.count(g.data.array), N)
g.apply_masking(inplace=True)
- self.assertEqual(numpy.ma.count(g.data.array), N - 2)
+ self.assertEqual(np.ma.count(g.data.array), N - 2)
f.set_property("_FillValue", 999)
f.set_property("missing_value", -111)
cf.write(f, tmpfile)
g = cf.read(tmpfile)[0]
- self.assertEqual(numpy.ma.count(g.data.array), N - 2)
+ self.assertEqual(np.ma.count(g.data.array), N - 2)
g = cf.read(tmpfile, mask=False)[0]
- self.assertEqual(numpy.ma.count(g.data.array), N)
+ self.assertEqual(np.ma.count(g.data.array), N)
g.apply_masking(inplace=True)
- self.assertEqual(numpy.ma.count(g.data.array), N - 2)
+ self.assertEqual(np.ma.count(g.data.array), N - 2)
def test_read_directory(self):
pwd = os.getcwd() + "/"
@@ -562,38 +562,38 @@ def test_read_write_netCDF4_compress_shuffle(self):
def test_write_datatype(self):
f = cf.read(self.filename)[0]
- self.assertEqual(f.dtype, numpy.dtype(float))
+ self.assertEqual(f.dtype, np.dtype(float))
cf.write(
f,
tmpfile,
fmt="NETCDF4",
- datatype={numpy.dtype(float): numpy.dtype("float32")},
+ datatype={np.dtype(float): np.dtype("float32")},
)
g = cf.read(tmpfile)[0]
self.assertEqual(
g.dtype,
- numpy.dtype("float32"),
+ np.dtype("float32"),
"datatype read in is " + str(g.dtype),
)
# Keyword single
f = cf.read(self.filename)[0]
- self.assertEqual(f.dtype, numpy.dtype(float))
+ self.assertEqual(f.dtype, np.dtype(float))
cf.write(f, tmpfile, fmt="NETCDF4", single=True)
g = cf.read(tmpfile)[0]
self.assertEqual(
g.dtype,
- numpy.dtype("float32"),
+ np.dtype("float32"),
"datatype read in is " + str(g.dtype),
)
# Keyword double
f = g
- self.assertEqual(f.dtype, numpy.dtype("float32"))
+ self.assertEqual(f.dtype, np.dtype("float32"))
cf.write(f, tmpfile2, fmt="NETCDF4", double=True)
g = cf.read(tmpfile2)[0]
self.assertEqual(
- g.dtype, numpy.dtype(float), "datatype read in is " + str(g.dtype)
+ g.dtype, np.dtype(float), "datatype read in is " + str(g.dtype)
)
for single in (True, False):
@@ -601,7 +601,7 @@ def test_write_datatype(self):
with self.assertRaises(Exception):
cf.write(g, double=double, single=single)
- datatype = {numpy.dtype(float): numpy.dtype("float32")}
+ datatype = {np.dtype(float): np.dtype("float32")}
with self.assertRaises(Exception):
cf.write(g, datatype=datatype, single=True)
@@ -898,8 +898,8 @@ def test_write_omit_data(self):
g = g[0]
# Check that the data are missing
- self.assertFalse(g.array.count())
- self.assertFalse(g.construct("grid_latitude").array.count())
+ self.assertFalse(np.ma.count(g.array))
+ self.assertFalse(np.ma.count(g.construct("grid_latitude").array))
# Check that a dump works
g.dump(display=False)
@@ -909,16 +909,16 @@ def test_write_omit_data(self):
# Check that only the field and dimension coordinate data are
# missing
- self.assertFalse(g.array.count())
- self.assertFalse(g.construct("grid_latitude").array.count())
- self.assertTrue(g.construct("latitude").array.count())
+ self.assertFalse(np.ma.count(g.array))
+ self.assertFalse(np.ma.count(g.construct("grid_latitude").array))
+ self.assertTrue(np.ma.count(g.construct("latitude").array))
cf.write(f, tmpfile, omit_data="field")
g = cf.read(tmpfile)[0]
# Check that only the field data are missing
- self.assertFalse(g.array.count())
- self.assertTrue(g.construct("grid_latitude").array.count())
+ self.assertFalse(np.ma.count(g.array))
+ self.assertTrue(np.ma.count(g.construct("grid_latitude").array))
@unittest.skipUnless(
False, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL"
diff --git a/docs/cheat_sheet.html b/docs/cheat_sheet.html
index ef89f98a11..52748b2401 100644
--- a/docs/cheat_sheet.html
+++ b/docs/cheat_sheet.html
@@ -133,7 +133,7 @@ Related Topics
Version 3.16.2 for version 1.11 of the CF conventions.
This cheat sheet provides a summary of some key functions and methods in
cf-python (also available as a printable PDF for
-pdf
).
+pdf
).
@@ -440,4 +440,4 @@ Related Topics