diff --git a/Changelog.rst b/Changelog.rst index efef383fb9..d75cef1554 100644 --- a/Changelog.rst +++ b/Changelog.rst @@ -1,3 +1,17 @@ +Version NEXTVERSION +-------------- + +**2026-??-??** + +* New default backend for netCDF-4 in `cf.read` that allows parallel + reading: (https://github.com/NCAS-CMS/cf-python/issues/912) +* New optional backend for netCDF-3 in `cf.read` that allows parallel + reading: ``netcdf_file`` + (https://github.com/NCAS-CMS/cf-python/issues/912) +* Changed dependency: ``cfdm>=1.13.1.0, <1.13.2.0`` + +---- + Version 3.19.0 -------------- diff --git a/cf/__init__.py b/cf/__init__.py index 125aa6286f..3684c41b6d 100644 --- a/cf/__init__.py +++ b/cf/__init__.py @@ -153,6 +153,8 @@ H5netcdfArray, NetCDFArray, NetCDF4Array, + Netcdf_fileArray, + PyfiveArray, PointTopologyArray, RaggedContiguousArray, RaggedIndexedArray, diff --git a/cf/cfimplementation.py b/cf/cfimplementation.py index 543a1fca9b..75667c0b63 100644 --- a/cf/cfimplementation.py +++ b/cf/cfimplementation.py @@ -34,7 +34,9 @@ GatheredArray, H5netcdfArray, NetCDF4Array, + Netcdf_fileArray, PointTopologyArray, + PyfiveArray, RaggedContiguousArray, RaggedIndexedArray, RaggedIndexedContiguousArray, @@ -147,7 +149,9 @@ def set_construct(self, parent, construct, axes=None, copy=True, **kwargs): GatheredArray=GatheredArray, H5netcdfArray=H5netcdfArray, NetCDF4Array=NetCDF4Array, + Netcdf_fileArray=Netcdf_fileArray, PointTopologyArray=PointTopologyArray, + PyfiveArray=PyfiveArray, Quantization=Quantization, RaggedContiguousArray=RaggedContiguousArray, RaggedIndexedArray=RaggedIndexedArray, @@ -204,7 +208,9 @@ def implementation(): 'GatheredArray': cf.data.array.gatheredarray.GatheredArray, 'H5netcdfArray': cf.data.array.h5netcdfarray.H5netcdfArray, 'NetCDF4Array': cf.data.array.netcdf4array.NetCDF4Array, + 'Netcdf_fileArray': cf.data.array.netcdf_filearray.Netcdf_fileArray, 'PointTopologyArray': , + 'PyfiveArray': cf.data.array.pyfivearray.PyfiveArray, 'Quantization': cf.quantization.Quantization, 'RaggedContiguousArray': cf.data.array.raggedcontiguousarray.RaggedContiguousArray, 'RaggedIndexedArray': cf.data.array.raggedindexedarray.RaggedIndexedArray, diff --git a/cf/data/array/__init__.py b/cf/data/array/__init__.py index 2b3d03c54f..df4ac9b536 100644 --- a/cf/data/array/__init__.py +++ b/cf/data/array/__init__.py @@ -6,7 +6,9 @@ from .h5netcdfarray import H5netcdfArray from .netcdfarray import NetCDFArray from .netcdf4array import NetCDF4Array +from .netcdf_filearray import Netcdf_fileArray from .pointtopologyarray import PointTopologyArray +from .pyfivearray import PyfiveArray from .raggedcontiguousarray import RaggedContiguousArray from .raggedindexedarray import RaggedIndexedArray from .raggedindexedcontiguousarray import RaggedIndexedContiguousArray diff --git a/cf/data/array/h5netcdfarray.py b/cf/data/array/h5netcdfarray.py index 8b7d7e8685..e54ceb4e13 100644 --- a/cf/data/array/h5netcdfarray.py +++ b/cf/data/array/h5netcdfarray.py @@ -9,7 +9,7 @@ class H5netcdfArray( Container, cfdm.H5netcdfArray, ): - """A netCDF array accessed with `h5netcdf`. + """A netCDF array accessed with `h5netcdf` using the `h5py` backend. .. versionadded:: 3.16.3 diff --git a/cf/data/array/pyfivearray.py b/cf/data/array/pyfivearray.py new file mode 100644 index 0000000000..e514a0d54f --- /dev/null +++ b/cf/data/array/pyfivearray.py @@ -0,0 +1,16 @@ +import cfdm + +from ...mixin_container import Container +from .mixin import ActiveStorageMixin + + +class PyfiveArray( + ActiveStorageMixin, + Container, + cfdm.PyfiveArray, +): + """A netCDF array accessed with `pyfive`. + + .. versionadded:: NEXTVERSION + + """ diff --git a/cf/data/collapse/collapse_active.py b/cf/data/collapse/collapse_active.py index 425f2769e6..79fae066f6 100644 --- a/cf/data/collapse/collapse_active.py +++ b/cf/data/collapse/collapse_active.py @@ -186,35 +186,35 @@ def active_chunk_function(method, *args, **kwargs): # with `cf.active_storage(True)` from activestorage import Active - filename = x.get_filename() - address = x.get_address() - max_requests = active_storage_max_requests() + info = is_log_level_info(logger) + + max_requests = active_storage_max_requests().value + storage_options = None + address = None + dataset = x.get_variable(None) + if dataset is None: + # Dateaset is a string, not a variable object. + storage_options = x.get_storage_options() + address = x.get_address() + dataset = x.get_filename() + active_kwargs = { - "uri": "/".join(filename.split("/")[3:]), + "dataset": dataset, "ncvar": address, - "storage_options": x.get_storage_options(), + "axis": axis, + "storage_options": storage_options, "active_storage_url": url, - "storage_type": "s3", "max_threads": max_requests, } - # WARNING: The "uri", "storage_options", and "storage_type" keys - # of the `active_kwargs` dictionary are currently - # formatted according to the whims of the `Active` class - # (i.e. the pyfive branch of PyActiveStorage). Future - # versions of `Active` will have a better API, that will - # require improvements to `active_kwargs`. index = x.index() - - details = ( - f"{method!r} (file={filename}, address={address}, url={url}, " - f"Dask chunk={index})" - ) - - info = is_log_level_info(logger) if info: # Do some detailed logging start = time.time() + details = ( + f"{method!r} (dataset={dataset!r}, ncvar={address}, " + f"Dask chunk={index})" + ) logger.info( f"STARTED active storage {details}: {datetime.datetime.now()}" ) # pragma: no cover @@ -227,8 +227,7 @@ def active_chunk_function(method, *args, **kwargs): # reduction on the remote server # # WARNING: The `_version` API of `Active` is likely to change from - # the current version (i.e. the pyfive branch of - # PyActiveStorage) + # the current version active._version = 2 # ---------------------------------------------------------------- diff --git a/cf/data/utils.py b/cf/data/utils.py index bcd8aef289..81a77201a0 100644 --- a/cf/data/utils.py +++ b/cf/data/utils.py @@ -423,10 +423,12 @@ def collapse( kwargs["ddof"] = ddof # The applicable chunk function will have its own call to - # 'cfdm_to_memory', so we can set '_force_to_memory=False'. Also, - # setting _force_to_memory=False will ensure that any active - # storage operations are not compromised. - dx = d.to_dask_array(_force_to_memory=False) + # 'cfdm_to_memory', so we can set '_force_to_memory=False'. + # Setting _force_to_memory=False will also ensure that any active + # storage operations are not compromised. We can set + # _force_mask_hardness=False because collapse operations do not + # need to ever unset masked values. + dx = d.to_dask_array(_force_mask_hardness=False, _force_to_memory=False) dx = func(dx, **kwargs) d._set_dask(dx) diff --git a/cf/flags.py b/cf/flags.py index 361c35b683..fb4fe1ef0c 100644 --- a/cf/flags.py +++ b/cf/flags.py @@ -454,5 +454,5 @@ def sort(self): for attr in ("_flag_values", "_flag_meanings", "_flag_masks"): if hasattr(self, attr): - array = getattr(self, attr).view() - array[...] = array[indices] + array = getattr(self, attr)[indices] + setattr(self, attr, array) diff --git a/cf/functions.py b/cf/functions.py index c4f7abe5fb..c527a649d6 100644 --- a/cf/functions.py +++ b/cf/functions.py @@ -3184,60 +3184,6 @@ def environment(display=True, paths=True): environment is printed and `None` is returned. Otherwise the description is returned as a string. - **Examples** - - >>> cf.environment() - Platform: Linux-6.8.0-60-generic-x86_64-with-glibc2.39 - Python: 3.12.8 /home/miniconda3/bin/python - packaging: 24.2 /home/miniconda3/lib/python3.12/site-packages/packaging/__init__.py - numpy: 2.2.6 /home/miniconda3/lib/python3.12/site-packages/numpy/__init__.py - cfdm.core: 1.12.2.0 /home/miniconda3/lib/python3.12/site-packages/cfdm/cfdm/core/__init__.py - udunits2 library: libudunits2.so.0 - HDF5 library: 1.14.2 - netcdf library: 4.9.4-development - netCDF4: 1.7.2 /home/miniconda3/lib/python3.12/site-packages/netCDF4/__init__.py - h5netcdf: 1.3.0 /home/miniconda3/lib/python3.12/site-packages/h5netcdf/__init__.py - h5py: 3.12.1 /home/miniconda3/lib/python3.12/site-packages/h5py/__init__.py - zarr: 3.1.3 /home/miniconda3/lib/python3.12/site-packages/zarr/__init__.py - s3fs: 2024.12.0 /home/miniconda3/lib/python3.12/site-packages/s3fs/__init__.py - scipy: 1.15.1 /home/miniconda3/lib/python3.12/site-packages/scipy/__init__.py - dask: 2025.5.1 /home/miniconda3/lib/python3.12/site-packages/dask/__init__.py - distributed: 2025.5.1 /home/miniconda3/lib/python3.12/site-packages/distributed/__init__.py - cftime: 1.6.4.post1 /home/miniconda3/lib/python3.12/site-packages/cftime/__init__.py - cfunits: 3.3.7 /home/miniconda3/lib/python3.12/site-packages/cfunits/__init__.py - cfdm: 1.12.2.0 /home/miniconda3/lib/python3.12/site-packages/cfdm/__init__.py - esmpy/ESMF: 8.7.0 /home/miniconda3/lib/python3.12/site-packages/esmpy/__init__.py - psutil: 6.1.1 /home/miniconda3/lib/python3.12/site-packages/psutil/__init__.py - matplotlib: 3.10.0 /home/miniconda3/lib/python3.12/site-packages/matplotlib/__init__.py - cfplot: 3.4.0 /home/miniconda3/lib/python3.12/site-packages/cfplot/__init__.py - cf: 3.18.0 /home/miniconda3/lib/python3.12/site-packages/cf/__init__.py - - >>> cf.environment(paths=False) - Platform: Linux-6.8.0-60-generic-x86_64-with-glibc2.39 - Python: 3.12.8 - packaging: 24.2 - numpy: 2.2.6 - cfdm.core: 1.12.2.0 - udunits2 library: libudunits2.so.0 - HDF5 library: 1.14.2 - netcdf library: 4.9.4-development - netCDF4: 1.7.2 - h5netcdf: 1.3.0 - h5py: 3.12.1 - zarr: 3.1.3 - s3fs: 2024.12.0 - scipy: 1.15.1 - dask: 2025.5.1 - distributed: 2025.5.1 - cftime: 1.6.4.post1 - cfunits: 3.3.7 - cfdm: 1.12.2.0 - esmpy/ESMF: 8.7.0 - psutil: 6.1.1 - matplotlib: 3.10.0 - cfplot: 3.4.0 - cf: 3.18.0 - """ # Get cfdm env out = cfdm.environment(display=False, paths=paths) diff --git a/cf/mixin/propertiesdatabounds.py b/cf/mixin/propertiesdatabounds.py index 1150449a56..720e2f7c3c 100644 --- a/cf/mixin/propertiesdatabounds.py +++ b/cf/mixin/propertiesdatabounds.py @@ -18,7 +18,9 @@ ) from ..functions import equivalent as cf_equivalent from ..functions import inspect as cf_inspect -from ..functions import parse_indices +from ..functions import ( + parse_indices, +) from ..functions import size as cf_size from ..query import Query from ..units import Units diff --git a/cf/read_write/um/umread.py b/cf/read_write/um/umread.py index 9daf1c4612..ae8409b2a8 100644 --- a/cf/read_write/um/umread.py +++ b/cf/read_write/um/umread.py @@ -34,7 +34,6 @@ _cached_model_level_number_coordinate = {} _cached_regular_array = {} _cached_regular_bounds = {} -_cached_data = {} # -------------------------------------------------------------------- # Constants @@ -1138,7 +1137,7 @@ def __init__( config={ "axis": xaxis, "coord": xc, - "period": self.get_data(np.array(360.0), xc.Units), + "period": Data(360.0, xc.Units), }, ) @@ -1798,11 +1797,11 @@ def coord_data( """ if array is not None: - data = self.get_data(array, units, fill_value) + data = Data(array, units, fill_value=fill_value) self.implementation.set_data(c, data, copy=False) if bounds is not None: - data = self.get_data(bounds, units, fill_value, bounds=True) + data = Data(bounds, units, fill_value=fill_value) bounds = self.implementation.initialise_Bounds() self.implementation.set_data(bounds, data, copy=False) self.implementation.set_bounds(c, bounds, copy=False) @@ -3215,7 +3214,7 @@ def xy_coordinate(self, axiscode, axis): if X and bounds is not None: autocyclic["cyclic"] = abs(bounds[0, 0] - bounds[-1, -1]) == 360.0 - autocyclic["period"] = self.get_data(np.array(360.0), units) + autocyclic["period"] = Data(360.0, units) autocyclic["axis"] = axis_key autocyclic["coord"] = dc @@ -3225,63 +3224,6 @@ def xy_coordinate(self, axiscode, axis): return key, dc, axis_key - def get_data(self, array, units, fill_value=None, bounds=False): - """Create data, or get it from the cache. - - .. versionadded:: 3.15.0 - - :Parameters: - - array: `np.ndarray` - The data. - - units: `Units - The units. - - fill_value: scalar - The fill value. - - bounds: `bool` - Whether or not the data are bounds of 1-d coordinates. - - :Returns: - - `Data` - An independent copy of the new data. - - """ - from dask.base import tokenize - - token = tokenize(array, units) - data = _cached_data.get(token) - if data is None: - data = Data(array, units=units, fill_value=fill_value) - if not bounds: - if array.size == 1: - value = array.item(0) - data._set_cached_elements({0: value, -1: value}) - else: - data._set_cached_elements( - { - 0: array.item(0), - 1: array.item(1), - -1: array.item(-1), - } - ) - else: - data._set_cached_elements( - { - 0: array.item(0), - 1: array.item(1), - -2: array.item(-2), - -1: array.item(-1), - } - ) - - _cached_data[token] = data - - return data.copy() - def site_coordinates_from_extra_data(self): """Create site-related coordinates from extra data. @@ -3648,7 +3590,7 @@ def _open_um_file( pass raise DatasetTypeError( - f"Can't interpret {filename} as a PP or UM dataset" + f"\nCan't interpret {filename} as a PP or UM dataset" ) self._um_file = f diff --git a/cf/test/test_Data.py b/cf/test/test_Data.py index d64ebdbeba..9fe41f0282 100644 --- a/cf/test/test_Data.py +++ b/cf/test/test_Data.py @@ -171,14 +171,7 @@ def test_Data_equals(self): # for strict equality, including equality of data type. d2 = cf.Data(a.astype(np.float32), "m", chunks=chunksize) self.assertTrue(d2.equals(d2.copy())) - with self.assertLogs(level=-1) as catch: - self.assertFalse(d2.equals(d, verbose=2)) - self.assertTrue( - any( - "Data: Different data types: float32 != int64" in log_msg - for log_msg in catch.output - ) - ) + self.assertFalse(d2.equals(d)) e = cf.Data(a, "s", chunks=chunksize) # different units to d self.assertTrue(e.equals(e.copy())) @@ -387,14 +380,8 @@ def test_Data_equals(self): for log_msg in catch.output ) ) - with self.assertLogs(level=-1) as catch: - self.assertFalse(s1.equals(s3, verbose=2)) - self.assertTrue( - any( - "Data: Different data types: int64 != 3.10 this can be replaced by 'assertNoLogs' method. - logger.warning( - "Log warning to prevent test error on empty log." - ) - - self.assertFalse(d2.equals(d, verbose=verbosity_level)) - self.assertIs( - any( - "Data: Different data types: float32 != int64" - in log_msg - for log_msg in catch.output - ), - expect_to_see_msg, - ) + self.assertFalse(d2.equals(d)) # Test ignore_data_type parameter self.assertTrue(d2.equals(d, ignore_data_type=True)) @@ -1490,7 +1457,7 @@ def test_Data__getitem__(self): self.assertTrue(e.equals(f)) # Chained subspaces reading from disk - f = cf.read(self.filename, netcdf_backend="h5netcdf")[0] + f = cf.read(self.filename)[0] d = f.data a = d[:1, [1, 3, 4], :][:, [True, False, True], ::-2].array diff --git a/cf/test/test_read_write.py b/cf/test/test_read_write.py index 93d58c1d2e..441c68d23a 100644 --- a/cf/test/test_read_write.py +++ b/cf/test/test_read_write.py @@ -619,23 +619,29 @@ def test_write_reference_datetime(self): ) def test_read_write_unlimited(self): - for fmt in ("NETCDF4", "NETCDF3_CLASSIC"): - f = self.f1.copy() + f = cf.read(self.filename)[0] + for fmt in self.netcdf_fmts: domain_axes = f.domain_axes() domain_axes["domainaxis0"].nc_set_unlimited(True) - cf.write(f, tmpfile, fmt=fmt) + cf.write(f, tmpfile, fmt=fmt, cfa=None) - f = cf.read(tmpfile)[0] - domain_axes = f.domain_axes() - self.assertTrue(domain_axes["domainaxis0"].nc_is_unlimited()) + if fmt in self.netcdf3_fmts: + # Note: netcdf_file backend does not support unlimited + # dimensions + backend = "netCDF4" + else: + backend = None + + g = cf.read(tmpfile, netcdf_backend=backend)[0] + domain_axes = g.domain_axes() + self.assertTrue(domain_axes["domainaxis0"].nc_is_unlimited(), fmt) - fmt = "NETCDF4" - f = self.f1.copy() domain_axes = f.domain_axes() + domain_axes["domainaxis0"].nc_set_unlimited(True) domain_axes["domainaxis2"].nc_set_unlimited(True) - cf.write(f, tmpfile, fmt=fmt) + cf.write(f, tmpfile, fmt="NETCDF4", cfa=None) f = cf.read(tmpfile)[0] domain_axes = f.domain_axes() @@ -875,11 +881,10 @@ def test_write_omit_data(self): # True, "URL TEST: UNRELIABLE FLAKEY URL DESTINATION. TODO REPLACE URL" # ) def test_read_url(self): - """Test reading urls.""" + """Test reading remote url.""" for scheme in ("http", "https"): remote = f"{scheme}:///psl.noaa.gov/thredds/dodsC/Datasets/cru/crutem5/Monthlies/air.mon.anom.nobs.nc" - # Check that cf can access it - f = cf.read(remote) + f = cf.read(remote, netcdf_backend="netCDF4") self.assertEqual(len(f), 1) @unittest.skipUnless( diff --git a/docs/source/installation.rst b/docs/source/installation.rst index 13c9f0ef88..7b07eb4de4 100644 --- a/docs/source/installation.rst +++ b/docs/source/installation.rst @@ -225,8 +225,8 @@ Required * `scipy `_, version 1.10.0 or newer. -* `cfdm `_, version 1.13.0.0 or up to, - but not including, 1.13.1.0. +* `cfdm `_, version 1.13.1.0 or up to, + but not including, 1.13.2.0. * `cfunits `_, version 3.3.7 or newer. diff --git a/recipes-docs/source/recipes-source/plot_08_recipe.py b/recipes-docs/source/recipes-source/plot_08_recipe.py index 63427f62a7..6045f51448 100644 --- a/recipes-docs/source/recipes-source/plot_08_recipe.py +++ b/recipes-docs/source/recipes-source/plot_08_recipe.py @@ -9,11 +9,10 @@ # 1. Import cf-python, cf-plot, numpy and scipy.stats: import cfplot as cfp -import cf - import numpy as np import scipy.stats as stats +import cf # %% # 2. Three functions are defined: diff --git a/recipes-docs/source/recipes-source/plot_12_recipe.py b/recipes-docs/source/recipes-source/plot_12_recipe.py index b09db0b29f..5304194b19 100644 --- a/recipes-docs/source/recipes-source/plot_12_recipe.py +++ b/recipes-docs/source/recipes-source/plot_12_recipe.py @@ -13,8 +13,8 @@ # %% # 1. Import cf-python, cf-plot and matplotlib.pyplot: -import matplotlib.pyplot as plt import cfplot as cfp +import matplotlib.pyplot as plt import cf diff --git a/recipes-docs/source/recipes-source/plot_13_recipe.py b/recipes-docs/source/recipes-source/plot_13_recipe.py index bf0398713e..9b658597d8 100644 --- a/recipes-docs/source/recipes-source/plot_13_recipe.py +++ b/recipes-docs/source/recipes-source/plot_13_recipe.py @@ -18,13 +18,11 @@ # in next steps. import cartopy.crs as ccrs -import matplotlib.patches as mpatches - import cfplot as cfp +import matplotlib.patches as mpatches import cf - # %% # 2. Read and select the SST by index and look at its contents: sst = cf.read("~/recipes/ERA5_monthly_averaged_SST.nc")[0] diff --git a/recipes-docs/source/recipes-source/plot_17_recipe.py b/recipes-docs/source/recipes-source/plot_17_recipe.py index c94769e2ba..a66c90b518 100644 --- a/recipes-docs/source/recipes-source/plot_17_recipe.py +++ b/recipes-docs/source/recipes-source/plot_17_recipe.py @@ -11,8 +11,8 @@ # %% # 1. Import cf-python and cf-plot: -import matplotlib.pyplot as plt import cfplot as cfp +import matplotlib.pyplot as plt import cf diff --git a/recipes-docs/source/recipes-source/plot_18_recipe.py b/recipes-docs/source/recipes-source/plot_18_recipe.py index f0eae36e35..3beb9d0db9 100644 --- a/recipes-docs/source/recipes-source/plot_18_recipe.py +++ b/recipes-docs/source/recipes-source/plot_18_recipe.py @@ -10,15 +10,15 @@ """ +import cfplot as cfp + # %% # 1. Import cf-python, cf-plot and other required packages: import matplotlib.pyplot as plt import scipy.stats.mstats as mstats -import cfplot as cfp import cf - # %% # 2. Read the data in and unpack the Fields from FieldLists using indexing. # In our example We are investigating the influence of the land height on diff --git a/recipes-docs/source/recipes-source/plot_19_recipe.py b/recipes-docs/source/recipes-source/plot_19_recipe.py index 02d493dc21..ceb9db1c5c 100644 --- a/recipes-docs/source/recipes-source/plot_19_recipe.py +++ b/recipes-docs/source/recipes-source/plot_19_recipe.py @@ -9,10 +9,11 @@ maxima. """ +import cfplot as cfp + # %% # 1. Import cf-python, cf-plot and other required packages: import matplotlib.pyplot as plt -import cfplot as cfp import cf diff --git a/recipes-docs/source/recipes-source/plot_23_recipe.py b/recipes-docs/source/recipes-source/plot_23_recipe.py index 4ae11c3863..29537803af 100644 --- a/recipes-docs/source/recipes-source/plot_23_recipe.py +++ b/recipes-docs/source/recipes-source/plot_23_recipe.py @@ -18,14 +18,13 @@ # sphinx_gallery_thumbnail_number = 2 # sphinx_gallery_end_ignore +import cfplot as cfp +import dask.array as da import matplotlib.pyplot as plt import numpy as np -import dask.array as da -import cfplot as cfp import cf - # %% # 2. Read example data field constructs, and set region for our plots: @@ -171,4 +170,3 @@ # create your figure with cf-plot with placeholders for your other plots, # then add subplots by accessing the ``cfp.plotvars.master_plot`` object, # and finally redraw the figure containing the new plots. -