Merge pull request #1074 from owenlittlejohns/update-to-virtualizarr-2.x

betolink · web-flow · commit 249c99d81c68 · 2025-09-01T11:51:30.000-05:00
Update open_virtual_mfdataset to use virtualizarr v2.x.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -27,6 +27,9 @@ and this project uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
 - `GESDISC` should be `GES_DISC` in docstrings.
   ([#1037](https://github.com/nsidc/earthaccess/issues/1037))
   ([@abarciauskas-bgse](https://github.com/abarciauskas-bgse))
+- `open_virtual_mfdataset` now uses `virtualizarr` v2, and `obstore` in place of `fsspec`. Updated Zarr to V3 xref #967.
+  ([#1074](https://github.com/nsidc/earthaccess/issues/1074))
+  ([@owenlittlejohns](https://github.com/owenlittlejohns))
 
 ### Added
 
@@ -46,7 +49,7 @@ and this project uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
   ([@Sherwin-14](https://github.com/Sherwin-14))
   ([@mfisher87](https://github.com/mfisher87))
 - `download` now returns Path consistently.
-  ([#595])(https://github.com/nsidc/earthaccess/issues/595)
+  ([#595])(<https://github.com/nsidc/earthaccess/issues/595>)
   ([@Sherwin-14](https://github.com/Sherwin-14))
 - Users may now authenticate with an existing Earthdata login token with
   environment variable `EARTHDATA_TOKEN`
@@ -764,9 +767,6 @@ _Conception!_
 - Basic object formatting.
 
 [Unreleased]: https://github.com/nsidc/earthaccess/compare/v0.14.0...HEAD
-[0.14.0]: https://github.com/nsidc/earthaccess/compare/v0.13.0...v0.14.0
-[0.13.0]: https://github.com/nsidc/earthaccess/compare/v0.12.0...v0.13.0
-[0.12.0]: https://github.com/nsidc/earthaccess/compare/v0.11.0...v0.12.0
 [0.11.0]: https://github.com/nsidc/earthaccess/releases/tag/v0.11.0
 [0.10.0]: https://github.com/nsidc/earthaccess/releases/tag/v0.10.0
 [0.9.0]: https://github.com/nsidc/earthaccess/releases/tag/v0.9.0
diff --git a/earthaccess/dmrpp_zarr.py b/earthaccess/dmrpp_zarr.py
@@ -1,6 +1,13 @@
 from __future__ import annotations
 
-from typing import TYPE_CHECKING, Any
+import warnings
+from typing import TYPE_CHECKING, Any, Literal
+from urllib.parse import urlparse
+
+from obstore.auth.earthdata import NasaEarthdataCredentialProvider
+from obstore.store import HTTPStore, S3Store
+from virtualizarr.parsers import DMRPPParser
+from virtualizarr.registry import ObjectStoreRegistry
 
 import earthaccess
 
@@ -12,9 +19,8 @@ def open_virtual_mfdataset(
     granules: list[earthaccess.DataGranule],
     group: str | None = None,
     access: str = "indirect",
-    load: bool = False,
     preprocess: callable | None = None,  # type: ignore
-    parallel: bool = True,
+    parallel: Literal["dask", "lithops", False] = "dask",
     **xr_combine_nested_kwargs: Any,
 ) -> xr.Dataset:
     """Open multiple granules as a single virtual xarray Dataset.
@@ -30,14 +36,10 @@ def open_virtual_mfdataset(
             Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored.
         access:
             The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine.
-        load:
-            Create an xarray dataset with indexes and lazy loaded data.
-
-            When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets.
         preprocess:
             A function to apply to each virtual dataset before combining
         parallel:
-            Open the virtual datasets in parallel (using dask.delayed)
+            Open the virtual datasets in parallel (using dask.delayed or lithops)
         xr_combine_nested_kwargs:
             Xarray arguments describing how to concatenate the datasets. Keyword arguments for xarray.combine_nested.
             See [https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html](https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html)
@@ -48,7 +50,7 @@ def open_virtual_mfdataset(
     Examples:
         ```python
         >>> results = earthaccess.search_data(count=5, temporal=("2024"), short_name="MUR-JPL-L4-GLOB-v4.1")
-        >>> vds = earthaccess.open_virtual_mfdataset(results, access="indirect", load=False, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts")
+        >>> vds = earthaccess.open_virtual_mfdataset(results, access="indirect", load=False, concat_dim="time", coords="minimal", compat="override", combine_attrs="drop_conflicts")
         >>> vds
         <xarray.Dataset> Size: 29GB
         Dimensions:           (time: 5, lat: 17999, lon: 36000)
@@ -68,7 +70,7 @@ def open_virtual_mfdataset(
             title:                      Daily MUR SST, Final product
 
         >>> vds.virtualize.to_kerchunk("mur_combined.json", format="json")
-        >>> vds = open_virtual_mfdataset(results, access="indirect", load=True, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts")
+        >>> vds = open_virtual_mfdataset(results, access="indirect", concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts")
         >>> vds
         <xarray.Dataset> Size: 143GB
         Dimensions:           (time: 5, lat: 17999, lon: 36000)
@@ -89,66 +91,64 @@ def open_virtual_mfdataset(
         ```
     """
     import virtualizarr as vz
-    import xarray as xr
+
+    if len(granules) == 0:
+        raise ValueError("No granules provided. At least one granule is required.")
+
+    parsed_url = urlparse(granules[0].data_links(access=access)[0])
 
     if access == "direct":
-        fs = earthaccess.get_s3_filesystem(results=granules)  # type: ignore
-        fs.storage_options["anon"] = False
-    else:
-        fs = earthaccess.get_fsspec_https_session()
-    if parallel:
-        import dask
-
-        # wrap _open_virtual_dataset and preprocess with delayed
-        open_ = dask.delayed(vz.open_virtual_dataset)  # type: ignore
-        if preprocess is not None:
-            preprocess = dask.delayed(preprocess)  # type: ignore
-    else:
-        open_ = vz.open_virtual_dataset  # type: ignore
-    vdatasets = []
-    # Get list of virtual datasets (or dask delayed objects)
-    for g in granules:
-        vdatasets.append(
-            open_(
-                filepath=g.data_links(access=access)[0] + ".dmrpp",
-                filetype="dmrpp",  # type: ignore
-                group=group,
-                indexes={},
-                reader_options={"storage_options": fs.storage_options},
-            )
+        credentials_endpoint, region = get_granule_credentials_endpoint_and_region(
+            granules[0]
         )
-    if preprocess is not None:
-        vdatasets = [preprocess(ds) for ds in vdatasets]
-    if parallel:
-        vdatasets = dask.compute(vdatasets)[0]  # type: ignore
-    if len(vdatasets) == 1:
-        vds = vdatasets[0]
+        bucket = parsed_url.netloc
+
+        s3_store = S3Store(
+            bucket=bucket,
+            region=region,
+            credential_provider=NasaEarthdataCredentialProvider(credentials_endpoint),
+            virtual_hosted_style_request=False,
+            client_options={"allow_http": True},
+        )
+        obstore_registry = ObjectStoreRegistry({f"s3://{bucket}": s3_store})
     else:
-        vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs)
-    if load:
-        refs = vds.virtualize.to_kerchunk(filepath=None, format="dict")
-        protocol = "s3" if "s3" in fs.protocol else fs.protocol
-        return xr.open_dataset(
-            "reference://",
-            engine="zarr",
-            chunks={},
-            backend_kwargs={
-                "consolidated": False,
-                "storage_options": {
-                    "fo": refs,  # codespell:ignore
-                    "remote_protocol": protocol,
-                    "remote_options": fs.storage_options,
+        domain = parsed_url.netloc
+        http_store = HTTPStore.from_url(
+            f"https://{domain}",
+            client_options={
+                "default_headers": {
+                    "Authorization": f"Bearer {earthaccess.__auth__.token['access_token']}",
                 },
             },
         )
-    return vds
+        obstore_registry = ObjectStoreRegistry({f"https://{domain}": http_store})
+
+    granule_dmrpp_urls = [
+        granule.data_links(access=access)[0] + ".dmrpp" for granule in granules
+    ]
+
+    with warnings.catch_warnings():
+        warnings.filterwarnings(
+            "ignore",
+            message="Numcodecs codecs*",
+            category=UserWarning,
+        )
+        vmfdataset = vz.open_virtual_mfdataset(
+            urls=granule_dmrpp_urls,
+            registry=obstore_registry,
+            parser=DMRPPParser(group=group),
+            preprocess=preprocess,
+            parallel=parallel,
+            **xr_combine_nested_kwargs,
+        )
+
+    return vmfdataset
 
 
 def open_virtual_dataset(
     granule: earthaccess.DataGranule,
     group: str | None = None,
     access: str = "indirect",
-    load: bool = False,
 ) -> xr.Dataset:
     """Open a granule as a single virtual xarray Dataset.
 
@@ -163,18 +163,14 @@ def open_virtual_dataset(
             Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored.
         access:
             The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine.
-        load:
-            Create an xarray dataset with indexes and lazy loaded data.
-
-            When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets.
 
     Returns:
         xarray.Dataset
 
     Examples:
         ```python
         >>> results = earthaccess.search_data(count=2, temporal=("2023"), short_name="SWOT_L2_LR_SSH_Expert_2.0")
-        >>> vds =  earthaccess.open_virtual_dataset(results[0], access="indirect", load=False)
+        >>> vds =  earthaccess.open_virtual_dataset(results[0], access="indirect")
         >>> vds
         <xarray.Dataset> Size: 149MB
         Dimensions:                                (num_lines: 9866, num_pixels: 69,
@@ -194,7 +190,40 @@ def open_virtual_dataset(
         granules=[granule],
         group=group,
         access=access,
-        load=load,
         parallel=False,
         preprocess=None,
     )
+
+
+def get_granule_credentials_endpoint_and_region(
+    granule: earthaccess.DataGranule,
+) -> tuple[str, str]:
+    """Retrieve credentials endpoint for direct access granule link.
+
+    Parameters:
+        granule:
+            The first granule being included in the virtual dataset.
+
+    Returns:
+        credentials_endpoint:
+            The S3 credentials endpoint. If this information is in the UMM-G record, then it is used from there. If not, a query for the collection is performed and the information is taken from the UMM-C record.
+        region:
+            Region for the data. Defaults to us-west-2. If the credentials endpoint is retrieved from the UMM-C record for the collection, the Region information is also used from UMM-C.
+
+    """
+    credentials_endpoint = granule.get_s3_credentials_endpoint()
+    region = "us-west-2"
+
+    if credentials_endpoint is None:
+        collection_results = earthaccess.search_datasets(
+            count=1,
+            concept_id=granule["meta"]["collection-concept-id"],
+        )
+        collection_s3_bucket = collection_results[0].s3_bucket()
+        credentials_endpoint = collection_s3_bucket.get("S3CredentialsAPIEndpoint")
+        region = collection_s3_bucket.get("Region", "us-west-2")
+
+    if credentials_endpoint is None:
+        raise ValueError("The collection did not provide an S3CredentialsAPIEndpoint")
+
+    return credentials_endpoint, region
diff --git a/environment.yml b/environment.yml
@@ -1,6 +1,7 @@
 name: earthaccess
 channels:
   - conda-forge
+  - nodefaults
 dependencies:
   # This environment bootstraps pip, the actual dev environment
   # is installed and managed with pip
diff --git a/pyproject.toml b/pyproject.toml
@@ -40,8 +40,8 @@ dependencies = [
   "python-cmr >=0.10.0",
   "pqdm >=0.1",
   "requests >=2.26",
-  "s3fs >=2022.11",
-  "fsspec >=2022.11",
+  "s3fs >=2025.2",
+  "fsspec >=2025.2",
   "tinynetrc >=1.3.1",
   "tenacity >=9.0",
   "multimethod >=1.8",
@@ -57,19 +57,20 @@ Changelog = "https://github.com/nsidc/earthaccess/blob/main/CHANGELOG.md"
 
 [project.optional-dependencies]
 kerchunk = [
-  "kerchunk",
+  "kerchunk>=0.2.9",
   "dask",
-  "h5py >=3.8.0",
+  "h5py >=3.14.0",
   "h5netcdf",
   "xarray",
-  "zarr >=2.12.0, <3.0.0a",
+  "zarr >=3.1.1",
 ]
 virtualizarr = [
   "numpy >=1.26.4",
-  "zarr >=2.12.0, <3.0.0a",
-  "virtualizarr >=1.2.0",
+  "zarr >=3.1.1",
+  "virtualizarr >=2.1.1",
   "dask",
-  "h5py >=3.8.0",
+  "h5py >=3.14.0",
+  "obstore >= 0.8.0",
 ]
 dev = [
   "bump-my-version >=0.10.0",
@@ -79,7 +80,7 @@ dev = [
   "uv >=0.4.7",
 ]
 test = [
-  "zarr >=2.12.0, <3.0.0a",
+  "zarr >=3.1.1",
   "numpy >=1.26.4",
   "mypy >=1.11.2",
   "pytest >=8.3",
diff --git a/tests/integration/test_kerchunk.py b/tests/integration/test_kerchunk.py
@@ -53,23 +53,22 @@ def test_consolidate_metadata(tmp_path, granules, output):
 
     # Open with kerchunk consolidated metadata file
     kwargs = {"outfile": tmp_path / "metadata.json"} if output == "file" else {}
-    metadata = earthaccess.consolidate_metadata(
+    earthaccess.consolidate_metadata(
         granules, access="indirect", kerchunk_options={"concat_dims": "Time"}, **kwargs
     )
 
     fs = earthaccess.get_fsspec_https_session()
-    result = xr.open_dataset(
-        "reference://",
-        engine="zarr",
-        chunks={},
-        backend_kwargs={
-            "consolidated": False,
-            "storage_options": {
-                "fo": metadata,
-                "remote_protocol": "https",
-                "remote_options": fs.storage_options,
+    # This test should be eventually refactored to use virtualizarr
+    if output == "file":
+        result = xr.open_dataset(
+            str(tmp_path / "metadata.json"),
+            engine="kerchunk",
+            backend_kwargs={
+                "storage_options": {
+                    "remote_protocol": "https",
+                    "remote_options": fs.storage_options,
+                },
             },
-        },
-    )
+        )
 
-    xr.testing.assert_equal(result, expected)
+        xr.testing.assert_equal(result, expected)
diff --git a/tests/integration/test_virtualizarr.py b/tests/integration/test_virtualizarr.py
@@ -32,7 +32,5 @@ def granule(request):
 
 def test_open_virtual_dataset(granule):
     # Simply check that the dmrpp can be found, parsed, and loaded. Actual parser result is checked in virtualizarr
-    vds = earthaccess.open_virtual_dataset(granule, load=False)
+    vds = earthaccess.open_virtual_dataset(granule)
     assert vds is not None
-    vds_load = earthaccess.open_virtual_dataset(granule, load=True)
-    assert vds_load is not None
diff --git a/tests/unit/test_dmrpp_zarr.py b/tests/unit/test_dmrpp_zarr.py
diff --git a/uv.lock b/uv.lock