Skip to content

Commit 249c99d

Browse files
authored
Merge pull request #1074 from owenlittlejohns/update-to-virtualizarr-2.x
Update open_virtual_mfdataset to use virtualizarr v2.x.
2 parents e2f4bae + 2cb9fec commit 249c99d

8 files changed

Lines changed: 2535 additions & 1616 deletions

File tree

CHANGELOG.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,9 @@ and this project uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
2727
- `GESDISC` should be `GES_DISC` in docstrings.
2828
([#1037](https://github.com/nsidc/earthaccess/issues/1037))
2929
([@abarciauskas-bgse](https://github.com/abarciauskas-bgse))
30+
- `open_virtual_mfdataset` now uses `virtualizarr` v2, and `obstore` in place of `fsspec`. Updated Zarr to V3 xref #967.
31+
([#1074](https://github.com/nsidc/earthaccess/issues/1074))
32+
([@owenlittlejohns](https://github.com/owenlittlejohns))
3033

3134
### Added
3235

@@ -46,7 +49,7 @@ and this project uses [Semantic Versioning](https://semver.org/spec/v2.0.0.html)
4649
([@Sherwin-14](https://github.com/Sherwin-14))
4750
([@mfisher87](https://github.com/mfisher87))
4851
- `download` now returns Path consistently.
49-
([#595])(https://github.com/nsidc/earthaccess/issues/595)
52+
([#595])(<https://github.com/nsidc/earthaccess/issues/595>)
5053
([@Sherwin-14](https://github.com/Sherwin-14))
5154
- Users may now authenticate with an existing Earthdata login token with
5255
environment variable `EARTHDATA_TOKEN`
@@ -764,9 +767,6 @@ _Conception!_
764767
- Basic object formatting.
765768

766769
[Unreleased]: https://github.com/nsidc/earthaccess/compare/v0.14.0...HEAD
767-
[0.14.0]: https://github.com/nsidc/earthaccess/compare/v0.13.0...v0.14.0
768-
[0.13.0]: https://github.com/nsidc/earthaccess/compare/v0.12.0...v0.13.0
769-
[0.12.0]: https://github.com/nsidc/earthaccess/compare/v0.11.0...v0.12.0
770770
[0.11.0]: https://github.com/nsidc/earthaccess/releases/tag/v0.11.0
771771
[0.10.0]: https://github.com/nsidc/earthaccess/releases/tag/v0.10.0
772772
[0.9.0]: https://github.com/nsidc/earthaccess/releases/tag/v0.9.0

earthaccess/dmrpp_zarr.py

Lines changed: 92 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,13 @@
11
from __future__ import annotations
22

3-
from typing import TYPE_CHECKING, Any
3+
import warnings
4+
from typing import TYPE_CHECKING, Any, Literal
5+
from urllib.parse import urlparse
6+
7+
from obstore.auth.earthdata import NasaEarthdataCredentialProvider
8+
from obstore.store import HTTPStore, S3Store
9+
from virtualizarr.parsers import DMRPPParser
10+
from virtualizarr.registry import ObjectStoreRegistry
411

512
import earthaccess
613

@@ -12,9 +19,8 @@ def open_virtual_mfdataset(
1219
granules: list[earthaccess.DataGranule],
1320
group: str | None = None,
1421
access: str = "indirect",
15-
load: bool = False,
1622
preprocess: callable | None = None, # type: ignore
17-
parallel: bool = True,
23+
parallel: Literal["dask", "lithops", False] = "dask",
1824
**xr_combine_nested_kwargs: Any,
1925
) -> xr.Dataset:
2026
"""Open multiple granules as a single virtual xarray Dataset.
@@ -30,14 +36,10 @@ def open_virtual_mfdataset(
3036
Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored.
3137
access:
3238
The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine.
33-
load:
34-
Create an xarray dataset with indexes and lazy loaded data.
35-
36-
When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets.
3739
preprocess:
3840
A function to apply to each virtual dataset before combining
3941
parallel:
40-
Open the virtual datasets in parallel (using dask.delayed)
42+
Open the virtual datasets in parallel (using dask.delayed or lithops)
4143
xr_combine_nested_kwargs:
4244
Xarray arguments describing how to concatenate the datasets. Keyword arguments for xarray.combine_nested.
4345
See [https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html](https://docs.xarray.dev/en/stable/generated/xarray.combine_nested.html)
@@ -48,7 +50,7 @@ def open_virtual_mfdataset(
4850
Examples:
4951
```python
5052
>>> results = earthaccess.search_data(count=5, temporal=("2024"), short_name="MUR-JPL-L4-GLOB-v4.1")
51-
>>> vds = earthaccess.open_virtual_mfdataset(results, access="indirect", load=False, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts")
53+
>>> vds = earthaccess.open_virtual_mfdataset(results, access="indirect", load=False, concat_dim="time", coords="minimal", compat="override", combine_attrs="drop_conflicts")
5254
>>> vds
5355
<xarray.Dataset> Size: 29GB
5456
Dimensions: (time: 5, lat: 17999, lon: 36000)
@@ -68,7 +70,7 @@ def open_virtual_mfdataset(
6870
title: Daily MUR SST, Final product
6971
7072
>>> vds.virtualize.to_kerchunk("mur_combined.json", format="json")
71-
>>> vds = open_virtual_mfdataset(results, access="indirect", load=True, concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts")
73+
>>> vds = open_virtual_mfdataset(results, access="indirect", concat_dim="time", coords='minimal', compat='override', combine_attrs="drop_conflicts")
7274
>>> vds
7375
<xarray.Dataset> Size: 143GB
7476
Dimensions: (time: 5, lat: 17999, lon: 36000)
@@ -89,66 +91,64 @@ def open_virtual_mfdataset(
8991
```
9092
"""
9193
import virtualizarr as vz
92-
import xarray as xr
94+
95+
if len(granules) == 0:
96+
raise ValueError("No granules provided. At least one granule is required.")
97+
98+
parsed_url = urlparse(granules[0].data_links(access=access)[0])
9399

94100
if access == "direct":
95-
fs = earthaccess.get_s3_filesystem(results=granules) # type: ignore
96-
fs.storage_options["anon"] = False
97-
else:
98-
fs = earthaccess.get_fsspec_https_session()
99-
if parallel:
100-
import dask
101-
102-
# wrap _open_virtual_dataset and preprocess with delayed
103-
open_ = dask.delayed(vz.open_virtual_dataset) # type: ignore
104-
if preprocess is not None:
105-
preprocess = dask.delayed(preprocess) # type: ignore
106-
else:
107-
open_ = vz.open_virtual_dataset # type: ignore
108-
vdatasets = []
109-
# Get list of virtual datasets (or dask delayed objects)
110-
for g in granules:
111-
vdatasets.append(
112-
open_(
113-
filepath=g.data_links(access=access)[0] + ".dmrpp",
114-
filetype="dmrpp", # type: ignore
115-
group=group,
116-
indexes={},
117-
reader_options={"storage_options": fs.storage_options},
118-
)
101+
credentials_endpoint, region = get_granule_credentials_endpoint_and_region(
102+
granules[0]
119103
)
120-
if preprocess is not None:
121-
vdatasets = [preprocess(ds) for ds in vdatasets]
122-
if parallel:
123-
vdatasets = dask.compute(vdatasets)[0] # type: ignore
124-
if len(vdatasets) == 1:
125-
vds = vdatasets[0]
104+
bucket = parsed_url.netloc
105+
106+
s3_store = S3Store(
107+
bucket=bucket,
108+
region=region,
109+
credential_provider=NasaEarthdataCredentialProvider(credentials_endpoint),
110+
virtual_hosted_style_request=False,
111+
client_options={"allow_http": True},
112+
)
113+
obstore_registry = ObjectStoreRegistry({f"s3://{bucket}": s3_store})
126114
else:
127-
vds = xr.combine_nested(vdatasets, **xr_combine_nested_kwargs)
128-
if load:
129-
refs = vds.virtualize.to_kerchunk(filepath=None, format="dict")
130-
protocol = "s3" if "s3" in fs.protocol else fs.protocol
131-
return xr.open_dataset(
132-
"reference://",
133-
engine="zarr",
134-
chunks={},
135-
backend_kwargs={
136-
"consolidated": False,
137-
"storage_options": {
138-
"fo": refs, # codespell:ignore
139-
"remote_protocol": protocol,
140-
"remote_options": fs.storage_options,
115+
domain = parsed_url.netloc
116+
http_store = HTTPStore.from_url(
117+
f"https://{domain}",
118+
client_options={
119+
"default_headers": {
120+
"Authorization": f"Bearer {earthaccess.__auth__.token['access_token']}",
141121
},
142122
},
143123
)
144-
return vds
124+
obstore_registry = ObjectStoreRegistry({f"https://{domain}": http_store})
125+
126+
granule_dmrpp_urls = [
127+
granule.data_links(access=access)[0] + ".dmrpp" for granule in granules
128+
]
129+
130+
with warnings.catch_warnings():
131+
warnings.filterwarnings(
132+
"ignore",
133+
message="Numcodecs codecs*",
134+
category=UserWarning,
135+
)
136+
vmfdataset = vz.open_virtual_mfdataset(
137+
urls=granule_dmrpp_urls,
138+
registry=obstore_registry,
139+
parser=DMRPPParser(group=group),
140+
preprocess=preprocess,
141+
parallel=parallel,
142+
**xr_combine_nested_kwargs,
143+
)
144+
145+
return vmfdataset
145146

146147

147148
def open_virtual_dataset(
148149
granule: earthaccess.DataGranule,
149150
group: str | None = None,
150151
access: str = "indirect",
151-
load: bool = False,
152152
) -> xr.Dataset:
153153
"""Open a granule as a single virtual xarray Dataset.
154154
@@ -163,18 +163,14 @@ def open_virtual_dataset(
163163
Path to the netCDF4 group in the given file to open. If None, the root group will be opened. If the DMR++ file does not have groups, this parameter is ignored.
164164
access:
165165
The access method to use. One of "direct" or "indirect". Use direct when running on AWS, use indirect when running on a local machine.
166-
load:
167-
Create an xarray dataset with indexes and lazy loaded data.
168-
169-
When true, creates a lazy loaded, numpy/dask backed xarray dataset with indexes. Note that when `load=True` all the data is now available to access but not loaded into memory. When `load=False` a virtual xarray dataset is created with ManifestArrays. This virtual dataset is a view over the underlying metadata and chunks and allows creation and concatenation of zarr reference files. This virtual dataset cannot load data on it's own and see https://virtualizarr.readthedocs.io/en/latest/ for more information on virtual xarray datasets.
170166
171167
Returns:
172168
xarray.Dataset
173169
174170
Examples:
175171
```python
176172
>>> results = earthaccess.search_data(count=2, temporal=("2023"), short_name="SWOT_L2_LR_SSH_Expert_2.0")
177-
>>> vds = earthaccess.open_virtual_dataset(results[0], access="indirect", load=False)
173+
>>> vds = earthaccess.open_virtual_dataset(results[0], access="indirect")
178174
>>> vds
179175
<xarray.Dataset> Size: 149MB
180176
Dimensions: (num_lines: 9866, num_pixels: 69,
@@ -194,7 +190,40 @@ def open_virtual_dataset(
194190
granules=[granule],
195191
group=group,
196192
access=access,
197-
load=load,
198193
parallel=False,
199194
preprocess=None,
200195
)
196+
197+
198+
def get_granule_credentials_endpoint_and_region(
199+
granule: earthaccess.DataGranule,
200+
) -> tuple[str, str]:
201+
"""Retrieve credentials endpoint for direct access granule link.
202+
203+
Parameters:
204+
granule:
205+
The first granule being included in the virtual dataset.
206+
207+
Returns:
208+
credentials_endpoint:
209+
The S3 credentials endpoint. If this information is in the UMM-G record, then it is used from there. If not, a query for the collection is performed and the information is taken from the UMM-C record.
210+
region:
211+
Region for the data. Defaults to us-west-2. If the credentials endpoint is retrieved from the UMM-C record for the collection, the Region information is also used from UMM-C.
212+
213+
"""
214+
credentials_endpoint = granule.get_s3_credentials_endpoint()
215+
region = "us-west-2"
216+
217+
if credentials_endpoint is None:
218+
collection_results = earthaccess.search_datasets(
219+
count=1,
220+
concept_id=granule["meta"]["collection-concept-id"],
221+
)
222+
collection_s3_bucket = collection_results[0].s3_bucket()
223+
credentials_endpoint = collection_s3_bucket.get("S3CredentialsAPIEndpoint")
224+
region = collection_s3_bucket.get("Region", "us-west-2")
225+
226+
if credentials_endpoint is None:
227+
raise ValueError("The collection did not provide an S3CredentialsAPIEndpoint")
228+
229+
return credentials_endpoint, region

environment.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
name: earthaccess
22
channels:
33
- conda-forge
4+
- nodefaults
45
dependencies:
56
# This environment bootstraps pip, the actual dev environment
67
# is installed and managed with pip

pyproject.toml

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ dependencies = [
4040
"python-cmr >=0.10.0",
4141
"pqdm >=0.1",
4242
"requests >=2.26",
43-
"s3fs >=2022.11",
44-
"fsspec >=2022.11",
43+
"s3fs >=2025.2",
44+
"fsspec >=2025.2",
4545
"tinynetrc >=1.3.1",
4646
"tenacity >=9.0",
4747
"multimethod >=1.8",
@@ -57,19 +57,20 @@ Changelog = "https://github.com/nsidc/earthaccess/blob/main/CHANGELOG.md"
5757

5858
[project.optional-dependencies]
5959
kerchunk = [
60-
"kerchunk",
60+
"kerchunk>=0.2.9",
6161
"dask",
62-
"h5py >=3.8.0",
62+
"h5py >=3.14.0",
6363
"h5netcdf",
6464
"xarray",
65-
"zarr >=2.12.0, <3.0.0a",
65+
"zarr >=3.1.1",
6666
]
6767
virtualizarr = [
6868
"numpy >=1.26.4",
69-
"zarr >=2.12.0, <3.0.0a",
70-
"virtualizarr >=1.2.0",
69+
"zarr >=3.1.1",
70+
"virtualizarr >=2.1.1",
7171
"dask",
72-
"h5py >=3.8.0",
72+
"h5py >=3.14.0",
73+
"obstore >= 0.8.0",
7374
]
7475
dev = [
7576
"bump-my-version >=0.10.0",
@@ -79,7 +80,7 @@ dev = [
7980
"uv >=0.4.7",
8081
]
8182
test = [
82-
"zarr >=2.12.0, <3.0.0a",
83+
"zarr >=3.1.1",
8384
"numpy >=1.26.4",
8485
"mypy >=1.11.2",
8586
"pytest >=8.3",

tests/integration/test_kerchunk.py

Lines changed: 13 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -53,23 +53,22 @@ def test_consolidate_metadata(tmp_path, granules, output):
5353

5454
# Open with kerchunk consolidated metadata file
5555
kwargs = {"outfile": tmp_path / "metadata.json"} if output == "file" else {}
56-
metadata = earthaccess.consolidate_metadata(
56+
earthaccess.consolidate_metadata(
5757
granules, access="indirect", kerchunk_options={"concat_dims": "Time"}, **kwargs
5858
)
5959

6060
fs = earthaccess.get_fsspec_https_session()
61-
result = xr.open_dataset(
62-
"reference://",
63-
engine="zarr",
64-
chunks={},
65-
backend_kwargs={
66-
"consolidated": False,
67-
"storage_options": {
68-
"fo": metadata,
69-
"remote_protocol": "https",
70-
"remote_options": fs.storage_options,
61+
# This test should be eventually refactored to use virtualizarr
62+
if output == "file":
63+
result = xr.open_dataset(
64+
str(tmp_path / "metadata.json"),
65+
engine="kerchunk",
66+
backend_kwargs={
67+
"storage_options": {
68+
"remote_protocol": "https",
69+
"remote_options": fs.storage_options,
70+
},
7171
},
72-
},
73-
)
72+
)
7473

75-
xr.testing.assert_equal(result, expected)
74+
xr.testing.assert_equal(result, expected)

tests/integration/test_virtualizarr.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,5 @@ def granule(request):
3232

3333
def test_open_virtual_dataset(granule):
3434
# Simply check that the dmrpp can be found, parsed, and loaded. Actual parser result is checked in virtualizarr
35-
vds = earthaccess.open_virtual_dataset(granule, load=False)
35+
vds = earthaccess.open_virtual_dataset(granule)
3636
assert vds is not None
37-
vds_load = earthaccess.open_virtual_dataset(granule, load=True)
38-
assert vds_load is not None

0 commit comments

Comments
 (0)