diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 41bc8c8baeae998a8d10ea57e7e852e25fb69ccc..1170ee0d620ad913ab0ccc577bc4748c5a4d621c 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -30,6 +30,9 @@ pages: - micromamba activate simplestac - micromamba env list - pip install -e . + - version=`fordead --version| sed -E 's/.*([0-9]+\.[0-9]+\.[0-9]+).*/\1/'` + - echo $version + - echo "{\"version\":\"$version\"}" > badges.json - cp -r examples docs/ - jupytext --to markdown docs/examples/*.py - portray as_html -c pyproject_doc.toml @@ -37,7 +40,7 @@ pages: artifacts: paths: - public - only: - - main - - pages + - version + rules: + - if: $CI_COMMIT_BRANCH == "pages" || $CI_COMMIT_BRANCH == "main" diff --git a/CHANGELOG.md b/CHANGELOG.md index 6387fd98751302a3760bbcf7e37a29a0241c84c6..4720be6a9b606a53b48e9c46bd5d890983ed96ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,7 @@ - function `harmonize_sen2cor_offset`: adds an `offset` property to the assets so it is taken into account by `to_xarray`. - method `ItemCollection.drop_duplicates`: drop duplicated ID returned by pgstac. - method `ItemCollection.drop_non_raster`: drop non raster assets. +- method `ItemCollection.to_xarray` default arg `gdal_env`: now it inlcudes GDAL retries in case of failure while reading url - function `extract_points` and method `ItemCollection.extract_points` to extract points time series. - `writer_args` to `ItemCollection.apply_...` methods and function in order to specify the outputs format, e.g. the encoding. - in local.py, `start_datetime` and `end_datetime` can now be used instead of `datetime` in the template used to build a local ItemCollection. diff --git a/README.md b/README.md index 50e1d07d640055c3bc5be07c325d7cf38649f5ff..b8023335aaf7a0f4d13682013f8eda22780b579f 100644 --- a/README.md +++ b/README.md @@ -5,6 +5,7 @@ __[Documentation](https://umr-tetis.pages.mia.inra.fr/stac/simplestac)__ [](https://www.r-project.org/Licenses/GPL-3) [](https://www.python.org) [](https://forgemia.inra.fr/umr-tetis/stac/simplestac/pipelines/main/latest) +[](https://forgemia.inra.fr/umr-tetis/stac/simplestac) # Features diff --git a/simplestac/local.py b/simplestac/local.py index dc5c0918bc135fa5a413400412e8d994d13ac4b1..dfe946b11930482ee0815137b5d8c20f52fbdd33 100644 --- a/simplestac/local.py +++ b/simplestac/local.py @@ -153,7 +153,7 @@ def stac_proj_info(bbox, gsd, meta): return proj_info -def stac_raster_info(meta, tags, scales, offsets): +def stac_raster_info(meta, tags, gsd, scales, offsets): """Raster information returned in the STAC format. Parameters @@ -162,6 +162,8 @@ def stac_raster_info(meta, tags, scales, offsets): Metadata dict returned by rasterio. tags : dict Tags returned by rasterio. + gsd : float + Ground sample distance. scales : list Scales returned by rasterio. offsets : list @@ -170,8 +172,15 @@ def stac_raster_info(meta, tags, scales, offsets): Returns ------- dict - bands - with prefix `raster:` + STAC extension raster information, with prefix `raster:bands` + + See also + -------- + simplestac.local.get_rio_info + + Notes + ----- + See https://github.com/stac-extensions/raster """ bands = [{}] if "nodata" in meta and meta["nodata"] is not None: @@ -180,8 +189,10 @@ def stac_raster_info(meta, tags, scales, offsets): bands[0]["sampling"] = tags["AREA_OR_POINT"].lower() if "dtype" in meta: bands[0]["datatype"] = meta["dtype"] - if "resolution" in tags: - bands[0]["spatial_resolution"] = tags["resolution"] + + # 'resolution' is not always in tags, thus gsd is used instead. + bands[0]["spatial_resolution"] = gsd + if scales is not None: bands[0]["scale"] = scales[0] if offsets is not None: @@ -347,7 +358,7 @@ def stac_asset_info_from_raster(band_file, band_fmt=None): # It could be set at the item level otherwise. proj_info = stac_proj_info(bbox, gsd, meta) stac_fields.update(proj_info) - raster_info = stac_raster_info(meta, tags, scales, offsets) + raster_info = stac_raster_info(meta, tags, gsd, scales, offsets) stac_fields.update(raster_info) return stac_fields diff --git a/simplestac/utils.py b/simplestac/utils.py index 9ed7813daf1abdc5d3bc469de23d41aefafa50cc..92912964d83f6aca6f7590367c38a8738d7a322e 100644 --- a/simplestac/utils.py +++ b/simplestac/utils.py @@ -27,6 +27,34 @@ logger = logging.getLogger(__name__) #### Generic functions and classes #### +# Adds GDAL_HTTP_MAX_RETRY and GDAL_HTTP_RETRY_DELAY to +# stackstac.rio_reader.DEFAULT_GDAL_ENV +# https://github.com/microsoft/PlanetaryComputerExamples/issues/279 +# while waiting for a PR to be merged: https://github.com/gjoseph92/stackstac/pull/232 +# See also https://github.com/gjoseph92/stackstac/issues/18 +DEFAULT_GDAL_ENV = stackstac.rio_reader.LayeredEnv( + always=dict( + GDAL_HTTP_MULTIRANGE="YES", # unclear if this actually works + GDAL_HTTP_MERGE_CONSECUTIVE_RANGES="YES", + # ^ unclear if this works either. won't do much when our dask chunks are aligned to the dataset's chunks. + GDAL_HTTP_MAX_RETRY="5", + GDAL_HTTP_RETRY_DELAY="1", + ), + open=dict( + GDAL_DISABLE_READDIR_ON_OPEN="EMPTY_DIR", + # ^ stop GDAL from requesting `.aux` and `.msk` files from the bucket (speeds up `open` time a lot) + VSI_CACHE=True + # ^ cache HTTP requests for opening datasets. This is critical for `ThreadLocalRioDataset`, + # which re-opens the same URL many times---having the request cached makes subsequent `open`s + # in different threads snappy. + ), + read=dict( + VSI_CACHE=False + # ^ *don't* cache HTTP requests for actual data. We don't expect to re-request data, + # so this would just blow out the HTTP cache that we rely on to make repeated `open`s fast + # (see above) + ), +) S2_THEIA_BANDS = [f"B{i+1}" for i in range(12)]+["B8A"] S2_SEN2COR_BANDS = [f"B{i+1:02}" for i in range(12)]+["B8A"] @@ -58,7 +86,7 @@ class ExtendPystacClasses: if not inplace: return x - def to_xarray(self, xy_coords="center", bbox=None, geometry=None, **kwargs): + def to_xarray(self, xy_coords="center", bbox=None, geometry=None, gdal_env=DEFAULT_GDAL_ENV, **kwargs): """Returns a DASK xarray() This is a proxy to stackstac.stac @@ -92,7 +120,7 @@ class ExtendPystacClasses: with warnings.catch_warnings(): warnings.filterwarnings("ignore", category=UserWarning) try: - arr = stackstac.stack(self, xy_coords=xy_coords, **kwargs) + arr = stackstac.stack(self, xy_coords=xy_coords, gdal_env=gdal_env, **kwargs) except ValueError as e: if "Cannot automatically compute the resolution" in str(e): raise ValueError(str(e)+"\nOr drop non-raster assets from collection with ItemCollection.drop_non_raster()") @@ -102,6 +130,8 @@ class ExtendPystacClasses: if bbox is not None: arr = arr.rio.clip_box(*bbox) if geometry is not None: + if isinstance(geometry, gpd.GeoDataFrame): + geometry = geometry.geometry if hasattr(geometry, 'crs') and not geometry.crs.equals(arr.rio.crs): logger.debug(f"Reprojecting geometry from {geometry.crs} to {arr.rio.crs}") geometry = geometry.to_crs(arr.rio.crs) @@ -337,6 +367,8 @@ class ExtendPystacClasses: inplace : bool, optional Whether to modify the collection in place. Defaults to False. In that case, a cloned collection is returned. + datetime : datetime, optional + A datetime to filter the items with. Defaults to None. bbox : tuple, optional A bounding box to clip_box the items with. Defaults to None. geometry : shapely.geometry, optional @@ -566,12 +598,16 @@ class ItemCollection(pystac.ItemCollection, ExtendPystacClasses): DEFAULT_REMOVE_PROPS = ['.*percentage', 'eo:cloud_cover', '.*mean_solar.*'] def write_assets(x: Union[ItemCollection, pystac.Item], - output_dir: str, bbox=None, update=True, + output_dir: str, + bbox=None, + geometry=None, + update=True, xy_coords='center', remove_item_props=DEFAULT_REMOVE_PROPS, overwrite=False, progress=True, writer_args=None, + inplace=False, **kwargs): """ Writes item(s) assets to the specified output directory. @@ -586,7 +622,17 @@ def write_assets(x: Union[ItemCollection, pystac.Item], output_dir : str The directory to write the assets to. bbox : Optional - The bounding box to clip the assets to. + Argument forwarded to ItemCollection.to_xarray. + The bounding box (in the CRS of the items) to clip the assets to. + geometry : Optional + Argument forwarded to ItemCollection.to_xarray to rioxarray.clip the assets to. + Usually a GeoDataFrame or GeoSeries. + See notes. + update : bool, optional + Whether to update the item properties with the new asset paths. + Defaults to True. + xy_coords : str, optional + The coordinate system to use for the x and y coordinates of the remove_item_props : list of str List of regex patterns to remove from item properties. If None, no properties are removed. @@ -595,6 +641,8 @@ def write_assets(x: Union[ItemCollection, pystac.Item], writer_args : dict, optional Arguments to pass to write_raster for each asset. Defaults to `None`. See Notes for an example. + inplace : bool, optional + Whether to modify the input collection in place or clone it. Defaults to False (i.e. clone). **kwargs Additional keyword arguments passed to write_raster. @@ -602,15 +650,44 @@ def write_assets(x: Union[ItemCollection, pystac.Item], ------- ItemCollection The item collection with the metadata updated with local asset paths. + + Notes + ----- + Arguments `bbox` and `geometry` are to ways to clip the assets before writing. + Although they look similar, they may lead to different results. + First, `bbox` does not have CRS, thus it is to the user to know + in which CRS x.to_xarray() will be before being clipped. If geometry is used instead, + it is automatically converted to the collection xarray CRS. + Second, as we use the default arguments for rio.clip and rio.clip_box, + the clip_box with bbox will contain all touched pixels while the clip with geometry will + contain only pixels whose center is within the polygon (all_touched=False). + Adding a buffer of resolution/2 could be a workaround to avoid that, + i.e. keep all touched pixels while clipping with a geometry. + + The `writer_args` argument can be used to specify the writing arguments (e.g. encoding) for specific assets. + Thus, it must be a dictionary with the keys corresponding to asset keys. + If the asset key is not in `writer_args`, the `kwargs` are passed to `write_raster`. + The following example would encode the B02 band as int16, and the rest of the assets as float: + writer_args = { + "B02": { + "encoding": { + "dtype": "int16", + } + } + } + """ if isinstance(x, pystac.Item): - x = [x] + x = ItemCollection([x]) + + if not inplace: + x = x.clone() output_dir = Path(output_dir).expand() items = [] for item in tqdm(x, disable=not progress): ic = ItemCollection([item], clone_items=True) - arr = ic.to_xarray(bbox=bbox, xy_coords=xy_coords, ).squeeze("time") + arr = ic.to_xarray(bbox=bbox, geometry=geometry,xy_coords=xy_coords, ).squeeze("time") item_dir = (output_dir / item.id).mkdir_p() for b in arr.band.values: filename = '_'.join([item.id, b+'.tif']) @@ -649,7 +726,8 @@ def write_assets(x: Union[ItemCollection, pystac.Item], logger.info(f'Item "{item.id}" is empty, skipping it.') item_dir.rmtree_p() - return ItemCollection(items, clone_items=False) + if not inplace: + return x def update_item_properties(x: pystac.Item, remove_item_props=DEFAULT_REMOVE_PROPS): """Update item bbox, geometry and proj:epsg introspecting assets. @@ -957,7 +1035,7 @@ def harmonize_sen2cor_offset(x, bands=S2_SEN2COR_BANDS, inplace=False): item.assets[asset].extra_fields["raster:bands"] = [dict(offset=-1000)] else: item.assets[asset].extra_fields["raster:bands"] = [dict(offset=0)] - if inplace: + if not inplace: return x def extract_points(x, points, method=None, tolerance=None, drop=False): diff --git a/tests/conftest.py b/tests/conftest.py index f4ae73329a17dd359ead9831afb568975109d796..39b8819af22c460cde503ea1ec99c885acd00a59 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -7,6 +7,7 @@ logging.basicConfig(level=logging.INFO) here = Path(__file__).parent download_script = here / "download_data.py" +print("Downloading the test data...") exec(open(download_script).read()) @pytest.fixture(scope="session") diff --git a/tests/test_local.py b/tests/test_local.py index 3471050eacc44322c622fb492298d52b7f048c85..49e7263cbda76fba14e365b08ab97000ea8dc2ce 100644 --- a/tests/test_local.py +++ b/tests/test_local.py @@ -14,6 +14,10 @@ def test_build(s2scene_dir): col = build_item_collection(s2scene_dir, collection_format()) assert len(col) == len(s2scene_dir.dirs()) assert len(col[0].assets) == 11 + extra_fields = col[0].assets["B02"].extra_fields + raster_bands = extra_fields["raster:bands"][0] + assert raster_bands["spatial_resolution"] == 10 + def test_datetime(s2scene_dir): fmt = collection_format() diff --git a/tests/test_remote.py b/tests/test_remote.py index 33339827156793213f4aa0e1e96f7d1fdd9bc140..5a293ace764cad6171afa7993cf9e883e6a8d992 100644 --- a/tests/test_remote.py +++ b/tests/test_remote.py @@ -1,6 +1,8 @@ from simplestac.utils import write_assets, ItemCollection, harmonize_sen2cor_offset import planetary_computer as pc import pystac_client +from tempfile import TemporaryDirectory +import numpy as np URL = "https://planetarycomputer.microsoft.com/api/stac/v1" @@ -80,6 +82,11 @@ def test_write_assets(roi, s2scene_pc_dir): assert len(item.assets) == len(s2scene_pc_dir.dirs()[0].files("*.tif")) assert item.assets["B08"].href.startswith(s2scene_pc_dir) assert new_col[0].assets["B08"].extra_fields["raster:bands"][0]["scale"] == 0.001 + + with TemporaryDirectory(prefix="simplestac-tests_") as tempdir: + new_col2 = write_assets(col, tempdir, geometry=roi.buffer(5), encoding=encoding) + assert len(new_col2) == len(new_col) + assert new_col2[0].bbox == new_col[0].bbox