Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Possible approach for stacking items #75

Draft
wants to merge 3 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions intake_stac/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@
from pkg_resources import DistributionNotFound, get_distribution

from .catalog import StacCatalog, StacCollection, StacItem, StacItemCollection # noqa: F401
from .drivers import RioxarraySource # noqa: F401

# NOTE: "The drivers ['rioxarray'] do not specify entry_points" but in setup.py...
# intake.register_driver('rioxarray', RioxarraySource)
# register_container('xarray', RioxarraySource) (override intake_xarray?)

try:
__version__ = get_distribution(__name__).version
Expand Down
236 changes: 149 additions & 87 deletions intake_stac/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
# STAC catalog asset 'type' determines intake driver:
# https://github.com/radiantearth/stac-spec/blob/master/item-spec/item-spec.md#media-types
default_type = 'application/rasterio'
default_driver = 'rasterio'
default_driver = 'rioxarray'

drivers = {
'application/netcdf': 'netcdf',
Expand All @@ -20,13 +20,13 @@
'application/x-parquet': 'parquet',
'application/x-hdf': 'netcdf',
'application/x-hdf5': 'netcdf',
'application/rasterio': 'rasterio',
'image/vnd.stac.geotiff': 'rasterio',
'image/vnd.stac.geotiff; cloud-optimized=true': 'rasterio',
'image/x.geotiff': 'rasterio',
'image/tiff; application=geotiff': 'rasterio',
'image/tiff; application=geotiff; profile=cloud-optimized': 'rasterio', # noqa: E501
'image/jp2': 'rasterio',
'application/rasterio': 'rioxarray',
'image/vnd.stac.geotiff': 'rioxarray',
'image/vnd.stac.geotiff; cloud-optimized=true': 'rioxarray',
'image/x.geotiff': 'rioxarray',
'image/tiff; application=geotiff': 'rioxarray',
'image/tiff; application=geotiff; profile=cloud-optimized': 'rioxarray', # noqa: E501
'image/jp2': 'rioxarray',
'image/png': 'xarray_image',
'image/jpg': 'xarray_image',
'image/jpeg': 'xarray_image',
Expand Down Expand Up @@ -101,13 +101,78 @@ def serialize(self):
"""
return self.yaml()

def stack_items(
self, items, assets, path_as_pattern=None, concat_dim='band', override_coords=None
):
"""
Experimental. Create an xarray.DataArray from a bunch of STAC Items

Parameters
----------
items: list of STAC item id strings
assets : list of strings representing the different assets
(assset key or eo:bands "common_name" e.g. ['B4', B5'], ['red', 'nir'])
path_as_pattern : pattern string to extract coordinates from asset href
concat_dim : name of concatenation dimension for xarray.DataArray
override_coords : list of custom coordinate names

Returns
-------
CombinedAssets instance with mapping of asset names to xarray coordinates

Examples
-------
source = cat.stack_items(['S2A_36MYB_20200814_0_L2A','S2A_36MYB_20200811_0_L2A'],
['red', 'nir'])
da = stack().to_dask()
"""
common2band = self[items[0]]._get_band_name_mapping()
metadatas = {'items': {}}
hrefs = []
for item in items:
metadatas['items'][item] = {'STAC': self[item].metadata, 'assets': {}}
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the way i ended up structuring this it ended up being basically a loop around StacItem.stack_bands() so could probably just call that method instead of a bunch of duplicate code?

stac_assets = self[item]._stac_obj.assets
for key in assets:

if key in stac_assets:
asset = stac_assets.get(key)
elif key in common2band:
asset = stac_assets.get(common2band[key])
else:
raise ValueError(
f'Asset "{key}" not found in asset keys {list(common2band.values())}'
f' or eo:bands common_names {list(common2band.keys())}'
)

asset_metadata = asset.properties
asset_metadata['key'] = key
metadatas['items'][item]['assets'][asset.href] = asset_metadata
hrefs.append(asset.href)

configDict = {}
configDict['name'] = 'item_stack'
configDict['description'] = 'stack of assets from multiple items'
configDict['args'] = dict(
chunks={},
concat_dim=concat_dim,
path_as_pattern=path_as_pattern,
urlpath=hrefs,
override_coords=override_coords,
)
configDict['metadata'] = metadatas

stack = CombinedAssets(configDict)

return stack


class StacCatalog(AbstractStacCatalog):
"""
Maps Intake Catalog to a STAC Catalog
https://pystac.readthedocs.io/en/latest/api.html?#catalog-spec
"""

# NOTE: name must match driver in setup.py entrypoints
name = 'stac_catalog'
_stac_cls = pystac.Catalog

Expand Down Expand Up @@ -172,7 +237,7 @@ class StacItemCollection(AbstractStacCatalog):
https://pystac.readthedocs.io/en/latest/api.html?#single-file-stac-extension
"""

name = 'stac_itemcollection'
name = 'stac_item_collection'
_stac_cls = pystac.Catalog

def _load(self):
Expand Down Expand Up @@ -241,111 +306,106 @@ def _get_metadata(self, **kwargs):
metadata.update(kwargs)
return metadata

def _get_band_info(self):
def _get_band_name_mapping(self):
"""
Return list of band info dictionaries (name, common_name, etc.)...
Return dictionary mapping common name to asset name
eo:bands extension has [{'name': 'B01', 'common_name': 'coastal']
return {'coastal':'B01'}
"""
band_info = []
try:
# NOTE: ensure we test these scenarios
# FileNotFoundError: [Errno 2] No such file or directory: '/catalog.json'
# NOTE: maybe return entire dataframe w/ central_wavelength etc?
common2band = {}
# 1. try to get directly from item metadata
if 'eo' in self._stac_obj.stac_extensions:
eo = self._stac_obj.ext['eo']
for band in eo.bands:
common2band[band.common_name] = band.name

# 2. go a level up to collection metadata
if common2band == {}:
collection = self._stac_obj.get_collection()
if 'item-assets' in collection.stac_extensions:
for val in collection.ext['item_assets']:
if 'eo:bands' in val:
band_info.append(val.get('eo:bands')[0])
else:
band_info = collection.summaries['eo:bands']
# Can simplify after item-assets extension implemented in Pystac
# https://github.com/stac-utils/pystac/issues/132
for asset, meta in collection.extra_fields['item_assets'].items():
eo = meta.get('eo:bands')
if eo:
for entry in eo:
common_name = entry.get('common_name')
if common_name:
common2band[common_name] = asset

except Exception:
for band in self._stac_obj.ext['eo'].get_bands():
band_info.append(band.to_dict())
finally:
if not band_info:
raise ValueError(
'Unable to parse "eo:bands" information from STAC Collection or Item Assets'
)
return band_info
return common2band

def stack_bands(self, bands, path_as_pattern=None, concat_dim='band'):
def stack_assets(self, assets, path_as_pattern=None, concat_dim='band', override_coords=None):
Comment on lines -270 to +338
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

renamed again to be less EO-specific and more in line with the STAC assets. i don't think too many people are using this library yet, so i'm not concerned with disruptive changes.

"""
Stack the listed bands over the ``band`` dimension.
Stack the listed assets over the ``band`` dimension.

This method only works for STAC Items using the 'eo' Extension
https://github.com/radiantearth/stac-spec/tree/master/extensions/eo

NOTE: This method is not aware of geotransform information. It *assumes*
bands for a given STAC Item have the same coordinate reference system (CRS).
WARNING: This method is not aware of geotransform information. It *assumes*
assets for a given STAC Item have the same coordinate reference system (CRS).
This is usually the case for a given multi-band satellite acquisition.
Coordinate alignment is performed automatically upon calling the
`to_dask()` method to load into an Xarray DataArray if bands have diffent
ground sample distance (gsd) or array shapes.
Comment on lines -280 to -282
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i learned the hard way that this isn't exactly true. it works for landsat bands that were aligned based on GSD, but here are two with the same CRS and transform, different GSD that xarray fails to do what we expect... i'll open an issue to illustrate this (https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/36/M/YB/2020/8/S2A_36MYB_20200814_0_L2A/B04.tif, https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/36/M/YB/2020/8/S2A_36MYB_20200814_0_L2A/B05.tif)

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So it might be best to raise a warning if CRS, Affine, and GSD do not match for requested stacked assets

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hi @scottyhq,
i stumbled upon this, do you have a suggestion how to go about stacking (sen2) bands with different GSD to be later used for lazy-loading in a dask array?
Thanks,
Vitus

Copy link
Collaborator Author

@scottyhq scottyhq Mar 11, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@vitusbenson i'd recommend explicitly interpolating, or using a gdal vrt

url4 = 'https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/36/M/YB/2020/8/S2A_36MYB_20200814_0_L2A/B04.tif'
url5 = 'https://sentinel-cogs.s3.us-west-2.amazonaws.com/sentinel-s2-l2a-cogs/36/M/YB/2020/8/S2A_36MYB_20200814_0_L2A/B05.tif'
da4 = rioxarray.open_rasterio(url4, chunks='auto').assign_coords(band=['B4']) #gsd=10
da5 = rioxarray.open_rasterio(url5, chunks='auto').assign_coords(band=['B5']) #gsd=20
da5_interp = da5.interp(y=da4['y'], x=da4['x'],  method='nearest')
da = xr.concat([da5_interp, da4], dim='band') #gsd=10

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks - I'll have to give that a shot to compare.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thank you! I wasn't aware of rioxarray's interpolation capabilities - really useful!


See the following documentation for dealing with different CRS or GSD:
http://xarray.pydata.org/en/stable/interpolation.html
https://corteva.github.io/rioxarray/stable/examples/reproject_match.html

Parameters
----------
bands : list of strings representing the different bands
(e.g. ['B4', B5'], ['red', 'nir']).
assets : list of strings representing the different assets
(assset key or eo:bands "common_name" e.g. ['B4', B5'], ['red', 'nir'])
path_as_pattern : pattern string to extract coordinates from asset href
concat_dim : name of concatenation dimension for xarray.DataArray
override_coords : list of custom coordinate names

Returns
-------
StacAsset with mapping of Asset names to Xarray bands
CombinedAssets instance with mapping of asset names to xarray coordinates

Examples
-------
stack = item.stack_bands(['nir','red'])
da = stack(chunks=dict(band=1, x=2048, y=2048)).to_dask()
stack = item.stack_assets(['nir','red'])
da = stack(chunks=True).to_dask()

stack = item.stack_bands(['B4','B5'], path_as_pattern='{band}.TIF')
stack = item.stack_assets(['B4','B5'], path_as_pattern='{band}.TIF')
da = stack(chunks=dict(band=1, x=2048, y=2048)).to_dask()
"""
if 'eo' not in self._stac_obj.stac_extensions:
raise ValueError('STAC Item must implement "eo" extension to use this method')

band_info = self._get_band_info()
configDict = {}
metadatas = {}
titles = []
metadatas = {'items': {self.name: {'STAC': self.metadata, 'assets': {}}}}
hrefs = []
types = []
assets = self._stac_obj.assets
for band in bands:
# band can be band id, name or common_name
if band in assets:
info = next((b for b in band_info if b.get('id', b.get('name')) == band), None,)
common2band = self._get_band_name_mapping()
stac_assets = self._stac_obj.assets
for key in assets:
if key in stac_assets:
asset = stac_assets.get(key)
elif key in common2band:
asset = stac_assets.get(common2band[key])
else:
info = next((b for b in band_info if b.get('common_name') == band), None)
if info is not None:
band = info.get('id', info.get('name'))

if band not in assets or info is None:
valid_band_names = []
for b in band_info:
valid_band_names.append(b.get('id', b.get('name')))
valid_band_names.append(b.get('common_name'))
raise ValueError(
f'{band} not found in list of eo:bands in collection.'
f'Valid values: {sorted(list(set(valid_band_names)))}'
f'Asset "{key}" not found in asset keys {list(common2band.values())}'
f' or eo:bands common_names {list(common2band.keys())}'
)
asset = assets.get(band)
metadatas[band] = asset.to_dict()
titles.append(band)
types.append(asset.media_type)
hrefs.append(asset.href)

unique_types = set(types)
if len(unique_types) != 1:
raise ValueError(
f'Stacking failed: bands must have type, multiple found: {unique_types}'
)
# map *HREF* to metadata to do fancy things when opening it
asset_metadata = asset.properties
asset_metadata['key'] = key
asset_metadata['item'] = self.name
metadatas['items'][self.name]['assets'][asset.href] = asset_metadata
hrefs.append(asset.href)

configDict['name'] = '_'.join(bands)
configDict['description'] = ', '.join(titles)
configDict['name'] = self.name
configDict['description'] = ', '.join(assets)
# NOTE: these are args for driver __init__ method
configDict['args'] = dict(
chunks={}, concat_dim=concat_dim, path_as_pattern=path_as_pattern, urlpath=hrefs
chunks={},
concat_dim=concat_dim,
path_as_pattern=path_as_pattern,
urlpath=hrefs,
override_coords=override_coords,
)
configDict['metadata'] = metadatas

return CombinedAssets(configDict)
# instantiate to allow item.stack_assets(['red','nir']).to_dask() ?
stack = CombinedAssets(configDict)

return stack


class StacAsset(LocalCatalogEntry):
Expand Down Expand Up @@ -443,7 +503,7 @@ def _get_driver(self, asset):
)
entry_type = asset.media_type

# if mimetype not registered try rasterio driver
# if mimetype not registered try rioxarray driver
driver = drivers.get(entry_type, default_driver)

return driver
Expand All @@ -453,7 +513,7 @@ def _get_args(self, asset, driver):
Optional keyword arguments to pass to intake driver
"""
args = {'urlpath': asset.href}
if driver in ['netcdf', 'rasterio', 'xarray_image']:
if driver in ['netcdf', 'rasterio', 'rioxarray', 'xarray_image']:
# NOTE: force using dask?
args.update(chunks={})

Expand All @@ -465,14 +525,16 @@ class CombinedAssets(LocalCatalogEntry):
Maps multiple STAC Item Assets to 1 Intake Catalog Entry
"""

_stac_cls = None

def __init__(self, configDict):
"""
configDict = intake Entry dictionary from stack_bands() method
configDict = intake Entry intialization dictionary
"""
super().__init__(
name=configDict['name'],
description=configDict['description'],
driver='rasterio', # stack_bands only relevant to rasterio driver?
driver='rioxarray',
direct_access=True,
args=configDict['args'],
metadata=configDict['metadata'],
Expand Down
Loading