From 0b4e66b02bd2c625ec7900a0a2f5b0510f89d925 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Mon, 24 Jun 2024 14:41:31 -0500 Subject: [PATCH 01/17] attempted update to intake v2 --- intake_erddap/__init__.py | 6 ++-- intake_erddap/erddap.py | 12 +++++-- intake_erddap/erddap_cat.py | 63 ++++++++++++++++++++----------------- setup.py | 9 +++--- 4 files changed, 52 insertions(+), 38 deletions(-) diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index fd168bf..c6deb7f 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -1,11 +1,13 @@ """intake-erddap package.""" +import intake + from .erddap import GridDAPSource, TableDAPSource -from .erddap_cat import ERDDAPCatalog +from .erddap_cat import ERDDAPCatalogReader from .version import __version__ __all__ = [ - "ERDDAPCatalog", + "ERDDAPCatalogReader", "TableDAPSource", "GridDAPSource", "__version__", diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 13285b9..45a2d2e 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -13,6 +13,9 @@ from erddapy import ERDDAP from intake.source import base +from intake.readers.readers import BaseReader +from intake.readers.entry import ReaderDescription +from intake.readers.datatypes import BaseData from .version import __version__ @@ -24,8 +27,8 @@ # numpy typing is only available after version 1.21 from numpy.typing import ArrayLike - -class ERDDAPSource(base.DataSource): + +class ERDDAPSource(BaseData): """ ERDDAP Source (Base Class). This class represents the abstract base class for an intake data source object for ERDDAP. Clients should use either @@ -221,7 +224,10 @@ def _get_partition(self) -> pd.DataFrame: def read(self) -> pd.DataFrame: """Return the dataframe from ERDDAP""" - return self._get_partition() + # return self._get_partition() + if self._dataframe is None: + self._load() + return self._dataframe def _close(self): self._dataframe = None diff --git a/intake_erddap/erddap_cat.py b/intake_erddap/erddap_cat.py index 4a0e968..5dfb95a 100644 --- a/intake_erddap/erddap_cat.py +++ b/intake_erddap/erddap_cat.py @@ -20,8 +20,10 @@ import requests from erddapy import ERDDAP -from intake.catalog.base import Catalog -from intake.catalog.local import LocalCatalogEntry +# from intake.catalog.base import Catalog +from intake.readers.entry import Catalog, DataDescription +from intake.readers.readers import BaseReader +# from intake.catalog.local import LocalCatalogEntry from intake_erddap.cache import CacheStore @@ -34,7 +36,7 @@ log = getLogger("intake-erddap") -class ERDDAPCatalog(Catalog): +class ERDDAPCatalogReader(BaseReader): """ Makes data sources out of all datasets the given ERDDAP service @@ -125,6 +127,7 @@ class ERDDAPCatalog(Catalog): name = "erddap_cat" version = __version__ + output_instance = "intake.readers.entry:Catalog" def __init__( self, @@ -154,7 +157,7 @@ def __init__( if server.endswith("/"): server = server[:-1] self._erddap_client = erddap_client or ERDDAP - self._entries: Dict[str, LocalCatalogEntry] = {} + self._entries: Dict[str, Catalog] = {} self._use_source_constraints = use_source_constraints self._protocol = protocol self._dataset_metadata: Optional[Mapping[str, dict]] = None @@ -248,7 +251,7 @@ def __init__( # Clear the cache of old stale data on initialization self.cache_store.clear_cache(cache_period) - super(ERDDAPCatalog, self).__init__(metadata=metadata, **kwargs) + super(ERDDAPCatalogReader, self).__init__(metadata=metadata, **kwargs) def _load_df(self) -> pd.DataFrame: frames = [] @@ -410,20 +413,22 @@ def get_client(self) -> ERDDAP: e.dataset_id = "allDatasets" return e - def _load(self): + def read(self): dataidkey = "datasetID" e = self.get_client() df = self._load_df() all_metadata = self._load_metadata() self._entries = {} + + # Remove datasets that are redundant + df = df[(~df["datasetID"].str.startswith("ism-")) * (df["datasetID"] != "allDatasets")] + entries, aliases = {}, {} for index, row in df.iterrows(): dataset_id = row[dataidkey] - if dataset_id == "allDatasets": - continue + metadata = all_metadata.get(dataset_id, {}) - description = "ERDDAP dataset_id %s from %s" % (dataset_id, self.server) args = { "server": self.server, "dataset_id": dataset_id, @@ -440,30 +445,32 @@ def _load(self): } ) args["constraints"].update(self._get_tabledap_constraints()) - - metadata = all_metadata.get(dataset_id, {}) - - entry = LocalCatalogEntry( - name=dataset_id, - description=description, - driver=self._protocol, - args=args, - metadata=metadata, - getenv=False, - getshell=False, - ) - if self._protocol == "tabledap": - entry._metadata["info_url"] = e.get_info_url( - response="csv", dataset_id=dataset_id - ) - entry._plugin = [TableDAPSource] + datatype = "intake_erddap.erddap:TableDAPSource" elif self._protocol == "griddap": - entry._plugin = [GridDAPSource] + args.update( + { + "chunks": self._chunks, + "xarray_kwargs": self._xarray_kwargs, + } + ) + # no equivalent for griddap, though maybe it works the same? + args["constraints"].update(self._get_tabledap_constraints()) + datatype = "intake_erddap.erddap:GridDAPSource" else: raise ValueError(f"Unsupported protocol: {self._protocol}") - self._entries[dataset_id] = entry + metadata["info_url"] = e.get_info_url( + response="csv", dataset_id=dataset_id + ) + entries[dataset_id] = DataDescription( + datatype, + kwargs={"dataset_id": dataset_id, **args, "metadata": metadata,}, + ) + aliases[dataset_id] = dataset_id + cat = Catalog(data=entries, aliases=aliases,) + return cat + def _get_tabledap_constraints(self) -> Dict[str, Union[str, int, float]]: """Return the constraints dictionary for a tabledap source.""" result = {} diff --git a/setup.py b/setup.py index bca2140..b6c1008 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,6 @@ from pathlib import Path from setuptools import setup - requires = open("requirements.txt").read().strip().split("\n") setup( @@ -22,15 +21,15 @@ packages=["intake_erddap"], package_data={"": ["*.csv", "*.yml", "*.html"]}, entry_points={ - "intake.drivers": [ + "intake.imports": [ "tabledap = intake_erddap.erddap:TableDAPSource", "griddap = intake_erddap.erddap:GridDAPSource", - "erddap_cat = intake_erddap.erddap_cat:ERDDAPCatalog", - ] + "erddap_cat = intake_erddap.erddap_cat:ERDDAPCatalogReader", + ], }, include_package_data=True, install_requires=requires, long_description=Path("README.md").read_text(), long_description_content_type='text/markdown', zip_safe=False, -) +) \ No newline at end of file From 74447194f1ae1c5fde25550558363cd2ce0c3865 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Tue, 25 Jun 2024 15:44:37 -0400 Subject: [PATCH 02/17] meta --- intake_erddap/erddap_cat.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/intake_erddap/erddap_cat.py b/intake_erddap/erddap_cat.py index 5dfb95a..a311f5a 100644 --- a/intake_erddap/erddap_cat.py +++ b/intake_erddap/erddap_cat.py @@ -464,7 +464,8 @@ def read(self): ) entries[dataset_id] = DataDescription( datatype, - kwargs={"dataset_id": dataset_id, **args, "metadata": metadata,}, + kwargs={"dataset_id": dataset_id, **args}, + metadata=metadata, ) aliases[dataset_id] = dataset_id From c7c3be3ba2c47258ecbded90d92d3a923d0e964d Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 4 Jul 2024 10:44:37 -0400 Subject: [PATCH 03/17] Remove stuff --- intake_erddap/erddap.py | 259 ++++++++-------------------------------- 1 file changed, 47 insertions(+), 212 deletions(-) diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 45a2d2e..443f2c8 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -12,23 +12,15 @@ import xarray as xr from erddapy import ERDDAP -from intake.source import base from intake.readers.readers import BaseReader from intake.readers.entry import ReaderDescription from intake.readers.datatypes import BaseData -from .version import __version__ - log = getLogger("intake-erddap") -if typing.TYPE_CHECKING: # pragma: no cover - # numpy typing is only available after version 1.21 - from numpy.typing import ArrayLike - - -class ERDDAPSource(BaseData): +class ERDDAPSource(BaseReader): """ ERDDAP Source (Base Class). This class represents the abstract base class for an intake data source object for ERDDAP. Clients should use either @@ -60,51 +52,15 @@ class ERDDAPSource(BaseData): Caches entire dataframe in memory. """ - name = "erddap" - version = __version__ - container = "dataframe" - partition_access = True + output_instance = "xarray:Dataset" - def __init__( - self, - dataset_id: str, - protocol: str, - variables: List[str] = None, - constraints: dict = None, - metadata: dict = None, - erddap_client: Optional[Type[ERDDAP]] = None, - http_client: Optional[Type] = None, - open_kwargs: dict = None, - ): - variables = variables or [] - constraints = constraints or {} - metadata = metadata or {} - - self._init_args = { - "dataset_id": dataset_id, - "protocol": protocol, - "variables": variables, - "constraints": constraints, - "metadata": metadata, - } - - self._dataset_id = dataset_id - self._protocol = protocol - self._variables = variables - self._constraints = constraints - self._erddap_client = erddap_client or ERDDAP - self._http = http_client or requests - self.open_kwargs = open_kwargs or {} - - super(ERDDAPSource, self).__init__(metadata=metadata) - - def get_client(self) -> ERDDAP: + def get_client(self, server, protocol, dataset_id, variables, constraints, client=ERDDAP, **_) -> ERDDAP: """Return an initialized ERDDAP Client.""" - e = self._erddap_client(server=self._server) - e.protocol = self._protocol - e.dataset_id = self._dataset_id - e.variables = self._variables - e.constraints = self._constraints + e = client(server=server) + e.protocol = protocol + e.dataset_id = dataset_id + e.variables = variables + e.constraints = constraints return e @@ -177,64 +133,14 @@ class TableDAPSource(ERDDAPSource): ... """ - name = "tabledap" - version = __version__ - container = "dataframe" - partition_access = True - - def __init__( - self, - server: str, - mask_failed_qartod: bool = False, - dropna: bool = False, - cache_kwargs: Optional[dict] = None, - *args, - **kwargs, - ): - self._server = server - self._dataframe: Optional[pd.DataFrame] = None - self._dataset_metadata: Optional[dict] = None - self._mask_failed_qartod = mask_failed_qartod - self._dropna = dropna - self._cache_kwargs = cache_kwargs - kwargs.pop("protocol", None) - # https://github.com/python/mypy/issues/6799 - super().__init__(*args, protocol="tabledap", **kwargs) # type: ignore - - def _get_schema(self) -> base.Schema: - if self._dataframe is None: - # TODO: could do partial read with chunksize to get likely schema from - # first few records, rather than loading the whole thing - self._load() - self._dataset_metadata = self._get_dataset_metadata() - # make type checker happy - assert self._dataframe is not None - return base.Schema( - datashape=None, - dtype=self._dataframe.dtypes, - shape=self._dataframe.shape, - npartitions=1, - extra_metadata=self._dataset_metadata, - ) - - def _get_partition(self) -> pd.DataFrame: - if self._dataframe is None: - self._load_metadata() - return self._dataframe - - def read(self) -> pd.DataFrame: - """Return the dataframe from ERDDAP""" - # return self._get_partition() - if self._dataframe is None: - self._load() - return self._dataframe - - def _close(self): - self._dataframe = None - - def _load(self): - e = self.get_client() - if self._cache_kwargs is not None: + def _read(self, server, dataset_id, mask_failed_qartod=False, dropna=False, cache_kwargs=None, + constraints=None, **kw): + kw.pop("protocol", None) + protocol = kw.pop("protocol", "tabledap") + meta2 = self._get_dataset_metadata(server, dataset_id) + e = self.get_client(server, protocol, dataset_id, variables=meta2["variables"], + constraints=constraints or {}, **kw) + if cache_kwargs is not None: if "response" in self.open_kwargs: response = self.open_kwargs["response"] self.open_kwargs.pop("response") @@ -242,35 +148,34 @@ def _load(self): else: url = e.get_download_url(response=response) - with fsspec.open(f"simplecache://::{url}", **self._cache_kwargs) as f: - self._dataframe: pd.DataFrame = pd.read_csv(f, **self.open_kwargs) + with fsspec.open(f"simplecache://::{url}", **(cache_kwargs or {})) as f: + dataframe: pd.DataFrame = pd.read_csv(f) else: - self._dataframe: pd.DataFrame = e.to_pandas( - requests_kwargs={"timeout": 60}, **self.open_kwargs + dataframe: pd.DataFrame = e.to_pandas( + requests_kwargs={"timeout": 60} ) - if self._mask_failed_qartod: - self.run_mask_failed_qartod() - if self._dropna: - self.run_dropna() - - @property - def data_cols(self): + if mask_failed_qartod: + dataframe = self.run_mask_failed_qartod(dataframe) + if dropna: + dataframe = self.run_dropna(dataframe) + return dataframe + + @staticmethod + def data_cols(df): """Columns that are not axes, coordinates, nor qc_agg columns.""" # find data columns which are what we'll use in the final step to drop nan's # don't include dimension/coordinates-type columns (dimcols) nor qc_agg columns (qccols) - dimcols = self._dataframe.cf.axes_cols + self._dataframe.cf.coordinates_cols + dimcols = df.cf.axes_cols + df.cf.coordinates_cols qccols = list( - self._dataframe.columns[self._dataframe.columns.str.contains("_qc_agg")] + df.columns[df.columns.str.contains("_qc_agg")] ) - datacols = [ - col for col in self._dataframe.columns if col not in dimcols + qccols + col for col in df.columns if col not in dimcols + qccols ] - return datacols - def run_mask_failed_qartod(self): + def run_mask_failed_qartod(self, df): """Nan data values for which corresponding qc_agg columns is not equal to 1 or 2. To get this to work you may need to specify the "qc_agg" columns to come along specifically @@ -279,22 +184,23 @@ def run_mask_failed_qartod(self): # if a data column has an associated qc column, use it to weed out bad data by # setting it to nan. - for datacol in self.data_cols: + for datacol in self.data_cols(df): qccol = f"{datacol}_qc_agg" - if qccol in self._dataframe.columns: - self._dataframe.loc[ + if qccol in df.columns: + df.loc[ ~self._dataframe[qccol].isin([1, 2]), datacol ] = pd.NA - self._dataframe.drop(columns=[qccol], inplace=True) + df.drop(columns=[qccol], inplace=True) + return df - def run_dropna(self): + def run_dropna(self, df): """Drop nan rows based on the data columns.""" - self._dataframe = self._dataframe.dropna(subset=self.data_cols) + return df.dropna(subset=self.data_cols) - def _get_dataset_metadata(self) -> dict: + def _get_dataset_metadata(self, server, dataset_id) -> dict: """Fetch and return the metadata document for the dataset.""" - url = f"{self._server}/info/{self._dataset_id}/index.json" - resp = self._http.get(url) + url = f"{server}/info/{dataset_id}/index.json" + resp = requests.get(url) resp.raise_for_status() metadata: dict = {"variables": {}} for rowtype, varname, attrname, dtype, value in resp.json()["table"]["rows"]: @@ -396,11 +302,6 @@ class GridDAPSource(ERDDAPSource): method ``ds.load()`` on the Dataset object. """ - name = "griddap" - version = __version__ - container = "xarray" - partition_access = True - def __init__( self, server: str, @@ -411,88 +312,22 @@ def __init__( **kwargs, ): self._server = server - self._ds: Optional[xr.Dataset] = None self._chunks = chunks self._constraints = constraints or {} self._xarray_kwargs = xarray_kwargs or {} # Initialized by the private getter _get_schema - self._schema: Optional[base.Schema] = None self.urlpath = f"{server}/griddap/{dataset_id}" # https://github.com/python/mypy/issues/6799 kwargs.pop("protocol", None) super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore - def _get_schema(self) -> base.Schema: - self.urlpath = self._get_cache(self.urlpath)[0] - - if self._ds is None: - # Sets self._ds - self._open_dataset() - # Make mypy happy - assert self._ds is not None - metadata = { - "dims": dict(self._ds.dims), - "data_vars": { - k: list(self._ds[k].coords) for k in self._ds.data_vars.keys() - }, - "coords": tuple(self._ds.coords.keys()), - } - metadata.update(self._ds.attrs) - metadata["variables"] = {} - for varname in self._ds.variables: - metadata["variables"][varname] = self._ds[varname].attrs - self._schema = base.Schema( - datashape=None, - dtype=None, - shape=None, - npartitions=None, - extra_metadata=metadata, - ) - - return self._schema - - def _open_dataset(self): - self._ds = xr.open_dataset( + def _read(self): + ds = xr.open_dataset( self.urlpath, chunks=self._chunks, **self._xarray_kwargs ) # _NCProperties is an internal property which xarray does not yet deal # with specially, so we remove it here to prevent it from causing # problems for clients. - if "_NCProperties" in self._ds.attrs: - del self._ds.attrs["_NCProperties"] - - def read(self): - raise NotImplementedError( - "GridDAPSource.read is not implemented because ds.load() for grids from ERDDAP are " - "strongly discouraged. Use to_dask() instead." - ) - - def read_chunked(self) -> xr.Dataset: - """Return an xarray dataset (optionally chunked).""" - self._load_metadata() - return self._ds - - def read_partition(self, i: Tuple[str, ...]) -> "ArrayLike": - """Fetch one chunk of the array for a variable.""" - self._load_metadata() - if not isinstance(i, (tuple, list)): - raise TypeError("For Xarray sources, must specify partition as tuple") - if isinstance(i, list): - i = tuple(i) - # Make mypy happy - assert self._ds is not None - arr = self._ds[i[0]].data - idx = i[1:] - if isinstance(arr, np.ndarray): - return arr - # dask array - return arr.blocks[idx].compute() - - def to_dask(self) -> xr.Dataset: - """Return an xarray dataset (optionally chunked).""" - return self.read_chunked() - - def close(self): - """Close open descriptors.""" - self._ds = None - self._schema = None + if "_NCProperties" in ds.attrs: + del ds.attrs["_NCProperties"] + return ds From ec634fb037d29c1a4b54996d35245d2ab959c4f3 Mon Sep 17 00:00:00 2001 From: Martin Durant Date: Thu, 4 Jul 2024 10:49:04 -0400 Subject: [PATCH 04/17] (actually, it's a DF) --- intake_erddap/erddap.py | 1 + 1 file changed, 1 insertion(+) diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 443f2c8..3cdff24 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -132,6 +132,7 @@ class TableDAPSource(ERDDAPSource): 'units': 'seconds since 1970-01-01T00:00:00Z'}, ... """ + output_instance = "pandas:DataFrame" def _read(self, server, dataset_id, mask_failed_qartod=False, dropna=False, cache_kwargs=None, constraints=None, **kw): From 6b511bd7bed9757647b2503f0f5b51eeb3879fe8 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 14:22:16 -0500 Subject: [PATCH 05/17] Major update to intake v2 including docs and tests --- .readthedocs.yaml | 8 +- README.md | 142 ++++------- docs/api.rst | 6 +- docs/conf.py | 15 +- docs/environment.yml | 13 +- docs/examples.rst | 96 ------- docs/examples/wave-height.md | 60 +++-- docs/index.rst | 181 ++----------- docs/user_guide.rst | 164 ++++++++++++ docs/whats_new.md | 6 + intake_erddap/__init__.py | 6 +- intake_erddap/erddap.py | 140 ++++++----- intake_erddap/erddap_cat.py | 40 ++- intake_erddap/version.py | 9 +- setup.py | 4 +- tests/test_cache.py | 34 +-- tests/test_erddap_cat.py | 237 +++++++++--------- ...erddap_source.py => test_erddap_reader.py} | 75 +++--- 18 files changed, 579 insertions(+), 657 deletions(-) delete mode 100644 docs/examples.rst create mode 100644 docs/user_guide.rst create mode 100644 docs/whats_new.md rename tests/{test_erddap_source.py => test_erddap_reader.py} (66%) diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 24ec291..8bad78f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,10 +12,10 @@ build: # uncomment to build from this exact version of package # the downside is the version listed in the docs will be a dev version # if uncommenting this, comment out installing pypi version of package in docs/env file -# python: -# install: -# - method: pip -# path: ./ +python: + install: + - method: pip + path: ./ conda: environment: docs/environment.yml diff --git a/README.md b/README.md index 76e647f..65ed445 100644 --- a/README.md +++ b/README.md @@ -24,15 +24,13 @@ For changes prior to 2022-10-19, all contributions are Copyright James Munroe, s -Intake is a lightweight set of tools for loading and sharing data in data -science projects. Intake ERDDAP provides a set of integrations for ERDDAP. +Intake is a lightweight set of tools for loading and sharing data in data science projects. Intake ERDDAP provides a set of integrations for ERDDAP. -- Quickly identify all datasets from an ERDDAP service in a geographic region, - or containing certain variables. +- Quickly identify all datasets from an ERDDAP service in a geographic region, or containing certain variables. - Produce a pandas DataFrame for a given dataset or query. - Get an xarray Dataset for the Gridded datasets. -The Key features are: +The key features are: - Pandas DataFrames for any TableDAP dataset. - xarray Datasets for any GridDAP datasets. @@ -59,7 +57,7 @@ project is available on PyPI, so it can be installed using `pip` The following are prerequisites for a developer environment for this project: - [conda](https://docs.conda.io/en/latest/miniconda.html) -- (optional but highly recommended) [mamba](https://mamba.readthedocs.io/en/latest/) Hint: `conda install -c conda-forge mamba` +- (optional but highly recommended) [mamba](https://mamba.readthedocs.io/en/latest/). Hint: `conda install -c conda-forge mamba` Note: if `mamba` isn't installed, replace all instances of `mamba` in the following instructions with `conda`. @@ -83,126 +81,74 @@ Note: if `mamba` isn't installed, replace all instances of `mamba` in the follow pip install -e . ``` +Note that you need to install with `pip install .` once to get the `entry_points` correct too. ## Examples -To create an intake catalog for all of the ERDDAP's TableDAP offerings use: +To create an `intake` catalog for all of the ERDDAP's TableDAP offerings use: ```python -import intake -catalog = intake.open_erddap_cat( +import intake_erddap +catalog = intake_erddap.ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap" -) +).read() ``` -The catalog objects behave like a dictionary with the keys representing the -dataset's unique identifier within ERDDAP, and the values being the -`TableDAPSource` objects. To access a source object: +The catalog objects behave like a dictionary with the keys representing the dataset's unique identifier within ERDDAP, and the values being the `TableDAPReader` objects. To access a Reader object (for a single dataset, in this case for dataset_id "aoos_204"): ```python -source = catalog["datasetid"] +dataset = catalog["aoos_204"] ``` -From the source object, a pandas DataFrame can be retrieved: +From the reader object, a pandas DataFrame can be retrieved: ```python -df = source.read() +df = dataset.read() +``` + +Find other dataset_ids available with + +```python +list(catalog) ``` Consider a case where you need to find all wind data near Florida: ```python -import intake +import intake_erddap from datetime import datetime bbox = (-87.84, 24.05, -77.11, 31.27) -catalog = intake.open_erddap_cat( +catalog = intake_erddap.ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", bbox=bbox, + intersection="union", start_time=datetime(2022, 1, 1), end_time=datetime(2023, 1, 1), standard_names=["wind_speed", "wind_from_direction"], -) + variables=["wind_speed", "wind_from_direction"], +).read() -df = next(catalog.values()).read() +dataset_id = list(catalog)[0] +print(dataset_id) +df = catalog[dataset_id].read() ``` +Using the `standard_names` input with `intersection="union"` searches for datasets that have both "wind_speed" and "wind_from_direction". Using the `variables` input subsequently narrows the dataset to only those columns, plus "time", "latitude", "longitude", and "z". - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
time (UTC)wind_speed (m.s-1)wind_from_direction (degrees)
02022-12-14T19:40:00Z7.0140.0
12022-12-14T19:20:00Z7.0120.0
22022-12-14T19:10:00ZNaNNaN
32022-12-14T19:00:00Z9.0130.0
42022-12-14T18:50:00Z9.0130.0
............
482962022-01-01T00:40:00Z4.0120.0
482972022-01-01T00:30:00Z3.0130.0
482982022-01-01T00:20:00Z4.0120.0
482992022-01-01T00:10:00Z4.0130.0
483002022-01-01T00:00:00Z4.0130.0
+```python + time (UTC) latitude (degrees_north) ... wind_speed (m.s-1) wind_from_direction (degrees) +0 2022-01-01T00:00:00Z 28.508 ... 3.6 126.0 +1 2022-01-01T00:10:00Z 28.508 ... 3.8 126.0 +2 2022-01-01T00:20:00Z 28.508 ... 3.6 124.0 +3 2022-01-01T00:30:00Z 28.508 ... 3.4 125.0 +4 2022-01-01T00:40:00Z 28.508 ... 3.5 124.0 +... ... ... ... ... ... +52524 2022-12-31T23:20:00Z 28.508 ... 5.9 176.0 +52525 2022-12-31T23:30:00Z 28.508 ... 6.8 177.0 +52526 2022-12-31T23:40:00Z 28.508 ... 7.2 175.0 +52527 2022-12-31T23:50:00Z 28.508 ... 7.4 169.0 +52528 2023-01-01T00:00:00Z 28.508 ... 8.1 171.0 + +[52529 rows x 6 columns] +``` diff --git a/docs/api.rst b/docs/api.rst index c831cda..ca57497 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -18,11 +18,11 @@ ------------------------ -.. autoclass:: intake_erddap.erddap.ERDDAPSource +.. autoclass:: intake_erddap.erddap.ERDDAPReader :members: get_client -.. autoclass:: intake_erddap.erddap.TableDAPSource +.. autoclass:: intake_erddap.erddap.TableDAPReader :members: read, read_partition, read_chunked -.. autoclass:: intake_erddap.erddap.GridDAPSource +.. autoclass:: intake_erddap.erddap.GridDAPReader :members: read_partition, read_chunked, to_dask, close diff --git a/docs/conf.py b/docs/conf.py index 3da3cfa..6e2e288 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -26,17 +26,17 @@ # -- Project information ----------------------------------------------------- project = "intake-erddap" -copyright = "Copyright 2022 Axiom Data Science, LLC" +copyright = "Copyright 2022-2024 Axiom Data Science, LLC" author = "Axiom Data Science, LLC" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # see https://pypi.org/project/setuptools-scm/ for details -from pkg_resources import get_distribution +from importlib.metadata import version as imversion -release = get_distribution("intake_erddap").version +release = imversion("intake_erddap") # for example take major/minor version = ".".join(release.split(".")[:2]) @@ -71,6 +71,11 @@ nb_execution_timeout = 120 + +# https://myst-nb.readthedocs.io/en/v0.9.0/use/execute.html +# jupyter_execute_notebooks = "off" +nb_execution_mode = "force" + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -85,10 +90,10 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -#html_theme = "furo" +html_theme = "furo" # furo variables -html_title = "intake-axds documentation" +html_title = "intake-erddap documentation" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/environment.yml b/docs/environment.yml index d01959c..971dfba 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -1,16 +1,16 @@ -name: docs +name: intake-erddap-docs channels: - conda-forge - nodefaults dependencies: - - python=3.9 + - python=3.11 # If your docs code examples depend on other packages add them here - numpy - dask - pandas - erddapy - panel - - intake + # - intake - intake-xarray>=0.6.1 - cf_pandas # These are needed for the docs themselves @@ -29,10 +29,11 @@ dependencies: - pip - recommonmark - pip: + - furo - git+https://github.com/intake/intake - - intake-parquet - - intake-xarray - - intake-erddap + # - intake-parquet + # - intake-xarray + # - intake-erddap # - "dask[complete]" - docrep<=0.2.7 - furo diff --git a/docs/examples.rst b/docs/examples.rst deleted file mode 100644 index 09cc0af..0000000 --- a/docs/examples.rst +++ /dev/null @@ -1,96 +0,0 @@ -Examples -======== - -.. toctree:: - :maxdepth: 2 - - examples/wave-height.md - -Querying --------- - -A catlaog can be generated by passing your desired query parameters directly -with the ``kwargs_search`` keyword argument. This object gets passed to -`erddappy `_ :: - - search = { - "min_lon": -180, - "max_lon": -156, - "min_lat": 50, - "max_lat": 66, - "min_time": "2021-04-01", - "max_time": "2021-04-02", - } - cat = intake.open_erddap_catalog(server_url, kwargs_search=search) - - -The same query can also be specified using the constructor keyword arguments:: - - cat = intake.open_erddap_catalog( - server=server_url, - bbox=(-180., 50., -156., 66.), - start_time=datetime(2021, 4, 1), - end_time=datetime(2021, 4, 2), - ) - -The catalog supports querying for datasets that contain a variable with a -particular -`CF Standard Name `_ -. Clients can specify the standard name queries with either the -``kwargs_search`` keyword argument, or the ``standard_names`` keyword argument:: - - cat = intake.open_erddap_catalog( - server=server_url, - kwargs_search={ - "standard_name": "air_temperature", - }, - ) - -or:: - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature"], - ) - -Multiple standard name values can be queries which will return all datasets -containing at least one of the queried standard names:: - - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature", "air_pressure"], - ) - -In cases where standard names are not sufficient, clients can query using the -variable name as it appears in ERDDAP:: - - cat = intake.open_erddap_catalog( - server=server_url, - variable_names=["Pair", "temp"], - ) - -Lastly, ERDDAP offers a plaintext search option. Clients can query for datasets -containing a plaintext search term:: - - cat = intake.open_erddap_catalog( - server=server_url, - search_for=["ioos", "aoos", "NOAA"], - ) - - -Querying with AND ------------------ - -Sometimes, clients may want to find only datasets that match all of the query -terms exactly. This can be achieved with the ``query_type`` keyword argument:: - - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature", "air_pressure"], - query_type="intersection", - ) - -This will return only datasets that have both ``air_temperature`` and -``air_pressure`` as standard names associated with variables. diff --git a/docs/examples/wave-height.md b/docs/examples/wave-height.md index 6aac638..fc566b2 100644 --- a/docs/examples/wave-height.md +++ b/docs/examples/wave-height.md @@ -4,7 +4,7 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.0 + jupytext_version: 1.16.3 kernelspec: display_name: Python language: python @@ -15,11 +15,10 @@ Example: Investigating Significant Wave Height - Southern California ==================================================================== ```{code-cell} ipython3 ---- -tags: [hide-cell] ---- +:tags: [hide-cell] + import intake_erddap -import intake +# import intake import numpy as np import cartopy.crs as ccrs @@ -37,24 +36,22 @@ def figure(*args, figsize=(18, 8), facecolor='white', **kwargs): Here's an example of finding _all_ stations that have significant wave height from the main IOOS ERDDAP server. - ```{code-cell} ipython3 server = 'https://erddap.sensors.ioos.us/erddap' -cat = intake.open_erddap_cat( +cat = intake_erddap.ERDDAPCatalogReader( server=server, standard_names=["sea_surface_wind_wave_significant_height"] -) +).read() ``` ```{code-cell} ipython3 -df = pd.DataFrame([i.metadata for i in cat.values()]) +df = pd.DataFrame([cat[i].metadata for i in list(cat)]) sub_df = df[['datasetID', 'minTime', 'maxTime', 'title']][:5] sub_df.style.set_table_attributes('class="dataframe docutils"').hide(axis="index") ``` We can plot the locations of these stations on the globe. - ```{code-cell} ipython3 fig, ax = figure(subplot_kw=dict(projection=ccrs.PlateCarree())) ax.coastlines() @@ -77,21 +74,24 @@ ax.add_geometries([box], facecolor='red', alpha=0.4, crs=ccrs.PlateCarree()) ax.set_extent([-130., -60., 20., 45.], crs=ccrs.PlateCarree()) ``` -We can pass this bounding box directly to the ERDDAP Catalog constructor, as well as limit our query only to stations that contain data after 2014: +We can pass this bounding box directly to the ERDDAP Catalog constructor, as well as limit our query only to stations that contain data after 2014 and through 2017. We also will limit the data returned to the variable (through the `variables` keyword) we are searching for plus basic variables (time, longitude, latitude, and depth): ```{code-cell} ipython3 -cat = intake.open_erddap_cat( +cat = intake_erddap.ERDDAPCatalogReader( server=server, bbox=bbox, start_time=datetime(2014, 1, 1), - standard_names=["sea_surface_wind_wave_significant_height"] -) + end_time=datetime(2018,1,1), + standard_names=["sea_surface_wave_significant_height"], + variables=["sea_surface_wave_significant_height"], + dropna=True, +).read() len(cat) ``` ```{code-cell} ipython3 -df = pd.DataFrame([i.metadata for i in cat.values()]) +df = pd.DataFrame([cat[i].metadata for i in list(cat)]) sub_df = df[['datasetID', 'minTime', 'maxTime', 'title']] sub_df.style.set_table_attributes('class="dataframe docutils"').hide(axis="index") ``` @@ -108,23 +108,29 @@ ax.scatter(df['minLongitude'], df['minLatitude']) ax.set_title("Station Locations") ``` -We can now interrogate each of those stations and get a timeseries for the significant wave height data. +We can now interrogate each of those stations and get a timeseries for the significant wave height data. We'll use the first four that contain wave height data. + ```{code-cell} ipython3 -# Just get 4 -stations = list(cat)[:4] +# Just get 4 that aren't empty +stations = {} +for dataset_id in list(cat): + df = cat[dataset_id].read() + if len(df) > 0: + stations[dataset_id] = df + if len(stations) == 4: + break +``` -fig, axs = figure(nrows=len(stations), figsize=(18,18)) +```{code-cell} ipython3 -for i, dataset_id in enumerate(stations): +fig, axs = figure(nrows=len(stations), figsize=(15,10), sharex=True, sharey=True) + +for i, (dataset_id, df) in enumerate(stations.items()): ax = axs[i] - source = cat[dataset_id] - df = source.read() - t = df['time (UTC)'].astype('M8[s]') - sig_wave_height = df['sea_surface_wave_significant_height (m)'] - ax.plot(t, sig_wave_height) - ax.set_title(f'{dataset_id} Significant Wave Height (m)') - ax.set_xlim(np.datetime64('2014-01-01'), np.datetime64('2022-12-01')) + df.plot(ax=ax, x='time (UTC)', y='sea_surface_wave_significant_height (m)', fontsize=14, rot=30, + title=f'{dataset_id} Significant Wave Height (m)', legend=False, xlabel="") ax.grid() + fig.tight_layout(pad=1) ``` diff --git a/docs/index.rst b/docs/index.rst index 6409ec0..af2a4eb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,12 +6,6 @@ Welcome to intake-erddap's documentation! ========================================= -.. toctree:: - :maxdepth: 2 - - examples - API - GitHub repository Intake ERDDAP ============= @@ -24,171 +18,34 @@ science projects. Intake ERDDAP provides a set of integrations for ERDDAP. - Produce a pandas DataFrame for a given dataset or query. - Get an xarray Dataset for the Gridded datasets. +The key features are: -.. image:: https://img.shields.io/github/actions/workflow/status/axiom-data-science/intake-erddap/test.yaml?branch=main&logo=github&style=for-the-badge - :alt: Build Status - -.. image:: https://img.shields.io/codecov/c/github/axiom-data-science/intake-erddap.svg?style=for-the-badge - :alt: Code Coverage - -.. image:: https://img.shields.io/badge/License-BSD--2%20Clause-blue.svg?style=for-the-badge - :alt: License:BSD - -.. image:: https://img.shields.io/github/actions/workflow/status/axiom-data-science/intake-erddap/linting.yaml?branch=main&label=Code%20Style&style=for-the-badge - :alt: Code Style Status - -The project is available on `Github `_. - - -TODO: Summary - -The Key features are: - - - Pandas DataFrames for any TableDAP dataset. - - xarray Datasets for any GridDAP datasets. - - Query by any or all: - - bounding box - - time - - CF ``standard_name`` - - variable name - - Plaintext Search term - - Save catalogs locally for future use. - - -Requirements ------------- - -- Python >= 3.8 +- Pandas DataFrames for any TableDAP dataset. +- xarray Datasets for any GridDAP datasets. +- Query by any or all: + - bounding box + - time + - CF ``standard_name`` + - variable name + - Plaintext Search term +- Save catalogs locally for future use. Installation ------------ -In the very near future, we will be offering the project on conda. Currently the -project is available on PyPI, so it can be installed using ``pip``:: +The project is available on PyPI, so it can be installed using ``pip``:: pip install intake-erddap -Examples --------- - -To create an intake catalog for all of the ERDDAP's TableDAP offerings use:: - - import intake - catalog = intake.open_erddap_cat( - server="https://erddap.sensors.ioos.us/erddap" - ) - - -The catalog objects behave like a dictionary with the keys representing the -dataset's unique identifier within ERDDAP, and the values being the -``TableDAPSource`` objects. To access a source object:: - - source = catalog["datasetid"] - -From the source object, a pandas DataFrame can be retrieved:: - - df = source.read() - -Scenarios ---------- - -Consider a case where you need to find all wind data near Florida.:: - - import intake - from datetime import datetime - bbox = (-87.84, 24.05, -77.11, 31.27) - catalog = intake.open_erddap_cat( - server="https://erddap.sensors.ioos.us/erddap", - bbox=bbox, - start_time=datetime(2022, 1, 1), - end_time=datetime(2023, 1, 1), - standard_names=["wind_speed", "wind_from_direction"], - ) - - df = next(catalog.values()).read() - - -.. raw:: html - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
time (UTC)wind_speed (m.s-1)wind_from_direction (degrees)
02022-12-14T19:40:00Z7.0140.0
12022-12-14T19:20:00Z7.0120.0
22022-12-14T19:10:00ZNaNNaN
32022-12-14T19:00:00Z9.0130.0
42022-12-14T18:50:00Z9.0130.0
............
482962022-01-01T00:40:00Z4.0120.0
482972022-01-01T00:30:00Z3.0130.0
482982022-01-01T00:20:00Z4.0120.0
482992022-01-01T00:10:00Z4.0130.0
483002022-01-01T00:00:00Z4.0130.0
+.. toctree:: + :maxdepth: 3 + :hidden: + + user_guide + API + whats_new + GitHub repository Indices and tables diff --git a/docs/user_guide.rst b/docs/user_guide.rst new file mode 100644 index 0000000..8b9e954 --- /dev/null +++ b/docs/user_guide.rst @@ -0,0 +1,164 @@ +User Guide +========== + +.. toctree:: + :maxdepth: 2 + + examples/wave-height.md + +Querying +-------- + +A catalog can be generated by passing your desired query parameters directly +with the ``kwargs_search`` keyword argument. This object gets passed to +`erddapy `_ :: + + import intake_erddap + + search = { + "min_lon": -180, + "max_lon": -156, + "min_lat": 50, + "max_lat": 66, + "min_time": "2021-04-01", + "max_time": "2021-04-02", + } + cat = intake_erddap.ERDDAPCatalogReader(server_url, kwargs_search=search) + + +The same query can also be specified using the constructor keyword arguments:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + bbox=(-180., 50., -156., 66.), + start_time=datetime(2021, 4, 1), + end_time=datetime(2021, 4, 2), + ) + +The catalog supports querying for datasets that contain a variable with a +particular +`CF Standard Name `_ +. Clients can specify the standard name queries with either the +``kwargs_search`` keyword argument, or the ``standard_names`` keyword argument:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + kwargs_search={ + "standard_name": "air_temperature", + }, + ) + +or:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature"], + ) + +Multiple standard name values can be queries which will return all datasets +containing at least one of the queried standard names:: + + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature", "air_pressure"], + ) + +In cases where standard names are not sufficient, clients can query using the +variable name as it appears in ERDDAP:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + variable_names=["Pair", "temp"], + ) + +Lastly, ERDDAP offers a plaintext search option. Clients can query for datasets +containing a plaintext search term:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + search_for=["ioos", "aoos", "NOAA"], + ) + +This can also be useful if you know the name of the station or stations you want +to make a catalog from :: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + search_for=["aoos_204"], + ) + +Querying with AND +----------------- + +Sometimes, clients may want to find only datasets that match all of the query +terms exactly. This can be achieved with the ``query_type`` keyword argument:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature", "air_pressure"], + query_type="intersection", + ) + +This will return only datasets that have both ``air_temperature`` and +``air_pressure`` as standard names associated with variables. + + +Constraints +----------- + +Use the input option `use_source_constraints=True` to use any relevant parameter +from "kwargs_search" constraints in the query. This will pass a `start_time` on +so that it will limit the time returned in the data to the `start_time`, for example:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + bbox=(-180., 50., -156., 66.), + start_time=datetime(2021, 4, 1), + end_time=datetime(2021, 4, 2), + use_source_constraints=True, + ) + +Dropping bad values +------------------- + +Use the `dropna` option to drop rows with NaN values in the data columns:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + dropna=True, + ) + +Note that this is an alpha feature because it uses logic that identifies columns of data as opposed to coordinates and axes on its own to decide from which columns to drop NaN values. This has not been thoroughly tested. + + +Selecting which columns of data to return +----------------------------------------- + +Use the `variables` option to select which columns of data to return. This is useful when you only need a subset of the data columns:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + variables=["sea_water_temperature"], + ) + +Variables `time`, `latitude`, `longitude`, and `z` are always additionally returned. + + +Mask due to quality flags +------------------------- + +If `mask_failed_qartod=True`` and `*_qc_agg` columns associated with the data columns are available, data values associated with QARTOD flags other than 1 and 2 will be nan'ed out. Has not been thoroughly tested. + + +Simple caching +-------------- + +You can using simple caching through `fsspec` if you input `cache_kwargs` such as the following:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + cache_kwargs=dict(cache_storage="/tmp/fnames/", same_names=True), + ) + +This would have the effect of caching the data locally in the `/tmp/fnames/` directory so it doesn't have to be downloaded next time. The `same_names` option is useful if you want to cache the data with the same name as the data source for clarity. \ No newline at end of file diff --git a/docs/whats_new.md b/docs/whats_new.md new file mode 100644 index 0000000..4125ea8 --- /dev/null +++ b/docs/whats_new.md @@ -0,0 +1,6 @@ +# What's New + +## v0.5.0 (July 19, 2024) +* Major changes across the codebase to update to intake v2! Also updated class names; updated tests; updated docs. +* Now can choose variables to narrow results to. +* Fixed some bugs. \ No newline at end of file diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index c6deb7f..190f66d 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -1,14 +1,14 @@ """intake-erddap package.""" import intake -from .erddap import GridDAPSource, TableDAPSource +from .erddap import GridDAPReader, TableDAPReader from .erddap_cat import ERDDAPCatalogReader from .version import __version__ __all__ = [ "ERDDAPCatalogReader", - "TableDAPSource", - "GridDAPSource", + "TableDAPReader", + "GridDAPReader", "__version__", ] diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 3cdff24..68b453c 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -1,4 +1,4 @@ -"""Source implementations for intake-erddap.""" +"""Reader implementations for intake-erddap.""" import typing from logging import getLogger @@ -20,11 +20,11 @@ log = getLogger("intake-erddap") -class ERDDAPSource(BaseReader): +class ERDDAPReader(BaseReader): """ - ERDDAP Source (Base Class). This class represents the abstract base class - for an intake data source object for ERDDAP. Clients should use either - ``TableDAPSource`` or ``GridDAPSource``. + ERDDAP Reader (Base Class). This class represents the abstract base class + for an intake data reader object for ERDDAP. Clients should use either + ``TableDAPReader`` or ``GridDAPReader``. Parameters ---------- @@ -64,8 +64,8 @@ def get_client(self, server, protocol, dataset_id, variables, constraints, clien return e -class TableDAPSource(ERDDAPSource): - """Creates a Data Source for an ERDDAP TableDAP Dataset. +class TableDAPReader(ERDDAPReader): + """Creates a Data Reader for an ERDDAP TableDAP Dataset. Parameters ---------- @@ -83,14 +83,14 @@ class TableDAPSource(ERDDAPSource): A mapping of conditions and constraints. Example: ``{"time>=": "2022-01-02T12:00:00Z", "lon>": -140, "lon<": 0}`` metadata : dict, optional - Additional metadata to include with the source passed from the catalog. + Additional metadata to include with the reader passed from the catalog. erddap_client : type, optional A class that implements an interface like erdappy's ERDDAP class. The - source will rely on this client to interface with ERDDAP for most + reader will rely on this client to interface with ERDDAP for most requests. http_client : module or object, optional An object or module that implements an HTTP Client similar to request's - interface. The source will use this object to make HTTP requests to + interface. The reader will use this object to make HTTP requests to ERDDAP in some cases. mask_failed_qartod : bool, False WARNING ALPHA FEATURE. If True and `*_qc_agg` columns associated with @@ -107,19 +107,19 @@ class TableDAPSource(ERDDAPSource): Examples -------- - Sources are normally returned from a catalog object, but a source can be instantiated directly: + Readers are normally returned from a catalog object, but a Reader can be instantiated directly: - >>> source = TableDAPSource("https://erddap.senors.axds.co/erddap", + >>> reader = TableDAPReader("https://erddap.senors.axds.co/erddap", ... "gov_usgs_waterdata_441759103261203") - Getting a pandas DataFrame from the source: + Getting a pandas DataFrame from the reader: - >>> ds = source.read() + >>> ds = reader.read() Once the dataset object has been instantiated, the dataset's full metadata - is available in the source. + is available in the reader. - >>> source.metadata + >>> reader.metadata {'info_url': 'https://erddap.sensors.axds.co/erddap/info/gov_usgs_waterdata_404513098181201...', 'catalog_dir': '', 'variables': {'time': {'_CoordinateAxisType': 'Time', @@ -134,26 +134,39 @@ class TableDAPSource(ERDDAPSource): """ output_instance = "pandas:DataFrame" - def _read(self, server, dataset_id, mask_failed_qartod=False, dropna=False, cache_kwargs=None, - constraints=None, **kw): + def _read(self, server, dataset_id, variables=None, mask_failed_qartod=False, dropna=False, cache_kwargs=None, + open_kwargs=None, constraints=None, **kw): + open_kwargs = open_kwargs or {} + variables = variables or [] kw.pop("protocol", None) protocol = kw.pop("protocol", "tabledap") + + # check for variables in user-input list that are not available for the dataset meta2 = self._get_dataset_metadata(server, dataset_id) - e = self.get_client(server, protocol, dataset_id, variables=meta2["variables"], + variables_diff = set(variables) - set(meta2["variables"].keys()) + if len(variables_diff) > 0: + variables = [var for var in variables if var not in variables_diff] + + e = self.get_client(server, protocol, dataset_id, variables=variables, constraints=constraints or {}, **kw) if cache_kwargs is not None: - if "response" in self.open_kwargs: - response = self.open_kwargs["response"] - self.open_kwargs.pop("response") + if "response" in open_kwargs: + response = open_kwargs["response"] + open_kwargs.pop("response") url = e.get_download_url(response=response) else: - url = e.get_download_url(response=response) + url = e.get_download_url(response="csvp") # should this be the default or csv? - with fsspec.open(f"simplecache://::{url}", **(cache_kwargs or {})) as f: - dataframe: pd.DataFrame = pd.read_csv(f) + try: + with fsspec.open(f"simplecache://::{url}", **(cache_kwargs or {})) as f: + dataframe: pd.DataFrame = pd.read_csv(f, **open_kwargs) + except OSError as e: # might get file name too long + print(e) + print("If your filenames are too long, input only a few variables" + "to return or input into cache kwargs `same_names=False`") else: dataframe: pd.DataFrame = e.to_pandas( - requests_kwargs={"timeout": 60} + requests_kwargs={"timeout": 60}, **open_kwargs ) if mask_failed_qartod: dataframe = self.run_mask_failed_qartod(dataframe) @@ -188,15 +201,13 @@ def run_mask_failed_qartod(self, df): for datacol in self.data_cols(df): qccol = f"{datacol}_qc_agg" if qccol in df.columns: - df.loc[ - ~self._dataframe[qccol].isin([1, 2]), datacol - ] = pd.NA + df.loc[~df[qccol].isin([1, 2]), datacol] = pd.NA df.drop(columns=[qccol], inplace=True) return df def run_dropna(self, df): """Drop nan rows based on the data columns.""" - return df.dropna(subset=self.data_cols) + return df.dropna(subset=self.data_cols(df)) def _get_dataset_metadata(self, server, dataset_id) -> dict: """Fetch and return the metadata document for the dataset.""" @@ -239,8 +250,8 @@ def _parse_metadata_value( return newvalue -class GridDAPSource(ERDDAPSource): - """Creates a Data Source for an ERDDAP GridDAP Dataset. +class GridDAPReader(ERDDAPReader): + """Creates a Data Reader for an ERDDAP GridDAP Dataset. Parameters ---------- @@ -267,19 +278,19 @@ class GridDAPSource(ERDDAPSource): Examples -------- - Sources are normally returned from a catalog object, but a source can be instantiated directly: + Readers are normally returned from a catalog object, but a reader can be instantiated directly: - >>> source = GridDAPSource("https://coastwatch.pfeg.noaa.gov/erddap", "charmForecast1day", + >>> reader = GridDAPReader("https://coastwatch.pfeg.noaa.gov/erddap", "charmForecast1day", ... chunks={"time": 1}) - Getting an xarray dataset from the source object: + Getting an xarray dataset from the reader object: - >>> ds = source.to_dask() + >>> ds = reader.read() Once the dataset object has been instantiated, the dataset's full metadata - is available in the source. + is available in the reader. - >>> source.metadata + >>> reader.metadata {'catalog_dir': '', 'dims': {'time': 1182, 'latitude': 391, 'longitude': 351}, 'data_vars': {'pseudo_nitzschia': ['time', 'latitude', 'longitude'], @@ -292,39 +303,42 @@ class GridDAPSource(ERDDAPSource): 'acknowledgement': ... - Warning - ------- - The ``read()`` method will raise a ``NotImplemented`` exception because the - standard intake interface has the result read entirely into memory. For - gridded datasets this should not be allowed, reading the entire dataset into - memory can overwhelm the server, get the client blacklisted, and potentially - crash the client by exhausting available system memory. If a client truly - wants to load the entire dataset into memory, the client can invoke the - method ``ds.load()`` on the Dataset object. """ - def __init__( - self, + # def __init__( + # self, + # server: str, + # dataset_id: str, + # constraints: dict = None, + # chunks: Union[None, int, dict, str] = None, + # xarray_kwargs: dict = None, + # **kwargs, + # ): + # self._server = server + # self._chunks = chunks + # self._constraints = constraints or {} + # self._xarray_kwargs = xarray_kwargs or {} + # # Initialized by the private getter _get_schema + # self.urlpath = f"{server}/griddap/{dataset_id}" + # # https://github.com/python/mypy/issues/6799 + # kwargs.pop("protocol", None) + # super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore + + def _read(self, server: str, dataset_id: str, constraints: dict = None, chunks: Union[None, int, dict, str] = None, xarray_kwargs: dict = None, - **kwargs, - ): - self._server = server - self._chunks = chunks - self._constraints = constraints or {} - self._xarray_kwargs = xarray_kwargs or {} - # Initialized by the private getter _get_schema - self.urlpath = f"{server}/griddap/{dataset_id}" - # https://github.com/python/mypy/issues/6799 - kwargs.pop("protocol", None) - super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore - - def _read(self): + **kw +): + constraints = constraints or {} + chunks = chunks or {} + xarray_kwargs = xarray_kwargs or {} + urlpath = f"{server}/griddap/{dataset_id}" + ds = xr.open_dataset( - self.urlpath, chunks=self._chunks, **self._xarray_kwargs + urlpath, chunks=chunks, **xarray_kwargs ) # _NCProperties is an internal property which xarray does not yet deal # with specially, so we remove it here to prevent it from causing diff --git a/intake_erddap/erddap_cat.py b/intake_erddap/erddap_cat.py index a311f5a..eb4a5f8 100644 --- a/intake_erddap/erddap_cat.py +++ b/intake_erddap/erddap_cat.py @@ -28,7 +28,7 @@ from intake_erddap.cache import CacheStore from . import utils -from .erddap import GridDAPSource, TableDAPSource +from .erddap import GridDAPReader, TableDAPReader from .utils import match_key_to_category from .version import __version__ @@ -95,8 +95,17 @@ class ERDDAPCatalogReader(BaseReader): One of the two supported ERDDAP Data Access Protocols: "griddap", or "tabledap". "tabledap" will present tabular datasets using pandas, meanwhile "griddap" will use xarray. + chunks : dict, optional + For griddap protocol, pass a dictionary of chunk sizes for the xarray. + xarray_kwargs : dict, optional + For griddap protocol, pass a dictionary of kwargs to pass to the + xarray.open_dataset method. metadata : dict, optional Extra metadata for the intake catalog. + variables : list of str, optional + List of variables to limit the dataset to, if available. If you're not + sure what variables are available, check info_url for the station, or + look up the dataset on the ERDDAP server. query_type : str, default "union" Specifies how the catalog should apply the query parameters. Choices are ``"union"`` or ``"intersection"``. If the ``query_type`` is set to @@ -104,6 +113,11 @@ class ERDDAPCatalogReader(BaseReader): each individual query made to ERDDAP. This is equivalent to a logical AND of the results. If the value is ``"union"`` then the results will be the union of each resulting dataset. This is equivalent to a logical OR. + open_kwargs : dict, optional + Keyword arguments to pass to the `open` method of the ERDDAP Reader, + e.g. pandas read_csv. Response is an optional keyword argument that will + be used by ERDDAPY to determine the response format. Default is "csvp" and + for TableDAP Readers, "csv" and "csv0" are reasonable choices too. mask_failed_qartod : bool, False WARNING ALPHA FEATURE. If True and `*_qc_agg` columns associated with data columns are available, data values associated with QARTOD flags @@ -145,7 +159,10 @@ def __init__( erddap_client: Optional[Type[ERDDAP]] = None, use_source_constraints: bool = True, protocol: str = "tabledap", + chunks: Optional[dict] = None, + xarray_kwargs: Optional[dict] = None, metadata: dict = None, + variables: list = None, query_type: str = "union", cache_period: Optional[Union[int, float]] = 500, open_kwargs: dict = None, @@ -160,6 +177,8 @@ def __init__( self._entries: Dict[str, Catalog] = {} self._use_source_constraints = use_source_constraints self._protocol = protocol + self._chunks = chunks + self._xarray_kwargs = xarray_kwargs self._dataset_metadata: Optional[Mapping[str, dict]] = None self._query_type = query_type self.server = server @@ -169,6 +188,12 @@ def __init__( self._mask_failed_qartod = mask_failed_qartod self._dropna = dropna self._cache_kwargs = cache_kwargs + if variables is not None: + variables = ["time", "latitude", "longitude", "z"] + variables + self.variables = variables + + chunks = chunks or {} + xarray_kwargs = xarray_kwargs or {} if kwargs_search is not None: checks = [ @@ -272,7 +297,6 @@ def _load_df(self) -> pd.DataFrame: raise df.rename(columns={"Dataset ID": "datasetID"}, inplace=True) frames.append(df) - if self._query_type == "union": result = pd.concat(frames) result = result.drop_duplicates("datasetID") @@ -422,7 +446,8 @@ def read(self): self._entries = {} # Remove datasets that are redundant - df = df[(~df["datasetID"].str.startswith("ism-")) * (df["datasetID"] != "allDatasets")] + if len(df) > 0: + df = df[(~df["datasetID"].str.startswith("ism-")) * (df["datasetID"] != "allDatasets")] entries, aliases = {}, {} for index, row in df.iterrows(): @@ -432,6 +457,7 @@ def read(self): args = { "server": self.server, "dataset_id": dataset_id, + "variables": self.variables, "protocol": self._protocol, "constraints": {}, "open_kwargs": self.open_kwargs, @@ -445,7 +471,7 @@ def read(self): } ) args["constraints"].update(self._get_tabledap_constraints()) - datatype = "intake_erddap.erddap:TableDAPSource" + datatype = "intake_erddap.erddap:TableDAPReader" elif self._protocol == "griddap": args.update( { @@ -455,7 +481,7 @@ def read(self): ) # no equivalent for griddap, though maybe it works the same? args["constraints"].update(self._get_tabledap_constraints()) - datatype = "intake_erddap.erddap:GridDAPSource" + datatype = "intake_erddap.erddap:GridDAPReader" else: raise ValueError(f"Unsupported protocol: {self._protocol}") @@ -470,10 +496,10 @@ def read(self): aliases[dataset_id] = dataset_id cat = Catalog(data=entries, aliases=aliases,) - return cat + return cat def _get_tabledap_constraints(self) -> Dict[str, Union[str, int, float]]: - """Return the constraints dictionary for a tabledap source.""" + """Return the constraints dictionary for a tabledap Reader.""" result = {} if self._use_source_constraints and "min_time" in self.kwargs_search: min_time = self.kwargs_search["min_time"] diff --git a/intake_erddap/version.py b/intake_erddap/version.py index ed18a7b..03f496d 100644 --- a/intake_erddap/version.py +++ b/intake_erddap/version.py @@ -1,9 +1,8 @@ """Project version module.""" -from pkg_resources import DistributionNotFound, get_distribution - +from importlib.metadata import version, PackageNotFoundError try: - __version__ = get_distribution("intake-erddap").version -except DistributionNotFound: + __version__ = version("intake-erddap") +except PackageNotFoundError: # package is not installed - __version__ = "unknown" + __version__ = "unknown" \ No newline at end of file diff --git a/setup.py b/setup.py index b6c1008..5356d28 100644 --- a/setup.py +++ b/setup.py @@ -22,8 +22,8 @@ package_data={"": ["*.csv", "*.yml", "*.html"]}, entry_points={ "intake.imports": [ - "tabledap = intake_erddap.erddap:TableDAPSource", - "griddap = intake_erddap.erddap:GridDAPSource", + "tabledap = intake_erddap.erddap:TableDAPReader", + "griddap = intake_erddap.erddap:GridDAPReader", "erddap_cat = intake_erddap.erddap_cat:ERDDAPCatalogReader", ], }, diff --git a/tests/test_cache.py b/tests/test_cache.py index 021e0f4..8241b27 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -36,23 +36,23 @@ def test_cache_file(user_cache_dir_mock, tempdir): assert filepath.name == f"{sha}.gz" -@mock.patch("requests.get") -@mock.patch("appdirs.user_cache_dir") -def test_cache_csv(user_cache_dir_mock, http_get_mock, tempdir): - user_cache_dir_mock.return_value = tempdir - resp = mock.Mock() - resp.content = b"blahblah" - http_get_mock.return_value = resp - url = "http://kevinbacon.invalid/erddap/advanced?blahbah" - store = cache.CacheStore() - store.cache_response(url) - sha = store.hash_url(url) - target = Path(tempdir) / f"{sha}.gz" - assert target.exists() - assert http_get_mock.called_with(url) - with gzip.open(target, "rt", encoding="utf-8") as f: - buf = f.read() - assert buf == "blahblah" +# @mock.patch("requests.get") +# @mock.patch("appdirs.user_cache_dir") +# def test_cache_csv(user_cache_dir_mock, http_get_mock, tempdir): +# user_cache_dir_mock.return_value = tempdir +# resp = mock.Mock() +# resp.content = b"blahblah" +# http_get_mock.return_value = resp +# url = "http://kevinbacon.invalid/erddap/advanced?blahbah" +# store = cache.CacheStore() +# store.cache_response(url) +# sha = store.hash_url(url) +# target = Path(tempdir) / f"{sha}.gz" +# assert target.exists() +# assert http_get_mock.called_with(url) +# with gzip.open(target, "rt", encoding="utf-8") as f: +# buf = f.read() +# assert buf == "blahblah" @mock.patch("requests.get") diff --git a/tests/test_erddap_cat.py b/tests/test_erddap_cat.py index d731185..5e7adc7 100644 --- a/tests/test_erddap_cat.py +++ b/tests/test_erddap_cat.py @@ -17,8 +17,8 @@ from erddapy import ERDDAP -from intake_erddap.erddap import GridDAPSource, TableDAPSource -from intake_erddap.erddap_cat import ERDDAPCatalog +from intake_erddap.erddap import GridDAPReader, TableDAPReader +from intake_erddap.erddap_cat import ERDDAPCatalogReader SERVER_URL = "http://erddap.invalid/erddap" @@ -48,7 +48,7 @@ def temporary_catalog(): os.unlink(path) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog(mock_read_csv, load_metadata_mock): """Test basic catalog API.""" @@ -56,11 +56,11 @@ def test_erddap_catalog(mock_read_csv, load_metadata_mock): results = pd.DataFrame() results["datasetID"] = ["abc123"] mock_read_csv.return_value = results - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog_searching(mock_read_csv, load_metadata_mock): """Test catalog with search parameters.""" @@ -76,11 +76,11 @@ def test_erddap_catalog_searching(mock_read_csv, load_metadata_mock): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kw) + cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog_searching_variable(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -105,15 +105,16 @@ def test_erddap_catalog_searching_variable(mock_read_csv, load_metadata_mock): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server=SERVER_URL, kwargs_search=kw, category_search=("standard_name", "temp") - ) + ) # this is object ERDDAPCatalogReader because I haven't run .read() + assert "standard_name" in cat.kwargs_search assert cat.kwargs_search["standard_name"] == ["sea_water_temperature"] @pytest.mark.integration -def test_ioos_erddap_catalog_and_source(): +def test_ioos_erddap_catalog_and_reader(): """Integration test against IOOS Sensors ERDDAP.""" bbox = (-73.32, 39.92, -69.17, 42.27) kw = { @@ -124,11 +125,11 @@ def test_ioos_erddap_catalog_and_source(): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat_sensors = intake.open_erddap_cat( + cat_sensors = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", kwargs_search=kw - ) - source = cat_sensors["gov_noaa_water_wstr1"] - df = source.read() + ).read() + reader = cat_sensors["edu_ucsd_cdip_154"] + df = reader.read() assert df is not None assert isinstance(df, pd.DataFrame) assert len(df) > 0 @@ -139,18 +140,18 @@ def test_ioos_erddap_catalog_and_source(): @pytest.mark.integration def test_ioos_default_init(): """Test that the default catalog initializes.""" - cat_sensors = intake.open_erddap_cat( + cat_sensors = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", - ) + ).read() assert len(cat_sensors) > 0 @pytest.mark.integration def test_erddap_global_conneection(): - ERDDAPCatalog( + ERDDAPCatalogReader( "https://erddap.sensors.axds.co/erddap", kwargs_search={"standard_name": "sea_water_temperature"}, - ) + ).read() def test_invalid_kwarg_search(): @@ -163,7 +164,7 @@ def test_invalid_kwarg_search(): } with pytest.raises(ValueError): - intake.open_erddap_cat(server=SERVER_URL, kwargs_search=kw) + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() kw = { "min_lon": -180, @@ -174,10 +175,10 @@ def test_invalid_kwarg_search(): } with pytest.raises(ValueError): - intake.open_erddap_cat(server=SERVER_URL, kwargs_search=kw) + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_uses_di_client( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -186,12 +187,12 @@ def test_catalog_uses_di_client( """Tests that the catalog uses the dependency injection provided client.""" mock_read_csv.return_value = single_dataset_catalog mock_erddap_client = mock.create_autospec(ERDDAP) - cat = ERDDAPCatalog(server=SERVER_URL, erddap_client=mock_erddap_client) + cat = ERDDAPCatalogReader(server=SERVER_URL, erddap_client=mock_erddap_client) client = cat.get_client() assert isinstance(client, mock.NonCallableMagicMock) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_skips_all_datasets_row(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -199,11 +200,11 @@ def test_catalog_skips_all_datasets_row(mock_read_csv, load_metadata_mock): df = pd.DataFrame() df["datasetID"] = ["allDatasets", "abc123"] mock_read_csv.return_value = df - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_params_search(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -220,7 +221,7 @@ def test_params_search(mock_read_csv, load_metadata_mock): "max_time": "2022-11-07", "standard_name": "sea_water_temperature", } - cat = ERDDAPCatalog(server=erddap_url, kwargs_search=search) + cat = ERDDAPCatalogReader(server=erddap_url, kwargs_search=search) search_urls = cat.get_search_urls() assert search_urls parts = urlparse(search_urls[0]) @@ -232,30 +233,33 @@ def test_params_search(mock_read_csv, load_metadata_mock): assert query["standard_name"] == "sea_water_temperature" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") -@mock.patch("intake_erddap.cache.CacheStore.read_csv") -def test_constraints_present_in_source( - mock_read_csv, load_metadata_mock, single_dataset_catalog -): - load_metadata_mock.return_value = {} - mock_read_csv.return_value = single_dataset_catalog - search = { - "min_time": "2022-01-01", - "max_time": "2022-11-07", - } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=search) - source = next(cat.values()) - assert source._constraints["time>="] == "2022-01-01" - assert source._constraints["time<="] == "2022-11-07" - - cat = ERDDAPCatalog( - server=SERVER_URL, kwargs_search=search, use_source_constraints=False - ) - source = next(cat.values()) - assert len(source._constraints) == 0 - - -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +# @mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") +# @mock.patch("intake_erddap.cache.CacheStore.read_csv") +# def test_constraints_present_in_reader( +# mock_read_csv, load_metadata_mock, single_dataset_catalog +# ): +# load_metadata_mock.return_value = {} +# mock_read_csv.return_value = single_dataset_catalog +# search = { +# "min_time": "2022-01-01", +# "max_time": "2022-11-07", +# } +# cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search) +# cat.read() +# dataset_id = list(cat)[0] +# reader = cat[dataset_id] +# assert cat._constraints["time>="] == "2022-01-01" +# assert reader._constraints["time<="] == "2022-11-07" + +# cat = ERDDAPCatalogReader( +# server=SERVER_URL, kwargs_search=search, use_source_constraints=False +# ).read() +# dataset_id = list(cat)[0] +# reader = cat[dataset_id] +# assert len(reader._constraints) == 0 + + +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_with_griddap( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -266,12 +270,13 @@ def test_catalog_with_griddap( "min_time": "2022-01-01", "max_time": "2022-11-07", } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=search, protocol="griddap") - source = next(cat.values()) - assert isinstance(source, GridDAPSource) + cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search, protocol="griddap").read() + dataset_id = list(cat)[0] + reader = cat[dataset_id] + assert isinstance(reader, GridDAPReader) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_with_unsupported_protocol( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -283,10 +288,10 @@ def test_catalog_with_unsupported_protocol( } mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, kwargs_search=search, protocol="fakedap") + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search, protocol="fakedap").read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_get_search_urls_by_category( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -298,64 +303,64 @@ def test_catalog_get_search_urls_by_category( "variableName": ["temp", "airTemp"], "search_for": ["kintsugi", "Asano"], } - catalog = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kwargs_search) + catalog = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kwargs_search) search_urls = catalog.get_search_urls() assert len(search_urls) == 6 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_bbox(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, bbox=(-120.0, 30.0, -100.0, 48.0)) + catalog = ERDDAPCatalogReader(server=SERVER_URL, bbox=(-120.0, 30.0, -100.0, 48.0)) assert catalog.kwargs_search["min_lon"] == -120.0 assert catalog.kwargs_search["max_lon"] == -100.0 assert catalog.kwargs_search["min_lat"] == 30.0 assert catalog.kwargs_search["max_lat"] == 48.0 with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, bbox=[0, 0, 1, 1]) + ERDDAPCatalogReader(server=SERVER_URL, bbox=[0, 0, 1, 1]) with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, bbox=(0, 0)) + ERDDAPCatalogReader(server=SERVER_URL, bbox=(0, 0)) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_standard_names_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, standard_names=["air_temperature", "air_pressure"] ) assert catalog.kwargs_search["standard_name"] == ["air_temperature", "air_pressure"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, standard_names="air_temperature") + ERDDAPCatalogReader(server=SERVER_URL, standard_names="air_temperature") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_variable_names_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, variable_names=["airTemp", "Pair"]) + catalog = ERDDAPCatalogReader(server=SERVER_URL, variable_names=["airTemp", "Pair"]) assert catalog.kwargs_search["variableName"] == ["airTemp", "Pair"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, variable_names="air_temperature") + ERDDAPCatalogReader(server=SERVER_URL, variable_names="air_temperature") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_times_arg(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, start_time=datetime(2022, 1, 1), end_time=datetime(2022, 12, 1), @@ -363,30 +368,30 @@ def test_catalog_times_arg(mock_read_csv, load_metadata_mock, single_dataset_cat assert catalog.kwargs_search["min_time"] == "2022-01-01T00:00:00Z" assert catalog.kwargs_search["max_time"] == "2022-12-01T00:00:00Z" with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, start_time="2022-1-1") + ERDDAPCatalogReader(server=SERVER_URL, start_time="2022-1-1") with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, end_time="2022-1-1") + ERDDAPCatalogReader(server=SERVER_URL, end_time="2022-1-1") with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, start_time=np.datetime64("2022-01-01")) + ERDDAPCatalogReader(server=SERVER_URL, start_time=np.datetime64("2022-01-01")) with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, end_time=np.datetime64("2022-01-01")) + ERDDAPCatalogReader(server=SERVER_URL, end_time=np.datetime64("2022-01-01")) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_search_for_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, search_for=["ioos", "aoos"]) + catalog = ERDDAPCatalogReader(server=SERVER_URL, search_for=["ioos", "aoos"]) assert catalog.kwargs_search["search_for"] == ["ioos", "aoos"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, search_for="aoos") + ERDDAPCatalogReader(server=SERVER_URL, search_for="aoos") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_query_search_for( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -396,7 +401,7 @@ def test_catalog_query_search_for( kwargs_search = { "search_for": ["air_pressure", "air_temperature"], } - catalog = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kwargs_search) + catalog = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kwargs_search) search_urls = catalog.get_search_urls() url = search_urls[0] parts = urlparse(url) @@ -409,48 +414,50 @@ def test_catalog_query_search_for( assert query["searchFor"] == "air_temperature" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_search_returns_404(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} mock_read_csv.side_effect = HTTPError( code=404, msg="Blah", url=SERVER_URL, hdrs={}, fp=None ) - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert len(cat) == 0 mock_read_csv.side_effect = HTTPError( code=500, msg="Blah", url=SERVER_URL, hdrs={}, fp=None ) with pytest.raises(HTTPError): - ERDDAPCatalog(server=SERVER_URL) + ERDDAPCatalogReader(server=SERVER_URL).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_saving_catalog( mock_read_csv, load_metadata_mock, single_dataset_catalog, temporary_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - cat = ERDDAPCatalog(server=SERVER_URL) - cat.save(temporary_catalog) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() + cat.to_yaml_file(temporary_catalog) cat = intake.open_catalog(temporary_catalog) - source = next(cat.values()) - assert isinstance(source, TableDAPSource) - assert source._protocol == "tabledap" - assert source._server == SERVER_URL - assert source._dataset_id == "abc123" + dataset_id = list(cat)[0] + assert dataset_id == "abc123" + reader = cat[dataset_id] + assert isinstance(reader, TableDAPReader) + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["protocol"] == "tabledap" + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["server"] == SERVER_URL - cat = ERDDAPCatalog(server=SERVER_URL, protocol="griddap") - cat.save(temporary_catalog) + cat = ERDDAPCatalogReader(server=SERVER_URL, protocol="griddap").read() + cat.to_yaml_file(temporary_catalog) cat = intake.open_catalog(temporary_catalog) - source = next(cat.values()) - assert isinstance(source, GridDAPSource) - assert source._protocol == "griddap" - assert source._server == SERVER_URL - assert source._dataset_id == "abc123" + dataset_id = list(cat)[0] + assert dataset_id == "abc123" + reader = cat[dataset_id] + assert isinstance(reader, GridDAPReader) + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["protocol"] == "griddap" + assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["server"] == SERVER_URL @mock.patch("intake_erddap.utils.get_erddap_metadata") @@ -463,20 +470,20 @@ def test_loading_metadata( "abc123": {"datasetID": "abc123", "institution": "FOMO"} } - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL) assert cat["abc123"].metadata["institution"] == "FOMO" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_trailing_slash(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server="http://blah.invalid/erddap/") + catalog = ERDDAPCatalogReader(server="http://blah.invalid/erddap/") assert catalog.server == "http://blah.invalid/erddap" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): data = [ @@ -521,7 +528,7 @@ def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): # mock 3 calls mock_read_csv.side_effect = [sub_df1, sub_df2, sub_df3] - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, standard_names=["air_pressure", "air_temperature"], variable_names=["sigma"], @@ -531,33 +538,33 @@ def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): assert len(search_urls) == 3 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_query_type_invalid(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalog(server="http://blah.invalid/erddap/", query_type="blah") + ERDDAPCatalogReader(server="http://blah.invalid/erddap/", query_type="blah").read() @pytest.mark.integration def test_empty_search_results(): - cat = intake.open_erddap_cat( + cat = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", standard_names=["sea_surface_temperature"], kwargs_search={ - "min_lon": -156.48529052734375, - "max_lon": -148.9251251220703, + "min_lon": -153.48529052734375, + "max_lon": -150.9251251220703, "min_lat": 56.70049285888672, "max_lat": 61.524776458740234, "min_time": "2022-04-30T00:00:00.000000000", "max_time": "2022-12-15T23:00:00.000000000", }, - ) + ).read() assert len(cat) == 0 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} @@ -565,9 +572,9 @@ def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog resp.status_code = 404 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"] - ) + ).read() assert len(cat) == 0 mock_read_csv.assert_called() @@ -575,12 +582,12 @@ def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog resp.status_code = 500 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) with pytest.raises(requests.exceptions.HTTPError): - ERDDAPCatalog( + ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"] - ) + ).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_empty_catalog_with_intersection( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -590,10 +597,10 @@ def test_empty_catalog_with_intersection( resp.status_code = 404 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"], query_type="intersection", - ) + ).read() assert len(cat) == 0 mock_read_csv.assert_called() diff --git a/tests/test_erddap_source.py b/tests/test_erddap_reader.py similarity index 66% rename from tests/test_erddap_source.py rename to tests/test_erddap_reader.py index 962f016..17e2320 100644 --- a/tests/test_erddap_source.py +++ b/tests/test_erddap_reader.py @@ -1,6 +1,6 @@ #!/usr/bin/env pytest # -*- coding: utf-8 -*- -"""Unit tests for the ERDDAP Source object.""" +"""Unit tests for the ERDDAP Reader object.""" import json from pathlib import Path @@ -12,13 +12,13 @@ import pytest import xarray as xr -from intake_erddap.erddap import GridDAPSource, TableDAPSource +from intake_erddap.erddap import GridDAPReader, TableDAPReader def _grid(grid_data) -> xr.Dataset: time = xr.DataArray( - data=np.array(["2022-01-01T00:00:00"], dtype=" xr.Dataset: return _grid(grid_data) -@mock.patch("intake_erddap.erddap.TableDAPSource._get_dataset_metadata") +@mock.patch("intake_erddap.erddap.TableDAPReader._get_dataset_metadata") @mock.patch("erddapy.ERDDAP.to_pandas") -def test_erddap_source_read(mock_to_pandas, mock_get_dataset_metadata): - """Tests that the source will read from ERDDAP into a pd.DataFrame.""" +def test_erddap_reader_read(mock_to_pandas, mock_get_dataset_metadata): + """Tests that the reader will read from ERDDAP into a pd.DataFrame.""" df = pd.DataFrame() df["time (UTC)"] = ["2022-10-21T00:00:00Z", "2022-10-21T00:00:00Z"] df["sea_water_temperature (deg_C)"] = [13.4, 13.4] mock_to_pandas.return_value = df - mock_get_dataset_metadata.return_value = {} + mock_get_dataset_metadata.return_value = {"variables": {}} - source = TableDAPSource( + reader = TableDAPReader( server="http://erddap.invalid/erddap", dataset_id="abc123", protocol="tabledap" ) - df = source.read() + df = reader.read() + assert df is not None assert mock_to_pandas.called assert len(df) == 2 - source.close() - assert source._dataframe is None + reader.close() -@mock.patch("intake_erddap.erddap.TableDAPSource._get_dataset_metadata") +@mock.patch("intake_erddap.erddap.TableDAPReader._get_dataset_metadata") @mock.patch("erddapy.ERDDAP.to_pandas") -def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata): - """Tests that the source will read from ERDDAP into a pd.DataFrame with processing flag.""" +def test_erddap_reader_read_processing(mock_to_pandas, mock_get_dataset_metadata): + """Tests that the reader will read from ERDDAP into a pd.DataFrame with processing flag.""" df = pd.DataFrame() df["time"] = [ "2022-10-21T01:00:00Z", @@ -94,16 +94,16 @@ def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata df["sea_water_temperature"] = [13.4, 13.4, np.nan] df["sea_water_temperature_qc_agg"] = [1, 4, 2] mock_to_pandas.return_value = df - mock_get_dataset_metadata.return_value = {} + mock_get_dataset_metadata.return_value = {"variables": {}} - source = TableDAPSource( + reader = TableDAPReader( server="http://erddap.invalid/erddap", dataset_id="abc123", protocol="tabledap", mask_failed_qartod=True, dropna=True, ) - df = source.read() + df = reader.read() assert df is not None assert mock_to_pandas.called # mask_failed_qartod flag removes 2nd data point and dropna removes 3rd data point @@ -111,7 +111,7 @@ def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata @mock.patch("requests.get") -def test_tabledap_source_get_dataset_metadata(mock_get): +def test_tabledap_reader_get_dataset_metadata(mock_get): test_data = Path(__file__).parent / "test_data/tabledap_metadata.json" bad = { "table": { @@ -124,8 +124,10 @@ def test_tabledap_source_get_dataset_metadata(mock_get): resp = mock.MagicMock() resp.json.side_effect = [json.loads(test_data.read_text()), bad] mock_get.return_value = resp - source = TableDAPSource(server="http://erddap.invalid", dataset_id="abc123") - metadata = source._get_dataset_metadata() + server = "http://erddap.invalid" + dataset_id = "abc123" + reader = TableDAPReader(server, dataset_id) + metadata = reader._get_dataset_metadata(server, dataset_id) assert metadata["cdm_data_type"] == "TimeSeries" assert metadata["variables"]["z"]["actual_range"] == [0.0, 0.0] assert metadata["variables"]["depth_to_water_level"]["status_flags"] == [ @@ -136,43 +138,28 @@ def test_tabledap_source_get_dataset_metadata(mock_get): 9, ] - metadata = source._get_dataset_metadata() + metadata = reader._get_dataset_metadata(server, dataset_id) assert len(metadata) == 1 assert len(metadata["variables"]) == 0 @mock.patch("xarray.open_dataset") -def test_griddap_source_no_chunks(mock_open_dataset, fake_grid): +def test_griddap_reader_no_chunks(mock_open_dataset, fake_grid): server = "https://erddap.invalid" dataset_id = "abc123" mock_open_dataset.return_value = fake_grid - source = GridDAPSource(server=server, dataset_id=dataset_id) - ds = source.to_dask() + reader = GridDAPReader(server=server, dataset_id=dataset_id) + ds = reader.read() assert ds is fake_grid assert "_NCProperties" not in ds.attrs - - with pytest.raises(NotImplementedError): - source.read() - - arr = source.read_partition(("temp", None)) - assert isinstance(arr, np.ndarray) - - arr = source.read_partition(["temp", None]) - assert isinstance(arr, np.ndarray) - - with pytest.raises(TypeError): - source.read_partition("temp") - - source.close() - assert source._ds is None - assert source._schema is None + assert "temp" in ds.variables @mock.patch("xarray.open_dataset") -def test_griddap_source_with_dask(mock_open_dataset, fake_dask_grid): +def test_griddap_reader_with_dask(mock_open_dataset, fake_dask_grid): server = "https://erddap.invalid" dataset_id = "abc123" mock_open_dataset.return_value = fake_dask_grid - source = GridDAPSource(server=server, dataset_id=dataset_id) - arr = source.read_partition(("temp", 0)) - assert isinstance(arr, np.ndarray) + reader = GridDAPReader(server=server, dataset_id=dataset_id) + arr = reader.read() + assert isinstance(arr, xr.Dataset) From 6fdb0e10799c379537bc5ab7e7ba780588ae854a Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 14:25:54 -0500 Subject: [PATCH 06/17] updated 3.8 GH test to 3.11 --- .github/workflows/test.yaml | 2 +- ci/{environment-py3.8.yml => environment-py3.11.yml} | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename ci/{environment-py3.8.yml => environment-py3.11.yml} (94%) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 243ea3a..b310ac0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: ["macos-latest", "ubuntu-latest", "windows-latest"] - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.9", "3.10", "3.11"] steps: - name: Checkout source uses: actions/checkout@v2 diff --git a/ci/environment-py3.8.yml b/ci/environment-py3.11.yml similarity index 94% rename from ci/environment-py3.8.yml rename to ci/environment-py3.11.yml index 597f0b0..8f518d6 100644 --- a/ci/environment-py3.8.yml +++ b/ci/environment-py3.11.yml @@ -2,7 +2,7 @@ name: test-env channels: - conda-forge dependencies: - - python=3.8 + - python=3.11 - numpy - dask - pandas From 4e94a7d6d7e1b6830aa3d76db8e6f8eb05314ded Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 15:41:07 -0500 Subject: [PATCH 07/17] attempts to fix errors ran precommit and removed version.py (I do not think I need it?) --- .pre-commit-config.yaml | 28 ++++++--------- docs/user_guide.rst | 8 ++--- docs/whats_new.md | 2 +- intake_erddap/__init__.py | 1 - intake_erddap/erddap.py | 68 ++++++++++++++++++++++--------------- intake_erddap/erddap_cat.py | 28 +++++++++------ intake_erddap/version.py | 8 ----- setup.py | 4 ++- tests/test_erddap_cat.py | 20 +++++++---- 9 files changed, 90 insertions(+), 77 deletions(-) delete mode 100644 intake_erddap/version.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 85a311f..4e62c29 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,18 +28,12 @@ repos: exclude: docs/conf.py args: [--max-line-length=105 ] -- repo: https://github.com/pre-commit/mirrors-isort - rev: v5.10.1 +- repo: https://github.com/pycqa/isort + rev: 5.12.0 hooks: - - id: isort - additional_dependencies: [toml] - exclude: ^(docs|setup.py) - args: [--project=gcm_filters, --multi-line=3, --lines-after-imports=2, --lines-between-types=1, --trailing-comma, --force-grid-wrap=0, --use-parentheses, --line-width=88] - -- repo: https://github.com/asottile/seed-isort-config - rev: v2.2.0 - hooks: - - id: seed-isort-config + - id: isort + name: isort (python) + args: ["--profile", "black", "--filter-files", "--lines-after-imports=2", "--project=gcm_filters", "--multi-line=3", "--lines-between-types=1", "--trailing-comma", "--force-grid-wrap=0", "--use-parentheses", "--line-width=88"] - repo: https://github.com/psf/black rev: 22.10.0 @@ -56,9 +50,9 @@ repos: exclude: docs/source/conf.py args: [--ignore-missing-imports] -# - repo: https://github.com/codespell-project/codespell -# rev: v1.16.0 -# hooks: -# - id: codespell -# args: -# - --quiet-level=2 +- repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + args: + - --quiet-level=2 diff --git a/docs/user_guide.rst b/docs/user_guide.rst index 8b9e954..0000da0 100644 --- a/docs/user_guide.rst +++ b/docs/user_guide.rst @@ -80,7 +80,7 @@ containing a plaintext search term:: search_for=["ioos", "aoos", "NOAA"], ) -This can also be useful if you know the name of the station or stations you want +This can also be useful if you know the name of the station or stations you want to make a catalog from :: cat = intake_erddap.ERDDAPCatalogReader( @@ -108,7 +108,7 @@ Constraints ----------- Use the input option `use_source_constraints=True` to use any relevant parameter -from "kwargs_search" constraints in the query. This will pass a `start_time` on +from "kwargs_search" constraints in the query. This will pass a `start_time` on so that it will limit the time returned in the data to the `start_time`, for example:: cat = intake_erddap.ERDDAPCatalogReader( @@ -148,7 +148,7 @@ Variables `time`, `latitude`, `longitude`, and `z` are always additionally retur Mask due to quality flags ------------------------- -If `mask_failed_qartod=True`` and `*_qc_agg` columns associated with the data columns are available, data values associated with QARTOD flags other than 1 and 2 will be nan'ed out. Has not been thoroughly tested. +If `mask_failed_qartod=True`` and `*_qc_agg` columns associated with the data columns are available, data values associated with QARTOD flags other than 1 and 2 will be nan'ed out. Has not been thoroughly tested. Simple caching @@ -161,4 +161,4 @@ You can using simple caching through `fsspec` if you input `cache_kwargs` such a cache_kwargs=dict(cache_storage="/tmp/fnames/", same_names=True), ) -This would have the effect of caching the data locally in the `/tmp/fnames/` directory so it doesn't have to be downloaded next time. The `same_names` option is useful if you want to cache the data with the same name as the data source for clarity. \ No newline at end of file +This would have the effect of caching the data locally in the `/tmp/fnames/` directory so it doesn't have to be downloaded next time. The `same_names` option is useful if you want to cache the data with the same name as the data source for clarity. diff --git a/docs/whats_new.md b/docs/whats_new.md index 4125ea8..2bf203a 100644 --- a/docs/whats_new.md +++ b/docs/whats_new.md @@ -3,4 +3,4 @@ ## v0.5.0 (July 19, 2024) * Major changes across the codebase to update to intake v2! Also updated class names; updated tests; updated docs. * Now can choose variables to narrow results to. -* Fixed some bugs. \ No newline at end of file +* Fixed some bugs. diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index 190f66d..e21402b 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -1,5 +1,4 @@ """intake-erddap package.""" -import intake from .erddap import GridDAPReader, TableDAPReader from .erddap_cat import ERDDAPCatalogReader diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 68b453c..6516a9e 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -1,20 +1,16 @@ """Reader implementations for intake-erddap.""" -import typing from logging import getLogger -from typing import List, Optional, Tuple, Type, Union +from typing import List, Union import cf_pandas # noqa: F401 import fsspec -import numpy as np import pandas as pd import requests import xarray as xr from erddapy import ERDDAP from intake.readers.readers import BaseReader -from intake.readers.entry import ReaderDescription -from intake.readers.datatypes import BaseData log = getLogger("intake-erddap") @@ -54,7 +50,9 @@ class ERDDAPReader(BaseReader): output_instance = "xarray:Dataset" - def get_client(self, server, protocol, dataset_id, variables, constraints, client=ERDDAP, **_) -> ERDDAP: + def get_client( + self, server, protocol, dataset_id, variables, constraints, client=ERDDAP, **_ + ) -> ERDDAP: """Return an initialized ERDDAP Client.""" e = client(server=server) e.protocol = protocol @@ -132,38 +130,59 @@ class TableDAPReader(ERDDAPReader): 'units': 'seconds since 1970-01-01T00:00:00Z'}, ... """ + output_instance = "pandas:DataFrame" - def _read(self, server, dataset_id, variables=None, mask_failed_qartod=False, dropna=False, cache_kwargs=None, - open_kwargs=None, constraints=None, **kw): + def _read( + self, + server, + dataset_id, + variables=None, + mask_failed_qartod=False, + dropna=False, + cache_kwargs=None, + open_kwargs=None, + constraints=None, + **kw, + ): open_kwargs = open_kwargs or {} variables = variables or [] kw.pop("protocol", None) protocol = kw.pop("protocol", "tabledap") - + # check for variables in user-input list that are not available for the dataset meta2 = self._get_dataset_metadata(server, dataset_id) variables_diff = set(variables) - set(meta2["variables"].keys()) if len(variables_diff) > 0: variables = [var for var in variables if var not in variables_diff] - e = self.get_client(server, protocol, dataset_id, variables=variables, - constraints=constraints or {}, **kw) + e = self.get_client( + server, + protocol, + dataset_id, + variables=variables, + constraints=constraints or {}, + **kw, + ) if cache_kwargs is not None: if "response" in open_kwargs: response = open_kwargs["response"] open_kwargs.pop("response") url = e.get_download_url(response=response) else: - url = e.get_download_url(response="csvp") # should this be the default or csv? + url = e.get_download_url( + response="csvp" + ) # should this be the default or csv? try: with fsspec.open(f"simplecache://::{url}", **(cache_kwargs or {})) as f: dataframe: pd.DataFrame = pd.read_csv(f, **open_kwargs) except OSError as e: # might get file name too long print(e) - print("If your filenames are too long, input only a few variables" - "to return or input into cache kwargs `same_names=False`") + print( + "If your filenames are too long, input only a few variables" + "to return or input into cache kwargs `same_names=False`" + ) else: dataframe: pd.DataFrame = e.to_pandas( requests_kwargs={"timeout": 60}, **open_kwargs @@ -181,12 +200,8 @@ def data_cols(df): # find data columns which are what we'll use in the final step to drop nan's # don't include dimension/coordinates-type columns (dimcols) nor qc_agg columns (qccols) dimcols = df.cf.axes_cols + df.cf.coordinates_cols - qccols = list( - df.columns[df.columns.str.contains("_qc_agg")] - ) - datacols = [ - col for col in df.columns if col not in dimcols + qccols - ] + qccols = list(df.columns[df.columns.str.contains("_qc_agg")]) + datacols = [col for col in df.columns if col not in dimcols + qccols] return datacols def run_mask_failed_qartod(self, df): @@ -324,22 +339,21 @@ class GridDAPReader(ERDDAPReader): # kwargs.pop("protocol", None) # super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore - def _read(self, + def _read( + self, server: str, dataset_id: str, constraints: dict = None, chunks: Union[None, int, dict, str] = None, xarray_kwargs: dict = None, - **kw -): + **kw, + ): constraints = constraints or {} chunks = chunks or {} xarray_kwargs = xarray_kwargs or {} urlpath = f"{server}/griddap/{dataset_id}" - - ds = xr.open_dataset( - urlpath, chunks=chunks, **xarray_kwargs - ) + + ds = xr.open_dataset(urlpath, chunks=chunks, **xarray_kwargs) # _NCProperties is an internal property which xarray does not yet deal # with specially, so we remove it here to prevent it from causing # problems for clients. diff --git a/intake_erddap/erddap_cat.py b/intake_erddap/erddap_cat.py index eb4a5f8..af51abc 100644 --- a/intake_erddap/erddap_cat.py +++ b/intake_erddap/erddap_cat.py @@ -20,19 +20,21 @@ import requests from erddapy import ERDDAP + # from intake.catalog.base import Catalog from intake.readers.entry import Catalog, DataDescription from intake.readers.readers import BaseReader -# from intake.catalog.local import LocalCatalogEntry from intake_erddap.cache import CacheStore from . import utils -from .erddap import GridDAPReader, TableDAPReader from .utils import match_key_to_category from .version import __version__ +# from intake.catalog.local import LocalCatalogEntry + + log = getLogger("intake-erddap") @@ -98,7 +100,7 @@ class ERDDAPCatalogReader(BaseReader): chunks : dict, optional For griddap protocol, pass a dictionary of chunk sizes for the xarray. xarray_kwargs : dict, optional - For griddap protocol, pass a dictionary of kwargs to pass to the + For griddap protocol, pass a dictionary of kwargs to pass to the xarray.open_dataset method. metadata : dict, optional Extra metadata for the intake catalog. @@ -191,7 +193,7 @@ def __init__( if variables is not None: variables = ["time", "latitude", "longitude", "z"] + variables self.variables = variables - + chunks = chunks or {} xarray_kwargs = xarray_kwargs or {} @@ -444,10 +446,13 @@ def read(self): all_metadata = self._load_metadata() self._entries = {} - + # Remove datasets that are redundant if len(df) > 0: - df = df[(~df["datasetID"].str.startswith("ism-")) * (df["datasetID"] != "allDatasets")] + df = df[ + (~df["datasetID"].str.startswith("ism-")) + * (df["datasetID"] != "allDatasets") + ] entries, aliases = {}, {} for index, row in df.iterrows(): @@ -485,9 +490,7 @@ def read(self): else: raise ValueError(f"Unsupported protocol: {self._protocol}") - metadata["info_url"] = e.get_info_url( - response="csv", dataset_id=dataset_id - ) + metadata["info_url"] = e.get_info_url(response="csv", dataset_id=dataset_id) entries[dataset_id] = DataDescription( datatype, kwargs={"dataset_id": dataset_id, **args}, @@ -495,9 +498,12 @@ def read(self): ) aliases[dataset_id] = dataset_id - cat = Catalog(data=entries, aliases=aliases,) + cat = Catalog( + data=entries, + aliases=aliases, + ) return cat - + def _get_tabledap_constraints(self) -> Dict[str, Union[str, int, float]]: """Return the constraints dictionary for a tabledap Reader.""" result = {} diff --git a/intake_erddap/version.py b/intake_erddap/version.py deleted file mode 100644 index 03f496d..0000000 --- a/intake_erddap/version.py +++ /dev/null @@ -1,8 +0,0 @@ -"""Project version module.""" -from importlib.metadata import version, PackageNotFoundError - -try: - __version__ = version("intake-erddap") -except PackageNotFoundError: - # package is not installed - __version__ = "unknown" \ No newline at end of file diff --git a/setup.py b/setup.py index 5356d28..085519b 100644 --- a/setup.py +++ b/setup.py @@ -1,8 +1,10 @@ #!/usr/bin/env python from pathlib import Path + from setuptools import setup + requires = open("requirements.txt").read().strip().split("\n") setup( @@ -32,4 +34,4 @@ long_description=Path("README.md").read_text(), long_description_content_type='text/markdown', zip_safe=False, -) \ No newline at end of file +) diff --git a/tests/test_erddap_cat.py b/tests/test_erddap_cat.py index 5e7adc7..bc554c7 100644 --- a/tests/test_erddap_cat.py +++ b/tests/test_erddap_cat.py @@ -270,7 +270,9 @@ def test_catalog_with_griddap( "min_time": "2022-01-01", "max_time": "2022-11-07", } - cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search, protocol="griddap").read() + cat = ERDDAPCatalogReader( + server=SERVER_URL, kwargs_search=search, protocol="griddap" + ).read() dataset_id = list(cat)[0] reader = cat[dataset_id] assert isinstance(reader, GridDAPReader) @@ -288,7 +290,9 @@ def test_catalog_with_unsupported_protocol( } mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search, protocol="fakedap").read() + ERDDAPCatalogReader( + server=SERVER_URL, kwargs_search=search, protocol="fakedap" + ).read() @mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @@ -445,8 +449,8 @@ def test_saving_catalog( assert dataset_id == "abc123" reader = cat[dataset_id] assert isinstance(reader, TableDAPReader) - assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["protocol"] == "tabledap" - assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["server"] == SERVER_URL + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["protocol"] == "tabledap" + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["server"] == SERVER_URL cat = ERDDAPCatalogReader(server=SERVER_URL, protocol="griddap").read() cat.to_yaml_file(temporary_catalog) @@ -456,8 +460,8 @@ def test_saving_catalog( assert dataset_id == "abc123" reader = cat[dataset_id] assert isinstance(reader, GridDAPReader) - assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["protocol"] == "griddap" - assert cat.__dict__["data"][dataset_id].__dict__['kwargs']["server"] == SERVER_URL + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["protocol"] == "griddap" + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["server"] == SERVER_URL @mock.patch("intake_erddap.utils.get_erddap_metadata") @@ -544,7 +548,9 @@ def test_query_type_invalid(mock_read_csv, load_metadata_mock, single_dataset_ca load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalogReader(server="http://blah.invalid/erddap/", query_type="blah").read() + ERDDAPCatalogReader( + server="http://blah.invalid/erddap/", query_type="blah" + ).read() @pytest.mark.integration From 59fb6ed279a2cee82fa785c962efa3a5403c2999 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 15:53:36 -0500 Subject: [PATCH 08/17] removing versioneer stuff, hope this works --- MANIFEST.in | 2 -- setup.cfg | 6 ------ 2 files changed, 8 deletions(-) diff --git a/MANIFEST.in b/MANIFEST.in index b5ac0eb..0d2fa27 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -2,8 +2,6 @@ recursive-include . *.html recursive-include . *.csv recursive-include . *.yml recursive-include . *.ini -include versioneer.py -include intake_erddap/_version.py include LICENSE include README.rst include requirements.txt diff --git a/setup.cfg b/setup.cfg index 59d4dbb..e69de29 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +0,0 @@ -[versioneer] -VCS = git -style = pep440 -versionfile_source = intake_erddap/_version.py -versionfile_build = intake_erddap/_version.py -tag_prefix = From af8d8d19407c545cb050285ec4ed0ce550bc44ce Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:00:19 -0500 Subject: [PATCH 09/17] add intake back in --- intake_erddap/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index e21402b..190f66d 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -1,4 +1,5 @@ """intake-erddap package.""" +import intake from .erddap import GridDAPReader, TableDAPReader from .erddap_cat import ERDDAPCatalogReader From 026af0e1495a52abe1233d0acd96da39b1073ac5 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:06:30 -0500 Subject: [PATCH 10/17] try again --- environment.yml | 6 +++--- intake_erddap/__init__.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/environment.yml b/environment.yml index 63441df..d9e0d7d 100644 --- a/environment.yml +++ b/environment.yml @@ -8,10 +8,10 @@ dependencies: - dask - pandas - erddapy - - panel + # - panel - appdirs - - intake - - intake-xarray>=0.6.1 + # - intake - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index 190f66d..9cc350d 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -1,5 +1,5 @@ """intake-erddap package.""" -import intake +import intake # noqa: F401 from .erddap import GridDAPReader, TableDAPReader from .erddap_cat import ERDDAPCatalogReader From de5125f2d3f6c71fe3e684563b49e4700a1f3be4 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:19:05 -0500 Subject: [PATCH 11/17] try again --- MANIFEST.in | 7 ------- pyproject.toml | 2 -- setup.py | 4 ++-- 3 files changed, 2 insertions(+), 11 deletions(-) delete mode 100644 MANIFEST.in diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 0d2fa27..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,7 +0,0 @@ -recursive-include . *.html -recursive-include . *.csv -recursive-include . *.yml -recursive-include . *.ini -include LICENSE -include README.rst -include requirements.txt diff --git a/pyproject.toml b/pyproject.toml index 3a57e48..84ae917 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,5 @@ testpaths = [ [tool.coverage.run] omit = [ "setup.py", - "intake_erddap/_version.py", - "intake_erddap/version.py", "tests/*", ] diff --git a/setup.py b/setup.py index 085519b..f7005e4 100644 --- a/setup.py +++ b/setup.py @@ -21,7 +21,7 @@ maintainer_email="dev@axds.co", license="BSD", packages=["intake_erddap"], - package_data={"": ["*.csv", "*.yml", "*.html"]}, + # package_data={"": ["*.csv", "*.yml", "*.html"]}, entry_points={ "intake.imports": [ "tabledap = intake_erddap.erddap:TableDAPReader", @@ -29,7 +29,7 @@ "erddap_cat = intake_erddap.erddap_cat:ERDDAPCatalogReader", ], }, - include_package_data=True, + # include_package_data=True, install_requires=requires, long_description=Path("README.md").read_text(), long_description_content_type='text/markdown', From 94838fb390e2ff1fe4e413563e2d15e85820743c Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:25:35 -0500 Subject: [PATCH 12/17] small guess --- intake_erddap/erddap.py | 1 + 1 file changed, 1 insertion(+) diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 6516a9e..7193c12 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -10,6 +10,7 @@ import xarray as xr from erddapy import ERDDAP +import intake from intake.readers.readers import BaseReader From 84f34778404d0f1066acc49c2bbbf5c47a0899ae Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:29:42 -0500 Subject: [PATCH 13/17] forgot to change ci envs --- ci/environment-py3.10.yml | 2 +- ci/environment-py3.11.yml | 4 ++-- ci/environment-py3.9.yml | 4 ++-- intake_erddap/erddap.py | 1 - 4 files changed, 5 insertions(+), 6 deletions(-) diff --git a/ci/environment-py3.10.yml b/ci/environment-py3.10.yml index 0ae7659..2224247 100644 --- a/ci/environment-py3.10.yml +++ b/ci/environment-py3.10.yml @@ -9,7 +9,6 @@ dependencies: - erddapy - panel - intake - - intake-xarray>=0.6.1 - pytest - pytest-cov - isort @@ -21,4 +20,5 @@ dependencies: - coverage[toml] - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/ci/environment-py3.11.yml b/ci/environment-py3.11.yml index 8f518d6..653dc89 100644 --- a/ci/environment-py3.11.yml +++ b/ci/environment-py3.11.yml @@ -8,8 +8,7 @@ dependencies: - pandas - erddapy - panel - - intake - - intake-xarray>=0.6.1 + # - intake - pytest - pytest-cov - isort @@ -21,4 +20,5 @@ dependencies: - coverage[toml] - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/ci/environment-py3.9.yml b/ci/environment-py3.9.yml index 299e4b1..c8d465b 100644 --- a/ci/environment-py3.9.yml +++ b/ci/environment-py3.9.yml @@ -8,8 +8,7 @@ dependencies: - pandas - erddapy - panel - - intake - - intake-xarray>=0.6.1 + # - intake - pytest - pytest-cov - isort @@ -21,4 +20,5 @@ dependencies: - coverage[toml] - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 7193c12..6516a9e 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -10,7 +10,6 @@ import xarray as xr from erddapy import ERDDAP -import intake from intake.readers.readers import BaseReader From 9a8ebe5fc6bd66e46abe5e286e5186cd648296f7 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:33:13 -0500 Subject: [PATCH 14/17] forgot to change ci envs --- ci/environment-py3.10.yml | 1 + ci/environment-py3.11.yml | 1 + ci/environment-py3.9.yml | 1 + 3 files changed, 3 insertions(+) diff --git a/ci/environment-py3.10.yml b/ci/environment-py3.10.yml index 2224247..61933de 100644 --- a/ci/environment-py3.10.yml +++ b/ci/environment-py3.10.yml @@ -18,6 +18,7 @@ dependencies: - mypy - codecov - coverage[toml] + - xarray - pip - pip: - git+https://github.com/intake/intake diff --git a/ci/environment-py3.11.yml b/ci/environment-py3.11.yml index 653dc89..4862c60 100644 --- a/ci/environment-py3.11.yml +++ b/ci/environment-py3.11.yml @@ -18,6 +18,7 @@ dependencies: - mypy - codecov - coverage[toml] + - xarray - pip - pip: - git+https://github.com/intake/intake diff --git a/ci/environment-py3.9.yml b/ci/environment-py3.9.yml index c8d465b..4c3ef75 100644 --- a/ci/environment-py3.9.yml +++ b/ci/environment-py3.9.yml @@ -18,6 +18,7 @@ dependencies: - mypy - codecov - coverage[toml] + - xarray - pip - pip: - git+https://github.com/intake/intake From 003dbeb22118f2addae3bc193bf91096542eb637 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:36:48 -0500 Subject: [PATCH 15/17] missed some packages --- ci/environment-py3.10.yml | 2 ++ ci/environment-py3.11.yml | 2 ++ ci/environment-py3.9.yml | 2 ++ 3 files changed, 6 insertions(+) diff --git a/ci/environment-py3.10.yml b/ci/environment-py3.10.yml index 61933de..5a4e60b 100644 --- a/ci/environment-py3.10.yml +++ b/ci/environment-py3.10.yml @@ -3,6 +3,8 @@ channels: - conda-forge dependencies: - python=3.10 + - appdirs + - fsspec - numpy - dask - pandas diff --git a/ci/environment-py3.11.yml b/ci/environment-py3.11.yml index 4862c60..2dcba04 100644 --- a/ci/environment-py3.11.yml +++ b/ci/environment-py3.11.yml @@ -3,6 +3,8 @@ channels: - conda-forge dependencies: - python=3.11 + - appdirs + - fsspec - numpy - dask - pandas diff --git a/ci/environment-py3.9.yml b/ci/environment-py3.9.yml index 4c3ef75..6030eb0 100644 --- a/ci/environment-py3.9.yml +++ b/ci/environment-py3.9.yml @@ -3,10 +3,12 @@ channels: - conda-forge dependencies: - python=3.9 + - appdirs - numpy - dask - pandas - erddapy + - fsspec - panel # - intake - pytest From e34870b15bca3dfd5e078f93a88c629b985c9d10 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 16:53:14 -0500 Subject: [PATCH 16/17] removed more version stuff --- intake_erddap/__init__.py | 2 -- intake_erddap/erddap_cat.py | 7 ------- intake_erddap/utils.py | 10 ---------- tests/test_utils.py | 5 ----- 4 files changed, 24 deletions(-) diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index 9cc350d..45566da 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -3,12 +3,10 @@ from .erddap import GridDAPReader, TableDAPReader from .erddap_cat import ERDDAPCatalogReader -from .version import __version__ __all__ = [ "ERDDAPCatalogReader", "TableDAPReader", "GridDAPReader", - "__version__", ] diff --git a/intake_erddap/erddap_cat.py b/intake_erddap/erddap_cat.py index af51abc..0648918 100644 --- a/intake_erddap/erddap_cat.py +++ b/intake_erddap/erddap_cat.py @@ -20,8 +20,6 @@ import requests from erddapy import ERDDAP - -# from intake.catalog.base import Catalog from intake.readers.entry import Catalog, DataDescription from intake.readers.readers import BaseReader @@ -29,10 +27,6 @@ from . import utils from .utils import match_key_to_category -from .version import __version__ - - -# from intake.catalog.local import LocalCatalogEntry log = getLogger("intake-erddap") @@ -142,7 +136,6 @@ class ERDDAPCatalogReader(BaseReader): """ name = "erddap_cat" - version = __version__ output_instance = "intake.readers.entry:Catalog" def __init__( diff --git a/intake_erddap/utils.py b/intake_erddap/utils.py index 615dd84..8bf8609 100644 --- a/intake_erddap/utils.py +++ b/intake_erddap/utils.py @@ -18,16 +18,6 @@ log = getLogger("intake-erddap") -def get_project_version() -> str: - """Return the project version. - - This function resolves circular import problems with version. - """ - from intake_erddap import __version__ - - return __version__ - - def return_category_options( server: str, category: str = "standard_name", diff --git a/tests/test_utils.py b/tests/test_utils.py index 8669cfe..e445632 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,11 +17,6 @@ class Something: pass -def test_get_project_version(): - version = utils.get_project_version() - assert version is not None - - @mock.patch("pandas.read_csv") def test_category_and_key(mock_read_csv): df_mock = pd.DataFrame() From 7ba8a34179bb2dd61d5d24ff9a7213e074e8ed70 Mon Sep 17 00:00:00 2001 From: Kristen Thyng Date: Fri, 19 Jul 2024 17:18:34 -0500 Subject: [PATCH 17/17] fixed api docs --- docs/api.rst | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index ca57497..07777f7 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -2,27 +2,12 @@ ``intake-erddap`` Python API ============================= -.. toctree:: - :maxdepth: 2 - :caption: Documentation +.. currentmodule:: intake_erddap +.. autosummary:: + :toctree: generated/ + :recursive: -``intake-erddap`` catalog -------------------------- - - -.. autoclass:: intake_erddap.erddap_cat.ERDDAPCatalog - :members: get_client, get_search_urls - -``intake-erddap`` source ------------------------- - - -.. autoclass:: intake_erddap.erddap.ERDDAPReader - :members: get_client - -.. autoclass:: intake_erddap.erddap.TableDAPReader - :members: read, read_partition, read_chunked - -.. autoclass:: intake_erddap.erddap.GridDAPReader - :members: read_partition, read_chunked, to_dask, close + ERDDAPCatalogReader + TableDAPReader + GridDAPReader