diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 243ea3a..b310ac0 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: ["macos-latest", "ubuntu-latest", "windows-latest"] - python-version: ["3.8", "3.9", "3.10"] + python-version: ["3.9", "3.10", "3.11"] steps: - name: Checkout source uses: actions/checkout@v2 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 85a311f..4e62c29 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -28,18 +28,12 @@ repos: exclude: docs/conf.py args: [--max-line-length=105 ] -- repo: https://github.com/pre-commit/mirrors-isort - rev: v5.10.1 +- repo: https://github.com/pycqa/isort + rev: 5.12.0 hooks: - - id: isort - additional_dependencies: [toml] - exclude: ^(docs|setup.py) - args: [--project=gcm_filters, --multi-line=3, --lines-after-imports=2, --lines-between-types=1, --trailing-comma, --force-grid-wrap=0, --use-parentheses, --line-width=88] - -- repo: https://github.com/asottile/seed-isort-config - rev: v2.2.0 - hooks: - - id: seed-isort-config + - id: isort + name: isort (python) + args: ["--profile", "black", "--filter-files", "--lines-after-imports=2", "--project=gcm_filters", "--multi-line=3", "--lines-between-types=1", "--trailing-comma", "--force-grid-wrap=0", "--use-parentheses", "--line-width=88"] - repo: https://github.com/psf/black rev: 22.10.0 @@ -56,9 +50,9 @@ repos: exclude: docs/source/conf.py args: [--ignore-missing-imports] -# - repo: https://github.com/codespell-project/codespell -# rev: v1.16.0 -# hooks: -# - id: codespell -# args: -# - --quiet-level=2 +- repo: https://github.com/codespell-project/codespell + rev: v2.1.0 + hooks: + - id: codespell + args: + - --quiet-level=2 diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 24ec291..8bad78f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -12,10 +12,10 @@ build: # uncomment to build from this exact version of package # the downside is the version listed in the docs will be a dev version # if uncommenting this, comment out installing pypi version of package in docs/env file -# python: -# install: -# - method: pip -# path: ./ +python: + install: + - method: pip + path: ./ conda: environment: docs/environment.yml diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index b5ac0eb..0000000 --- a/MANIFEST.in +++ /dev/null @@ -1,9 +0,0 @@ -recursive-include . *.html -recursive-include . *.csv -recursive-include . *.yml -recursive-include . *.ini -include versioneer.py -include intake_erddap/_version.py -include LICENSE -include README.rst -include requirements.txt diff --git a/README.md b/README.md index 76e647f..65ed445 100644 --- a/README.md +++ b/README.md @@ -24,15 +24,13 @@ For changes prior to 2022-10-19, all contributions are Copyright James Munroe, s -Intake is a lightweight set of tools for loading and sharing data in data -science projects. Intake ERDDAP provides a set of integrations for ERDDAP. +Intake is a lightweight set of tools for loading and sharing data in data science projects. Intake ERDDAP provides a set of integrations for ERDDAP. -- Quickly identify all datasets from an ERDDAP service in a geographic region, - or containing certain variables. +- Quickly identify all datasets from an ERDDAP service in a geographic region, or containing certain variables. - Produce a pandas DataFrame for a given dataset or query. - Get an xarray Dataset for the Gridded datasets. -The Key features are: +The key features are: - Pandas DataFrames for any TableDAP dataset. - xarray Datasets for any GridDAP datasets. @@ -59,7 +57,7 @@ project is available on PyPI, so it can be installed using `pip` The following are prerequisites for a developer environment for this project: - [conda](https://docs.conda.io/en/latest/miniconda.html) -- (optional but highly recommended) [mamba](https://mamba.readthedocs.io/en/latest/) Hint: `conda install -c conda-forge mamba` +- (optional but highly recommended) [mamba](https://mamba.readthedocs.io/en/latest/). Hint: `conda install -c conda-forge mamba` Note: if `mamba` isn't installed, replace all instances of `mamba` in the following instructions with `conda`. @@ -83,126 +81,74 @@ Note: if `mamba` isn't installed, replace all instances of `mamba` in the follow pip install -e . ``` +Note that you need to install with `pip install .` once to get the `entry_points` correct too. ## Examples -To create an intake catalog for all of the ERDDAP's TableDAP offerings use: +To create an `intake` catalog for all of the ERDDAP's TableDAP offerings use: ```python -import intake -catalog = intake.open_erddap_cat( +import intake_erddap +catalog = intake_erddap.ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap" -) +).read() ``` -The catalog objects behave like a dictionary with the keys representing the -dataset's unique identifier within ERDDAP, and the values being the -`TableDAPSource` objects. To access a source object: +The catalog objects behave like a dictionary with the keys representing the dataset's unique identifier within ERDDAP, and the values being the `TableDAPReader` objects. To access a Reader object (for a single dataset, in this case for dataset_id "aoos_204"): ```python -source = catalog["datasetid"] +dataset = catalog["aoos_204"] ``` -From the source object, a pandas DataFrame can be retrieved: +From the reader object, a pandas DataFrame can be retrieved: ```python -df = source.read() +df = dataset.read() +``` + +Find other dataset_ids available with + +```python +list(catalog) ``` Consider a case where you need to find all wind data near Florida: ```python -import intake +import intake_erddap from datetime import datetime bbox = (-87.84, 24.05, -77.11, 31.27) -catalog = intake.open_erddap_cat( +catalog = intake_erddap.ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", bbox=bbox, + intersection="union", start_time=datetime(2022, 1, 1), end_time=datetime(2023, 1, 1), standard_names=["wind_speed", "wind_from_direction"], -) + variables=["wind_speed", "wind_from_direction"], +).read() -df = next(catalog.values()).read() +dataset_id = list(catalog)[0] +print(dataset_id) +df = catalog[dataset_id].read() ``` +Using the `standard_names` input with `intersection="union"` searches for datasets that have both "wind_speed" and "wind_from_direction". Using the `variables` input subsequently narrows the dataset to only those columns, plus "time", "latitude", "longitude", and "z". - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
time (UTC)wind_speed (m.s-1)wind_from_direction (degrees)
02022-12-14T19:40:00Z7.0140.0
12022-12-14T19:20:00Z7.0120.0
22022-12-14T19:10:00ZNaNNaN
32022-12-14T19:00:00Z9.0130.0
42022-12-14T18:50:00Z9.0130.0
............
482962022-01-01T00:40:00Z4.0120.0
482972022-01-01T00:30:00Z3.0130.0
482982022-01-01T00:20:00Z4.0120.0
482992022-01-01T00:10:00Z4.0130.0
483002022-01-01T00:00:00Z4.0130.0
+```python + time (UTC) latitude (degrees_north) ... wind_speed (m.s-1) wind_from_direction (degrees) +0 2022-01-01T00:00:00Z 28.508 ... 3.6 126.0 +1 2022-01-01T00:10:00Z 28.508 ... 3.8 126.0 +2 2022-01-01T00:20:00Z 28.508 ... 3.6 124.0 +3 2022-01-01T00:30:00Z 28.508 ... 3.4 125.0 +4 2022-01-01T00:40:00Z 28.508 ... 3.5 124.0 +... ... ... ... ... ... +52524 2022-12-31T23:20:00Z 28.508 ... 5.9 176.0 +52525 2022-12-31T23:30:00Z 28.508 ... 6.8 177.0 +52526 2022-12-31T23:40:00Z 28.508 ... 7.2 175.0 +52527 2022-12-31T23:50:00Z 28.508 ... 7.4 169.0 +52528 2023-01-01T00:00:00Z 28.508 ... 8.1 171.0 + +[52529 rows x 6 columns] +``` diff --git a/ci/environment-py3.10.yml b/ci/environment-py3.10.yml index 0ae7659..5a4e60b 100644 --- a/ci/environment-py3.10.yml +++ b/ci/environment-py3.10.yml @@ -3,13 +3,14 @@ channels: - conda-forge dependencies: - python=3.10 + - appdirs + - fsspec - numpy - dask - pandas - erddapy - panel - intake - - intake-xarray>=0.6.1 - pytest - pytest-cov - isort @@ -19,6 +20,8 @@ dependencies: - mypy - codecov - coverage[toml] + - xarray - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/ci/environment-py3.8.yml b/ci/environment-py3.11.yml similarity index 71% rename from ci/environment-py3.8.yml rename to ci/environment-py3.11.yml index 597f0b0..2dcba04 100644 --- a/ci/environment-py3.8.yml +++ b/ci/environment-py3.11.yml @@ -2,14 +2,15 @@ name: test-env channels: - conda-forge dependencies: - - python=3.8 + - python=3.11 + - appdirs + - fsspec - numpy - dask - pandas - erddapy - panel - - intake - - intake-xarray>=0.6.1 + # - intake - pytest - pytest-cov - isort @@ -19,6 +20,8 @@ dependencies: - mypy - codecov - coverage[toml] + - xarray - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/ci/environment-py3.9.yml b/ci/environment-py3.9.yml index 299e4b1..6030eb0 100644 --- a/ci/environment-py3.9.yml +++ b/ci/environment-py3.9.yml @@ -3,13 +3,14 @@ channels: - conda-forge dependencies: - python=3.9 + - appdirs - numpy - dask - pandas - erddapy + - fsspec - panel - - intake - - intake-xarray>=0.6.1 + # - intake - pytest - pytest-cov - isort @@ -19,6 +20,8 @@ dependencies: - mypy - codecov - coverage[toml] + - xarray - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/docs/api.rst b/docs/api.rst index c831cda..07777f7 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -2,27 +2,12 @@ ``intake-erddap`` Python API ============================= -.. toctree:: - :maxdepth: 2 - :caption: Documentation +.. currentmodule:: intake_erddap +.. autosummary:: + :toctree: generated/ + :recursive: -``intake-erddap`` catalog -------------------------- - - -.. autoclass:: intake_erddap.erddap_cat.ERDDAPCatalog - :members: get_client, get_search_urls - -``intake-erddap`` source ------------------------- - - -.. autoclass:: intake_erddap.erddap.ERDDAPSource - :members: get_client - -.. autoclass:: intake_erddap.erddap.TableDAPSource - :members: read, read_partition, read_chunked - -.. autoclass:: intake_erddap.erddap.GridDAPSource - :members: read_partition, read_chunked, to_dask, close + ERDDAPCatalogReader + TableDAPReader + GridDAPReader diff --git a/docs/conf.py b/docs/conf.py index 3da3cfa..6e2e288 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -26,17 +26,17 @@ # -- Project information ----------------------------------------------------- project = "intake-erddap" -copyright = "Copyright 2022 Axiom Data Science, LLC" +copyright = "Copyright 2022-2024 Axiom Data Science, LLC" author = "Axiom Data Science, LLC" # The version info for the project you're documenting, acts as replacement for # |version| and |release|, also used in various other places throughout the # built documents. # see https://pypi.org/project/setuptools-scm/ for details -from pkg_resources import get_distribution +from importlib.metadata import version as imversion -release = get_distribution("intake_erddap").version +release = imversion("intake_erddap") # for example take major/minor version = ".".join(release.split(".")[:2]) @@ -71,6 +71,11 @@ nb_execution_timeout = 120 + +# https://myst-nb.readthedocs.io/en/v0.9.0/use/execute.html +# jupyter_execute_notebooks = "off" +nb_execution_mode = "force" + # Add any paths that contain templates here, relative to this directory. templates_path = ["_templates"] @@ -85,10 +90,10 @@ # The theme to use for HTML and HTML Help pages. See the documentation for # a list of builtin themes. # -#html_theme = "furo" +html_theme = "furo" # furo variables -html_title = "intake-axds documentation" +html_title = "intake-erddap documentation" # Add any paths that contain custom static files (such as style sheets) here, # relative to this directory. They are copied after the builtin static files, diff --git a/docs/environment.yml b/docs/environment.yml index d01959c..971dfba 100644 --- a/docs/environment.yml +++ b/docs/environment.yml @@ -1,16 +1,16 @@ -name: docs +name: intake-erddap-docs channels: - conda-forge - nodefaults dependencies: - - python=3.9 + - python=3.11 # If your docs code examples depend on other packages add them here - numpy - dask - pandas - erddapy - panel - - intake + # - intake - intake-xarray>=0.6.1 - cf_pandas # These are needed for the docs themselves @@ -29,10 +29,11 @@ dependencies: - pip - recommonmark - pip: + - furo - git+https://github.com/intake/intake - - intake-parquet - - intake-xarray - - intake-erddap + # - intake-parquet + # - intake-xarray + # - intake-erddap # - "dask[complete]" - docrep<=0.2.7 - furo diff --git a/docs/examples.rst b/docs/examples.rst deleted file mode 100644 index 09cc0af..0000000 --- a/docs/examples.rst +++ /dev/null @@ -1,96 +0,0 @@ -Examples -======== - -.. toctree:: - :maxdepth: 2 - - examples/wave-height.md - -Querying --------- - -A catlaog can be generated by passing your desired query parameters directly -with the ``kwargs_search`` keyword argument. This object gets passed to -`erddappy `_ :: - - search = { - "min_lon": -180, - "max_lon": -156, - "min_lat": 50, - "max_lat": 66, - "min_time": "2021-04-01", - "max_time": "2021-04-02", - } - cat = intake.open_erddap_catalog(server_url, kwargs_search=search) - - -The same query can also be specified using the constructor keyword arguments:: - - cat = intake.open_erddap_catalog( - server=server_url, - bbox=(-180., 50., -156., 66.), - start_time=datetime(2021, 4, 1), - end_time=datetime(2021, 4, 2), - ) - -The catalog supports querying for datasets that contain a variable with a -particular -`CF Standard Name `_ -. Clients can specify the standard name queries with either the -``kwargs_search`` keyword argument, or the ``standard_names`` keyword argument:: - - cat = intake.open_erddap_catalog( - server=server_url, - kwargs_search={ - "standard_name": "air_temperature", - }, - ) - -or:: - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature"], - ) - -Multiple standard name values can be queries which will return all datasets -containing at least one of the queried standard names:: - - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature", "air_pressure"], - ) - -In cases where standard names are not sufficient, clients can query using the -variable name as it appears in ERDDAP:: - - cat = intake.open_erddap_catalog( - server=server_url, - variable_names=["Pair", "temp"], - ) - -Lastly, ERDDAP offers a plaintext search option. Clients can query for datasets -containing a plaintext search term:: - - cat = intake.open_erddap_catalog( - server=server_url, - search_for=["ioos", "aoos", "NOAA"], - ) - - -Querying with AND ------------------ - -Sometimes, clients may want to find only datasets that match all of the query -terms exactly. This can be achieved with the ``query_type`` keyword argument:: - - - cat = intake.open_erddap_catalog( - server=server_url, - standard_names=["air_temperature", "air_pressure"], - query_type="intersection", - ) - -This will return only datasets that have both ``air_temperature`` and -``air_pressure`` as standard names associated with variables. diff --git a/docs/examples/wave-height.md b/docs/examples/wave-height.md index 6aac638..fc566b2 100644 --- a/docs/examples/wave-height.md +++ b/docs/examples/wave-height.md @@ -4,7 +4,7 @@ jupytext: extension: .md format_name: myst format_version: 0.13 - jupytext_version: 1.14.0 + jupytext_version: 1.16.3 kernelspec: display_name: Python language: python @@ -15,11 +15,10 @@ Example: Investigating Significant Wave Height - Southern California ==================================================================== ```{code-cell} ipython3 ---- -tags: [hide-cell] ---- +:tags: [hide-cell] + import intake_erddap -import intake +# import intake import numpy as np import cartopy.crs as ccrs @@ -37,24 +36,22 @@ def figure(*args, figsize=(18, 8), facecolor='white', **kwargs): Here's an example of finding _all_ stations that have significant wave height from the main IOOS ERDDAP server. - ```{code-cell} ipython3 server = 'https://erddap.sensors.ioos.us/erddap' -cat = intake.open_erddap_cat( +cat = intake_erddap.ERDDAPCatalogReader( server=server, standard_names=["sea_surface_wind_wave_significant_height"] -) +).read() ``` ```{code-cell} ipython3 -df = pd.DataFrame([i.metadata for i in cat.values()]) +df = pd.DataFrame([cat[i].metadata for i in list(cat)]) sub_df = df[['datasetID', 'minTime', 'maxTime', 'title']][:5] sub_df.style.set_table_attributes('class="dataframe docutils"').hide(axis="index") ``` We can plot the locations of these stations on the globe. - ```{code-cell} ipython3 fig, ax = figure(subplot_kw=dict(projection=ccrs.PlateCarree())) ax.coastlines() @@ -77,21 +74,24 @@ ax.add_geometries([box], facecolor='red', alpha=0.4, crs=ccrs.PlateCarree()) ax.set_extent([-130., -60., 20., 45.], crs=ccrs.PlateCarree()) ``` -We can pass this bounding box directly to the ERDDAP Catalog constructor, as well as limit our query only to stations that contain data after 2014: +We can pass this bounding box directly to the ERDDAP Catalog constructor, as well as limit our query only to stations that contain data after 2014 and through 2017. We also will limit the data returned to the variable (through the `variables` keyword) we are searching for plus basic variables (time, longitude, latitude, and depth): ```{code-cell} ipython3 -cat = intake.open_erddap_cat( +cat = intake_erddap.ERDDAPCatalogReader( server=server, bbox=bbox, start_time=datetime(2014, 1, 1), - standard_names=["sea_surface_wind_wave_significant_height"] -) + end_time=datetime(2018,1,1), + standard_names=["sea_surface_wave_significant_height"], + variables=["sea_surface_wave_significant_height"], + dropna=True, +).read() len(cat) ``` ```{code-cell} ipython3 -df = pd.DataFrame([i.metadata for i in cat.values()]) +df = pd.DataFrame([cat[i].metadata for i in list(cat)]) sub_df = df[['datasetID', 'minTime', 'maxTime', 'title']] sub_df.style.set_table_attributes('class="dataframe docutils"').hide(axis="index") ``` @@ -108,23 +108,29 @@ ax.scatter(df['minLongitude'], df['minLatitude']) ax.set_title("Station Locations") ``` -We can now interrogate each of those stations and get a timeseries for the significant wave height data. +We can now interrogate each of those stations and get a timeseries for the significant wave height data. We'll use the first four that contain wave height data. + ```{code-cell} ipython3 -# Just get 4 -stations = list(cat)[:4] +# Just get 4 that aren't empty +stations = {} +for dataset_id in list(cat): + df = cat[dataset_id].read() + if len(df) > 0: + stations[dataset_id] = df + if len(stations) == 4: + break +``` -fig, axs = figure(nrows=len(stations), figsize=(18,18)) +```{code-cell} ipython3 -for i, dataset_id in enumerate(stations): +fig, axs = figure(nrows=len(stations), figsize=(15,10), sharex=True, sharey=True) + +for i, (dataset_id, df) in enumerate(stations.items()): ax = axs[i] - source = cat[dataset_id] - df = source.read() - t = df['time (UTC)'].astype('M8[s]') - sig_wave_height = df['sea_surface_wave_significant_height (m)'] - ax.plot(t, sig_wave_height) - ax.set_title(f'{dataset_id} Significant Wave Height (m)') - ax.set_xlim(np.datetime64('2014-01-01'), np.datetime64('2022-12-01')) + df.plot(ax=ax, x='time (UTC)', y='sea_surface_wave_significant_height (m)', fontsize=14, rot=30, + title=f'{dataset_id} Significant Wave Height (m)', legend=False, xlabel="") ax.grid() + fig.tight_layout(pad=1) ``` diff --git a/docs/index.rst b/docs/index.rst index 6409ec0..af2a4eb 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -6,12 +6,6 @@ Welcome to intake-erddap's documentation! ========================================= -.. toctree:: - :maxdepth: 2 - - examples - API - GitHub repository Intake ERDDAP ============= @@ -24,171 +18,34 @@ science projects. Intake ERDDAP provides a set of integrations for ERDDAP. - Produce a pandas DataFrame for a given dataset or query. - Get an xarray Dataset for the Gridded datasets. +The key features are: -.. image:: https://img.shields.io/github/actions/workflow/status/axiom-data-science/intake-erddap/test.yaml?branch=main&logo=github&style=for-the-badge - :alt: Build Status - -.. image:: https://img.shields.io/codecov/c/github/axiom-data-science/intake-erddap.svg?style=for-the-badge - :alt: Code Coverage - -.. image:: https://img.shields.io/badge/License-BSD--2%20Clause-blue.svg?style=for-the-badge - :alt: License:BSD - -.. image:: https://img.shields.io/github/actions/workflow/status/axiom-data-science/intake-erddap/linting.yaml?branch=main&label=Code%20Style&style=for-the-badge - :alt: Code Style Status - -The project is available on `Github `_. - - -TODO: Summary - -The Key features are: - - - Pandas DataFrames for any TableDAP dataset. - - xarray Datasets for any GridDAP datasets. - - Query by any or all: - - bounding box - - time - - CF ``standard_name`` - - variable name - - Plaintext Search term - - Save catalogs locally for future use. - - -Requirements ------------- - -- Python >= 3.8 +- Pandas DataFrames for any TableDAP dataset. +- xarray Datasets for any GridDAP datasets. +- Query by any or all: + - bounding box + - time + - CF ``standard_name`` + - variable name + - Plaintext Search term +- Save catalogs locally for future use. Installation ------------ -In the very near future, we will be offering the project on conda. Currently the -project is available on PyPI, so it can be installed using ``pip``:: +The project is available on PyPI, so it can be installed using ``pip``:: pip install intake-erddap -Examples --------- - -To create an intake catalog for all of the ERDDAP's TableDAP offerings use:: - - import intake - catalog = intake.open_erddap_cat( - server="https://erddap.sensors.ioos.us/erddap" - ) - - -The catalog objects behave like a dictionary with the keys representing the -dataset's unique identifier within ERDDAP, and the values being the -``TableDAPSource`` objects. To access a source object:: - - source = catalog["datasetid"] - -From the source object, a pandas DataFrame can be retrieved:: - - df = source.read() - -Scenarios ---------- - -Consider a case where you need to find all wind data near Florida.:: - - import intake - from datetime import datetime - bbox = (-87.84, 24.05, -77.11, 31.27) - catalog = intake.open_erddap_cat( - server="https://erddap.sensors.ioos.us/erddap", - bbox=bbox, - start_time=datetime(2022, 1, 1), - end_time=datetime(2023, 1, 1), - standard_names=["wind_speed", "wind_from_direction"], - ) - - df = next(catalog.values()).read() - - -.. raw:: html - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
time (UTC)wind_speed (m.s-1)wind_from_direction (degrees)
02022-12-14T19:40:00Z7.0140.0
12022-12-14T19:20:00Z7.0120.0
22022-12-14T19:10:00ZNaNNaN
32022-12-14T19:00:00Z9.0130.0
42022-12-14T18:50:00Z9.0130.0
............
482962022-01-01T00:40:00Z4.0120.0
482972022-01-01T00:30:00Z3.0130.0
482982022-01-01T00:20:00Z4.0120.0
482992022-01-01T00:10:00Z4.0130.0
483002022-01-01T00:00:00Z4.0130.0
+.. toctree:: + :maxdepth: 3 + :hidden: + + user_guide + API + whats_new + GitHub repository Indices and tables diff --git a/docs/user_guide.rst b/docs/user_guide.rst new file mode 100644 index 0000000..0000da0 --- /dev/null +++ b/docs/user_guide.rst @@ -0,0 +1,164 @@ +User Guide +========== + +.. toctree:: + :maxdepth: 2 + + examples/wave-height.md + +Querying +-------- + +A catalog can be generated by passing your desired query parameters directly +with the ``kwargs_search`` keyword argument. This object gets passed to +`erddapy `_ :: + + import intake_erddap + + search = { + "min_lon": -180, + "max_lon": -156, + "min_lat": 50, + "max_lat": 66, + "min_time": "2021-04-01", + "max_time": "2021-04-02", + } + cat = intake_erddap.ERDDAPCatalogReader(server_url, kwargs_search=search) + + +The same query can also be specified using the constructor keyword arguments:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + bbox=(-180., 50., -156., 66.), + start_time=datetime(2021, 4, 1), + end_time=datetime(2021, 4, 2), + ) + +The catalog supports querying for datasets that contain a variable with a +particular +`CF Standard Name `_ +. Clients can specify the standard name queries with either the +``kwargs_search`` keyword argument, or the ``standard_names`` keyword argument:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + kwargs_search={ + "standard_name": "air_temperature", + }, + ) + +or:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature"], + ) + +Multiple standard name values can be queries which will return all datasets +containing at least one of the queried standard names:: + + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature", "air_pressure"], + ) + +In cases where standard names are not sufficient, clients can query using the +variable name as it appears in ERDDAP:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + variable_names=["Pair", "temp"], + ) + +Lastly, ERDDAP offers a plaintext search option. Clients can query for datasets +containing a plaintext search term:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + search_for=["ioos", "aoos", "NOAA"], + ) + +This can also be useful if you know the name of the station or stations you want +to make a catalog from :: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + search_for=["aoos_204"], + ) + +Querying with AND +----------------- + +Sometimes, clients may want to find only datasets that match all of the query +terms exactly. This can be achieved with the ``query_type`` keyword argument:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + standard_names=["air_temperature", "air_pressure"], + query_type="intersection", + ) + +This will return only datasets that have both ``air_temperature`` and +``air_pressure`` as standard names associated with variables. + + +Constraints +----------- + +Use the input option `use_source_constraints=True` to use any relevant parameter +from "kwargs_search" constraints in the query. This will pass a `start_time` on +so that it will limit the time returned in the data to the `start_time`, for example:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + bbox=(-180., 50., -156., 66.), + start_time=datetime(2021, 4, 1), + end_time=datetime(2021, 4, 2), + use_source_constraints=True, + ) + +Dropping bad values +------------------- + +Use the `dropna` option to drop rows with NaN values in the data columns:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + dropna=True, + ) + +Note that this is an alpha feature because it uses logic that identifies columns of data as opposed to coordinates and axes on its own to decide from which columns to drop NaN values. This has not been thoroughly tested. + + +Selecting which columns of data to return +----------------------------------------- + +Use the `variables` option to select which columns of data to return. This is useful when you only need a subset of the data columns:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + variables=["sea_water_temperature"], + ) + +Variables `time`, `latitude`, `longitude`, and `z` are always additionally returned. + + +Mask due to quality flags +------------------------- + +If `mask_failed_qartod=True`` and `*_qc_agg` columns associated with the data columns are available, data values associated with QARTOD flags other than 1 and 2 will be nan'ed out. Has not been thoroughly tested. + + +Simple caching +-------------- + +You can using simple caching through `fsspec` if you input `cache_kwargs` such as the following:: + + cat = intake_erddap.ERDDAPCatalogReader( + server=server_url, + cache_kwargs=dict(cache_storage="/tmp/fnames/", same_names=True), + ) + +This would have the effect of caching the data locally in the `/tmp/fnames/` directory so it doesn't have to be downloaded next time. The `same_names` option is useful if you want to cache the data with the same name as the data source for clarity. diff --git a/docs/whats_new.md b/docs/whats_new.md new file mode 100644 index 0000000..2bf203a --- /dev/null +++ b/docs/whats_new.md @@ -0,0 +1,6 @@ +# What's New + +## v0.5.0 (July 19, 2024) +* Major changes across the codebase to update to intake v2! Also updated class names; updated tests; updated docs. +* Now can choose variables to narrow results to. +* Fixed some bugs. diff --git a/environment.yml b/environment.yml index 63441df..d9e0d7d 100644 --- a/environment.yml +++ b/environment.yml @@ -8,10 +8,10 @@ dependencies: - dask - pandas - erddapy - - panel + # - panel - appdirs - - intake - - intake-xarray>=0.6.1 + # - intake - pip - pip: + - git+https://github.com/intake/intake - cf-pandas diff --git a/intake_erddap/__init__.py b/intake_erddap/__init__.py index fd168bf..45566da 100644 --- a/intake_erddap/__init__.py +++ b/intake_erddap/__init__.py @@ -1,12 +1,12 @@ """intake-erddap package.""" -from .erddap import GridDAPSource, TableDAPSource -from .erddap_cat import ERDDAPCatalog -from .version import __version__ +import intake # noqa: F401 + +from .erddap import GridDAPReader, TableDAPReader +from .erddap_cat import ERDDAPCatalogReader __all__ = [ - "ERDDAPCatalog", - "TableDAPSource", - "GridDAPSource", - "__version__", + "ERDDAPCatalogReader", + "TableDAPReader", + "GridDAPReader", ] diff --git a/intake_erddap/erddap.py b/intake_erddap/erddap.py index 13285b9..6516a9e 100644 --- a/intake_erddap/erddap.py +++ b/intake_erddap/erddap.py @@ -1,35 +1,26 @@ -"""Source implementations for intake-erddap.""" -import typing +"""Reader implementations for intake-erddap.""" from logging import getLogger -from typing import List, Optional, Tuple, Type, Union +from typing import List, Union import cf_pandas # noqa: F401 import fsspec -import numpy as np import pandas as pd import requests import xarray as xr from erddapy import ERDDAP -from intake.source import base - -from .version import __version__ +from intake.readers.readers import BaseReader log = getLogger("intake-erddap") -if typing.TYPE_CHECKING: # pragma: no cover - # numpy typing is only available after version 1.21 - from numpy.typing import ArrayLike - - -class ERDDAPSource(base.DataSource): +class ERDDAPReader(BaseReader): """ - ERDDAP Source (Base Class). This class represents the abstract base class - for an intake data source object for ERDDAP. Clients should use either - ``TableDAPSource`` or ``GridDAPSource``. + ERDDAP Reader (Base Class). This class represents the abstract base class + for an intake data reader object for ERDDAP. Clients should use either + ``TableDAPReader`` or ``GridDAPReader``. Parameters ---------- @@ -57,56 +48,22 @@ class ERDDAPSource(base.DataSource): Caches entire dataframe in memory. """ - name = "erddap" - version = __version__ - container = "dataframe" - partition_access = True + output_instance = "xarray:Dataset" - def __init__( - self, - dataset_id: str, - protocol: str, - variables: List[str] = None, - constraints: dict = None, - metadata: dict = None, - erddap_client: Optional[Type[ERDDAP]] = None, - http_client: Optional[Type] = None, - open_kwargs: dict = None, - ): - variables = variables or [] - constraints = constraints or {} - metadata = metadata or {} - - self._init_args = { - "dataset_id": dataset_id, - "protocol": protocol, - "variables": variables, - "constraints": constraints, - "metadata": metadata, - } - - self._dataset_id = dataset_id - self._protocol = protocol - self._variables = variables - self._constraints = constraints - self._erddap_client = erddap_client or ERDDAP - self._http = http_client or requests - self.open_kwargs = open_kwargs or {} - - super(ERDDAPSource, self).__init__(metadata=metadata) - - def get_client(self) -> ERDDAP: + def get_client( + self, server, protocol, dataset_id, variables, constraints, client=ERDDAP, **_ + ) -> ERDDAP: """Return an initialized ERDDAP Client.""" - e = self._erddap_client(server=self._server) - e.protocol = self._protocol - e.dataset_id = self._dataset_id - e.variables = self._variables - e.constraints = self._constraints + e = client(server=server) + e.protocol = protocol + e.dataset_id = dataset_id + e.variables = variables + e.constraints = constraints return e -class TableDAPSource(ERDDAPSource): - """Creates a Data Source for an ERDDAP TableDAP Dataset. +class TableDAPReader(ERDDAPReader): + """Creates a Data Reader for an ERDDAP TableDAP Dataset. Parameters ---------- @@ -124,14 +81,14 @@ class TableDAPSource(ERDDAPSource): A mapping of conditions and constraints. Example: ``{"time>=": "2022-01-02T12:00:00Z", "lon>": -140, "lon<": 0}`` metadata : dict, optional - Additional metadata to include with the source passed from the catalog. + Additional metadata to include with the reader passed from the catalog. erddap_client : type, optional A class that implements an interface like erdappy's ERDDAP class. The - source will rely on this client to interface with ERDDAP for most + reader will rely on this client to interface with ERDDAP for most requests. http_client : module or object, optional An object or module that implements an HTTP Client similar to request's - interface. The source will use this object to make HTTP requests to + interface. The reader will use this object to make HTTP requests to ERDDAP in some cases. mask_failed_qartod : bool, False WARNING ALPHA FEATURE. If True and `*_qc_agg` columns associated with @@ -148,19 +105,19 @@ class TableDAPSource(ERDDAPSource): Examples -------- - Sources are normally returned from a catalog object, but a source can be instantiated directly: + Readers are normally returned from a catalog object, but a Reader can be instantiated directly: - >>> source = TableDAPSource("https://erddap.senors.axds.co/erddap", + >>> reader = TableDAPReader("https://erddap.senors.axds.co/erddap", ... "gov_usgs_waterdata_441759103261203") - Getting a pandas DataFrame from the source: + Getting a pandas DataFrame from the reader: - >>> ds = source.read() + >>> ds = reader.read() Once the dataset object has been instantiated, the dataset's full metadata - is available in the source. + is available in the reader. - >>> source.metadata + >>> reader.metadata {'info_url': 'https://erddap.sensors.axds.co/erddap/info/gov_usgs_waterdata_404513098181201...', 'catalog_dir': '', 'variables': {'time': {'_CoordinateAxisType': 'Time', @@ -174,97 +131,80 @@ class TableDAPSource(ERDDAPSource): ... """ - name = "tabledap" - version = __version__ - container = "dataframe" - partition_access = True + output_instance = "pandas:DataFrame" - def __init__( + def _read( self, - server: str, - mask_failed_qartod: bool = False, - dropna: bool = False, - cache_kwargs: Optional[dict] = None, - *args, - **kwargs, + server, + dataset_id, + variables=None, + mask_failed_qartod=False, + dropna=False, + cache_kwargs=None, + open_kwargs=None, + constraints=None, + **kw, ): - self._server = server - self._dataframe: Optional[pd.DataFrame] = None - self._dataset_metadata: Optional[dict] = None - self._mask_failed_qartod = mask_failed_qartod - self._dropna = dropna - self._cache_kwargs = cache_kwargs - kwargs.pop("protocol", None) - # https://github.com/python/mypy/issues/6799 - super().__init__(*args, protocol="tabledap", **kwargs) # type: ignore - - def _get_schema(self) -> base.Schema: - if self._dataframe is None: - # TODO: could do partial read with chunksize to get likely schema from - # first few records, rather than loading the whole thing - self._load() - self._dataset_metadata = self._get_dataset_metadata() - # make type checker happy - assert self._dataframe is not None - return base.Schema( - datashape=None, - dtype=self._dataframe.dtypes, - shape=self._dataframe.shape, - npartitions=1, - extra_metadata=self._dataset_metadata, + open_kwargs = open_kwargs or {} + variables = variables or [] + kw.pop("protocol", None) + protocol = kw.pop("protocol", "tabledap") + + # check for variables in user-input list that are not available for the dataset + meta2 = self._get_dataset_metadata(server, dataset_id) + variables_diff = set(variables) - set(meta2["variables"].keys()) + if len(variables_diff) > 0: + variables = [var for var in variables if var not in variables_diff] + + e = self.get_client( + server, + protocol, + dataset_id, + variables=variables, + constraints=constraints or {}, + **kw, ) - - def _get_partition(self) -> pd.DataFrame: - if self._dataframe is None: - self._load_metadata() - return self._dataframe - - def read(self) -> pd.DataFrame: - """Return the dataframe from ERDDAP""" - return self._get_partition() - - def _close(self): - self._dataframe = None - - def _load(self): - e = self.get_client() - if self._cache_kwargs is not None: - if "response" in self.open_kwargs: - response = self.open_kwargs["response"] - self.open_kwargs.pop("response") + if cache_kwargs is not None: + if "response" in open_kwargs: + response = open_kwargs["response"] + open_kwargs.pop("response") url = e.get_download_url(response=response) else: - url = e.get_download_url(response=response) + url = e.get_download_url( + response="csvp" + ) # should this be the default or csv? - with fsspec.open(f"simplecache://::{url}", **self._cache_kwargs) as f: - self._dataframe: pd.DataFrame = pd.read_csv(f, **self.open_kwargs) + try: + with fsspec.open(f"simplecache://::{url}", **(cache_kwargs or {})) as f: + dataframe: pd.DataFrame = pd.read_csv(f, **open_kwargs) + except OSError as e: # might get file name too long + print(e) + print( + "If your filenames are too long, input only a few variables" + "to return or input into cache kwargs `same_names=False`" + ) else: - self._dataframe: pd.DataFrame = e.to_pandas( - requests_kwargs={"timeout": 60}, **self.open_kwargs + dataframe: pd.DataFrame = e.to_pandas( + requests_kwargs={"timeout": 60}, **open_kwargs ) - if self._mask_failed_qartod: - self.run_mask_failed_qartod() - if self._dropna: - self.run_dropna() - - @property - def data_cols(self): + if mask_failed_qartod: + dataframe = self.run_mask_failed_qartod(dataframe) + if dropna: + dataframe = self.run_dropna(dataframe) + return dataframe + + @staticmethod + def data_cols(df): """Columns that are not axes, coordinates, nor qc_agg columns.""" # find data columns which are what we'll use in the final step to drop nan's # don't include dimension/coordinates-type columns (dimcols) nor qc_agg columns (qccols) - dimcols = self._dataframe.cf.axes_cols + self._dataframe.cf.coordinates_cols - qccols = list( - self._dataframe.columns[self._dataframe.columns.str.contains("_qc_agg")] - ) - - datacols = [ - col for col in self._dataframe.columns if col not in dimcols + qccols - ] - + dimcols = df.cf.axes_cols + df.cf.coordinates_cols + qccols = list(df.columns[df.columns.str.contains("_qc_agg")]) + datacols = [col for col in df.columns if col not in dimcols + qccols] return datacols - def run_mask_failed_qartod(self): + def run_mask_failed_qartod(self, df): """Nan data values for which corresponding qc_agg columns is not equal to 1 or 2. To get this to work you may need to specify the "qc_agg" columns to come along specifically @@ -273,22 +213,21 @@ def run_mask_failed_qartod(self): # if a data column has an associated qc column, use it to weed out bad data by # setting it to nan. - for datacol in self.data_cols: + for datacol in self.data_cols(df): qccol = f"{datacol}_qc_agg" - if qccol in self._dataframe.columns: - self._dataframe.loc[ - ~self._dataframe[qccol].isin([1, 2]), datacol - ] = pd.NA - self._dataframe.drop(columns=[qccol], inplace=True) + if qccol in df.columns: + df.loc[~df[qccol].isin([1, 2]), datacol] = pd.NA + df.drop(columns=[qccol], inplace=True) + return df - def run_dropna(self): + def run_dropna(self, df): """Drop nan rows based on the data columns.""" - self._dataframe = self._dataframe.dropna(subset=self.data_cols) + return df.dropna(subset=self.data_cols(df)) - def _get_dataset_metadata(self) -> dict: + def _get_dataset_metadata(self, server, dataset_id) -> dict: """Fetch and return the metadata document for the dataset.""" - url = f"{self._server}/info/{self._dataset_id}/index.json" - resp = self._http.get(url) + url = f"{server}/info/{dataset_id}/index.json" + resp = requests.get(url) resp.raise_for_status() metadata: dict = {"variables": {}} for rowtype, varname, attrname, dtype, value in resp.json()["table"]["rows"]: @@ -326,8 +265,8 @@ def _parse_metadata_value( return newvalue -class GridDAPSource(ERDDAPSource): - """Creates a Data Source for an ERDDAP GridDAP Dataset. +class GridDAPReader(ERDDAPReader): + """Creates a Data Reader for an ERDDAP GridDAP Dataset. Parameters ---------- @@ -354,19 +293,19 @@ class GridDAPSource(ERDDAPSource): Examples -------- - Sources are normally returned from a catalog object, but a source can be instantiated directly: + Readers are normally returned from a catalog object, but a reader can be instantiated directly: - >>> source = GridDAPSource("https://coastwatch.pfeg.noaa.gov/erddap", "charmForecast1day", + >>> reader = GridDAPReader("https://coastwatch.pfeg.noaa.gov/erddap", "charmForecast1day", ... chunks={"time": 1}) - Getting an xarray dataset from the source object: + Getting an xarray dataset from the reader object: - >>> ds = source.to_dask() + >>> ds = reader.read() Once the dataset object has been instantiated, the dataset's full metadata - is available in the source. + is available in the reader. - >>> source.metadata + >>> reader.metadata {'catalog_dir': '', 'dims': {'time': 1182, 'latitude': 391, 'longitude': 351}, 'data_vars': {'pseudo_nitzschia': ['time', 'latitude', 'longitude'], @@ -379,114 +318,45 @@ class GridDAPSource(ERDDAPSource): 'acknowledgement': ... - Warning - ------- - The ``read()`` method will raise a ``NotImplemented`` exception because the - standard intake interface has the result read entirely into memory. For - gridded datasets this should not be allowed, reading the entire dataset into - memory can overwhelm the server, get the client blacklisted, and potentially - crash the client by exhausting available system memory. If a client truly - wants to load the entire dataset into memory, the client can invoke the - method ``ds.load()`` on the Dataset object. """ - name = "griddap" - version = __version__ - container = "xarray" - partition_access = True - - def __init__( + # def __init__( + # self, + # server: str, + # dataset_id: str, + # constraints: dict = None, + # chunks: Union[None, int, dict, str] = None, + # xarray_kwargs: dict = None, + # **kwargs, + # ): + # self._server = server + # self._chunks = chunks + # self._constraints = constraints or {} + # self._xarray_kwargs = xarray_kwargs or {} + # # Initialized by the private getter _get_schema + # self.urlpath = f"{server}/griddap/{dataset_id}" + # # https://github.com/python/mypy/issues/6799 + # kwargs.pop("protocol", None) + # super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore + + def _read( self, server: str, dataset_id: str, constraints: dict = None, chunks: Union[None, int, dict, str] = None, xarray_kwargs: dict = None, - **kwargs, + **kw, ): - self._server = server - self._ds: Optional[xr.Dataset] = None - self._chunks = chunks - self._constraints = constraints or {} - self._xarray_kwargs = xarray_kwargs or {} - # Initialized by the private getter _get_schema - self._schema: Optional[base.Schema] = None - self.urlpath = f"{server}/griddap/{dataset_id}" - # https://github.com/python/mypy/issues/6799 - kwargs.pop("protocol", None) - super().__init__(dataset_id=dataset_id, protocol="griddap", **kwargs) # type: ignore - - def _get_schema(self) -> base.Schema: - self.urlpath = self._get_cache(self.urlpath)[0] - - if self._ds is None: - # Sets self._ds - self._open_dataset() - # Make mypy happy - assert self._ds is not None - metadata = { - "dims": dict(self._ds.dims), - "data_vars": { - k: list(self._ds[k].coords) for k in self._ds.data_vars.keys() - }, - "coords": tuple(self._ds.coords.keys()), - } - metadata.update(self._ds.attrs) - metadata["variables"] = {} - for varname in self._ds.variables: - metadata["variables"][varname] = self._ds[varname].attrs - self._schema = base.Schema( - datashape=None, - dtype=None, - shape=None, - npartitions=None, - extra_metadata=metadata, - ) - - return self._schema + constraints = constraints or {} + chunks = chunks or {} + xarray_kwargs = xarray_kwargs or {} + urlpath = f"{server}/griddap/{dataset_id}" - def _open_dataset(self): - self._ds = xr.open_dataset( - self.urlpath, chunks=self._chunks, **self._xarray_kwargs - ) + ds = xr.open_dataset(urlpath, chunks=chunks, **xarray_kwargs) # _NCProperties is an internal property which xarray does not yet deal # with specially, so we remove it here to prevent it from causing # problems for clients. - if "_NCProperties" in self._ds.attrs: - del self._ds.attrs["_NCProperties"] - - def read(self): - raise NotImplementedError( - "GridDAPSource.read is not implemented because ds.load() for grids from ERDDAP are " - "strongly discouraged. Use to_dask() instead." - ) - - def read_chunked(self) -> xr.Dataset: - """Return an xarray dataset (optionally chunked).""" - self._load_metadata() - return self._ds - - def read_partition(self, i: Tuple[str, ...]) -> "ArrayLike": - """Fetch one chunk of the array for a variable.""" - self._load_metadata() - if not isinstance(i, (tuple, list)): - raise TypeError("For Xarray sources, must specify partition as tuple") - if isinstance(i, list): - i = tuple(i) - # Make mypy happy - assert self._ds is not None - arr = self._ds[i[0]].data - idx = i[1:] - if isinstance(arr, np.ndarray): - return arr - # dask array - return arr.blocks[idx].compute() - - def to_dask(self) -> xr.Dataset: - """Return an xarray dataset (optionally chunked).""" - return self.read_chunked() - - def close(self): - """Close open descriptors.""" - self._ds = None - self._schema = None + if "_NCProperties" in ds.attrs: + del ds.attrs["_NCProperties"] + return ds diff --git a/intake_erddap/erddap_cat.py b/intake_erddap/erddap_cat.py index 4a0e968..0648918 100644 --- a/intake_erddap/erddap_cat.py +++ b/intake_erddap/erddap_cat.py @@ -20,21 +20,19 @@ import requests from erddapy import ERDDAP -from intake.catalog.base import Catalog -from intake.catalog.local import LocalCatalogEntry +from intake.readers.entry import Catalog, DataDescription +from intake.readers.readers import BaseReader from intake_erddap.cache import CacheStore from . import utils -from .erddap import GridDAPSource, TableDAPSource from .utils import match_key_to_category -from .version import __version__ log = getLogger("intake-erddap") -class ERDDAPCatalog(Catalog): +class ERDDAPCatalogReader(BaseReader): """ Makes data sources out of all datasets the given ERDDAP service @@ -93,8 +91,17 @@ class ERDDAPCatalog(Catalog): One of the two supported ERDDAP Data Access Protocols: "griddap", or "tabledap". "tabledap" will present tabular datasets using pandas, meanwhile "griddap" will use xarray. + chunks : dict, optional + For griddap protocol, pass a dictionary of chunk sizes for the xarray. + xarray_kwargs : dict, optional + For griddap protocol, pass a dictionary of kwargs to pass to the + xarray.open_dataset method. metadata : dict, optional Extra metadata for the intake catalog. + variables : list of str, optional + List of variables to limit the dataset to, if available. If you're not + sure what variables are available, check info_url for the station, or + look up the dataset on the ERDDAP server. query_type : str, default "union" Specifies how the catalog should apply the query parameters. Choices are ``"union"`` or ``"intersection"``. If the ``query_type`` is set to @@ -102,6 +109,11 @@ class ERDDAPCatalog(Catalog): each individual query made to ERDDAP. This is equivalent to a logical AND of the results. If the value is ``"union"`` then the results will be the union of each resulting dataset. This is equivalent to a logical OR. + open_kwargs : dict, optional + Keyword arguments to pass to the `open` method of the ERDDAP Reader, + e.g. pandas read_csv. Response is an optional keyword argument that will + be used by ERDDAPY to determine the response format. Default is "csvp" and + for TableDAP Readers, "csv" and "csv0" are reasonable choices too. mask_failed_qartod : bool, False WARNING ALPHA FEATURE. If True and `*_qc_agg` columns associated with data columns are available, data values associated with QARTOD flags @@ -124,7 +136,7 @@ class ERDDAPCatalog(Catalog): """ name = "erddap_cat" - version = __version__ + output_instance = "intake.readers.entry:Catalog" def __init__( self, @@ -142,7 +154,10 @@ def __init__( erddap_client: Optional[Type[ERDDAP]] = None, use_source_constraints: bool = True, protocol: str = "tabledap", + chunks: Optional[dict] = None, + xarray_kwargs: Optional[dict] = None, metadata: dict = None, + variables: list = None, query_type: str = "union", cache_period: Optional[Union[int, float]] = 500, open_kwargs: dict = None, @@ -154,9 +169,11 @@ def __init__( if server.endswith("/"): server = server[:-1] self._erddap_client = erddap_client or ERDDAP - self._entries: Dict[str, LocalCatalogEntry] = {} + self._entries: Dict[str, Catalog] = {} self._use_source_constraints = use_source_constraints self._protocol = protocol + self._chunks = chunks + self._xarray_kwargs = xarray_kwargs self._dataset_metadata: Optional[Mapping[str, dict]] = None self._query_type = query_type self.server = server @@ -166,6 +183,12 @@ def __init__( self._mask_failed_qartod = mask_failed_qartod self._dropna = dropna self._cache_kwargs = cache_kwargs + if variables is not None: + variables = ["time", "latitude", "longitude", "z"] + variables + self.variables = variables + + chunks = chunks or {} + xarray_kwargs = xarray_kwargs or {} if kwargs_search is not None: checks = [ @@ -248,7 +271,7 @@ def __init__( # Clear the cache of old stale data on initialization self.cache_store.clear_cache(cache_period) - super(ERDDAPCatalog, self).__init__(metadata=metadata, **kwargs) + super(ERDDAPCatalogReader, self).__init__(metadata=metadata, **kwargs) def _load_df(self) -> pd.DataFrame: frames = [] @@ -269,7 +292,6 @@ def _load_df(self) -> pd.DataFrame: raise df.rename(columns={"Dataset ID": "datasetID"}, inplace=True) frames.append(df) - if self._query_type == "union": result = pd.concat(frames) result = result.drop_duplicates("datasetID") @@ -410,7 +432,7 @@ def get_client(self) -> ERDDAP: e.dataset_id = "allDatasets" return e - def _load(self): + def read(self): dataidkey = "datasetID" e = self.get_client() df = self._load_df() @@ -418,15 +440,22 @@ def _load(self): self._entries = {} + # Remove datasets that are redundant + if len(df) > 0: + df = df[ + (~df["datasetID"].str.startswith("ism-")) + * (df["datasetID"] != "allDatasets") + ] + + entries, aliases = {}, {} for index, row in df.iterrows(): dataset_id = row[dataidkey] - if dataset_id == "allDatasets": - continue + metadata = all_metadata.get(dataset_id, {}) - description = "ERDDAP dataset_id %s from %s" % (dataset_id, self.server) args = { "server": self.server, "dataset_id": dataset_id, + "variables": self.variables, "protocol": self._protocol, "constraints": {}, "open_kwargs": self.open_kwargs, @@ -440,32 +469,36 @@ def _load(self): } ) args["constraints"].update(self._get_tabledap_constraints()) - - metadata = all_metadata.get(dataset_id, {}) - - entry = LocalCatalogEntry( - name=dataset_id, - description=description, - driver=self._protocol, - args=args, - metadata=metadata, - getenv=False, - getshell=False, - ) - if self._protocol == "tabledap": - entry._metadata["info_url"] = e.get_info_url( - response="csv", dataset_id=dataset_id - ) - entry._plugin = [TableDAPSource] + datatype = "intake_erddap.erddap:TableDAPReader" elif self._protocol == "griddap": - entry._plugin = [GridDAPSource] + args.update( + { + "chunks": self._chunks, + "xarray_kwargs": self._xarray_kwargs, + } + ) + # no equivalent for griddap, though maybe it works the same? + args["constraints"].update(self._get_tabledap_constraints()) + datatype = "intake_erddap.erddap:GridDAPReader" else: raise ValueError(f"Unsupported protocol: {self._protocol}") - self._entries[dataset_id] = entry + metadata["info_url"] = e.get_info_url(response="csv", dataset_id=dataset_id) + entries[dataset_id] = DataDescription( + datatype, + kwargs={"dataset_id": dataset_id, **args}, + metadata=metadata, + ) + aliases[dataset_id] = dataset_id + + cat = Catalog( + data=entries, + aliases=aliases, + ) + return cat def _get_tabledap_constraints(self) -> Dict[str, Union[str, int, float]]: - """Return the constraints dictionary for a tabledap source.""" + """Return the constraints dictionary for a tabledap Reader.""" result = {} if self._use_source_constraints and "min_time" in self.kwargs_search: min_time = self.kwargs_search["min_time"] diff --git a/intake_erddap/utils.py b/intake_erddap/utils.py index 615dd84..8bf8609 100644 --- a/intake_erddap/utils.py +++ b/intake_erddap/utils.py @@ -18,16 +18,6 @@ log = getLogger("intake-erddap") -def get_project_version() -> str: - """Return the project version. - - This function resolves circular import problems with version. - """ - from intake_erddap import __version__ - - return __version__ - - def return_category_options( server: str, category: str = "standard_name", diff --git a/intake_erddap/version.py b/intake_erddap/version.py deleted file mode 100644 index ed18a7b..0000000 --- a/intake_erddap/version.py +++ /dev/null @@ -1,9 +0,0 @@ -"""Project version module.""" -from pkg_resources import DistributionNotFound, get_distribution - - -try: - __version__ = get_distribution("intake-erddap").version -except DistributionNotFound: - # package is not installed - __version__ = "unknown" diff --git a/pyproject.toml b/pyproject.toml index 3a57e48..84ae917 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,5 @@ testpaths = [ [tool.coverage.run] omit = [ "setup.py", - "intake_erddap/_version.py", - "intake_erddap/version.py", "tests/*", ] diff --git a/setup.cfg b/setup.cfg index 59d4dbb..e69de29 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +0,0 @@ -[versioneer] -VCS = git -style = pep440 -versionfile_source = intake_erddap/_version.py -versionfile_build = intake_erddap/_version.py -tag_prefix = diff --git a/setup.py b/setup.py index bca2140..f7005e4 100644 --- a/setup.py +++ b/setup.py @@ -1,6 +1,7 @@ #!/usr/bin/env python from pathlib import Path + from setuptools import setup @@ -20,15 +21,15 @@ maintainer_email="dev@axds.co", license="BSD", packages=["intake_erddap"], - package_data={"": ["*.csv", "*.yml", "*.html"]}, + # package_data={"": ["*.csv", "*.yml", "*.html"]}, entry_points={ - "intake.drivers": [ - "tabledap = intake_erddap.erddap:TableDAPSource", - "griddap = intake_erddap.erddap:GridDAPSource", - "erddap_cat = intake_erddap.erddap_cat:ERDDAPCatalog", - ] + "intake.imports": [ + "tabledap = intake_erddap.erddap:TableDAPReader", + "griddap = intake_erddap.erddap:GridDAPReader", + "erddap_cat = intake_erddap.erddap_cat:ERDDAPCatalogReader", + ], }, - include_package_data=True, + # include_package_data=True, install_requires=requires, long_description=Path("README.md").read_text(), long_description_content_type='text/markdown', diff --git a/tests/test_cache.py b/tests/test_cache.py index 021e0f4..8241b27 100644 --- a/tests/test_cache.py +++ b/tests/test_cache.py @@ -36,23 +36,23 @@ def test_cache_file(user_cache_dir_mock, tempdir): assert filepath.name == f"{sha}.gz" -@mock.patch("requests.get") -@mock.patch("appdirs.user_cache_dir") -def test_cache_csv(user_cache_dir_mock, http_get_mock, tempdir): - user_cache_dir_mock.return_value = tempdir - resp = mock.Mock() - resp.content = b"blahblah" - http_get_mock.return_value = resp - url = "http://kevinbacon.invalid/erddap/advanced?blahbah" - store = cache.CacheStore() - store.cache_response(url) - sha = store.hash_url(url) - target = Path(tempdir) / f"{sha}.gz" - assert target.exists() - assert http_get_mock.called_with(url) - with gzip.open(target, "rt", encoding="utf-8") as f: - buf = f.read() - assert buf == "blahblah" +# @mock.patch("requests.get") +# @mock.patch("appdirs.user_cache_dir") +# def test_cache_csv(user_cache_dir_mock, http_get_mock, tempdir): +# user_cache_dir_mock.return_value = tempdir +# resp = mock.Mock() +# resp.content = b"blahblah" +# http_get_mock.return_value = resp +# url = "http://kevinbacon.invalid/erddap/advanced?blahbah" +# store = cache.CacheStore() +# store.cache_response(url) +# sha = store.hash_url(url) +# target = Path(tempdir) / f"{sha}.gz" +# assert target.exists() +# assert http_get_mock.called_with(url) +# with gzip.open(target, "rt", encoding="utf-8") as f: +# buf = f.read() +# assert buf == "blahblah" @mock.patch("requests.get") diff --git a/tests/test_erddap_cat.py b/tests/test_erddap_cat.py index d731185..bc554c7 100644 --- a/tests/test_erddap_cat.py +++ b/tests/test_erddap_cat.py @@ -17,8 +17,8 @@ from erddapy import ERDDAP -from intake_erddap.erddap import GridDAPSource, TableDAPSource -from intake_erddap.erddap_cat import ERDDAPCatalog +from intake_erddap.erddap import GridDAPReader, TableDAPReader +from intake_erddap.erddap_cat import ERDDAPCatalogReader SERVER_URL = "http://erddap.invalid/erddap" @@ -48,7 +48,7 @@ def temporary_catalog(): os.unlink(path) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog(mock_read_csv, load_metadata_mock): """Test basic catalog API.""" @@ -56,11 +56,11 @@ def test_erddap_catalog(mock_read_csv, load_metadata_mock): results = pd.DataFrame() results["datasetID"] = ["abc123"] mock_read_csv.return_value = results - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog_searching(mock_read_csv, load_metadata_mock): """Test catalog with search parameters.""" @@ -76,11 +76,11 @@ def test_erddap_catalog_searching(mock_read_csv, load_metadata_mock): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kw) + cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_erddap_catalog_searching_variable(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -105,15 +105,16 @@ def test_erddap_catalog_searching_variable(mock_read_csv, load_metadata_mock): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server=SERVER_URL, kwargs_search=kw, category_search=("standard_name", "temp") - ) + ) # this is object ERDDAPCatalogReader because I haven't run .read() + assert "standard_name" in cat.kwargs_search assert cat.kwargs_search["standard_name"] == ["sea_water_temperature"] @pytest.mark.integration -def test_ioos_erddap_catalog_and_source(): +def test_ioos_erddap_catalog_and_reader(): """Integration test against IOOS Sensors ERDDAP.""" bbox = (-73.32, 39.92, -69.17, 42.27) kw = { @@ -124,11 +125,11 @@ def test_ioos_erddap_catalog_and_source(): "min_time": "2021-4-1", "max_time": "2021-4-2", } - cat_sensors = intake.open_erddap_cat( + cat_sensors = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", kwargs_search=kw - ) - source = cat_sensors["gov_noaa_water_wstr1"] - df = source.read() + ).read() + reader = cat_sensors["edu_ucsd_cdip_154"] + df = reader.read() assert df is not None assert isinstance(df, pd.DataFrame) assert len(df) > 0 @@ -139,18 +140,18 @@ def test_ioos_erddap_catalog_and_source(): @pytest.mark.integration def test_ioos_default_init(): """Test that the default catalog initializes.""" - cat_sensors = intake.open_erddap_cat( + cat_sensors = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", - ) + ).read() assert len(cat_sensors) > 0 @pytest.mark.integration def test_erddap_global_conneection(): - ERDDAPCatalog( + ERDDAPCatalogReader( "https://erddap.sensors.axds.co/erddap", kwargs_search={"standard_name": "sea_water_temperature"}, - ) + ).read() def test_invalid_kwarg_search(): @@ -163,7 +164,7 @@ def test_invalid_kwarg_search(): } with pytest.raises(ValueError): - intake.open_erddap_cat(server=SERVER_URL, kwargs_search=kw) + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() kw = { "min_lon": -180, @@ -174,10 +175,10 @@ def test_invalid_kwarg_search(): } with pytest.raises(ValueError): - intake.open_erddap_cat(server=SERVER_URL, kwargs_search=kw) + ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kw).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_uses_di_client( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -186,12 +187,12 @@ def test_catalog_uses_di_client( """Tests that the catalog uses the dependency injection provided client.""" mock_read_csv.return_value = single_dataset_catalog mock_erddap_client = mock.create_autospec(ERDDAP) - cat = ERDDAPCatalog(server=SERVER_URL, erddap_client=mock_erddap_client) + cat = ERDDAPCatalogReader(server=SERVER_URL, erddap_client=mock_erddap_client) client = cat.get_client() assert isinstance(client, mock.NonCallableMagicMock) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_skips_all_datasets_row(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -199,11 +200,11 @@ def test_catalog_skips_all_datasets_row(mock_read_csv, load_metadata_mock): df = pd.DataFrame() df["datasetID"] = ["allDatasets", "abc123"] mock_read_csv.return_value = df - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert list(cat) == ["abc123"] -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_params_search(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} @@ -220,7 +221,7 @@ def test_params_search(mock_read_csv, load_metadata_mock): "max_time": "2022-11-07", "standard_name": "sea_water_temperature", } - cat = ERDDAPCatalog(server=erddap_url, kwargs_search=search) + cat = ERDDAPCatalogReader(server=erddap_url, kwargs_search=search) search_urls = cat.get_search_urls() assert search_urls parts = urlparse(search_urls[0]) @@ -232,30 +233,33 @@ def test_params_search(mock_read_csv, load_metadata_mock): assert query["standard_name"] == "sea_water_temperature" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") -@mock.patch("intake_erddap.cache.CacheStore.read_csv") -def test_constraints_present_in_source( - mock_read_csv, load_metadata_mock, single_dataset_catalog -): - load_metadata_mock.return_value = {} - mock_read_csv.return_value = single_dataset_catalog - search = { - "min_time": "2022-01-01", - "max_time": "2022-11-07", - } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=search) - source = next(cat.values()) - assert source._constraints["time>="] == "2022-01-01" - assert source._constraints["time<="] == "2022-11-07" - - cat = ERDDAPCatalog( - server=SERVER_URL, kwargs_search=search, use_source_constraints=False - ) - source = next(cat.values()) - assert len(source._constraints) == 0 - - -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +# @mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") +# @mock.patch("intake_erddap.cache.CacheStore.read_csv") +# def test_constraints_present_in_reader( +# mock_read_csv, load_metadata_mock, single_dataset_catalog +# ): +# load_metadata_mock.return_value = {} +# mock_read_csv.return_value = single_dataset_catalog +# search = { +# "min_time": "2022-01-01", +# "max_time": "2022-11-07", +# } +# cat = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=search) +# cat.read() +# dataset_id = list(cat)[0] +# reader = cat[dataset_id] +# assert cat._constraints["time>="] == "2022-01-01" +# assert reader._constraints["time<="] == "2022-11-07" + +# cat = ERDDAPCatalogReader( +# server=SERVER_URL, kwargs_search=search, use_source_constraints=False +# ).read() +# dataset_id = list(cat)[0] +# reader = cat[dataset_id] +# assert len(reader._constraints) == 0 + + +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_with_griddap( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -266,12 +270,15 @@ def test_catalog_with_griddap( "min_time": "2022-01-01", "max_time": "2022-11-07", } - cat = ERDDAPCatalog(server=SERVER_URL, kwargs_search=search, protocol="griddap") - source = next(cat.values()) - assert isinstance(source, GridDAPSource) + cat = ERDDAPCatalogReader( + server=SERVER_URL, kwargs_search=search, protocol="griddap" + ).read() + dataset_id = list(cat)[0] + reader = cat[dataset_id] + assert isinstance(reader, GridDAPReader) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_with_unsupported_protocol( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -283,10 +290,12 @@ def test_catalog_with_unsupported_protocol( } mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, kwargs_search=search, protocol="fakedap") + ERDDAPCatalogReader( + server=SERVER_URL, kwargs_search=search, protocol="fakedap" + ).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_get_search_urls_by_category( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -298,64 +307,64 @@ def test_catalog_get_search_urls_by_category( "variableName": ["temp", "airTemp"], "search_for": ["kintsugi", "Asano"], } - catalog = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kwargs_search) + catalog = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kwargs_search) search_urls = catalog.get_search_urls() assert len(search_urls) == 6 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_bbox(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, bbox=(-120.0, 30.0, -100.0, 48.0)) + catalog = ERDDAPCatalogReader(server=SERVER_URL, bbox=(-120.0, 30.0, -100.0, 48.0)) assert catalog.kwargs_search["min_lon"] == -120.0 assert catalog.kwargs_search["max_lon"] == -100.0 assert catalog.kwargs_search["min_lat"] == 30.0 assert catalog.kwargs_search["max_lat"] == 48.0 with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, bbox=[0, 0, 1, 1]) + ERDDAPCatalogReader(server=SERVER_URL, bbox=[0, 0, 1, 1]) with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, bbox=(0, 0)) + ERDDAPCatalogReader(server=SERVER_URL, bbox=(0, 0)) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_standard_names_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, standard_names=["air_temperature", "air_pressure"] ) assert catalog.kwargs_search["standard_name"] == ["air_temperature", "air_pressure"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, standard_names="air_temperature") + ERDDAPCatalogReader(server=SERVER_URL, standard_names="air_temperature") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_variable_names_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, variable_names=["airTemp", "Pair"]) + catalog = ERDDAPCatalogReader(server=SERVER_URL, variable_names=["airTemp", "Pair"]) assert catalog.kwargs_search["variableName"] == ["airTemp", "Pair"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, variable_names="air_temperature") + ERDDAPCatalogReader(server=SERVER_URL, variable_names="air_temperature") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_times_arg(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, start_time=datetime(2022, 1, 1), end_time=datetime(2022, 12, 1), @@ -363,30 +372,30 @@ def test_catalog_times_arg(mock_read_csv, load_metadata_mock, single_dataset_cat assert catalog.kwargs_search["min_time"] == "2022-01-01T00:00:00Z" assert catalog.kwargs_search["max_time"] == "2022-12-01T00:00:00Z" with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, start_time="2022-1-1") + ERDDAPCatalogReader(server=SERVER_URL, start_time="2022-1-1") with pytest.raises(ValueError): - ERDDAPCatalog(server=SERVER_URL, end_time="2022-1-1") + ERDDAPCatalogReader(server=SERVER_URL, end_time="2022-1-1") with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, start_time=np.datetime64("2022-01-01")) + ERDDAPCatalogReader(server=SERVER_URL, start_time=np.datetime64("2022-01-01")) with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, end_time=np.datetime64("2022-01-01")) + ERDDAPCatalogReader(server=SERVER_URL, end_time=np.datetime64("2022-01-01")) -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_search_for_arg( mock_read_csv, load_metadata_mock, single_dataset_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server=SERVER_URL, search_for=["ioos", "aoos"]) + catalog = ERDDAPCatalogReader(server=SERVER_URL, search_for=["ioos", "aoos"]) assert catalog.kwargs_search["search_for"] == ["ioos", "aoos"] with pytest.raises(TypeError): - ERDDAPCatalog(server=SERVER_URL, search_for="aoos") + ERDDAPCatalogReader(server=SERVER_URL, search_for="aoos") -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_query_search_for( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -396,7 +405,7 @@ def test_catalog_query_search_for( kwargs_search = { "search_for": ["air_pressure", "air_temperature"], } - catalog = ERDDAPCatalog(server=SERVER_URL, kwargs_search=kwargs_search) + catalog = ERDDAPCatalogReader(server=SERVER_URL, kwargs_search=kwargs_search) search_urls = catalog.get_search_urls() url = search_urls[0] parts = urlparse(url) @@ -409,48 +418,50 @@ def test_catalog_query_search_for( assert query["searchFor"] == "air_temperature" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_search_returns_404(mock_read_csv, load_metadata_mock): load_metadata_mock.return_value = {} mock_read_csv.side_effect = HTTPError( code=404, msg="Blah", url=SERVER_URL, hdrs={}, fp=None ) - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() assert len(cat) == 0 mock_read_csv.side_effect = HTTPError( code=500, msg="Blah", url=SERVER_URL, hdrs={}, fp=None ) with pytest.raises(HTTPError): - ERDDAPCatalog(server=SERVER_URL) + ERDDAPCatalogReader(server=SERVER_URL).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_saving_catalog( mock_read_csv, load_metadata_mock, single_dataset_catalog, temporary_catalog ): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - cat = ERDDAPCatalog(server=SERVER_URL) - cat.save(temporary_catalog) + cat = ERDDAPCatalogReader(server=SERVER_URL).read() + cat.to_yaml_file(temporary_catalog) cat = intake.open_catalog(temporary_catalog) - source = next(cat.values()) - assert isinstance(source, TableDAPSource) - assert source._protocol == "tabledap" - assert source._server == SERVER_URL - assert source._dataset_id == "abc123" + dataset_id = list(cat)[0] + assert dataset_id == "abc123" + reader = cat[dataset_id] + assert isinstance(reader, TableDAPReader) + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["protocol"] == "tabledap" + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["server"] == SERVER_URL - cat = ERDDAPCatalog(server=SERVER_URL, protocol="griddap") - cat.save(temporary_catalog) + cat = ERDDAPCatalogReader(server=SERVER_URL, protocol="griddap").read() + cat.to_yaml_file(temporary_catalog) cat = intake.open_catalog(temporary_catalog) - source = next(cat.values()) - assert isinstance(source, GridDAPSource) - assert source._protocol == "griddap" - assert source._server == SERVER_URL - assert source._dataset_id == "abc123" + dataset_id = list(cat)[0] + assert dataset_id == "abc123" + reader = cat[dataset_id] + assert isinstance(reader, GridDAPReader) + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["protocol"] == "griddap" + assert cat.__dict__["data"][dataset_id].__dict__["kwargs"]["server"] == SERVER_URL @mock.patch("intake_erddap.utils.get_erddap_metadata") @@ -463,20 +474,20 @@ def test_loading_metadata( "abc123": {"datasetID": "abc123", "institution": "FOMO"} } - cat = ERDDAPCatalog(server=SERVER_URL) + cat = ERDDAPCatalogReader(server=SERVER_URL) assert cat["abc123"].metadata["institution"] == "FOMO" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_trailing_slash(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog - catalog = ERDDAPCatalog(server="http://blah.invalid/erddap/") + catalog = ERDDAPCatalogReader(server="http://blah.invalid/erddap/") assert catalog.server == "http://blah.invalid/erddap" -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): data = [ @@ -521,7 +532,7 @@ def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): # mock 3 calls mock_read_csv.side_effect = [sub_df1, sub_df2, sub_df3] - catalog = ERDDAPCatalog( + catalog = ERDDAPCatalogReader( server=SERVER_URL, standard_names=["air_pressure", "air_temperature"], variable_names=["sigma"], @@ -531,33 +542,35 @@ def test_catalog_query_type_intersection(mock_read_csv, load_metadata_mock): assert len(search_urls) == 3 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_query_type_invalid(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} mock_read_csv.return_value = single_dataset_catalog with pytest.raises(ValueError): - ERDDAPCatalog(server="http://blah.invalid/erddap/", query_type="blah") + ERDDAPCatalogReader( + server="http://blah.invalid/erddap/", query_type="blah" + ).read() @pytest.mark.integration def test_empty_search_results(): - cat = intake.open_erddap_cat( + cat = ERDDAPCatalogReader( server="https://erddap.sensors.ioos.us/erddap", standard_names=["sea_surface_temperature"], kwargs_search={ - "min_lon": -156.48529052734375, - "max_lon": -148.9251251220703, + "min_lon": -153.48529052734375, + "max_lon": -150.9251251220703, "min_lat": 56.70049285888672, "max_lat": 61.524776458740234, "min_time": "2022-04-30T00:00:00.000000000", "max_time": "2022-12-15T23:00:00.000000000", }, - ) + ).read() assert len(cat) == 0 -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog): load_metadata_mock.return_value = {} @@ -565,9 +578,9 @@ def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog resp.status_code = 404 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"] - ) + ).read() assert len(cat) == 0 mock_read_csv.assert_called() @@ -575,12 +588,12 @@ def test_empty_catalog(mock_read_csv, load_metadata_mock, single_dataset_catalog resp.status_code = 500 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) with pytest.raises(requests.exceptions.HTTPError): - ERDDAPCatalog( + ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"] - ) + ).read() -@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalog._load_metadata") +@mock.patch("intake_erddap.erddap_cat.ERDDAPCatalogReader._load_metadata") @mock.patch("intake_erddap.cache.CacheStore.read_csv") def test_empty_catalog_with_intersection( mock_read_csv, load_metadata_mock, single_dataset_catalog @@ -590,10 +603,10 @@ def test_empty_catalog_with_intersection( resp.status_code = 404 mock_read_csv.side_effect = requests.exceptions.HTTPError(response=resp) - cat = ERDDAPCatalog( + cat = ERDDAPCatalogReader( server="http://blah.invalid/erddap", standard_names=["air_temperature"], query_type="intersection", - ) + ).read() assert len(cat) == 0 mock_read_csv.assert_called() diff --git a/tests/test_erddap_source.py b/tests/test_erddap_reader.py similarity index 66% rename from tests/test_erddap_source.py rename to tests/test_erddap_reader.py index 962f016..17e2320 100644 --- a/tests/test_erddap_source.py +++ b/tests/test_erddap_reader.py @@ -1,6 +1,6 @@ #!/usr/bin/env pytest # -*- coding: utf-8 -*- -"""Unit tests for the ERDDAP Source object.""" +"""Unit tests for the ERDDAP Reader object.""" import json from pathlib import Path @@ -12,13 +12,13 @@ import pytest import xarray as xr -from intake_erddap.erddap import GridDAPSource, TableDAPSource +from intake_erddap.erddap import GridDAPReader, TableDAPReader def _grid(grid_data) -> xr.Dataset: time = xr.DataArray( - data=np.array(["2022-01-01T00:00:00"], dtype=" xr.Dataset: return _grid(grid_data) -@mock.patch("intake_erddap.erddap.TableDAPSource._get_dataset_metadata") +@mock.patch("intake_erddap.erddap.TableDAPReader._get_dataset_metadata") @mock.patch("erddapy.ERDDAP.to_pandas") -def test_erddap_source_read(mock_to_pandas, mock_get_dataset_metadata): - """Tests that the source will read from ERDDAP into a pd.DataFrame.""" +def test_erddap_reader_read(mock_to_pandas, mock_get_dataset_metadata): + """Tests that the reader will read from ERDDAP into a pd.DataFrame.""" df = pd.DataFrame() df["time (UTC)"] = ["2022-10-21T00:00:00Z", "2022-10-21T00:00:00Z"] df["sea_water_temperature (deg_C)"] = [13.4, 13.4] mock_to_pandas.return_value = df - mock_get_dataset_metadata.return_value = {} + mock_get_dataset_metadata.return_value = {"variables": {}} - source = TableDAPSource( + reader = TableDAPReader( server="http://erddap.invalid/erddap", dataset_id="abc123", protocol="tabledap" ) - df = source.read() + df = reader.read() + assert df is not None assert mock_to_pandas.called assert len(df) == 2 - source.close() - assert source._dataframe is None + reader.close() -@mock.patch("intake_erddap.erddap.TableDAPSource._get_dataset_metadata") +@mock.patch("intake_erddap.erddap.TableDAPReader._get_dataset_metadata") @mock.patch("erddapy.ERDDAP.to_pandas") -def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata): - """Tests that the source will read from ERDDAP into a pd.DataFrame with processing flag.""" +def test_erddap_reader_read_processing(mock_to_pandas, mock_get_dataset_metadata): + """Tests that the reader will read from ERDDAP into a pd.DataFrame with processing flag.""" df = pd.DataFrame() df["time"] = [ "2022-10-21T01:00:00Z", @@ -94,16 +94,16 @@ def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata df["sea_water_temperature"] = [13.4, 13.4, np.nan] df["sea_water_temperature_qc_agg"] = [1, 4, 2] mock_to_pandas.return_value = df - mock_get_dataset_metadata.return_value = {} + mock_get_dataset_metadata.return_value = {"variables": {}} - source = TableDAPSource( + reader = TableDAPReader( server="http://erddap.invalid/erddap", dataset_id="abc123", protocol="tabledap", mask_failed_qartod=True, dropna=True, ) - df = source.read() + df = reader.read() assert df is not None assert mock_to_pandas.called # mask_failed_qartod flag removes 2nd data point and dropna removes 3rd data point @@ -111,7 +111,7 @@ def test_erddap_source_read_processing(mock_to_pandas, mock_get_dataset_metadata @mock.patch("requests.get") -def test_tabledap_source_get_dataset_metadata(mock_get): +def test_tabledap_reader_get_dataset_metadata(mock_get): test_data = Path(__file__).parent / "test_data/tabledap_metadata.json" bad = { "table": { @@ -124,8 +124,10 @@ def test_tabledap_source_get_dataset_metadata(mock_get): resp = mock.MagicMock() resp.json.side_effect = [json.loads(test_data.read_text()), bad] mock_get.return_value = resp - source = TableDAPSource(server="http://erddap.invalid", dataset_id="abc123") - metadata = source._get_dataset_metadata() + server = "http://erddap.invalid" + dataset_id = "abc123" + reader = TableDAPReader(server, dataset_id) + metadata = reader._get_dataset_metadata(server, dataset_id) assert metadata["cdm_data_type"] == "TimeSeries" assert metadata["variables"]["z"]["actual_range"] == [0.0, 0.0] assert metadata["variables"]["depth_to_water_level"]["status_flags"] == [ @@ -136,43 +138,28 @@ def test_tabledap_source_get_dataset_metadata(mock_get): 9, ] - metadata = source._get_dataset_metadata() + metadata = reader._get_dataset_metadata(server, dataset_id) assert len(metadata) == 1 assert len(metadata["variables"]) == 0 @mock.patch("xarray.open_dataset") -def test_griddap_source_no_chunks(mock_open_dataset, fake_grid): +def test_griddap_reader_no_chunks(mock_open_dataset, fake_grid): server = "https://erddap.invalid" dataset_id = "abc123" mock_open_dataset.return_value = fake_grid - source = GridDAPSource(server=server, dataset_id=dataset_id) - ds = source.to_dask() + reader = GridDAPReader(server=server, dataset_id=dataset_id) + ds = reader.read() assert ds is fake_grid assert "_NCProperties" not in ds.attrs - - with pytest.raises(NotImplementedError): - source.read() - - arr = source.read_partition(("temp", None)) - assert isinstance(arr, np.ndarray) - - arr = source.read_partition(["temp", None]) - assert isinstance(arr, np.ndarray) - - with pytest.raises(TypeError): - source.read_partition("temp") - - source.close() - assert source._ds is None - assert source._schema is None + assert "temp" in ds.variables @mock.patch("xarray.open_dataset") -def test_griddap_source_with_dask(mock_open_dataset, fake_dask_grid): +def test_griddap_reader_with_dask(mock_open_dataset, fake_dask_grid): server = "https://erddap.invalid" dataset_id = "abc123" mock_open_dataset.return_value = fake_dask_grid - source = GridDAPSource(server=server, dataset_id=dataset_id) - arr = source.read_partition(("temp", 0)) - assert isinstance(arr, np.ndarray) + reader = GridDAPReader(server=server, dataset_id=dataset_id) + arr = reader.read() + assert isinstance(arr, xr.Dataset) diff --git a/tests/test_utils.py b/tests/test_utils.py index 8669cfe..e445632 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -17,11 +17,6 @@ class Something: pass -def test_get_project_version(): - version = utils.get_project_version() - assert version is not None - - @mock.patch("pandas.read_csv") def test_category_and_key(mock_read_csv): df_mock = pd.DataFrame()