diff --git a/.gitignore b/.gitignore index b1bd05e..dbbefbd 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,6 @@ +# Launcher +update.sh + # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] diff --git a/.readthedocs.yaml b/.readthedocs.yaml index b4c2f31..ea527a5 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -7,9 +7,9 @@ version: 2 # Set the version of Python and other tools you might need build: - os: ubuntu-20.04 + os: ubuntu-22.04 tools: - python: "3.8" + python: "3.10" # Build documentation in the docs/ directory with Sphinx sphinx: diff --git a/.zenodo.json b/.zenodo.json index 7acf567..a573784 100644 --- a/.zenodo.json +++ b/.zenodo.json @@ -3,17 +3,22 @@ { "orcid": "0000-0002-0826-4487", "affiliation": "UMR Marbec, IRD", - "name": "Gaetan Morand" + "name": "Morand, Gaetan" }, { "affiliation": "UMR Marbec, IRD", - "name": "Sylvain Poulain" + "name": "Poulain, Sylvain" + }, + { + "orcid": "0000-0002-3519-6141", + "affiliation": "UMR Marbec, IRD", + "name": "Barde, Julien" } ], "license": "GPL-3.0", - "title": "GeoEnrich v0.5.8: a new tool for scientists to painlessly enrich species occurrence data with environmental variables", + "title": "GeoEnrich v0.6.2: a new tool for scientists to painlessly enrich species occurrence data with environmental variables", "related_identifiers": [ { diff --git a/CHANGELOG.md b/CHANGELOG.md index 9e090ec..3d1563e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,22 @@ +## v0.6.2 + +#### New functions: + - Accept semicolon delimited CSV files for personal variable catalogs + - Use closest lower depth instead of closest depth to stay in the water column + + +## v0.6.1 + +#### Bug fixes: + - Fixed metadata generation when downlaoding Copernicus data + +## v0.6 + +#### New functions: + - Added support for the new Copernicus data store (using copernicusmarine API) + - Added Support for 'nearest' mode for data recovery at depth. + + ## v0.5.8 #### New functions: diff --git a/README.md b/README.md index a9b1097..3b60254 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# **geoenrich 0.5.8** +# **geoenrich 0.6.2** [![Read the Docs](https://img.shields.io/readthedocs/geoenrich)](https://geoenrich.readthedocs.io/en/latest/) [![License](https://img.shields.io/github/license/morand-g/geoenrich?color=green)](https://github.com/morand-g/geoenrich/blob/main/LICENSE) @@ -29,9 +29,10 @@ Documentation on [Read the Docs](https://geoenrich.readthedocs.io). ![Illustration of an occurrence dataset enriched with bathymetry data](https://github.com/morand-g/geoenrich/blob/main/geoenrich/data/readme_illus_1.png?raw=true "Illustration of an occurrence dataset enriched with bathymetry data") +# Acknowledgment This project is being developed as part of the G2OI project, cofinanced by the European union, the Reunion region, and the French Republic. -Union Européenne     Région Réunion     République Française +Union Européenne     Région Réunion     République Française ## Installation diff --git a/docker/Dockerfile b/docker/Dockerfile index a53dd9b..12114ac 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -1,11 +1,11 @@ -FROM tiangolo/uwsgi-nginx-flask:python3.8 +FROM tiangolo/uwsgi-nginx-flask:python3.10 RUN apt-get update -RUN apt-get install ffmpeg libsm6 libxext6 python3-mpi4py -y +RUN apt-get install ffmpeg libsm6 libxext6 -y -RUN python3 -m pip install geoenrich mpi4py +RUN python3 -m pip install geoenrich RUN rm /app/* RUN curl https://raw.githubusercontent.com/morand-g/geoenrich/main/docker/initialize.py -o /home/initialize.py -RUN old_path="'./'" && new_path="'/app/data/'" && sed -i "s%$old_path%$new_path%g" /usr/local/lib/python3.8/site-packages/geoenrich/credentials_example.py +RUN old_path="'./'" && new_path="'/app/data/'" && sed -i "s%$old_path%$new_path%g" /usr/local/lib/python3.10/site-packages/geoenrich/credentials_example.py diff --git a/docker/app/main.py b/docker/app/main.py index 47ab9a1..92fd16d 100644 --- a/docker/app/main.py +++ b/docker/app/main.py @@ -16,7 +16,7 @@ app = Flask(__name__) # enable debugging mode -app.config["DEBUG"] = True +app.config["DEBUG"] = False # App variables app.config['UPLOAD_FOLDER'] = 'static/uploads/' diff --git a/docker/app/templates/home.html b/docker/app/templates/home.html index 0b83464..10198ac 100644 --- a/docker/app/templates/home.html +++ b/docker/app/templates/home.html @@ -79,16 +79,14 @@

GeoEnrich online

- - - - + + @@ -96,7 +94,6 @@

GeoEnrich online

- @@ -104,12 +101,9 @@

GeoEnrich online

- - + - - diff --git a/docker/docker-compose.yml b/docker/docker-compose.yml index 5b6e233..21e9353 100644 --- a/docker/docker-compose.yml +++ b/docker/docker-compose.yml @@ -13,7 +13,6 @@ services: - "8080:8080" environment: - FLASK_APP=main.py - - FLASK_DEBUG=1 - 'RUN=flask run --host=0.0.0.0 --port=8080' command: bash -c "python /home/initialize.py && flask run --host=0.0.0.0 --port=8080" diff --git a/docker/initialize.py b/docker/initialize.py index 47c674f..8d4be36 100644 --- a/docker/initialize.py +++ b/docker/initialize.py @@ -53,9 +53,9 @@ Path('/app/static/assets/' + asset).open('wb').write(r.content) if Path('/app/conf/credentials.py').exists(): - shutil.copy(Path('/app/conf/credentials.py'), Path('/usr/local/lib/python3.8/site-packages/geoenrich/credentials.py')) + shutil.copy(Path('/app/conf/credentials.py'), Path('/usr/local/lib/python3.10/site-packages/geoenrich/credentials.py')) if Path('/app/conf/personal_catalog.csv').exists(): - shutil.copy(Path('/app/conf/personal_catalog.csv'), Path('/usr/local/lib/python3.8/site-packages/geoenrich/data/personal_catalog.csv')) + shutil.copy(Path('/app/conf/personal_catalog.csv'), Path('/usr/local/lib/python3.10/site-packages/geoenrich/data/personal_catalog.csv')) -print('Initialization complete.') \ No newline at end of file +print('Initialization complete.') diff --git a/docs/requirements.txt b/docs/requirements.txt index a80726c..07eef1e 100644 --- a/docs/requirements.txt +++ b/docs/requirements.txt @@ -1,7 +1,7 @@ numpy pandas geopandas -netCDF4==1.5.8 +netCDF4 python-dwca-reader tqdm opencv-python @@ -14,3 +14,5 @@ geojson_rewind geomet nbsphinx sphinx_copybutton +sphinx_rtd_theme +copernicusmarine diff --git a/docs/source/conf.py b/docs/source/conf.py index 1f781db..9fbfa7e 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -21,11 +21,11 @@ # -- Project information ----------------------------------------------------- project = 'geoenrich' -copyright = '2022, Gaétan Morand (UMR Marbec). Project under GNU GPL v3 license' +copyright = '2024, Gaétan Morand (UMR Marbec). Project under GNU GPL v3 license' author = 'Gaétan Morand (UMR Marbec)' # The full version, including alpha/beta/rc tags -release = '0.5.8' +release = '0.6.2' # -- General configuration --------------------------------------------------- diff --git a/docs/source/enrichment.rst b/docs/source/enrichment.rst index 048fe1c..76136e4 100644 --- a/docs/source/enrichment.rst +++ b/docs/source/enrichment.rst @@ -33,6 +33,8 @@ Other functions (for internal use) .. autofunction:: geoenrich.enrichment.enrich_compute +.. autofunction:: geoenrich.enrichment.enrich_copernicus + .. autofunction:: geoenrich.enrichment.enrich_download .. autofunction:: geoenrich.enrichment.get_enrichment_id diff --git a/docs/source/examples.rst b/docs/source/examples.rst index c3cb422..d0d2d6e 100644 --- a/docs/source/examples.rst +++ b/docs/source/examples.rst @@ -19,27 +19,31 @@ You may also use a custom csv file that does not follow any standard. In this ca A column with a unique ID is mandatory, to be able to link downloaded data to the corresponding occurrence. Date, latitude, and longitude columns are mandatory. Here is an exemple of such a file: .. list-table:: turtles.csv - :widths: 20 20 20 20 20 + :widths: 10 20 20 10 20 20 :header-rows: 1 * - ID - Lat - Lon + - Depth - Day - Comments * - turtle1 - -28.752241 - 154.8926541 + - 12 - 2018-07-29 - bottom feeding * - turtle2 - 2.5754611 - 72.964164 + - 4 - 2019-02-13 - cruising * - turtle3 - -21.2871554 - 55.316446 + - 3 - 2021-01-05 - resting @@ -49,7 +53,8 @@ This file can be imported the following way:: id_col = 'ID', date_col = 'Day', lat_col = 'Lat', - lon_col = 'Lon') + lon_col = 'Lon', + depth_col = 'Depth') The date parser should work with any common date format. If you encounter problems with a custom date format, you can try to provide an explicit format string using the *date_format* parameter. See *strptime* documentation `here `_. diff --git a/docs/source/index.rst b/docs/source/index.rst index 2efacd8..693e798 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -1,4 +1,4 @@ -geoenrich 0.5.8 documentation +geoenrich 0.6.2 documentation ============================== |Read the Docs| |License| |PyPI| |Python versions| |Last commit| |DOI| @@ -6,7 +6,7 @@ geoenrich 0.5.8 documentation GeoEnrich provides functionalities to enrich georeferenced events (such as species occurrences) with environmental data from satellites or models. Users can specify a geographic or temporal buffer to include data in the neighbourhood of occurrences into their analyses. Two main outputs are available: a simple summary of the variable in the requested area, or the full data (as a geotiff raster, a png image, or a numpy array). -Sea surface temperature, chlorophyll, and 40 other environmental variables are available natively, and others can easily be added by the user. This package is intended for large numbers of occurrences: local storage is implemented to avoid redundant requests to remote servers. +Sea surface temperature, chlorophyll, and 40 other environmental variables are available natively, and other sources can easily be added by the user. This package is intended for large numbers of occurrences: local storage is implemented to avoid redundant requests to remote servers. The package provides functions to retrieve occurrence data directly from GBIF, or open a custom dataset from any source. Arbitrary areas defined by the user can also be enriched. diff --git a/docs/source/install.rst b/docs/source/install.rst index e887baf..46c85a1 100644 --- a/docs/source/install.rst +++ b/docs/source/install.rst @@ -5,7 +5,7 @@ Installation instructions for Python 1. Work environment ------------------- -This package was tested on Ubuntu 20.04 with Python 3.8. +This package was tested on Ubuntu 20.04 with Python 3.8 and on Ubuntu 22.04 with Python 3.10. It should work on other operating systems and with other versions of Python 3, but this wasn't tested yet. 2. Prerequisites @@ -13,6 +13,8 @@ It should work on other operating systems and with other versions of Python 3, b Assuming you have Python3 and pip installed. This is automatic in all recent Linux distributions. Otherwise instructions are available here: `Python `_ and `pip `_. +If you want to use Copernicus data, you need to install Copernicus Marine API (`instructions `_) and set it up with your Copernicus account (`instructions `_). + 3. Installation --------------- @@ -53,14 +55,20 @@ There is also a dictionary named *dap_creds* that is intended to store credentia 4.2. Adding other data sources ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -At the same location, there is a *catalog.csv* file that already contains a list of available variables. If you want to use a dataset from Copernicus, you first need to register on `their website `_ and write your credentials in the *credentials.py* file. +At the same location, there is a *catalog.csv* file that already contains a list of available variables. -If you need additional variables, you can update add a *personal_catalog.csv* file to the same folder (template on `GitHub `_). Three columns are compulsory: +If you need additional variables, you can add a *personal_catalog.csv* file to the same folder (template on `GitHub `_). Three columns are compulsory: - *variable*: A unique name for that variable (user defined). It needs to be different from the variable names already in the built-in catalog. - *url*: OpenDAP URL. - *varname*: Name of the variable in the remote dataset. +If the required variable is from a Copernicus data set, the fields are slightly different: + +- *variable*: A unique name for that variable (user defined). It needs to be different from the variable names already in the built-in catalog. +- *source*: Must be set to "Copernicus" +- *url*: Copernicus Dataset ID +- *varname*: Name of the variable in the remote dataset. 6. Using the package -------------------- diff --git a/docs/source/r-install.rst b/docs/source/r-install.rst index 240aed7..8814ba1 100644 --- a/docs/source/r-install.rst +++ b/docs/source/r-install.rst @@ -7,6 +7,8 @@ Installation instructions for R Assuming you have a version of R installed on your computer, as well as Python3 and pip. This is automatic in all recent Linux distributions. Otherwise instructions are available here: `Python `_ and `pip `_. +If you want to use Copernicus data, you need to install Copernicus Marine API (`instructions `_) and set it up with your Copernicus account (`instructions `_). + 2. Installation @@ -64,14 +66,20 @@ There is also a dictionary named *dap_creds* that is intended to store credentia 3.2. Adding other data sources ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ -At the same location, there is a *catalog.csv* file that already contains a list of available variables. If you want to use a dataset from Copernicus, you first need to register on `their website `_ and write your credentials in the *credentials.py* file. +At the same location, there is a *catalog.csv* file that already contains a list of available variables. -If you need additional variables, you can update add a *personal_catalog.csv* file to the same folder (template on `GitHub `_). Three columns are compulsory: +If you need additional variables, you can add a *personal_catalog.csv* file to the same folder (template on `GitHub `_). Three columns are compulsory: - *variable*: A unique name for that variable (user defined). It needs to be different from the variable names already in the built-in catalog. - *url*: OpenDAP URL. - *varname*: Name of the variable in the remote dataset. +If the required variable is from a Copernicus data set, the fields are slightly different: + +- *variable*: A unique name for that variable (user defined). It needs to be different from the variable names already in the built-in catalog. +- *source*: Must be set to "Copernicus" +- *url*: Copernicus Dataset ID +- *varname*: Name of the variable in the remote dataset. 4. Using the package -------------------- diff --git a/docs/source/satellite.rst b/docs/source/satellite.rst index 4942faa..5554133 100644 --- a/docs/source/satellite.rst +++ b/docs/source/satellite.rst @@ -16,10 +16,14 @@ Other functions (for internal use) .. autofunction:: geoenrich.satellite.create_nc_calculated +.. autofunction:: geoenrich.satellite.create_nc_copernicus + .. autofunction:: geoenrich.satellite.ellipsoid_mask .. autofunction:: geoenrich.satellite.get_metadata +.. autofunction:: geoenrich.satellite.get_metadata_copernicus + .. autofunction:: geoenrich.satellite.get_var_catalog .. autofunction:: geoenrich.satellite.insert_multidimensional_slice diff --git a/geoenrich/data/catalog.csv b/geoenrich/data/catalog.csv index 1349b06..f9aeb84 100644 --- a/geoenrich/data/catalog.csv +++ b/geoenrich/data/catalog.csv @@ -1,45 +1,38 @@ variable,source,spatial_resolution,time_resolution,time_coverage,depth_levels,url,varname -backscattering3d,Copernicus,0.25°,7d,1998-01-01 - ongoing,36 levels,https://my.cmems-du.eu/thredds/dodsC/cmems_obs_glo_bgc3d_rep_weekly,bbp bathymetry,NOAA,0.017°,,,surface,https://www.ngdc.noaa.gov/thredds/dodsC/global/ETOPO1_Bed_g_gmt4.nc,z -chlorophyll,Copernicus,4km,1d,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-gapfree-multi-4km_P1D,CHL -chlorophyll3d,Copernicus,0.25°,7d,1998-01-01 - ongoing,36 levels,https://my.cmems-du.eu/thredds/dodsC/cmems_obs_glo_bgc3d_rep_weekly,chl -chlorophyll-occi,OCCI,0.042°,1d,1997-09-06 - 2021-12-31,surface,https://www.oceancolour.org/thredds/dodsC/CCI_ALL-v5.0-DAILY,chlor_a -current3d-u,Copernicus,0.25°,7d,1993-01-01 - ongoing,50 levels,https://nrt.cmems-du.eu/thredds/dodsC/dataset-armor-3d-rep-weekly,ugo -current3d-v,Copernicus,0.25°,7d,1993-01-01 - ongoing,50 levels,https://nrt.cmems-du.eu/thredds/dodsC/dataset-armor-3d-rep-weekly,vgo -diatoms,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,DIATO -dinophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,DINO +chlorophyll,Copernicus,4km,1d,1997 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-gapfree-multi-4km_P1D,CHL +current3d-u,Copernicus,0.25°,7d,1993-01-01 - 2022-12-28,50 levels,dataset-armor-3d-rep-weekly,ugo +current3d-v,Copernicus,0.25°,7d,1993-01-01 - 2022-12-28,50 levels,dataset-armor-3d-rep-weekly,vgo +diatoms,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,DIATO +dinophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,DINO eke,,,,,,calculated,eke fsle,Aviso+,0.04°,1d,1994-01-04 – ongoing,surface,https://tds.aviso.altimetry.fr/thredds/dodsC/dataset-duacs-dt-global-allsat-madt-fsle,fsle_max fsle-orientation,Aviso+,0.04°,1d,1994-01-04 – ongoing,surface,https://tds.aviso.altimetry.fr/thredds/dodsC/dataset-duacs-dt-global-allsat-madt-fsle,theta_max -geos-current-u,Copernicus,0.25°,1d,1993-01-01 - 2019-12-31,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-sl_glo_phy-ssh_my_allsat-l4-duacs-0.25deg_P1D,ugosa -geos-current-u2,Copernicus,0.25°,1d,2019-12-01 - ongoing,surface,https://nrt.cmems-du.eu/thredds/dodsC/dataset-duacs-nrt-global-merged-allsat-phy-l4,ugosa -geos-current-v,Copernicus,0.25°,1d,1993-01-01 - 2019-12-31,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-sl_glo_phy-ssh_my_allsat-l4-duacs-0.25deg_P1D,vgosa -geos-current-v2,Copernicus,0.25°,1d,2019-12-01 - ongoing,surface,https://nrt.cmems-du.eu/thredds/dodsC/dataset-duacs-nrt-global-merged-allsat-phy-l4,vgosa -green-algae,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,GREEN -haptophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,HAPTO -microphytoplankton,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,MICRO -mixed-layer-thickness,Copernicus,0.25°,7d,1993-01-01 - ongoing,surface,https://nrt.cmems-du.eu/thredds/dodsC/dataset-armor-3d-rep-weekly,mlotst -nanophytoplankton,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,NANO -ocean-heat-content,NOAA,1°,3m,1955-01-01 - ongoing,surface,https://www.ncei.noaa.gov/thredds-ocean/dodsC/woa/heat_content/heat_content/heat_content_anomaly_0-700_seasonal.nc,h18_hc -organic-carbon3d,Copernicus,0.25°,7d,1998-01-01 - ongoing,36 levels,https://my.cmems-du.eu/thredds/dodsC/cmems_obs_glo_bgc3d_rep_weekly,poc -oxygen,Copernicus,0.25°,1d,1993-01-01 - 2020-12-31,75 levels,https://my.cmems-du.eu/thredds/dodsC/cmems_mod_glo_bgc_my_0.25_P1D-m,o2 -oxygen2,Copernicus,0.25°,1d,2019-05-04 - ongoing,50 levels,https://nrt.cmems-du.eu/thredds/dodsC/global-analysis-forecast-bio-001-028-daily,o2 -ph,Copernicus,0.25°,1m,1993-01-01 - 2020-12-31,75 levels,https://my.cmems-du.eu/thredds/dodsC/cmems_mod_glo_bgc_my_0.25_P1M-m,ph -ph2,Copernicus,0.25°,1d,2019-05-04 - ongoing,50 levels,https://nrt.cmems-du.eu/thredds/dodsC/global-analysis-forecast-bio-001-028-daily,ph -picophytoplankton,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PICO -primary-production,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-pp_my_l4-multi-4km_P1M,PP -prochlorophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PROCHLO -prokaryotes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PROKAR -salinity,SMOS+Aquarius,0.25°,7d,2011-09-01 - 2021-09-08,surface,[FillMismatch]https://thredds.jpl.nasa.gov/thredds/dodsC/SalinityDensity/OISSS_L4_multimission_7day_v1.nc,sss -salinity3d,Copernicus,0.25°,7d,1993-01-01 - ongoing,50 levels,https://nrt.cmems-du.eu/thredds/dodsC/dataset-armor-3d-rep-weekly,so -sla,Copernicus,0.25°,1d,1993-01-01 - 2019-12-31,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-sl_glo_phy-ssh_my_allsat-l4-duacs-0.25deg_P1D,sla -sla2,Copernicus,0.25°,1d,2019-12-01 - ongoing,surface,https://nrt.cmems-du.eu/thredds/dodsC/dataset-duacs-nrt-global-merged-allsat-phy-l4,sla -sst,Copernicus,0.05°,1d,1981-10-01 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/METOFFICE-GLO-SST-L4-REP-OBS-SST,analysed_sst -surface-current-u,Copernicus,0.25°,3h,1993-10-01 - ongoing,"surface, 15m",https://my.cmems-du.eu/thredds/dodsC/dataset-uv-rep-hourly,uo -surface-current-v,Copernicus,0.25°,3h,1993-10-01 - ongoing,"surface, 15m",https://my.cmems-du.eu/thredds/dodsC/dataset-uv-rep-hourly,vo +geos-current-u,Copernicus,0.25°,1d,1993-01-01 - 2023-06-07,surface,cmems_obs-sl_glo_phy-ssh_my_allsat-l4-duacs-0.25deg_P1D,ugosa +geos-current-v,Copernicus,0.25°,1d,1993-01-01 - 2023-06-07,surface,cmems_obs-sl_glo_phy-ssh_my_allsat-l4-duacs-0.25deg_P1D,vgosa +green-algae,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,GREEN +haptophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,HAPTO +microphytoplankton,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,MICRO +mixed-layer-thickness,Copernicus,0.083°,1d,1993-01-01 - 2021-06-30,surface,cmems_mod_glo_phy_my_0.083deg_P1D-m,mlotst +nanophytoplankton,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,NANO +nitrate,Copernicus,0.25°,1d,1993-01-01 - 2022-12-31,75 levels,cmems_mod_glo_bgc_my_0.25deg_P1D-m,no3 +organic-carbon3d,Copernicus,0.25°,7d,1998-01-01 - 2021-12-29,36 levels,cmems_obs_glo_bgc3d_rep_weekly,poc +oxygen,Copernicus,0.25°,1d,1993-01-01 - 2022-12-31,75 levels,cmems_mod_glo_bgc_my_0.25deg_P1D-m,o2 +ph,Copernicus,0.25°,1m,1985-01-01 - 2022-12-01,surface,dataset-carbon-rep-monthly,ph +picophytoplankton,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PICO +primary-production,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-pp_my_l4-multi-4km_P1M,PP +prochlorophytes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PROCHLO +prokaryotes,Copernicus,4km,1m,1997-09-04 - ongoing,surface,cmems_obs-oc_glo_bgc-plankton_my_l4-multi-4km_P1M,PROKAR +salinity3d,Copernicus,0.083°,1d,1993-01-01 - ongoing,50 levels,cmems_mod_glo_phy_my_0.083deg_P1D-m,so +ssh,Copernicus,0.083°,1d,1993-01-01 - ongoing,50 levels,cmems_mod_glo_phy_my_0.083deg_P1D-m,zos +sst,Copernicus,0.05°,1d,1981-10-01 - ongoing,surface,METOFFICE-GLO-SST-L4-REP-OBS-SST,analysed_sst +surface-current-u,Copernicus,0.25°,1h,1993-01-01 - ongoing,"surface, 15m",cmems_obs_mob_glo_phy-cur_my_0.25deg_PT1H-i,uo +surface-current-v,Copernicus,0.25°,1h,1993-01-01 - ongoing,"surface, 15m",cmems_obs_mob_glo_phy-cur_my_0.25deg_PT1H-i,vo surface-wind-u,CCMP,0.25°,6h,1987-07-02 - 2011-12-31,surface,http://apdrc.soest.hawaii.edu:80/dods/public_data/satellite_product/CCMP/6hourly_v2,uwnd surface-wind-v,CCMP,0.25°,6h,1987-07-02 - 2011-12-31,surface,http://apdrc.soest.hawaii.edu:80/dods/public_data/satellite_product/CCMP/6hourly_v2,vwnd -temperature3d,Copernicus,0.25°,7d,1993-01-01 - ongoing,50 levels,https://nrt.cmems-du.eu/thredds/dodsC/dataset-armor-3d-rep-weekly,to -wave-height,Copernicus,2°,1d,2002-01-01 - ongoing,surface,https://my.cmems-du.eu/thredds/dodsC/cmems_obs-wave_glo_phy-swh_my_multi-l4-2deg_P1D,VAVH_INST -wind-u,Copernicus,0.125°,1h,2020-07-01 – ongoing,surface,https://nrt.cmems-du.eu/thredds/dodsC/cmems_obs-wind_glo_phy_nrt_l4_0.125deg_PT1H,eastward_wind -wind-v,Copernicus,0.125°,1h,2020-07-01 – ongoing,surface,https://nrt.cmems-du.eu/thredds/dodsC/cmems_obs-wind_glo_phy_nrt_l4_0.125deg_PT1H,northward_wind +temperature3d,Copernicus,0.083°,1d,1993-01-01 - ongoing,50 levels,cmems_mod_glo_phy_my_0.083deg_P1D-m,thetao +wave-height,Copernicus,0.2°,3h,1993-01-01 - 2023-04-30,surface,cmems_mod_glo_wav_my_0.2deg_PT3H-i,VHM0 +wind-u,Copernicus,0.125°,1h,2007-01-11 – ongoing,surface,cmems_obs-wind_glo_phy_my_l4_0.125deg_PT1H,eastward_wind +wind-v,Copernicus,0.125°,1h,2007-01-11 – ongoing,surface,cmems_obs-wind_glo_phy_my_l4_0.125deg_PT1H,northward_wind +wind-u-old,Copernicus,0.25°,1h,1994-06-01 – 2009-10-31,surface,cmems_obs-wind_glo_phy_my_l4_0.25deg_PT1H,eastward_wind +wind-v-old,Copernicus,0.25°,1h,1994-06-01 – 2009-10-31,surface,cmems_obs-wind_glo_phy_my_l4_0.25deg_PT1H,northward_wind diff --git a/geoenrich/data/webapp_turtles.csv b/geoenrich/data/webapp_turtles.csv index b292f13..a36f96b 100644 --- a/geoenrich/data/webapp_turtles.csv +++ b/geoenrich/data/webapp_turtles.csv @@ -1,4 +1,4 @@ -id,latitude,longitude,date,Comments -turtle1,-28.752241,154.8926541,2018-07-29,bottom feeding -turtle2,2.5754611,72.964164,2019-02-13,cruising -turtle3,-21.2871554,55.316446,2021-01-05,resting +id,latitude,longitude,depth,date,Comments +turtle1,-28.752241,154.8926541,14,2018-07-29,bottom feeding +turtle2,2.5754611,72.964164,3,2019-02-13,cruising +turtle3,-21.2871554,55.316446,8,2021-01-05,resting diff --git a/geoenrich/dataloader.py b/geoenrich/dataloader.py index a82971b..6b8945c 100644 --- a/geoenrich/dataloader.py +++ b/geoenrich/dataloader.py @@ -82,7 +82,7 @@ def request_from_gbif(taxon_key, override = False): for e in l['results']: preds = e['request']['predicate']['predicates'] for predicate in preds: - if predicate['key'] == 'TAXON_KEY' and predicate['value'] == str(taxonKey): + if predicate['key'] == 'TAXON_KEY' and predicate['value'] == str(taxon_key): existing = True if not(override): print('Request already made on ' + e['created']) @@ -90,7 +90,7 @@ def request_from_gbif(taxon_key, override = False): request_id = e['key'] if not(existing) or override: - req = ['taxonKey = {}'.format(taxonKey), 'hasCoordinate = True'] + req = ['taxonKey = {}'.format(taxon_key), 'hasCoordinate = True'] res = occ.download(req, user=gbif_username, pwd=gbif_pw, email = email, pred_type='and') return(res[0]) @@ -195,7 +195,7 @@ def open_dwca(path = None, taxonKey = None, max_number = 10000): -def import_occurrences_csv(path, id_col, date_col, lat_col, lon_col, date_format = None, +def import_occurrences_csv(path, id_col, date_col, lat_col, lon_col, depth_col = None, date_format = None, crs="EPSG:4326", *args, **kwargs): @@ -211,6 +211,7 @@ def import_occurrences_csv(path, id_col, date_col, lat_col, lon_col, date_format date_col (int or str): Name or index of the column containing occurrence dates. lat_col (int or str): Name or index of the column containing occurrence latitudes (decimal degrees). lon_col (int or str): Name or index of the column containing occurrence longitudes (decimal degrees). + depth_col (int or str): Name or index of the column containing occurrence longitudes (meters from the surface). date_format (str): To avoid date parsing mistakes, specify your date format (according to strftime syntax). crs (str): Crs of the provided coordinates. Returns: @@ -218,8 +219,11 @@ def import_occurrences_csv(path, id_col, date_col, lat_col, lon_col, date_format """ # Load file + if depth_col is None: + columns = [id_col, date_col, lat_col, lon_col] + else: + columns = [id_col, date_col, lat_col, lon_col, depth_col] - columns = [id_col, date_col, lat_col, lon_col] rawdf = pd.read_csv(path, usecols = columns, index_col = id_col, *args, **kwargs) idf = rawdf.dropna(subset = [lat_col, lon_col]) @@ -228,7 +232,10 @@ def import_occurrences_csv(path, id_col, date_col, lat_col, lon_col, date_format print('Dropped {} rows with missing coordinates'.format(len(rawdf) - len(idf))) # Convert Lat/Long to GEOS POINT - idf['geometry'] = gpd.points_from_xy(idf[lon_col], idf[lat_col], crs=crs) + if depth_col is None: + idf['geometry'] = gpd.points_from_xy(idf[lon_col], idf[lat_col], crs=crs) + else: + idf['geometry'] = gpd.points_from_xy(idf[lon_col], idf[lat_col], idf[depth_col].abs(), crs=crs) # Remove rows with no event date idf['eventDate'] = pd.to_datetime(idf[date_col], errors = 'coerce', format = date_format) diff --git a/geoenrich/enrichment.py b/geoenrich/enrichment.py index fcf0860..7e04d4d 100644 --- a/geoenrich/enrichment.py +++ b/geoenrich/enrichment.py @@ -19,9 +19,15 @@ from tqdm import tqdm +import copernicusmarine +import xarray as xr + import geoenrich from geoenrich.satellite import * +import logging +logging.getLogger("copernicus_marine_root_logger").setLevel("WARN") + try: from geoenrich.credentials import * except: @@ -57,7 +63,7 @@ def enrich(dataset_ref, var_id, geo_buff = None, time_buff = None, depth_request var_id (str): ID of the variable to download. geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers). time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date. - depth_request (str): Used when depth is a dimension. 'surface' only downloads surface data. Anything else downloads everything. + depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data. downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key. slice (int tuple): Slice of the enrichment file to use for enrichment. maxpoints(int): Maximum number of points to download. @@ -69,6 +75,8 @@ def enrich(dataset_ref, var_id, geo_buff = None, time_buff = None, depth_request original, enrichment_metadata = load_enrichment_file(dataset_ref) + print(f"Starting enrichment for variable '{var_id}' on dataset '{dataset_ref}'...") + input_type = enrichment_metadata['input_type'] enrichments = enrichment_metadata['enrichments'] @@ -93,6 +101,10 @@ def enrich(dataset_ref, var_id, geo_buff = None, time_buff = None, depth_request if var_source['url'] == 'calculated': indices = enrich_compute(to_enrich, var_id, geo_buff, time_buff, downsample) + elif var_source['source'] == 'Copernicus': + indices = enrich_copernicus(to_enrich, var_source['varname'], var_id, var_source['url'], + geo_buff, time_buff, depth_request, downsample, maxpoints, + force_download) else: indices = enrich_download( to_enrich, var_source['varname'], var_id, var_source['url'], geo_buff, time_buff, depth_request, downsample, maxpoints, @@ -187,7 +199,8 @@ def enrich_compute(geodf, var_id, geo_buff, time_buff, downsample): dimdict_2, _ = get_metadata(remote_ds, firstvar) remote_ds.close() t1, t2 = min(dimdict_2['time']['vals']), max(dimdict_2['time']['vals']) - geodf2 = geodf[(geodf['mint'] >= t1) & (geodf['maxt'] <= t2)] + time_res = (t2 - t1) / (len(dimdict_2['time']['vals']) - 1) + geodf2 = geodf[(geodf['mint'] >= t1 - time_res) & (geodf['maxt'] <= t2 + time_res)] print('Ignoring {} rows because data is not available at these dates'.format(len(geodf) - len(geodf2))) else: geodf2 = geodf @@ -241,7 +254,7 @@ def enrich_download(geodf, varname, var_id, url, geo_buff, time_buff, depth_requ url (str): Dataset url (including credentials if needed). geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers). time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date. - depth_request (str): For 4D data: 'surface' only download surface data. Anything else downloads everything. + depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data. downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key. maxpoints(int): Maximum number of points to download. force_download(bool): If True, download data regardless of cache status. @@ -290,7 +303,8 @@ def enrich_download(geodf, varname, var_id, url, geo_buff, time_buff, depth_requ if 'time' in dimdict: t1, t2 = min(dimdict['time']['vals']), max(dimdict['time']['vals']) - geodf2 = geodf[(geodf['mint'] >= t1) & (geodf['maxt'] <= t2)] + time_res = (t2 - t1) / (len(dimdict['time']['vals']) - 1) + geodf2 = geodf[(geodf['mint'] >= t1 - time_res) & (geodf['maxt'] <= t2 + time_res)] print('Ignoring {} rows because data is not available at these dates'.format(len(geodf) - len(geodf2))) else: geodf2 = geodf @@ -339,6 +353,116 @@ def enrich_download(geodf, varname, var_id, url, geo_buff, time_buff, depth_requ return(res) +def enrich_copernicus(geodf, varname, var_id, dataset_id, geo_buff, time_buff, depth_request, downsample, maxpoints, force_download): + + """ + Download Copernicus data for the requested occurrences and buffer into local netcdf file. + Calculate and return indices of the data of interest in the ncdf file. + + Args: + geodf (geopandas.GeoDataFrame): Data to be enriched. + varname(str): Variable name in the dataset. + var_id (str): ID of the variable to download. + dataset_id (str): Copernicus dataset ID. + geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers). + time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date. + depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data. + downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key. + maxpoints(int): Maximum number of points to download. + force_download(bool): If True, download data regardless of cache status. + + Returns: + pandas.DataFrame: DataFrame with indices of relevant data in the netCDF file. + + """ + + # Get netcdf metadata + + remote_ds = copernicusmarine.open_dataset(dataset_id = dataset_id) + + dimdict, var = get_metadata_copernicus(remote_ds, varname) + var['var_id'] = var_id + + # Add bounds if occurrences + + if 'minx' not in geodf.columns: + if geo_buff is None or (time_buff is None and 'time' in dimdict): + raise BufferError('Please specify time_buff and geo_buff.') + geodf = add_bounds(geodf, geo_buff, time_buff) + + + # Check if local netcdf files already exist + + if not(Path(sat_path, var_id + '.nc').exists()) or \ + not(Path(sat_path, var_id + '_downloaded.nc').exists()): + + create_nc_copernicus(get_var_catalog()[var_id]) + + # Backup local netCDF files + + timestamp = datetime.now().strftime('%d-%H-%M') + shutil.copy2( str(Path(sat_path, var_id + '.nc')), + str(Path(sat_path, var_id + '.nc.' + timestamp))) + shutil.copy2( str(Path(sat_path, var_id + '_downloaded.nc')), + str(Path(sat_path, var_id + '_downloaded.nc.' + timestamp))) + + # Load files + + local_ds = nc.Dataset(str(Path(sat_path, var_id + '.nc.' + timestamp)), mode ='r+') + bool_ds = nc.Dataset(str(Path(sat_path, var_id + '_downloaded.nc.' + timestamp)), mode ='r+') + + # Remove out of timeframe datapoints + + if 'time' in dimdict: + t1, t2 = min(dimdict['time']['vals']), max(dimdict['time']['vals']) + time_res = (t2 - t1) / (len(dimdict['time']['vals']) - 1) + geodf2 = geodf[(geodf['mint'] >= t1 - time_res) & (geodf['maxt'] <= t2 + time_res)] + print('Ignoring {} rows because data is not available at these dates'.format(len(geodf) - len(geodf2))) + else: + geodf2 = geodf + + # Apply query to each row sequentially + + if not(len(geodf2)): + print('No data in input dataframe.') + return(pd.DataFrame()) + + geodf2['ind'] = geodf2.apply(calculate_indices, axis = 1, args = (dimdict, var, depth_request, downsample)) + + if maxpoints is not None and (s:= checksize(geodf2['ind'])) > maxpoints: + + print(f"You are requesting a download of {s:,} points and the limit is set to {maxpoints:,}\n" + "Please reduce your buffer size, your number of occurrences, or use geoenrich locally") + res = pd.DataFrame() + + else: + res = geodf2.progress_apply(row_enrich, axis=1, args = (remote_ds, local_ds, bool_ds, dimdict, var, depth_request, downsample, force_download), + result_type = 'expand') + + # Update time variable in local dataset if needed + + if 'time' in dimdict and local_ds.variables[dimdict['time']['name']][:].mask.any(): + + local_ds.variables[dimdict['time']['name']][:] = remote_ds.variables[dimdict['time']['name']][:] + + + # Close datasets + + local_ds.close() + bool_ds.close() + remote_ds.close() + + + # Remove backup + + Path(sat_path, var_id + '.nc').unlink() + Path(sat_path, var_id + '_downloaded.nc').unlink() + + Path(sat_path, var_id + '.nc.' + timestamp).rename(Path(sat_path, var_id + '.nc')) + Path(sat_path, var_id + '_downloaded.nc.' + timestamp).rename(Path(sat_path, var_id + '_downloaded.nc')) + + print('Enrichment over') + return(res) def checksize(ind): @@ -412,6 +536,9 @@ def add_bounds(geodf1, geo_buff, time_buff): geodf['bestt'] = pd.to_datetime(geodf['eventDate']) geodf['maxt'] = pd.to_datetime(geodf['eventDate'] + buff2) + if geodf['geometry'].z.notna().any(): + geodf['bestz'] = geodf['geometry'].z + return(geodf) @@ -432,7 +559,7 @@ def row_enrich(row, remote_ds, local_ds, bool_ds, dimdict, var, depth_request, d bool_ds (netCDF4.Dataset): Local dataset recording whether data has already been downloaded. dimdict (dict): Dictionary of dimensions as returned by :func:`geoenrich.satellite.get_metadata`. var (dict): Variable dictionary as returned by :func:`geoenrich.satellite.get_metadata`. - depth_request (str): For 4D data: 'surface' only download surface data. Anything else downloads everything. + depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data. downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key. force_download(bool): If True, download data regardless of cache status. Returns: @@ -552,7 +679,7 @@ def calculate_indices(row, dimdict, var, depth_request, downsample): row (pandas.Series): GeoDataFrame row to enrich. dimdict (dict): Dictionary of dimensions as returned by geoenrich.satellite.get_metadata. var (dict): Variable dictionary as returned by geoenrich.satellite.get_metadata. - depth_request (str): For 4D data: 'surface' only download surface data. Anything else downloads everything. + depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data. downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key. Returns: dict: Dictionary of indices for each dimension (keys are standard dimension names). @@ -597,15 +724,24 @@ def calculate_indices(row, dimdict, var, depth_request, downsample): t1 = np.argmin( np.abs( dimdict['time']['vals'] - row['bestt'] ) ) ind['time']['best'] = t1 - # if depth is a dimension, either select surface layer or return everything + # if depth is a dimension, select surface layer, nearest lower value or everything if ('depth' in dimdict) and (dimdict['depth']['name'] in var['params']): - if depth_request == 'surface': - d1 = np.argmin( np.abs( dimdict['depth']['vals'] ) ) + if depth_request == 'nearest' and pd.notna(row['bestz']): + diffs = (row['bestz'] - dimdict['depth']['vals']).astype('float') + diffs[diffs < 0] = np.nan + d1 = np.nanargmin(diffs) ind['depth'] = {'min': d1, 'max': d1, 'best': d1, 'step': 1} - else: + + elif depth_request == 'all': ind['depth'] = {'min': 0, 'max': len(dimdict['depth']['vals']) - 1, 'best': None, 'step': 1} + else: + # Surface + d1 = np.argmin( np.abs( dimdict['depth']['vals'] ) ) + ind['depth'] = {'min': d1, 'max': d1, 'best': d1, 'step': 1} + + for dim in downsample: ind[dim]['step'] = downsample[dim] + 1 @@ -650,7 +786,7 @@ def download_data(remote_ds, local_ds, bool_ds, var, dimdict, ind, force_downloa elif ('time' in ind) and (check.ndim == len(ind)): - # If time is a dimension, check wich timepoints already have the data. + # If time is a dimension, check which timepoints already have the data. time_pos = var['params'].index(dimdict['time']['name']) expected_lentime = 1 + (ind['time']['max'] - ind['time']['min']) // ind['time']['step'] @@ -951,7 +1087,7 @@ def get_enrichment_id(enrichments, var_id, geo_buff, time_buff, depth_request, d var_id (str): ID of the variable to download. geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers). time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date. - depth_request (str): Used when depth is a dimension. 'surface' only downloads surface data. Anything else downloads everything. + depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data. downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key. Returns: @@ -988,7 +1124,7 @@ def save_enrichment_config(dataset_ref, enrichment_id, var_id, geo_buff, time_bu var_id (str): ID of the variable to download. geo_buff (int): Geographic buffer for which to download data around occurrence point (kilometers). time_buff (float list): Time bounds for which to download data around occurrence day (days). For instance, time_buff = [-7, 0] will download data from 7 days before the occurrence to the occurrence date. - depth_request (str): Used when depth is a dimension. 'surface' only downloads surface data. Anything else downloads everything. + depth_request (str): For 4D data: 'all' -> data for all depths. 'nearest' -> closest lower available depth. Anything else downloads surface data. downsample (dict): Number of points to skip between each downloaded point, for each dimension, using its standard name as a key. Returns: None diff --git a/geoenrich/exports.py b/geoenrich/exports.py index c314c34..d8909d0 100644 --- a/geoenrich/exports.py +++ b/geoenrich/exports.py @@ -665,7 +665,7 @@ def export_raster(dataset_ref, occ_id, var_id, path = Path('./'), geo_buff = Non print('Abort. Array is smaller than 2x2 pixels.') -def collate_npy(ds_ref, data_path, output_res = 32, slice = None, dimension3 = {'surface-current-u': 2}): +def collate_npy(ds_ref, data_path, output_res = 32, slice = None, dimension3 = {'example-var': 2}): """ Export a 3D numpy array with all layers for each occurrence of a dataset. diff --git a/geoenrich/satellite.py b/geoenrich/satellite.py index 3e0a8f2..c398f35 100644 --- a/geoenrich/satellite.py +++ b/geoenrich/satellite.py @@ -8,9 +8,11 @@ from pathlib import Path from datetime import datetime -from cftime import num2date, num2pydate +import pytz +from cftime import num2date, num2pydate, date2num import geoenrich +import copernicusmarine try: from geoenrich.credentials import * @@ -47,10 +49,11 @@ def get_metadata(ds, varname): if 'months since' in ds.variables[name].__dict__['units']: times = num2date(ds.variables[name][:], ds.variables[name].__dict__['units'], '360_day') else: + cal = getattr(ds.variables[name], 'calendar', 'gregorian') if varname in ['uwnd', 'vwnd']: - times = num2pydate(ds.variables[name][:] - 725563, 'days since 1987-01-01 00:00:00') + times = num2pydate(ds.variables[name][:] - 725563, 'days since 1987-01-01 00:00:00', cal) else: - times = num2pydate(ds.variables[name][:], ds.variables[name].__dict__['units']) + times = num2pydate(ds.variables[name][:], ds.variables[name].__dict__['units'], cal) times = pd.Series([datetime(*d.timetuple()[:-3]) for d in times]) item = {'name': name, 'standard_name': 'time', 'vals': times, 'unit': None} dimdict[name] = item @@ -105,6 +108,77 @@ def get_metadata(ds, varname): return dimdict, var +def get_metadata_copernicus(ds, varname): + + """ + Download and format useful metadata on dimensions and variables from a Copernicus dataset. + Generate a dictionary where dimensions can be accessed both with their original name and their standard name (if available). + + Args: + ds (xarray.Dataset): Dataset of interest. + varname (str): Name of the variable of interest in the dataset. + Returns: + dict, dict: dictionary with standardized information on dimensions, dictionary with information on the variable. + """ + + dimdict = {} + var = None + + for name in ds.variables: + + # Format time dimension + + if name in ['time', 'time_agg']: + item = {'name': name, 'standard_name': 'time', 'vals': pd.Series(ds.variables['time']), 'unit': None} + dimdict[name] = item + dimdict['time'] = item + + # Format lon & lat dimensions + + elif ('standard_name' in ds.variables[name].attrs) and (ds.variables[name].attrs['standard_name'] in ['longitude', 'latitude', 'depth']): + + item = {'name': name, + 'standard_name': ds.variables[name].attrs['standard_name'], + 'vals': ds.variables[name].data, + 'unit': ds.variables[name].attrs['units']} + dimdict[name] = item + dimdict[ds.variables[name].attrs['standard_name']] = item + + # Format requested variable + + elif name == varname: + + var = {'name':name, + 'unit': ds.variables[name].attrs['units'], + 'params': ds.variables[name].dims} + + if 'standard_name' in ds.variables[name].attrs: + var['standard_name'] = ['standard_name'] + + + # Search for latitude and longitude in case standard names were not provided + + elif name in ['lat', 'latitude']: + + item = {'name': name, + 'standard_name': 'latitude', + 'vals': ds.variables[name].data, + 'unit': ds.variables[name].attrs['units']} + dimdict[name] = item + dimdict['latitude'] = item + + elif name in ['lon', 'longitude']: + + item = {'name': name, + 'standard_name': 'longitude', + 'vals': ds.variables[name].data, + 'unit': ds.variables[name].attrs['units']} + dimdict[name] = item + dimdict['longitude'] = item + + + return dimdict, var + def get_var_catalog(): @@ -123,7 +197,15 @@ def get_var_catalog(): var_catalog = pd.read_csv(path / 'data' / 'catalog.csv', index_col = 0).to_dict('index') if (path / 'data' / 'personal_catalog.csv').exists(): - pers_vars = pd.read_csv(path / 'data' / 'personal_catalog.csv', index_col = 0).to_dict('index') + cat_comma = pd.read_csv(path / 'data' / 'personal_catalog.csv', index_col = 0) + cat_semicolon = pd.read_csv(path / 'data' / 'personal_catalog.csv', index_col = 0, sep = ';') + + if len(cat_semicolon.columns) > len(cat_comma.columns): + pers_cat = cat_semicolon + else: + pers_cat = cat_comma + + pers_vars = pers_cat.to_dict('index') var_catalog = {**var_catalog, **pers_vars} for v in var_catalog: @@ -155,6 +237,7 @@ def create_nc(var): pathd = Path(sat_path, var['var_id'] + '_downloaded.nc') remote_ds = nc.Dataset(var['url']) + varname = var['varname'] dimdict, var = get_metadata(remote_ds, varname) @@ -182,13 +265,85 @@ def create_nc(var): local_ds.createVariable(varname, variable.dtype, variable.dimensions, zlib = True) local_ds.variables[varname].setncatts({k: variable.getncattr(k) for k in variable.ncattrs()}) - bool_ds.createVariable(varname, 'B', remote_ds.variables[varname].dimensions, zlib = True, fill_value = 0) + bool_ds.createVariable(varname, 'B', variable.dimensions, zlib = True, fill_value = 0) local_ds.close() bool_ds.close() remote_ds.close() +def create_nc_copernicus(var): + + """ + Create empty netcdf file for requested variable for subsequent local storage. + Same dimensions as the online dataset. + + Args: + var (dict): Variable dictionary, as returned by :func:`geoenrich.satellite.get_var_catalog`. + Returns: + None + """ + + path = Path(sat_path, var['var_id'] + '.nc') + pathd = Path(sat_path, var['var_id'] + '_downloaded.nc') + + remote_ds = copernicusmarine.open_dataset(dataset_id = var['url']) + + + varname = var['varname'] + dimdict, var = get_metadata_copernicus(remote_ds, varname) + + local_ds = nc.Dataset(str(path), mode = 'w') + local_ds.set_fill_off() + bool_ds = nc.Dataset(str(pathd), mode = 'w') + + for name, length in remote_ds.sizes.items(): + if ('time' in dimdict) and (name == dimdict['time']['name']): + local_ds.createDimension(name, None) + bool_ds.createDimension(name, None) + else: + local_ds.createDimension(name, length) + bool_ds.createDimension(name, length) + + + # Time conversion + unix_epoch = np.datetime64(0, 's') + one_second = np.timedelta64(1, 's') + + for name, variable in remote_ds.variables.items(): + if (name in dimdict) and (dimdict[name]['standard_name'] in ['time', 'latitude', 'longitude', 'depth']): + + if dimdict[name]['standard_name'] == 'time': + times = [] + for t in variable.data: + seconds_since_epoch = (t - unix_epoch) / one_second + d = datetime.fromtimestamp(seconds_since_epoch, pytz.utc) + times.append(date2num(d, "hours since 1950-01-01 00:00:00", calendar = 'gregorian')) + + local_ds.createVariable(name, 'u4', variable.dims, zlib= True) + local_ds.variables[name][:] = np.array(times) + local_ds.variables[name].axis = "T" + local_ds.variables[name].coverage_content_type = "coordinate" + local_ds.variables[name].standard_name = "time" + local_ds.variables[name].calendar = 'gregorian' + local_ds.variables[name].units = "hours since 1950-01-01 00:00:00" + + else: + local_ds.createVariable(name, variable.dtype, variable.dims, zlib= True) + local_ds.variables[name][:] = variable.data + local_ds.variables[name].setncatts(variable.attrs) + + + variable = remote_ds.variables[varname] + local_ds.createVariable(varname, variable.dtype, variable.dims, zlib = True) + local_ds.variables[varname].setncatts(variable.attrs) + + bool_ds.createVariable(varname, 'B', variable.dims, zlib = True, fill_value = 0) + + local_ds.close() + bool_ds.close() + remote_ds.close() + def create_nc_calculated(var_id): diff --git a/setup.cfg b/setup.cfg index 3df90dc..0eb8422 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,6 +1,6 @@ [metadata] name = geoenrich -version = 0.5.8 +version = 0.6.2 author = Gaétan Morand (UMR Marbec) author_email = gaetan.morand@ird.fr description = A package to enrich your geo-referenced data (e.g. species occurrences) with environmental data. @@ -31,7 +31,7 @@ install_requires = numpy pandas>=2.0.0 geopandas - netCDF4==1.5.8 + netCDF4 python-dwca-reader tqdm opencv-python @@ -42,3 +42,4 @@ install_requires = appdirs geojson_rewind geomet + copernicusmarine