From adaefd2c30f049a436977e843f0f18ea06a0fa10 Mon Sep 17 00:00:00 2001 From: Yingquan Li Date: Fri, 15 Nov 2024 14:29:27 -0500 Subject: [PATCH] Descr: Adding the Gold team's notebook: MAST Bulk Download through AWS. --- ..._MAST_Data_Bulk_Download_through_AWS.ipynb | 328 ++++++++++++++++++ .../bulk_download/requirements.txt | 1 + 2 files changed, 329 insertions(+) create mode 100644 notebooks/multi_mission/bulk_download/20241108_MAST_Data_Bulk_Download_through_AWS.ipynb create mode 100644 notebooks/multi_mission/bulk_download/requirements.txt diff --git a/notebooks/multi_mission/bulk_download/20241108_MAST_Data_Bulk_Download_through_AWS.ipynb b/notebooks/multi_mission/bulk_download/20241108_MAST_Data_Bulk_Download_through_AWS.ipynb new file mode 100644 index 000000000..438e73ea0 --- /dev/null +++ b/notebooks/multi_mission/bulk_download/20241108_MAST_Data_Bulk_Download_through_AWS.ipynb @@ -0,0 +1,328 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# **MAST Data Bulk Download through AWS**\n", + "Enhance the MAST user experience for astronomers and scientists such that the data download per mission is targeted and seamless!" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Learning Goals\n", + "By using this notebook, an astronomer/scientist will:\n", + "* Understand that downloading data and files in bulk from AWS is feasible.\n", + "* Make targeted queries to MAST using parameters such as: `right ascension`, `declination`, `observation` and more.\n", + "* Filter the resulting products by using parameters such as: `productType`, `productSubGroupDescription`, `productGroupDescription`, `mrp_only`, and more.\n", + "* Use this notebook to programmatically download *.fits* files locally, which may be much easier than the equivalent UI web tool." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Table of Contents\n", + "* Introduction\n", + "* Imports\n", + "* Two Core Functions from Astropy: `query_criteria()` and `filter_products()`\n", + "* The 3-Step Data Download Process" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Introduction\n", + "This notebook contains some sample code to bulk download files from MAST, with examples provided for `GALEX` and `Pan-STARRS (PS1)`. This notebook can be generalized to query data from other missions too such as: `SWIFT`, `HST`, or `IUE`. Please feel free to modify the code to your particular use case! If you have any questions, please don't hesitate to reach out to archive@stsci.edu." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Imports\n", + "* `Observations` from *astroquery.mast* to query the Barbara A. Mikulski Archive for Space Telescopes (MAST)." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "INFO: Using the S3 STScI public dataset [astroquery.mast.cloud]\n" + ] + } + ], + "source": [ + "from astroquery.mast import Observations\n", + "\n", + "# Turning on access to the cloud dataset.\n", + "Observations.enable_cloud_dataset()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Two Core Functions from Astropy: `query_criteria()` and `filter_products()`\n", + "\n", + "`query_criteria()` and `filter_products()` are two functions from Astropy that enable us to make queries and then filter the corresponding products.\n", + "\n", + "All the parameters that we could use in `query_criteria()` are shown below." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Column Name Column Label Data Type Units Description Examples/Valid Values \n", + "--------------------- ------------------------- --------- ---------- ------------------------------------------------------------------------ --------------------------------------------------------------------------------------------------------------------\n", + " intentType Observation Type string Whether observation is for science or calibration. Valid values: science, calibration\n", + " obs_collection Mission string Collection E.g. SWIFT, PS1, HST, IUE\n", + " provenance_name Provenance Name string Provenance name, or source of data E.g. TASOC, CALSTIS, PS1\n", + " instrument_name Instrument string Instrument Name E.g. WFPC2/WFC, UVOT, STIS/CCD\n", + " project Project string Processing project E.g. HST, HLA, EUVE, hlsp_legus\n", + " filters Filters string Instrument filters F469N, NUV, FUV, LOW DISP, MIRROR\n", + " wavelength_region Waveband string Energy Band EUV, XRAY, OPTICAL\n", + " target_name Target Name string Target Name Ex. COMET-67P-CHURYUMOV-GER-UPDATE\n", + "target_classification Target Classification string Type of target Ex. COMET;COMET BEING ORBITED BY THE ROSETTA SPACECRAFT;SOLAR SYSTEM\n", + " obs_id Observation ID string Observation identifier, given by mission U24Z0101T, N4QF18030\n", + " s_ra RA float deg Observation Right Ascension May be displayed in the Portal as hh:mm:ss.sss, but should be searched as decimal\n", + " s_dec Dec float deg Observation Declination May be displayed in the Portal as hh:mm:ss.sss, but should be searched as decimal\n", + " proposal_id Proposal ID string Proposal ID E.g. EGCJC, 11360, 9401\n", + " proposal_pi Principal Investigator string Principal investigator's last name Chander, Chu, Malkin\n", + " obs_title Observation Title string Observation description from proposal Age-dating Star Clusters in M101\n", + " dataproduct_type Product Type string Type of product Valid values: IMAGE, SPECTRUM, SED, TIMESERIES, VISIBILITY, EVENTLIST, CUBE, CATALOG, ENGINEERING, NULL\n", + " calib_level Calibration Level integer Calibration level 0 = raw, 1 = uncalibrated, 2 = calibrated, 3 = science product, 4 = contributed science product\n", + " t_min Start Time float MJD Observation start datetime May be displayed in the Portal as YYY-MM-DD HH:MM, but should be searched as MJD\n", + " t_max End Time float MJD Observation end datetime May be displayed in the Portal as YYY-MM-DD HH:MM, but should be searched as MJD\n", + " t_obs_release Release Date float MJD Dataset release date May be displayed in the Portal as YYY-MM-DD HH:MM, but should be searched as MJD\n", + " t_exptime Exposure Length float sec Exposure time \n", + " em_min Min. Wavelength float nm Minimum Wavelength \n", + " em_max Max. Wavelength float nm Maximum Wavelength \n", + " objID Object ID integer Plane ID of observation at given calibration level Long integer, e.g. 2012969445\n", + " s_region s_region string ICRS Shape STC/S Footprint Will be ICRS circle or polygon. E.g. CIRCLE ICRS 17.71740689 -58.40043015 0.625\n", + " jpegURL jpegURL string Preview Image URL https://archive.stsci.edu/hst/previews/N4QF/N4QF18090.jpg\n", + " distance Distance (\") float arcsec Angular separation between searched coordinates and center of obsevation \n", + " obsid Product Group ID integer Database identifier for obs_id Long integer, e.g. 2007590987\n", + " dataRights Data Rights string Data Rights valid values: public,exclusive_access,restricted\n", + " mtFlag Moving Target boolean Moving Target Flag If True, observation contains a moving target, if False or absent observation may or may not contain a moving target\n", + " srcDen Number of Catalog Objects float Number of cataloged objects found in observation \n", + " dataURL Data URL string Data URL \n", + " proposal_type Proposal Type string Type of telescope proposal Eg. 3PI, GO, GO/DD, HLA, GII, AIS\n", + " sequence_number Sequence Number integer Sequence number, e.g. Kepler quarter or TESS sector \n" + ] + } + ], + "source": [ + "Observations.get_metadata(\"observations\").pprint(max_lines=-1, max_width=-1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "All the filters that we could filter by in `filter_products()` is located right **[here](https://masttest.stsci.edu/api/v0/_productsfields.html)**." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# The 3-Step Data Download Process\n", + "* **STEP 1**: Get the products after making a specific query.\n", + "* **STEP 2**: Filter the products based on specific parameters.\n", + "* **STEP 3**: Download the files locally via Python." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**STEP 1**: When filtering an observation using the function `query_criteria()`, you must specify two coordinates for the right ascension and two coordinates for the declination. This forms a box to limit the search area. You must also supply a mission that you would want to search from such as 'GALEX' or 'PS1'.\n", + "\n", + "If you would like to filter by other parameters, see the other filter parameters above. Please modify this code for your specific use case!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "175841" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# - Ex.: s_ra: 30.2,31.2\n", + "# s_dec: -10.25,-9.25\n", + "# obs_collection: GALEX, PS1, SWIFT, etc.\n", + "\n", + "obs = Observations.query_criteria(s_ra=[30.2,31.2], s_dec=[-10.25,-9.25], obs_collection=\"GALEX\")\n", + "prod = Observations.get_product_list(obs)\n", + "len(prod)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**STEP 2**: Now we can use `filter_products()` to select specific products. Right now, this code is configured such that you can filter based on *productType*, *productSubGroupDescription*, *productGroupDescription*, and *mrp_only*. The valid filter parameters for GALEX and Pan-STARRS are outlined below as examples. Please use only these parameters + corresponding values, unless you see another parameter in the documentation (see above) that you would like to use. Please use the right filter products for your specific mission by referring to the documentation (see above)!\n", + "\n", + "**GALEX Example**\n", + "* productType: *AUXILIARY*, *CATALOG*, *INFO*, *PREVIEW*, *SCIENCE*, *THUMBNAIL*\n", + "* productSubGroupDescription: *Catalog Only*, *Imaging Only*, *Spectra Only*, *Spectral Image Strips Only*, *Whole Field Images Only*\n", + "* productGroupDescription: *Minimum Recommended Products*\n", + "* mrp_only: *True*, *False*.\n", + "\n", + "**Pan-STARRS (PS1) Example**\n", + "* productType: *AUXILIARY*, *CATALOG*, *INFO*, *SCIENCE*\n", + "* productSubGroupDescription: - \n", + "* productGroupDescription: *Minimum Recommended Products*\n", + "* mrp_only: *True*, *False*\n", + "\n", + "Note that *productSubGroupDescription* and *productGroupDescription* may not be needed when filtering for Pan-STARRS products. An example for 'GALEX' is provided below as well as an example for PS1. Please modify this code for your specific use case!\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "1785" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Table length=5\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "
obsIDobs_collectiondataproduct_typeobs_iddescriptiontypedataURIproductTypeproductGroupDescriptionproductSubGroupDescriptionproductDocumentationURLprojectprvversionproposal_idproductFilenamesizeparent_obsiddataRightscalib_levelfilters
str7str3str5str43str32str1str71str9str28str1str1str3str3str1str54int64str7str10int64str1
1971976PS1imagerings.v3.skycell.1062.040.stk.gstack data imageCmast:PS1/product/rings.v3.skycell.1062.040.stk.g.unconv.fitsSCIENCEMinimum Recommended Products----3PIpv3--rings.v3.skycell.1062.040.stk.g.unconv.fits668073601971976PUBLIC3g
1971977PS1imagerings.v3.skycell.1062.040.stk.istack data imageCmast:PS1/product/rings.v3.skycell.1062.040.stk.i.unconv.fitsSCIENCEMinimum Recommended Products----3PIpv3--rings.v3.skycell.1062.040.stk.i.unconv.fits674352001971977PUBLIC3i
1971978PS1imagerings.v3.skycell.1062.040.stk.rstack data imageCmast:PS1/product/rings.v3.skycell.1062.040.stk.r.unconv.fitsSCIENCEMinimum Recommended Products----3PIpv3--rings.v3.skycell.1062.040.stk.r.unconv.fits676396801971978PUBLIC3r
1971979PS1imagerings.v3.skycell.1062.040.stk.ystack data imageCmast:PS1/product/rings.v3.skycell.1062.040.stk.y.unconv.fitsSCIENCEMinimum Recommended Products----3PIpv3--rings.v3.skycell.1062.040.stk.y.unconv.fits678844801971979PUBLIC3y
1971980PS1imagerings.v3.skycell.1062.040.stk.zstack data imageCmast:PS1/product/rings.v3.skycell.1062.040.stk.z.unconv.fitsSCIENCEMinimum Recommended Products----3PIpv3--rings.v3.skycell.1062.040.stk.z.unconv.fits672163201971980PUBLIC3z
" + ], + "text/plain": [ + "\n", + " obsID obs_collection dataproduct_type ... dataRights calib_level filters\n", + " str7 str3 str5 ... str10 int64 str1 \n", + "------- -------------- ---------------- ... ---------- ----------- -------\n", + "1971976 PS1 image ... PUBLIC 3 g\n", + "1971977 PS1 image ... PUBLIC 3 i\n", + "1971978 PS1 image ... PUBLIC 3 r\n", + "1971979 PS1 image ... PUBLIC 3 y\n", + "1971980 PS1 image ... PUBLIC 3 z" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# - Ex. (GALEX): productType: SCIENCE\n", + "# productSubGroupDescription: Imaging Only\n", + "# productGroupDescription: Minimum Recommended Products\n", + "# mrp_only: True\n", + "\n", + "# - Ex. (PS1): productType: \n", + "# productSubGroupDescription: \n", + "# productGroupDescription: \n", + "# mrp_only: True\n", + "\n", + "# Use this for the 'GALEX' example.\n", + "filt_prod = Observations.filter_products(\n", + " prod,\n", + " productType=\"SCIENCE\",\n", + " productSubGroupDescription=\"Imaging Only\",\n", + " productGroupDescription=\"Minimum Recommended Products\",\n", + " mrp_only=True\n", + ")\n", + "\n", + "# Shows how many files are left after applying the filter.\n", + "display(len(filt_prod))\n", + "\n", + "# Shows the first 5 files from the filtered table.\n", + "display(filt_prod[0:5])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**STEP 3**: Download the files to your local computer. The line below will download the first five files only. Please modify this code for your specific use case, especially if you need to download more than five files!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "Observations.download_products(filt_prod[0:5], cloud_only=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# About this Notebook\n", + "\n", + "* **Authors**: Yingquan Li, Bernie Shao\n", + "* **Keywords**: GALEX, Pan-STARRS, Bulk Download, Python, AWS\n", + "* **Updated On**: 2024-11-08\n", + "\n", + "For support, please contact the Archive HelpDesk at archive@stsci.edu." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "dev", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/notebooks/multi_mission/bulk_download/requirements.txt b/notebooks/multi_mission/bulk_download/requirements.txt new file mode 100644 index 000000000..bf3d5bab9 --- /dev/null +++ b/notebooks/multi_mission/bulk_download/requirements.txt @@ -0,0 +1 @@ +astropy >= 6.1.4 \ No newline at end of file