From ea6901803a3b849c20c1ade42c34180201486002 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Wed, 9 Oct 2024 18:19:40 -0400 Subject: [PATCH 01/11] fuller test to also test large data and URL based downloads --- notebooks/developer/test_dysh_data.py | 55 +++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 notebooks/developer/test_dysh_data.py diff --git a/notebooks/developer/test_dysh_data.py b/notebooks/developer/test_dysh_data.py new file mode 100644 index 00000000..a337475a --- /dev/null +++ b/notebooks/developer/test_dysh_data.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +""" +Testing dysh_data ( a full test is not possible in CI's pytest) + +""" + +import os +from astropy.io import fits +import astropy.units as u + +from dysh.fits.sdfitsload import SDFITSLoad +from dysh.fits.gbtfitsload import GBTFITSLoad +from dysh.util.files import dysh_data + +# some more liberal panda dataframe display options +import pandas as pd +pd.set_option('display.max_rows', None) +pd.set_option('display.max_columns', None) +pd.set_option('display.width', 1000) + +# pd.options.display.max_columns = None + +dd = os.environ["DYSH_DATA"] +print("Current value of DYSH_DATA:", dd) + + +#%% debugging + +import dysh +dysh.log.init_logging(3) # 0=ERROR 1=WARNING 2=INFO 3=DEBUG + +#%% + +# A number of these could easily fail if you don't have your $DYSH_DATA set and populated +# but they should always work at GBO + +print(dysh_data(example="nod-KFPA/data/TGBT22A_503_02.raw.vegas.trim.fits", verbose=True)) +print(dysh_data(example="nod-KFPA/data/TGBT22A_503_02.raw.vegas")) +print(dysh_data(example="nod")) +print(dysh_data(example="getps")) +print(dysh_data(example="test1")) + +print(dysh_data(test="test1")) +print(dysh_data(test="getps")) +print(dysh_data(test="TRCO_230413_Ka/TRCO_230413_Ka_scan43.fits")) + +print(dysh_data(accept='AGBT22A_325_15/AGBT22A_325_15.raw.vegas')) +print(dysh_data(accept='nod1')) + +print(dysh_data("AGBT21B_024_01")) +print(dysh_data("AGBT21B_024_01/AGBT21B_024_01.raw.vegas")) + + + From 56519cf4b5f426f981cba64fc5b3388011e28ae9 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Wed, 9 Oct 2024 18:20:12 -0400 Subject: [PATCH 02/11] remote the wget option; fix a filename issue --- src/dysh/util/files.py | 40 ++++++++++------------------------------ 1 file changed, 10 insertions(+), 30 deletions(-) diff --git a/src/dysh/util/files.py b/src/dysh/util/files.py index 1848e895..197ad0fb 100755 --- a/src/dysh/util/files.py +++ b/src/dysh/util/files.py @@ -15,16 +15,8 @@ import sys from pathlib import Path -try: - from dysh.util.download import from_url - - use_wget = False -except: - import wget # might get deprecated - - use_wget = True +from dysh.util.download import from_url import dysh.util as util - from ..util import minimum_string_match _debug = False @@ -130,7 +122,7 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, _url = "http://www.gb.nrao.edu/dysh/" # base of all things dysh _example_data = "/home/dysh/public_html/example_data" # GBO direct access _test_data = "/home/dysh/public_html/test_data" # not used ?? - _acceptance_testing = "/home/dysh/acceptance_testing/data" # not in public_html ?? + _accept_data = "/home/dysh/acceptance_testing/data" # not in public_html ?? # fmt:on # 1. find out if there is a dysh_data (or use $DYSH_DATA, or a .dyshrc config?) @@ -142,14 +134,10 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, # - throw!? # ? e.g. dysh_data('foo.fits') -> sdfits='foo.fits' - global _debug if dysh_data == None and "DYSH_DATA" in os.environ: dysh_data = Path(os.environ["DYSH_DATA"]) if verbose: - _debug = True - if _debug: print("DYSH_DATA:", dysh_data) - print("USE_WGET: ", use_wget) # 2. Process whichever one of 'sdfits=', 'test=', 'example=', and 'accept=' is present @@ -200,7 +188,7 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, fn = util.get_project_testdata() / my_test else: fn = util.get_project_testdata() / my_test - if _debug: + if verbose: print("final:", fn) if fn.exists(): # @todo this catches files and directories return fn @@ -233,16 +221,12 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, print("Odd-2, did not find", fn) # last resort, try getting it via wget, but it will then be a local file in the current directory url = _url + "/example_data/" + my_example - if _debug: + if verbose: print("url:", url) - # @todo how to use Path() here ???? + filename = url.split("/")[-1] if not os.path.exists(filename): - filename = url.split("/")[-1] print(f"Downloading {filename} from {url}") - if use_wget: - wget.download(url, out=filename) - else: - filename = from_url(url) + filename = from_url(url) print(f"\nRetrieved {filename}") else: print(f"{filename} already downloaded") @@ -274,16 +258,12 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, print("Odd-2, did not find", fn) # last resort, try getting it via wget, but it will then be a local file in the current directory url = _url + "/acceptance_testing/data/" + my_accept - if _debug: + if verbose: print("url:", url) - # @todo how to use Path() here ???? - if not os.path.exists(filename): # @todo filename not known if file didn't exist - filename = url.split("/")[-1] + filename = url.split("/")[-1] + if not os.path.exists(filename): print(f"Downloading {filename} from {url}") - if use_wget: - wget.download(url, out=filename) - else: - filename = from_url(url) + filename = from_url(url) print(f"\nRetrieved {filename}") else: print(f"{filename} already downloaded") From c9e2f7137d169764e3b11edac47d3b0d586e09e6 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 9 Oct 2024 23:13:42 +0000 Subject: [PATCH 03/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- notebooks/developer/test_dysh_data.py | 3 --- src/dysh/util/files.py | 5 +++-- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/notebooks/developer/test_dysh_data.py b/notebooks/developer/test_dysh_data.py index a337475a..244dabfa 100644 --- a/notebooks/developer/test_dysh_data.py +++ b/notebooks/developer/test_dysh_data.py @@ -50,6 +50,3 @@ print(dysh_data("AGBT21B_024_01")) print(dysh_data("AGBT21B_024_01/AGBT21B_024_01.raw.vegas")) - - - diff --git a/src/dysh/util/files.py b/src/dysh/util/files.py index 197ad0fb..4bab602d 100755 --- a/src/dysh/util/files.py +++ b/src/dysh/util/files.py @@ -15,8 +15,9 @@ import sys from pathlib import Path -from dysh.util.download import from_url import dysh.util as util +from dysh.util.download import from_url + from ..util import minimum_string_match _debug = False @@ -223,7 +224,7 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, url = _url + "/example_data/" + my_example if verbose: print("url:", url) - filename = url.split("/")[-1] + filename = url.split("/")[-1] if not os.path.exists(filename): print(f"Downloading {filename} from {url}") filename = from_url(url) From 0198f50c26b4d0df0e64c5f1fb47d5f328e8ba92 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Tue, 15 Oct 2024 17:30:40 -0400 Subject: [PATCH 04/11] implement GBTIDL's "offline" mode within sdfits= --- src/dysh/util/files.py | 47 ++++++++++++++++++++++++++++++++++-------- 1 file changed, 38 insertions(+), 9 deletions(-) diff --git a/src/dysh/util/files.py b/src/dysh/util/files.py index 4bab602d..5a576c1b 100755 --- a/src/dysh/util/files.py +++ b/src/dysh/util/files.py @@ -126,6 +126,30 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, _accept_data = "/home/dysh/acceptance_testing/data" # not in public_html ?? # fmt:on + def sdfits_offline(fn): + """ fn is an sdfits= filename that was shown to exist + If fn contains only one name + """ + if fn.is_file(): + return fn + if not fn.is_dir(): + print(f"{fn} is not a file nor a directory, dunno how to proceed") + return None + # find all fits files one level deep + ff = list(fn.glob("*/*.fits")) + if len(ff) == 0: + return fn + # ensure there is only a single parent + parents = [] + for f in ff: + parents.append(f.parent) + parents = list(set(parents)) + if len(parents) > 1: + print(f"{fn} does not contain a single fits tree: {parents}") + # @todo throw ? or return the first one? + + return parents[0] + # 1. find out if there is a dysh_data (or use $DYSH_DATA, or a .dyshrc config?) # - if present, API dysh_data is used # - if present, $DYSH_DATA is used @@ -157,14 +181,13 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, print("# -----------------") os.system(cmd) return None - print(type(dysh_data)) if dysh_data != None: - fn = dysh_data / Path("sdfits") / sdfits + fn = dysh_data / Path("sdfits") / sdfits # normally user is using a private sdfits if fn.exists(): - return fn - fn = Path("/home/sdfits/") / sdfits + return sdfits_offline(fn) + fn = Path("/home/sdfits/") / sdfits # expected at GBO if fn.exists(): - return fn + return sdfits_offline(fn) print(f"could not handle sdfits={sdfits} yet") return None @@ -227,8 +250,11 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, filename = url.split("/")[-1] if not os.path.exists(filename): print(f"Downloading {filename} from {url}") - filename = from_url(url) - print(f"\nRetrieved {filename}") + try: + filename = from_url(url) + print(f"\nRetrieved {filename}") + except: + print(f"\nFailing to retrieve example {filename} ") else: print(f"{filename} already downloaded") return filename @@ -264,8 +290,11 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, filename = url.split("/")[-1] if not os.path.exists(filename): print(f"Downloading {filename} from {url}") - filename = from_url(url) - print(f"\nRetrieved {filename}") + try: + filename = from_url(url) + print(f"\nRetrieved {filename}") + except: + print(f"\nFailing to retrieve accept {filename}") else: print(f"{filename} already downloaded") return filename From da3f0d7e619484fb9449da7ccd4422731501bd61 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 15 Oct 2024 21:32:19 +0000 Subject: [PATCH 05/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/dysh/util/files.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/dysh/util/files.py b/src/dysh/util/files.py index 5a576c1b..dd8fb012 100755 --- a/src/dysh/util/files.py +++ b/src/dysh/util/files.py @@ -127,8 +127,8 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, # fmt:on def sdfits_offline(fn): - """ fn is an sdfits= filename that was shown to exist - If fn contains only one name + """fn is an sdfits= filename that was shown to exist + If fn contains only one name """ if fn.is_file(): return fn @@ -147,7 +147,7 @@ def sdfits_offline(fn): if len(parents) > 1: print(f"{fn} does not contain a single fits tree: {parents}") # @todo throw ? or return the first one? - + return parents[0] # 1. find out if there is a dysh_data (or use $DYSH_DATA, or a .dyshrc config?) From b8507700dc361cd5616e177c6fd20b4c47819ec9 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Tue, 15 Oct 2024 17:37:22 -0400 Subject: [PATCH 06/11] return None if file could not be found or does not exist --- src/dysh/util/files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dysh/util/files.py b/src/dysh/util/files.py index 5a576c1b..8ee9bb79 100755 --- a/src/dysh/util/files.py +++ b/src/dysh/util/files.py @@ -188,7 +188,7 @@ def sdfits_offline(fn): fn = Path("/home/sdfits/") / sdfits # expected at GBO if fn.exists(): return sdfits_offline(fn) - print(f"could not handle sdfits={sdfits} yet") + # print(f"could not handle sdfits={sdfits} yet") return None # test: this should also be allowed to use util.get_project_testdata() as well @@ -255,6 +255,7 @@ def sdfits_offline(fn): print(f"\nRetrieved {filename}") except: print(f"\nFailing to retrieve example {filename} ") + return None else: print(f"{filename} already downloaded") return filename @@ -295,6 +296,7 @@ def sdfits_offline(fn): print(f"\nRetrieved {filename}") except: print(f"\nFailing to retrieve accept {filename}") + return None else: print(f"{filename} already downloaded") return filename From 38c08eacd1997a36efe51f1a28aaff30d7b8c8d2 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Tue, 15 Oct 2024 17:37:56 -0400 Subject: [PATCH 07/11] notes on bad files, they should return None --- notebooks/developer/test_dysh_data.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/notebooks/developer/test_dysh_data.py b/notebooks/developer/test_dysh_data.py index 244dabfa..486d9682 100644 --- a/notebooks/developer/test_dysh_data.py +++ b/notebooks/developer/test_dysh_data.py @@ -39,6 +39,9 @@ print(dysh_data(example="nod-KFPA/data/TGBT22A_503_02.raw.vegas")) print(dysh_data(example="nod")) print(dysh_data(example="getps")) +print(dysh_data(example="getps0")) # bad url +print(dysh_data(example="getps1")) # bad url +print(dysh_data(example="positionswitch/data/AGBT05B_047_01/AGBT05B_047_01.raw.acs/")) print(dysh_data(example="test1")) print(dysh_data(test="test1")) @@ -50,3 +53,4 @@ print(dysh_data("AGBT21B_024_01")) print(dysh_data("AGBT21B_024_01/AGBT21B_024_01.raw.vegas")) +print(dysh_data("junk")) # return None, file does not exist From 3a3b25fb795d6c7ee4b77b09d60f08a5b442efc7 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Wed, 30 Oct 2024 10:34:43 -0400 Subject: [PATCH 08/11] documentation cleanup --- src/dysh/util/files.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/dysh/util/files.py b/src/dysh/util/files.py index 159b1259..58d4d1cd 100755 --- a/src/dysh/util/files.py +++ b/src/dysh/util/files.py @@ -76,27 +76,31 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, verbose=False): r""" - Simplified access to GBO data without needing an absolute path. @todo pending configuration discussion + Resolves the filename within the GBO dysh data system without the need for an absolute path. - By default it will detect the GBO system, users or developers that are not on the GBO system and need - access to data could rsync various data trees to avoid repeated downloads and use the $DYSH_DATA env.var. - access to data could rsync various data trees to avoid repeated downloads. + Currently configured to work at GBO, where for example /home/sdfits exists. For other sites users + need to configure a $DYSH_DATA directory, properly populated with (symlinks to) project and test data, + as described below. Optionally, an explicit dysh_data= can be given, which overrides any possible $DYSH_DATA + environment (or configuration) that may exist. + + Only one of the keywords sdfits=, test=, example=, accept= can be given to probe for data. They are + processed in that order, whichever comes first. - For example inside their $HOME/dysh_data/ one could set - export DYSH_DATA=$HOME/dysh_data Locations of various dysh_data directory roots: ($DYSH is the repo root for developers) ----------------------------------------------- - keyword location method $DYSH_DATA root - ------- -------- ------ --------------- - sdfits: /home/sdfits - $DYSH_DATA/sdfits - test: $DYSH/testdata util.get_project_testdata() $DYSH_DATA/testdata - example: /home/dysh/example_data - $DYSH_DATA/example_data - accept: /home/dysh/acceptance_testing - $DYSH_DATA/acceptance_testing + keyword location at GBO $DYSH_DATA root + ------- --------------- --------------- + sdfits= /home/sdfits $DYSH_DATA/sdfits + test= $DYSH/testdata $DYSH_DATA/testdata + example= /home/dysh/example_data $DYSH_DATA/example_data + accept= /home/dysh/acceptance_testing $DYSH_DATA/acceptance_testing + + Note: test= resolves to the same filename as the util.get_project_testdata() function - Examples of use include mnemonics or full paths: - ------------------------------------------------ + Examples of use including mnemonics or full paths: + -------------------------------------------------- fn = dysh_data(test='getps') fn = dysh_data(example='getfs') fn = dysh_data(example='onoff-L/data/TGBT21A_501_11.raw.vegas') @@ -115,8 +119,7 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, wget for as long we want to support that. astropy caching is also an option 4) directories (names not ending on .fits) cannot be downloaded using wget - 5) use python-dotenv for configuration? - key=val + 5) configuration TBD """ # fmt:off @@ -129,6 +132,7 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, def sdfits_offline(fn): """fn is an sdfits= filename that was shown to exist If fn contains only one name + See also GBTOffline() """ if fn.is_file(): return fn @@ -164,7 +168,7 @@ def sdfits_offline(fn): if verbose: print("DYSH_DATA:", dysh_data) - # 2. Process whichever one of 'sdfits=', 'test=', 'example=', and 'accept=' is present + # 2. Process whichever one of 'sdfits=', 'test=', 'example=', and 'accept=' is present (in that order) # sdfits: the main place where GBO data reside From 1e2795988b42ca06943199564b262d49083a30c8 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Thu, 31 Oct 2024 21:24:49 -0400 Subject: [PATCH 09/11] remove references to wget ensure return type to dysh_data is always a Path, or None --- src/dysh/util/files.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/dysh/util/files.py b/src/dysh/util/files.py index 58d4d1cd..64ad8501 100755 --- a/src/dysh/util/files.py +++ b/src/dysh/util/files.py @@ -116,9 +116,9 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, this will keep GBO people happy. Offsite a symlink should also work. 3) if none of those gave a valid name, it will fall back to making a URL by prepending http://www.gb.nrao.edu/dysh/ and using - wget for as long we want to support that. + from_url for as long we want to support that. astropy caching is also an option - 4) directories (names not ending on .fits) cannot be downloaded using wget + 4) directories (names not ending on .fits) cannot be downloaded using from_url 5) configuration TBD """ @@ -223,7 +223,7 @@ def sdfits_offline(fn): print("Could not find", fn) return None - # example: these can also obtain data via wget (or perhaps astropy caching???) + # example: these can also obtain data via from_url (or perhaps astropy caching???) if example != None: if example == "?": @@ -247,7 +247,7 @@ def sdfits_offline(fn): if fn.exists(): return fn print("Odd-2, did not find", fn) - # last resort, try getting it via wget, but it will then be a local file in the current directory + # last resort, try getting it via from_url, but it will then be a local file in the current directory url = _url + "/example_data/" + my_example if verbose: print("url:", url) @@ -262,9 +262,9 @@ def sdfits_offline(fn): return None else: print(f"{filename} already downloaded") - return filename + return Path(filename) - # accept: acceptance_testing/data - wget not recommended (does not work on multifile fits) + # accept: acceptance_testing/data - from_url not recommended (does not work on multifile fits) if accept != None: if accept == "?": @@ -288,7 +288,7 @@ def sdfits_offline(fn): if fn.exists(): return fn print("Odd-2, did not find", fn) - # last resort, try getting it via wget, but it will then be a local file in the current directory + # last resort, try getting it via from_url, but it will then be a local file in the current directory url = _url + "/acceptance_testing/data/" + my_accept if verbose: print("url:", url) @@ -303,7 +303,7 @@ def sdfits_offline(fn): return None else: print(f"{filename} already downloaded") - return filename + return Path(filename) print("You have not given one of: sdfits=, test=, example=, accept=") print("or use =? as argument to get a list of valid shortcuts") From 5654da421744ee72a30029a77c0aff5577825570 Mon Sep 17 00:00:00 2001 From: Peter Teuben Date: Thu, 31 Oct 2024 21:26:28 -0400 Subject: [PATCH 10/11] add a test to ensure from_url() works --- src/dysh/util/tests/test_files.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/dysh/util/tests/test_files.py b/src/dysh/util/tests/test_files.py index 363441a8..fd738ab0 100644 --- a/src/dysh/util/tests/test_files.py +++ b/src/dysh/util/tests/test_files.py @@ -1,7 +1,7 @@ import numpy as np import dysh.util.files as duf - +from pathlib import Path class TestUtil: """Test dysh.util files functions""" @@ -20,4 +20,7 @@ def test_dysh_data(self): # sdfits= # skipping # dysh_data= - assert duf.dysh_data(sdfits="foo.fits", dysh_data="/tmp") == None + assert duf.dysh_data("foo.fits", dysh_data="/tmp") == None + # this assume DYSH_DATA is not present + f2 = duf.dysh_data(example="test1") + assert f2 == Path("AGBT05B_047_01.raw.acs.fits") From 6187342b4b3b552b00c651a5bbaff9554e7c7283 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 1 Nov 2024 01:28:04 +0000 Subject: [PATCH 11/11] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- src/dysh/util/tests/test_files.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/dysh/util/tests/test_files.py b/src/dysh/util/tests/test_files.py index fd738ab0..d03c1051 100644 --- a/src/dysh/util/tests/test_files.py +++ b/src/dysh/util/tests/test_files.py @@ -1,7 +1,9 @@ +from pathlib import Path + import numpy as np import dysh.util.files as duf -from pathlib import Path + class TestUtil: """Test dysh.util files functions"""