Skip to content

Commit

Permalink
Merge pull request #399 from GreenBankObservatory/dysh_data2
Browse files Browse the repository at this point in the history
Dysh data2
  • Loading branch information
teuben authored Nov 1, 2024
2 parents 97567ff + dd52c68 commit 46cd44f
Show file tree
Hide file tree
Showing 3 changed files with 138 additions and 61 deletions.
56 changes: 56 additions & 0 deletions notebooks/developer/test_dysh_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#!/usr/bin/env python3

"""
Testing dysh_data ( a full test is not possible in CI's pytest)
"""

import os
from astropy.io import fits
import astropy.units as u

from dysh.fits.sdfitsload import SDFITSLoad
from dysh.fits.gbtfitsload import GBTFITSLoad
from dysh.util.files import dysh_data

# some more liberal panda dataframe display options
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)

# pd.options.display.max_columns = None

dd = os.environ["DYSH_DATA"]
print("Current value of DYSH_DATA:", dd)


#%% debugging

import dysh
dysh.log.init_logging(3) # 0=ERROR 1=WARNING 2=INFO 3=DEBUG

#%%

# A number of these could easily fail if you don't have your $DYSH_DATA set and populated
# but they should always work at GBO

print(dysh_data(example="nod-KFPA/data/TGBT22A_503_02.raw.vegas.trim.fits", verbose=True))
print(dysh_data(example="nod-KFPA/data/TGBT22A_503_02.raw.vegas"))
print(dysh_data(example="nod"))
print(dysh_data(example="getps"))
print(dysh_data(example="getps0")) # bad url
print(dysh_data(example="getps1")) # bad url
print(dysh_data(example="positionswitch/data/AGBT05B_047_01/AGBT05B_047_01.raw.acs/"))
print(dysh_data(example="test1"))

print(dysh_data(test="test1"))
print(dysh_data(test="getps"))
print(dysh_data(test="TRCO_230413_Ka/TRCO_230413_Ka_scan43.fits"))

print(dysh_data(accept='AGBT22A_325_15/AGBT22A_325_15.raw.vegas'))
print(dysh_data(accept='nod1'))

print(dysh_data("AGBT21B_024_01"))
print(dysh_data("AGBT21B_024_01/AGBT21B_024_01.raw.vegas"))
print(dysh_data("junk")) # return None, file does not exist
136 changes: 76 additions & 60 deletions src/dysh/util/files.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,8 @@
import sys
from pathlib import Path

try:
from dysh.util.download import from_url

use_wget = False
except:
import wget # might get deprecated

use_wget = True
import dysh.util as util
from dysh.util.download import from_url

from ..util import minimum_string_match

Expand Down Expand Up @@ -83,27 +76,31 @@

def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None, verbose=False):
r"""
Simplified access to GBO data without needing an absolute path. @todo pending configuration discussion
Resolves the filename within the GBO dysh data system without the need for an absolute path.
Currently configured to work at GBO, where for example /home/sdfits exists. For other sites users
need to configure a $DYSH_DATA directory, properly populated with (symlinks to) project and test data,
as described below. Optionally, an explicit dysh_data= can be given, which overrides any possible $DYSH_DATA
environment (or configuration) that may exist.
By default it will detect the GBO system, users or developers that are not on the GBO system and need
access to data could rsync various data trees to avoid repeated downloads and use the $DYSH_DATA env.var.
access to data could rsync various data trees to avoid repeated downloads.
Only one of the keywords sdfits=, test=, example=, accept= can be given to probe for data. They are
processed in that order, whichever comes first.
For example inside their $HOME/dysh_data/ one could set
export DYSH_DATA=$HOME/dysh_data
Locations of various dysh_data directory roots: ($DYSH is the repo root for developers)
-----------------------------------------------
keyword location method $DYSH_DATA root
------- -------- ------ ---------------
sdfits: /home/sdfits - $DYSH_DATA/sdfits
test: $DYSH/testdata util.get_project_testdata() $DYSH_DATA/testdata
example: /home/dysh/example_data - $DYSH_DATA/example_data
accept: /home/dysh/acceptance_testing - $DYSH_DATA/acceptance_testing
keyword location at GBO $DYSH_DATA root
------- --------------- ---------------
sdfits= /home/sdfits $DYSH_DATA/sdfits
test= $DYSH/testdata $DYSH_DATA/testdata
example= /home/dysh/example_data $DYSH_DATA/example_data
accept= /home/dysh/acceptance_testing $DYSH_DATA/acceptance_testing
Note: test= resolves to the same filename as the util.get_project_testdata() function
Examples of use include mnemonics or full paths:
------------------------------------------------
Examples of use including mnemonics or full paths:
--------------------------------------------------
fn = dysh_data(test='getps')
fn = dysh_data(example='getfs')
fn = dysh_data(example='onoff-L/data/TGBT21A_501_11.raw.vegas')
Expand All @@ -119,20 +116,44 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None,
this will keep GBO people happy. Offsite a symlink should also work.
3) if none of those gave a valid name, it will fall back to making a URL
by prepending http://www.gb.nrao.edu/dysh/ and using
wget for as long we want to support that.
from_url for as long we want to support that.
astropy caching is also an option
4) directories (names not ending on .fits) cannot be downloaded using wget
5) use python-dotenv for configuration?
key=val
4) directories (names not ending on .fits) cannot be downloaded using from_url
5) configuration TBD
"""
# fmt:off
_url = "http://www.gb.nrao.edu/dysh/" # base of all things dysh
_example_data = "/home/dysh/public_html/example_data" # GBO direct access
_test_data = "/home/dysh/public_html/test_data" # not used ??
_acceptance_testing = "/home/dysh/acceptance_testing/data" # not in public_html ??
_accept_data = "/home/dysh/acceptance_testing/data" # not in public_html ??
# fmt:on

def sdfits_offline(fn):
"""fn is an sdfits= filename that was shown to exist
If fn contains only one name
See also GBTOffline()
"""
if fn.is_file():
return fn
if not fn.is_dir():
print(f"{fn} is not a file nor a directory, dunno how to proceed")
return None
# find all fits files one level deep
ff = list(fn.glob("*/*.fits"))
if len(ff) == 0:
return fn
# ensure there is only a single parent
parents = []
for f in ff:
parents.append(f.parent)
parents = list(set(parents))
if len(parents) > 1:
print(f"{fn} does not contain a single fits tree: {parents}")
# @todo throw ? or return the first one?

return parents[0]

# 1. find out if there is a dysh_data (or use $DYSH_DATA, or a .dyshrc config?)
# - if present, API dysh_data is used
# - if present, $DYSH_DATA is used
Expand All @@ -142,16 +163,12 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None,
# - throw!?
# ? e.g. dysh_data('foo.fits') -> sdfits='foo.fits'

global _debug
if dysh_data == None and "DYSH_DATA" in os.environ:
dysh_data = Path(os.environ["DYSH_DATA"])
if verbose:
_debug = True
if _debug:
print("DYSH_DATA:", dysh_data)
print("USE_WGET: ", use_wget)

# 2. Process whichever one of 'sdfits=', 'test=', 'example=', and 'accept=' is present
# 2. Process whichever one of 'sdfits=', 'test=', 'example=', and 'accept=' is present (in that order)

# sdfits: the main place where GBO data reside

Expand All @@ -168,15 +185,14 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None,
print("# -----------------")
os.system(cmd)
return None
print(type(dysh_data))
if dysh_data != None:
fn = dysh_data / Path("sdfits") / sdfits
fn = dysh_data / Path("sdfits") / sdfits # normally user is using a private sdfits
if fn.exists():
return fn
fn = Path("/home/sdfits/") / sdfits
return sdfits_offline(fn)
fn = Path("/home/sdfits/") / sdfits # expected at GBO
if fn.exists():
return fn
print(f"could not handle sdfits={sdfits} yet")
return sdfits_offline(fn)
# print(f"could not handle sdfits={sdfits} yet")
return None

# test: this should also be allowed to use util.get_project_testdata() as well
Expand All @@ -200,14 +216,14 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None,
fn = util.get_project_testdata() / my_test
else:
fn = util.get_project_testdata() / my_test
if _debug:
if verbose:
print("final:", fn)
if fn.exists(): # @todo this catches files and directories
return fn
print("Could not find", fn)
return None

# example: these can also obtain data via wget (or perhaps astropy caching???)
# example: these can also obtain data via from_url (or perhaps astropy caching???)

if example != None:
if example == "?":
Expand All @@ -231,24 +247,24 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None,
if fn.exists():
return fn
print("Odd-2, did not find", fn)
# last resort, try getting it via wget, but it will then be a local file in the current directory
# last resort, try getting it via from_url, but it will then be a local file in the current directory
url = _url + "/example_data/" + my_example
if _debug:
if verbose:
print("url:", url)
# @todo how to use Path() here ????
filename = url.split("/")[-1]
if not os.path.exists(filename):
filename = url.split("/")[-1]
print(f"Downloading {filename} from {url}")
if use_wget:
wget.download(url, out=filename)
else:
try:
filename = from_url(url)
print(f"\nRetrieved {filename}")
print(f"\nRetrieved {filename}")
except:
print(f"\nFailing to retrieve example {filename} ")
return None
else:
print(f"{filename} already downloaded")
return filename
return Path(filename)

# accept: acceptance_testing/data - wget not recommended (does not work on multifile fits)
# accept: acceptance_testing/data - from_url not recommended (does not work on multifile fits)

if accept != None:
if accept == "?":
Expand All @@ -272,22 +288,22 @@ def dysh_data(sdfits=None, test=None, example=None, accept=None, dysh_data=None,
if fn.exists():
return fn
print("Odd-2, did not find", fn)
# last resort, try getting it via wget, but it will then be a local file in the current directory
# last resort, try getting it via from_url, but it will then be a local file in the current directory
url = _url + "/acceptance_testing/data/" + my_accept
if _debug:
if verbose:
print("url:", url)
# @todo how to use Path() here ????
if not os.path.exists(filename): # @todo filename not known if file didn't exist
filename = url.split("/")[-1]
filename = url.split("/")[-1]
if not os.path.exists(filename):
print(f"Downloading {filename} from {url}")
if use_wget:
wget.download(url, out=filename)
else:
try:
filename = from_url(url)
print(f"\nRetrieved {filename}")
print(f"\nRetrieved {filename}")
except:
print(f"\nFailing to retrieve accept {filename}")
return None
else:
print(f"{filename} already downloaded")
return filename
return Path(filename)

print("You have not given one of: sdfits=, test=, example=, accept=")
print("or use =? as argument to get a list of valid shortcuts")
Expand Down
7 changes: 6 additions & 1 deletion src/dysh/util/tests/test_files.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from pathlib import Path

import numpy as np

import dysh.util.files as duf
Expand All @@ -20,4 +22,7 @@ def test_dysh_data(self):
# sdfits=
# skipping
# dysh_data=
assert duf.dysh_data(sdfits="foo.fits", dysh_data="/tmp") == None
assert duf.dysh_data("foo.fits", dysh_data="/tmp") == None
# this assume DYSH_DATA is not present
f2 = duf.dysh_data(example="test1")
assert f2 == Path("AGBT05B_047_01.raw.acs.fits")

0 comments on commit 46cd44f

Please sign in to comment.