Skip to content

Commit

Permalink
Merge pull request #46 from CBIIT/CRDCDH-2168-001-pgu
Browse files Browse the repository at this point in the history
CRDCDH 2168
  • Loading branch information
n2iw authored Jan 15, 2025
2 parents 7315ad6 + a5e7b51 commit 3ecc0c1
Show file tree
Hide file tree
Showing 6 changed files with 79 additions and 55 deletions.
5 changes: 0 additions & 5 deletions README-technical.md
Original file line number Diff line number Diff line change
Expand Up @@ -57,11 +57,6 @@ Usage of the CLI tool:
Following arguments are needed to read important data from manifest, conditional required when type = “data file”

-m --manifest, path to manifest file, conditional required when type = “data file”
-n --name-field
-s --size-field
-m --md5-field
-i --id-field
-o --omit-DCF-prefix

CLI configuration module will validate and combine parameters from CLI and/or config file
If config_file is given, then everything else is potentially optional (if it’s included in config file)
Expand Down
6 changes: 0 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -90,12 +90,6 @@ You can put a manifest in the same folder with the data files, or you can put it
- data: local path to the folder that contains the data files to be uploaded
- manifest: local path to the manifest file
- id-field: column name in the manifest file that contains file IDs(Keys). Please refer to the data model to determine which property is the ID/Key property.
- omit-DCF-prefix: for most data commons, this should be set to “false”. One exception is ICDC, which should be set to “true”.
- name-field: column name in the manifest file that contains data file names
- size-field: column name in the manifest file that contains data file sizes
- md5-field: column name in the manifest file that contains data file MD5 checksums
- id-field: column name in the manifest file that contains data file ID
- omit-DCF-prefix: boolean to define if need DCF prefix "dg.4DFC"
- retries: number of retries the CLI tool will perform after a failed upload
- overwrite: if set to “true”, CLI will upload a data file to overwrite the data file with same name that already exists in the Data Hub target storage. If set to “false”, CLI will not upload a data file if a data file with the same name exists in the Data Hub target storage.
- dryrun: if set to “true”, CLI will not upload any data files to the Data Hub target storage. If set to “false”, CLI will upload data files to the Data Hub target storage.
Expand Down
15 changes: 0 additions & 15 deletions configs/uploader-file-config.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,21 +17,6 @@ Config:
# path to manifest file, conditional required when type = data file
manifest: /path_to_the_manifest_file

# Property name for Key/ID property of file node. For example, "file_id" is the Key/ID property of file node in CDS.
id-field: file_id

# Whether to omit DCF prefix when generating file IDs. For example, false means include DCF prefix when generating file IDs.
omit-DCF-prefix: false

# file name header name in the manifest file. For example, "file_name" is the header name for CDS.
name-field: file_name

# file size header name in the manifest file. For example, "file_size" is the header name for CDS.
size-field: file_size

# file md5 header name in the manifest file. For example, "md5sum" is the header name for CDS.
md5-field: md5sum

# file uploading retries
retries: 3

Expand Down
37 changes: 37 additions & 0 deletions src/common/graphql_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,6 +145,43 @@ def update_batch(self, batchID, uploaded_files):
self.log.debug(e)
self.log.exception(f'Update batch failed - internal error. Please try again and contact the helpdesk if this error persists.')
return False

# 4) get_data_file_config()
def get_data_file_config(self, submissionID):
body = f"""
query {{
retrieveFileNodeConfig (submissionID: \"{submissionID}\") {{
id_field,
name_field,
size_field,
md5_field,
omit_DCF_prefix
}}
}}
"""
try:
response = requests.post(url=self.url, headers=self.headers, json={"query": body})
status = response.status_code
self.log.info(f"get_data_file_config response status code: {status}.")
if status == 200:
results = response.json()
if results.get("errors"):
self.log.error(f'Get data file config failed: {results.get("errors")[0].get("message")}.')
else:
data_file_config = results.get("data").get("retrieveFileNodeConfig")
if data_file_config:
return True, data_file_config
else:
self.log.error('Get data file config failed!')
return False, None
else:
self.log.error(f'Get data file config failed (code: {status}) - internal error. Please try again and contact the helpdesk if this error persists.')
return False, None

except Exception as e:
self.log.debug(e)
self.log.exception(f'Get data file config failed - internal error. Please try again and contact the helpdesk if this error persists.')
return False, None



Expand Down
53 changes: 26 additions & 27 deletions src/upload_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,6 @@ def __init__(self):
parser.add_argument('--dryrun', default=False, type=bool, help='Only check original file, won\'t copy any files, optional, default is false')
#args for data file type
parser.add_argument('-f', '--manifest', help='path to manifest file, conditional required when type = “data file"')
parser.add_argument('-n', '--name-field', help='header file name in manifest, optional, default value is "file_name"')
parser.add_argument('-s', '--size-field', help='header file size in manifest, optional, default value is "file_size"')
parser.add_argument('-m', '--md5-field', help='header md5 name in manifest, optional, default value is "md5sum"')
parser.add_argument('-i', '--id-field', help='header file ID name in manifest, optional, default value is "file_id"')
parser.add_argument('-o', '--omit-DCF-prefix', help='boolean to define if need DCF prefix "dg.4DFC"')

parser.add_argument('-r', '--retries', default=3, type=int, help='file uploading retries, optional, default value is 3')

Expand Down Expand Up @@ -103,7 +98,7 @@ def validate(self):
return False
elif type not in UPLOAD_TYPES:
self.log.critical(f'Configuration error in "type": “{type}” is not valid. Valid “type” value can be one of [“data file”, “metadata”]')
return False
return False
else:
if type == TYPE_FILE: #data file
#check manifest
Expand All @@ -117,27 +112,6 @@ def validate(self):
return False

self.data[PRE_MANIFEST] = manifest
#check header names in manifest file
file_name_header= self.data.get(FILE_NAME_FIELD)
if file_name_header is None:
self.data[FILE_NAME_FIELD] = FILE_NAME_DEFAULT

file_size_header = self.data.get(FILE_SIZE_FIELD)
if file_size_header is None:
self.data[FILE_SIZE_FIELD] = FILE_SIZE_DEFAULT

md5_header = self.data.get(FILE_MD5_FIELD)
if md5_header is None:
self.data[FILE_MD5_FIELD] = MD5_DEFAULT

file_id_header= self.data.get(FILE_ID_FIELD)
if file_id_header is None:
self.log.critical(f'file id field is required.')
return False

omit_dcf_prefix = self.data.get(OMIT_DCF_PREFIX)
if omit_dcf_prefix is None:
self.data[OMIT_DCF_PREFIX] = False

filepath = self.data.get(FILE_DIR)
if filepath is None:
Expand All @@ -154,4 +128,29 @@ def validate(self):
self.data[FROM_S3] = True

return True

def validate_file_config(self, data_file_config):
#check header names in manifest file
file_name_header= data_file_config.get(FILE_NAME_FIELD.replace("-", "_"))
self.data[FILE_NAME_FIELD] = file_name_header if file_name_header else FILE_NAME_DEFAULT

file_size_header = data_file_config.get(FILE_SIZE_FIELD.replace("-", "_"))
self.data[FILE_SIZE_FIELD] = file_size_header if file_size_header else FILE_SIZE_DEFAULT

md5_header = data_file_config.get(FILE_MD5_FIELD.replace("-", "_"))
self.data[FILE_MD5_FIELD] = md5_header if md5_header else MD5_DEFAULT

file_id_header= data_file_config.get(FILE_ID_FIELD.replace("-", "_"))
if file_id_header is None:
self.log.critical(f'file id field is required.')
return False

self.data[FILE_ID_FIELD] = file_id_header

omit_dcf_prefix = data_file_config.get(OMIT_DCF_PREFIX.replace("-", "_"))
self.data[OMIT_DCF_PREFIX] = False if omit_dcf_prefix is None else omit_dcf_prefix

return True



18 changes: 16 additions & 2 deletions src/uploader.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,24 @@ def controller():
log.error("Failed to upload files: missing required valid parameter(s)!")
print("Failed to upload files: invalid parameter(s)! Please check log file in tmp folder for details.")
return 1

#step 2: validate file or metadata
configs = config.data
apiInvoker = APIInvoker(configs)
# get data file config
if configs[UPLOAD_TYPE] == TYPE_FILE:
# retrieve data file configuration
result, data_file_config = apiInvoker.get_data_file_config(configs["submission"])
if not result or not data_file_config:
log.error("Failed to upload files: can't get data file config!")
print("Failed to upload files: can't get data file config! Please check log file in tmp folder for details.")
return 1

if not config.validate_file_config(data_file_config):
log.error("Failed to upload files: invalid file config!")
print("Failed to upload files: invalid file config! Please check log file in tmp folder for details.")
return 1

validator = FileValidator(configs)
if not validator.validate():
log.error("Failed to upload files: found invalid file(s)!")
Expand All @@ -40,7 +55,6 @@ def controller():

if validator.invalid_count == 0:
#step 3: create a batch
apiInvoker = APIInvoker(configs)
# file_array = [{"fileName": item[FILE_NAME_DEFAULT], "size": item[FILE_SIZE_DEFAULT]} for item in file_list]
file_array = [ item[FILE_NAME_DEFAULT] for item in file_list]
newBatch = None
Expand Down

0 comments on commit 3ecc0c1

Please sign in to comment.