From 94be4189190ccc0e7c928a4b97dad6072bc0c0a6 Mon Sep 17 00:00:00 2001 From: Vamsi Thiriveedhi Date: Tue, 20 Aug 2024 20:50:17 -0400 Subject: [PATCH 1/2] BUG: use trim to remove any extraneous spaces while parsing s3 url in manifest --- idc_index/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idc_index/index.py b/idc_index/index.py index c183403b..45bc08a0 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -665,7 +665,7 @@ def _validate_update_manifest_and_get_download_size( SELECT manifest_cp_cmd, REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, - REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url, + TRIM(REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '')) AS s3_url, FROM manifest_df ) SELECT From 206118ccd329a261668b37becac7a3749ee49e37 Mon Sep 17 00:00:00 2001 From: Andrey Fedorov Date: Wed, 21 Aug 2024 09:48:20 -0400 Subject: [PATCH 2/2] ENH: simplify s3_url extraction --- idc_index/index.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/idc_index/index.py b/idc_index/index.py index 45bc08a0..3083e09d 100644 --- a/idc_index/index.py +++ b/idc_index/index.py @@ -665,7 +665,7 @@ def _validate_update_manifest_and_get_download_size( SELECT manifest_cp_cmd, REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){{3}}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid, - TRIM(REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '')) AS s3_url, + REGEXP_EXTRACT(manifest_cp_cmd, 's3://\\S+') AS s3_url, FROM manifest_df ) SELECT