Skip to content

Commit

Permalink
style: format SQL and overrule ruff in this regard
Browse files Browse the repository at this point in the history
  • Loading branch information
fedorov committed May 3, 2024
1 parent fb2b595 commit 654b2c8
Showing 1 changed file with 62 additions and 47 deletions.
109 changes: 62 additions & 47 deletions idc_index/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -512,34 +512,45 @@ def _validate_update_manifest_and_get_download_size(
# Next, extract crdc_instance_uuid from aws_series_url in the index and
# try to verify if every series in the manifest is present in the index

# ruff: noqa
sql = """
PRAGMA disable_progress_bar;
with index_temp as
(select
seriesInstanceUID,
series_aws_url,
series_size_MB,
regexp_extract(series_aws_url, '(?:.*?\\/){3}([^\\/?#]+)', 1) index_crdc_series_uuid
from index_df_copy),
manifest_temp as (
select
manifest_cp_cmd,
regexp_extract(manifest_cp_cmd, '(?:.*?\\/){3}([^\\/?#]+)', 1) as manifest_crdc_series_uuid,
regexp_replace(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') as s3_url,
from
manifest_df
)
select
seriesInstanceuid,
s3_url,
series_size_MB,
index_crdc_series_uuid==manifest_crdc_series_uuid as crdc_series_uuid_match,
s3_url==series_aws_url as s3_url_match,
CASE WHEN s3_url==series_aws_url THEN 'aws' ELSE 'unknown' END as endpoint
from
manifest_temp
left join index_temp on index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
"""
WITH
index_temp AS (
SELECT
seriesInstanceUID,
series_aws_url,
series_size_MB,
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){3}([^\\/?#]+)', 1) index_crdc_series_uuid
FROM
index_df_copy),
manifest_temp AS (
SELECT
manifest_cp_cmd,
REGEXP_EXTRACT(manifest_cp_cmd, '(?:.*?\\/){3}([^\\/?#]+)', 1) AS manifest_crdc_series_uuid,
REGEXP_REPLACE(regexp_replace(manifest_cp_cmd, 'cp ', ''), '\\s[^\\s]*$', '') AS s3_url,
FROM
manifest_df )
SELECT
seriesInstanceuid,
s3_url,
series_size_MB,
index_crdc_series_uuid==manifest_crdc_series_uuid AS crdc_series_uuid_match,
s3_url==series_aws_url AS s3_url_match,
CASE
WHEN s3_url==series_aws_url THEN 'aws'
ELSE
'unknown'
END
AS endpoint
FROM
manifest_temp
LEFT JOIN
index_temp
ON
index_temp.index_crdc_series_uuid = manifest_temp.manifest_crdc_series_uuid
"""
# ruff: noqa: end
merged_df = duckdb.query(sql).df()

if validate_manifest:
Expand Down Expand Up @@ -695,30 +706,34 @@ def _parse_s5cmd_sync_output_and_generate_synced_manifest(
# create a copy of the index
index_df_copy = self.index

# ruff: noqa
sql = """
PRAGMA disable_progress_bar;
with index_temp as
(select
*,
regexp_extract(series_aws_url, '(?:.*?\\/){3}([^\\/?#]+)', 1) index_crdc_series_uuid
from index_df_copy),
sync_temp as (
select
distinct
concat(regexp_extract(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*') as s3_url,
regexp_extract(concat(regexp_extract(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*'),'(?:.*?\\/){3}([^\\/?#]+)',1) as sync_crdc_instance_uuid
from
stdout_df
)
select
distinct
seriesInstanceUID,
series_size_MB,
s3_url
from
sync_temp
left join index_temp on index_temp.index_crdc_series_uuid = sync_temp.sync_crdc_instance_uuid
WITH
index_temp AS (
SELECT
*,
REGEXP_EXTRACT(series_aws_url, '(?:.*?\\/){3}([^\\/?#]+)', 1) index_crdc_series_uuid
FROM
index_df_copy),
sync_temp AS (
SELECT
DISTINCT CONCAT(REGEXP_EXTRACT(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*') AS s3_url,
REGEXP_EXTRACT(CONCAT(REGEXP_EXTRACT(s5cmd_output, 'cp (s3://[^/]+/[^/]+)/.*', 1), '/*'),'(?:.*?\\/){3}([^\\/?#]+)',1) AS sync_crdc_instance_uuid
FROM
stdout_df )
SELECT
DISTINCT seriesInstanceUID,
series_size_MB,
s3_url
FROM
sync_temp
LEFT JOIN
index_temp
ON
index_temp.index_crdc_series_uuid = sync_temp.sync_crdc_instance_uuid
"""
# ruff: noqa: end
merged_df = duckdb.query(sql).df()
sync_size = merged_df["series_size_MB"].sum()
sync_size_rounded = round(sync_size, 2)
Expand Down

0 comments on commit 654b2c8

Please sign in to comment.