From a871c45b69d74dbb262e4ececceca4bad1bae8ba Mon Sep 17 00:00:00 2001 From: Stephen Soltesz Date: Thu, 28 Apr 2022 15:05:16 -0400 Subject: [PATCH] Create web100 passthrough view for extended web100 views (#138) * Create web100 static passthrough view for extended web100 views * Rename web100_static to web100 for consistency --- cloudbuild.yaml | 4 ++-- transform/create_static_tables.sh | 5 +--- transform/{web100_static.sql => web100.sql} | 2 +- views/create_dataset_views.sh | 23 ++++++++++--------- views/ndt/web100.sql | 9 ++++++++ .../extended_web100_downloads.sql | 2 +- .../extended_web100_uploads.sql | 2 +- 7 files changed, 27 insertions(+), 20 deletions(-) rename transform/{web100_static.sql => web100.sql} (99%) create mode 100644 views/ndt/web100.sql diff --git a/cloudbuild.yaml b/cloudbuild.yaml index 1cc916c..93006bd 100644 --- a/cloudbuild.yaml +++ b/cloudbuild.yaml @@ -22,8 +22,8 @@ steps: # Use cbif condition: only run these steps in one of these projects. - PROJECT_IN=mlab-sandbox,mlab-staging args: - - /workspace/views/create_dataset_views.sh self $PROJECT_ID $PROJECT_ID - /workspace/transform/create_static_tables.sh $PROJECT_ID + - /workspace/views/create_dataset_views.sh self $PROJECT_ID $PROJECT_ID # Deployments to oti and measurement-lab. - name: gcr.io/$PROJECT_ID/gcloud-jsonnet-cbif @@ -31,6 +31,6 @@ steps: # Use cbif condition: only run these steps in one of these projects. - PROJECT_IN=mlab-oti args: - - /workspace/views/create_dataset_views.sh self $PROJECT_ID $PROJECT_ID - /workspace/transform/create_static_tables.sh $PROJECT_ID + - /workspace/views/create_dataset_views.sh self $PROJECT_ID $PROJECT_ID - /workspace/views/create_dataset_views.sh self $PROJECT_ID measurement-lab diff --git a/transform/create_static_tables.sh b/transform/create_static_tables.sh index 17ab3e3..32661d0 100755 --- a/transform/create_static_tables.sh +++ b/transform/create_static_tables.sh @@ -19,10 +19,7 @@ cd ${BASEDIR} function create_table() { local query_file=${1:?Please provide query file} - local table=$( grep 'CREATE TABLE' $query_file | awk '{print $3}' ) - bq query --project_id=$PROJECT --nouse_legacy_sql "$( cat $query_file )" - echo "Created table $PROJECT.$table successfully" } -create_table ./web100_static.sql +create_table ./web100.sql diff --git a/transform/web100_static.sql b/transform/web100.sql similarity index 99% rename from transform/web100_static.sql rename to transform/web100.sql index fa3d709..99543f8 100644 --- a/transform/web100_static.sql +++ b/transform/web100.sql @@ -5,7 +5,7 @@ -- for queries. -- -- Always create within local project. -CREATE TABLE IF NOT EXISTS ndt.web100_static +CREATE TABLE IF NOT EXISTS ndt.web100 PARTITION BY date OPTIONS ( require_partition_filter=true diff --git a/views/create_dataset_views.sh b/views/create_dataset_views.sh index 07cc291..9216961 100755 --- a/views/create_dataset_views.sh +++ b/views/create_dataset_views.sh @@ -88,6 +88,18 @@ create_view ${SRC_PROJECT} ${DST_PROJECT} ndt_raw ./ndt_raw/hopannotation1.sql create_view ${SRC_PROJECT} ${DST_PROJECT} ndt_raw ./ndt_raw/scamper1.sql create_view ${SRC_PROJECT} ${DST_PROJECT} ndt_raw ./ndt_raw/tcpinfo.sql +# Public pass-through views for joined tables. +if [[ ${DST_PROJECT} = "measurement-lab" ]] ; then + # NOTE: these steps can only be applied in the public measurement-lab + # project because in other M-Lab projects, these targets are actual + # tables. Only in measurement-lab can we create these views. + create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/ndt5.sql + create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/ndt7.sql + create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/tcpinfo.sql + create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/scamper1.sql + create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/web100.sql +fi + # NDT extended (mixed parsers) create_view ${DST_PROJECT} ${DST_PROJECT} ndt_intermediate ./ndt_intermediate/extended_ndt5_downloads.sql create_view ${DST_PROJECT} ${DST_PROJECT} ndt_intermediate ./ndt_intermediate/extended_ndt5_uploads.sql @@ -103,17 +115,6 @@ create_view ${DST_PROJECT} ${DST_PROJECT} ndt ./ndt/unified_uploads_20201026x.sq create_view ${DST_PROJECT} ${DST_PROJECT} ndt ./ndt/unified_uploads.sql create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/scamper1_hopannotation1.sql -# Public pass-through views for joined tables. -if [[ ${DST_PROJECT} = "measurement-lab" ]] ; then - # NOTE: these steps can only be applied in the public measurement-lab - # project because in other M-Lab projects, these targets are actual - # tables. Only in measurement-lab can we create these views. - create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/ndt5.sql - create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/ndt7.sql - create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/tcpinfo.sql - create_view ${SRC_PROJECT} ${DST_PROJECT} ndt ./ndt/scamper1.sql -fi - # traceroute. create_view ${SRC_PROJECT} ${DST_PROJECT} traceroute ./traceroute/scamper1.sql create_view ${SRC_PROJECT} ${DST_PROJECT} traceroute ./traceroute/paris1_legacy.sql diff --git a/views/ndt/web100.sql b/views/ndt/web100.sql new file mode 100644 index 0000000..c333b01 --- /dev/null +++ b/views/ndt/web100.sql @@ -0,0 +1,9 @@ +-- +-- This view is a pass-through for date partitioned ndt web100 data. The data +-- in this table is a static transformation of data from the v1 data pipeline +-- for the ndt web100 dataset. It is "static" because it is not actively +-- reprocessed. While it uses standard column conventions, the schema is not +-- guaranteed to be backward compatible b/c there is currently no parser that +-- supports reprocessing this format. +-- +SELECT * FROM `{{.ProjectID}}.ndt.web100` diff --git a/views/ndt_intermediate/extended_web100_downloads.sql b/views/ndt_intermediate/extended_web100_downloads.sql index d49f171..410e1e0 100644 --- a/views/ndt_intermediate/extended_web100_downloads.sql +++ b/views/ndt_intermediate/extended_web100_downloads.sql @@ -38,7 +38,7 @@ WITH PreCleanWeb100 AS ( parser.ArchiveURL, parser.Filename ) AS Web100parser, - FROM `{{.ProjectID}}.ndt.web100_static` + FROM `{{.ProjectID}}.ndt.web100` WHERE raw.web100.snap.Duration IS NOT NULL AND raw.web100.snap.State IS NOT NULL diff --git a/views/ndt_intermediate/extended_web100_uploads.sql b/views/ndt_intermediate/extended_web100_uploads.sql index da70e31..f316c9b 100644 --- a/views/ndt_intermediate/extended_web100_uploads.sql +++ b/views/ndt_intermediate/extended_web100_uploads.sql @@ -38,7 +38,7 @@ WITH PreCleanWeb100 AS ( parser.ArchiveURL, parser.Filename ) AS Web100parser, - FROM `{{.ProjectID}}.ndt.web100_static` + FROM `{{.ProjectID}}.ndt.web100` WHERE raw.web100.snap.Duration IS NOT NULL AND raw.web100.snap.State IS NOT NULL