From 308180ea1b3725309a4f7de629115e28ac3ca0bc Mon Sep 17 00:00:00 2001 From: floriscalkoen Date: Tue, 20 Feb 2024 11:48:50 +0100 Subject: [PATCH 1/3] exclude names column --- open_buildings/overture/add_columns.py | 28 +++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/open_buildings/overture/add_columns.py b/open_buildings/overture/add_columns.py index a1451f5..b68f467 100644 --- a/open_buildings/overture/add_columns.py +++ b/open_buildings/overture/add_columns.py @@ -4,15 +4,17 @@ # parquet to geoparquet. +import glob import os -import duckdb -import time -import tempfile +import shutil import subprocess -import glob -from duckdb.typing import * +import tempfile +import time + +import duckdb import mercantile -import shutil +from duckdb.typing import * + def lat_lon_to_quadkey(lat: DOUBLE, lon: DOUBLE, level: INTEGER) -> VARCHAR: # Convert latitude and longitude to tile using mercantile @@ -44,6 +46,7 @@ def add_quadkey(con): ); """) + def add_country_iso(con, country_parquet_path): # Load country parquet file into duckdb con.execute(f"CREATE TABLE countries AS SELECT * FROM read_parquet('{country_parquet_path}')") @@ -88,8 +91,8 @@ def process_parquet_file(input_parquet_path, output_folder, country_parquet_path con.execute('LOAD spatial;') - # Load parquet file into duckdb - con.execute(f"CREATE TABLE buildings AS SELECT * FROM read_parquet('{input_parquet_path}')") + # NOTE: exclude names column because it's all NULL and causes InternalException: INTERNAL Error: Attempted to dereference unique_ptr that is NULL! + con.execute(f"CREATE OR REPLACE TABLE buildings AS SELECT * EXCLUDE(names) FROM read_parquet('{input_parquet_path}')") if add_quadkey_option: add_quadkey(con) @@ -126,7 +129,14 @@ def process_parquet_files(input_path, output_folder, country_parquet_path, overw process_parquet_file(input_path, output_folder, country_parquet_path, overwrite, add_quadkey_option, add_country_iso_option, verbose) # Call the function - uncomment if you want to call this directly from python and put values in here. +import pathlib + +release_version = "overture_02-15" # Example version, adjust as necessary +data_dir = pathlib.Path.home() / "data" / "src" / f"{release_version}" / "theme=buildings" / "type=building" +out_dir = pathlib.Path.home() / "data" / "prc" / f"{release_version}" / "theme=buildings" / "type=building" + +input_path = data_dir / "part-00041-a34b09ea-399f-4872-b0b1-084a81bbb42f-c000.zstd.parquet" #input_path = '/Volumes/fastdata/overture/s3-data/buildings/' #output_folder = '/Volumes/fastdata/overture/refined-parquet/' #country_parquet_path = '/Volumes/fastdata/overture/countries.parquet' -#process_parquet_files(input_path, output_folder, country_parquet_path, overwrite=False, add_quadkey_option=True, add_country_iso_option=True) \ No newline at end of file +process_parquet_files(input_path, out_dir, "", overwrite=False, add_quadkey_option=True, add_country_iso_option=False) \ No newline at end of file From f34b31d5878eaa6850a49e282004413bdbcdd556 Mon Sep 17 00:00:00 2001 From: floriscalkoen Date: Tue, 20 Feb 2024 19:50:40 +0100 Subject: [PATCH 2/3] bash script to sync overture to local --- scripts/bash/sync-overture-to-local.sh | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 scripts/bash/sync-overture-to-local.sh diff --git a/scripts/bash/sync-overture-to-local.sh b/scripts/bash/sync-overture-to-local.sh new file mode 100644 index 0000000..b3114ef --- /dev/null +++ b/scripts/bash/sync-overture-to-local.sh @@ -0,0 +1,4 @@ +#!/bin/bash +mkdir -p ~/data/overture-02-15 +cd ~/data/overture-02-15 +aws s3 sync --no-sign-request s3://overturemaps-us-west-2/release/2024-02-15-alpha.0/ . From f7d1098ec23cad165c497e538ce672ad0a579043 Mon Sep 17 00:00:00 2001 From: floriscalkoen Date: Tue, 20 Feb 2024 19:58:43 +0100 Subject: [PATCH 3/3] sync script --- scripts/bash/sync-overture-to-local.sh | 34 +++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/scripts/bash/sync-overture-to-local.sh b/scripts/bash/sync-overture-to-local.sh index b3114ef..387537a 100644 --- a/scripts/bash/sync-overture-to-local.sh +++ b/scripts/bash/sync-overture-to-local.sh @@ -1,4 +1,32 @@ #!/bin/bash -mkdir -p ~/data/overture-02-15 -cd ~/data/overture-02-15 -aws s3 sync --no-sign-request s3://overturemaps-us-west-2/release/2024-02-15-alpha.0/ . + +DEFAULT_RELEASE="2024-02-15-alpha.0" +DEFAULT_DESTINATION="$HOME/data/src/overture/$DEFAULT_RELEASE" + +while getopts ":d:r:" opt; do + case ${opt} in + d) # Process option for the destination + DESTINATION=$OPTARG + ;; + r) # Process option for the release + RELEASE=$OPTARG + ;; + \?) + echo "Usage: cmd [-d destination] [-r release]" + ;; + esac +done + +# Set the destination directory based on the provided destination or release argument +DESTINATION="${DESTINATION:-$DEFAULT_DESTINATION}" +RELEASE="${RELEASE:-$DEFAULT_RELEASE}" + +mkdir -p "${DESTINATION}" +cd "${DESTINATION}" + +aws s3 sync --no-sign-request "s3://overturemaps-us-west-2/release/${RELEASE}/" . + +# Verification step to ensure all files are transferred correctly +# This is a simple re-sync operation; any missing or incomplete files will be re-downloaded +echo "Verifying file transfer..." +aws s3 sync --no-sign-request "s3://overturemaps-us-west-2/release/${RELEASE}/" . --dryrun