Skip to content

Commit

Permalink
Script to generate TPCH data in ORC format
Browse files Browse the repository at this point in the history
  • Loading branch information
Jefffrey committed Sep 19, 2024
1 parent 679be88 commit 62e4ebc
Show file tree
Hide file tree
Showing 3 changed files with 157 additions and 0 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Cargo.lock

venv
benchmark_data

private/
*.txt
121 changes: 121 additions & 0 deletions scripts/convert_tpch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

import pyarrow as pa
from pyarrow import orc
from pyarrow import csv

tables = [
"customer",
"lineitem",
"nation",
"orders",
"part",
"partsupp",
"region",
"supplier"
]

# Datatypes based on:
# https://github.com/apache/datafusion/blob/3b93cc952b889cec2364ad2490ae18ecddb3ca49/benchmarks/src/tpch/mod.rs#L50-L134
schemas = {
"customer": pa.schema([
pa.field("c_custkey", pa.int64()),
pa.field("c_name", pa.string()),
pa.field("c_address", pa.string()),
pa.field("c_nationkey", pa.int64()),
pa.field("c_phone", pa.string()),
pa.field("c_acctbal", pa.decimal128(15, 2)),
pa.field("c_mktsegment", pa.string()),
pa.field("c_comment", pa.string()),
]),
"lineitem": pa.schema([
pa.field("l_orderkey", pa.int64()),
pa.field("l_partkey", pa.int64()),
pa.field("l_suppkey", pa.int64()),
pa.field("l_linenumber", pa.int32()),
pa.field("l_quantity", pa.decimal128(15, 2)),
pa.field("l_extendedprice", pa.decimal128(15, 2)),
pa.field("l_discount", pa.decimal128(15, 2)),
pa.field("l_tax", pa.decimal128(15, 2)),
pa.field("l_returnflag", pa.string()),
pa.field("l_linestatus", pa.string()),
pa.field("l_shipdate", pa.date32()),
pa.field("l_commitdate", pa.date32()),
pa.field("l_receiptdate", pa.date32()),
pa.field("l_shipinstruct", pa.string()),
pa.field("l_shipmode", pa.string()),
pa.field("l_comment", pa.string()),
]),
"nation": pa.schema([
pa.field("n_nationkey", pa.int64()),
pa.field("n_name", pa.string()),
pa.field("n_regionkey", pa.int64()),
pa.field("n_comment", pa.string()),
]),
"orders": pa.schema([
pa.field("o_orderkey", pa.int64()),
pa.field("o_custkey", pa.int64()),
pa.field("o_orderstatus", pa.string()),
pa.field("o_totalprice", pa.decimal128(15, 2)),
pa.field("o_orderdate", pa.date32()),
pa.field("o_orderpriority", pa.string()),
pa.field("o_clerk", pa.string()),
pa.field("o_shippriority", pa.int32()),
pa.field("o_comment", pa.string()),
]),
"part": pa.schema([
pa.field("p_partkey", pa.int64()),
pa.field("p_name", pa.string()),
pa.field("p_mfgr", pa.string()),
pa.field("p_brand", pa.string()),
pa.field("p_type", pa.string()),
pa.field("p_size", pa.int32()),
pa.field("p_container", pa.string()),
pa.field("p_retailprice", pa.decimal128(15, 2)),
pa.field("p_comment", pa.string()),
]),
"partsupp": pa.schema([
pa.field("ps_partkey", pa.int64()),
pa.field("ps_suppkey", pa.int64()),
pa.field("ps_availqty", pa.int32()),
pa.field("ps_supplycost", pa.decimal128(15, 2)),
pa.field("ps_comment", pa.string()),
]),
"region": pa.schema([
pa.field("r_regionkey", pa.int64()),
pa.field("r_name", pa.string()),
pa.field("r_comment", pa.string()),
]),
"supplier": pa.schema([
pa.field("s_suppkey", pa.int64()),
pa.field("s_name", pa.string()),
pa.field("s_address", pa.string()),
pa.field("s_nationkey", pa.int64()),
pa.field("s_phone", pa.string()),
pa.field("s_acctbal", pa.decimal128(15, 2)),
pa.field("s_comment", pa.string()),
]),
}

for table in tables:
tbl = csv.read_csv(
f"benchmark_data/{table}.tbl",
parse_options=csv.ParseOptions(delimiter="|"),
convert_options=csv.ConvertOptions(column_types=schemas[table]),
)
orc.write_table(tbl, f"benchmark_data/{table}.orc", compression="zstd")
35 changes: 35 additions & 0 deletions scripts/generate-tpch.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#!/usr/bin/env bash

# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
BASE_DIR=$SCRIPT_DIR/..
DATA_DIR=$BASE_DIR/benchmark_data
VENV_BIN=$BASE_DIR/venv/bin

SCALE_FACTOR=${1:-1}

# Generate TBL data
mkdir -p $DATA_DIR
docker run --rm \
-v $DATA_DIR:/data \
ghcr.io/scalytics/tpch-docker:main -vf -s $SCALE_FACTOR
# Removing trailing |
sed -i 's/.$//' benchmark_data/*.tbl
$VENV_BIN/python $SCRIPT_DIR/convert_tpch.py
echo "Done"

0 comments on commit 62e4ebc

Please sign in to comment.