From 6e8953087d552f74488bd6f992d0f3daeaaf26a7 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Fri, 7 Oct 2022 11:57:51 +0200 Subject: [PATCH 01/52] Update for next development version --- docker/pom.xml | 2 +- flowman-archetype-quickstart/pom.xml | 2 +- flowman-client/pom.xml | 2 +- flowman-common/pom.xml | 2 +- flowman-core/pom.xml | 2 +- flowman-dist/pom.xml | 2 +- flowman-dsl/pom.xml | 2 +- flowman-hub/pom.xml | 2 +- flowman-parent/pom.xml | 2 +- flowman-plugins/aws/pom.xml | 2 +- flowman-plugins/azure/pom.xml | 2 +- flowman-plugins/delta/pom.xml | 2 +- flowman-plugins/impala/pom.xml | 2 +- flowman-plugins/json/pom.xml | 2 +- flowman-plugins/kafka/pom.xml | 2 +- flowman-plugins/mariadb/pom.xml | 2 +- flowman-plugins/mssqlserver/pom.xml | 2 +- flowman-plugins/mysql/pom.xml | 2 +- flowman-plugins/openapi/pom.xml | 2 +- flowman-plugins/oracle/pom.xml | 2 +- flowman-plugins/postgresql/pom.xml | 2 +- flowman-plugins/sftp/pom.xml | 2 +- flowman-plugins/swagger/pom.xml | 2 +- flowman-scalatest-compat/pom.xml | 2 +- flowman-server-ui/pom.xml | 2 +- flowman-server/pom.xml | 2 +- flowman-spark-extensions/pom.xml | 2 +- flowman-spark-testing/pom.xml | 2 +- flowman-spec/pom.xml | 2 +- flowman-studio-ui/pom.xml | 2 +- flowman-studio/pom.xml | 2 +- flowman-testing/pom.xml | 2 +- flowman-tools/pom.xml | 2 +- flowman-yaml-schema/pom.xml | 2 +- pom.xml | 2 +- 35 files changed, 35 insertions(+), 35 deletions(-) diff --git a/docker/pom.xml b/docker/pom.xml index 57b6af462..26a7b137d 100644 --- a/docker/pom.xml +++ b/docker/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-archetype-quickstart/pom.xml b/flowman-archetype-quickstart/pom.xml index a3c8b4227..79e61a2fa 100644 --- a/flowman-archetype-quickstart/pom.xml +++ b/flowman-archetype-quickstart/pom.xml @@ -7,7 +7,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-client/pom.xml b/flowman-client/pom.xml index 6717d9658..6e8f6bd49 100644 --- a/flowman-client/pom.xml +++ b/flowman-client/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-common/pom.xml b/flowman-common/pom.xml index b33f925fe..c0057e1e9 100644 --- a/flowman-common/pom.xml +++ b/flowman-common/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-core/pom.xml b/flowman-core/pom.xml index babe52b42..29152e204 100644 --- a/flowman-core/pom.xml +++ b/flowman-core/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-dist/pom.xml b/flowman-dist/pom.xml index 0bc487724..8caad26ed 100644 --- a/flowman-dist/pom.xml +++ b/flowman-dist/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-dsl/pom.xml b/flowman-dsl/pom.xml index 6b52a6a13..999835dd3 100644 --- a/flowman-dsl/pom.xml +++ b/flowman-dsl/pom.xml @@ -9,7 +9,7 @@ flowman-root com.dimajix.flowman - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-hub/pom.xml b/flowman-hub/pom.xml index ef2058582..2e0d01744 100644 --- a/flowman-hub/pom.xml +++ b/flowman-hub/pom.xml @@ -9,7 +9,7 @@ flowman-root com.dimajix.flowman - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index 17f62c19c..c935002a5 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-plugins/aws/pom.xml b/flowman-plugins/aws/pom.xml index 526b5e265..d3b0ccae1 100644 --- a/flowman-plugins/aws/pom.xml +++ b/flowman-plugins/aws/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/azure/pom.xml b/flowman-plugins/azure/pom.xml index 50942b907..4e31c6cbe 100644 --- a/flowman-plugins/azure/pom.xml +++ b/flowman-plugins/azure/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/delta/pom.xml b/flowman-plugins/delta/pom.xml index ad0dc435e..962e3148b 100644 --- a/flowman-plugins/delta/pom.xml +++ b/flowman-plugins/delta/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/impala/pom.xml b/flowman-plugins/impala/pom.xml index 6bb92fb50..57cf28905 100644 --- a/flowman-plugins/impala/pom.xml +++ b/flowman-plugins/impala/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/json/pom.xml b/flowman-plugins/json/pom.xml index 4e21590fe..900828fa8 100644 --- a/flowman-plugins/json/pom.xml +++ b/flowman-plugins/json/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/kafka/pom.xml b/flowman-plugins/kafka/pom.xml index 9245ae3c0..35f28a46b 100644 --- a/flowman-plugins/kafka/pom.xml +++ b/flowman-plugins/kafka/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/mariadb/pom.xml b/flowman-plugins/mariadb/pom.xml index ff55a3ce5..d861acd69 100644 --- a/flowman-plugins/mariadb/pom.xml +++ b/flowman-plugins/mariadb/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/mssqlserver/pom.xml b/flowman-plugins/mssqlserver/pom.xml index e07d79391..9d4221194 100644 --- a/flowman-plugins/mssqlserver/pom.xml +++ b/flowman-plugins/mssqlserver/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/mysql/pom.xml b/flowman-plugins/mysql/pom.xml index 0638a5aed..4b398eb0c 100644 --- a/flowman-plugins/mysql/pom.xml +++ b/flowman-plugins/mysql/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/openapi/pom.xml b/flowman-plugins/openapi/pom.xml index a1e1f884f..16e3a4223 100644 --- a/flowman-plugins/openapi/pom.xml +++ b/flowman-plugins/openapi/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/oracle/pom.xml b/flowman-plugins/oracle/pom.xml index 3c931abf8..758e481b6 100644 --- a/flowman-plugins/oracle/pom.xml +++ b/flowman-plugins/oracle/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/postgresql/pom.xml b/flowman-plugins/postgresql/pom.xml index 09876179c..f8fa9d40d 100644 --- a/flowman-plugins/postgresql/pom.xml +++ b/flowman-plugins/postgresql/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/sftp/pom.xml b/flowman-plugins/sftp/pom.xml index c52cc5d4c..b8254dab0 100644 --- a/flowman-plugins/sftp/pom.xml +++ b/flowman-plugins/sftp/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/swagger/pom.xml b/flowman-plugins/swagger/pom.xml index a328a55ec..e4f37e014 100644 --- a/flowman-plugins/swagger/pom.xml +++ b/flowman-plugins/swagger/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../../pom.xml diff --git a/flowman-scalatest-compat/pom.xml b/flowman-scalatest-compat/pom.xml index ae769a271..cc2c990e9 100644 --- a/flowman-scalatest-compat/pom.xml +++ b/flowman-scalatest-compat/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-server-ui/pom.xml b/flowman-server-ui/pom.xml index 56038ea09..a5e025baa 100644 --- a/flowman-server-ui/pom.xml +++ b/flowman-server-ui/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-server/pom.xml b/flowman-server/pom.xml index 09ff89ab9..4f3de222e 100644 --- a/flowman-server/pom.xml +++ b/flowman-server/pom.xml @@ -9,7 +9,7 @@ flowman-root com.dimajix.flowman - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-spark-extensions/pom.xml b/flowman-spark-extensions/pom.xml index f66811b75..44d641eff 100644 --- a/flowman-spark-extensions/pom.xml +++ b/flowman-spark-extensions/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-spark-testing/pom.xml b/flowman-spark-testing/pom.xml index 63017d89f..3ce93f5ae 100644 --- a/flowman-spark-testing/pom.xml +++ b/flowman-spark-testing/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-spec/pom.xml b/flowman-spec/pom.xml index a03ad718d..34282031a 100644 --- a/flowman-spec/pom.xml +++ b/flowman-spec/pom.xml @@ -9,7 +9,7 @@ flowman-root com.dimajix.flowman - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-studio-ui/pom.xml b/flowman-studio-ui/pom.xml index 482d421f6..7737a9d8f 100644 --- a/flowman-studio-ui/pom.xml +++ b/flowman-studio-ui/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-studio/pom.xml b/flowman-studio/pom.xml index 1a66514d1..2415cd6fa 100644 --- a/flowman-studio/pom.xml +++ b/flowman-studio/pom.xml @@ -9,7 +9,7 @@ flowman-root com.dimajix.flowman - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-testing/pom.xml b/flowman-testing/pom.xml index 826f670c0..e2065fb1a 100644 --- a/flowman-testing/pom.xml +++ b/flowman-testing/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-tools/pom.xml b/flowman-tools/pom.xml index 1f4cc0de0..4beee330b 100644 --- a/flowman-tools/pom.xml +++ b/flowman-tools/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/flowman-yaml-schema/pom.xml b/flowman-yaml-schema/pom.xml index b1cf09229..df0b99397 100644 --- a/flowman-yaml-schema/pom.xml +++ b/flowman-yaml-schema/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT ../pom.xml diff --git a/pom.xml b/pom.xml index f33168bd5..aee973f2d 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 com.dimajix.flowman flowman-root - 0.28.0 + 0.28.1-SNAPSHOT pom Flowman root pom A Spark based ETL tool From 078921c17a2dbc326f89a8bea5213b9bc5700171 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Fri, 7 Oct 2022 17:41:12 +0200 Subject: [PATCH 02/52] github-260 Remove hive-storage-api from several plugins and lib --- CHANGELOG.md | 7 ++++++- docs/releases.md | 7 ++++++- flowman-parent/pom.xml | 4 ---- flowman-plugins/kafka/pom.xml | 10 ---------- pom.xml | 36 ++++++++++++++++++++++++++++++----- 5 files changed, 43 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a1bfaa3b6..77e552852 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,9 @@ -# Version 0.28.0 +# Version 0.28.1 + +* github-260 Remove hive-storage-api from several plugins and lib + + +# Version 0.28.0 - 2022-10-07 * Improve support for MariaDB / MySQL as data sinks * github-245: Bump ejs, @vue/cli-plugin-babel, @vue/cli-plugin-eslint and @vue/cli-service in /flowman-studio-ui diff --git a/docs/releases.md b/docs/releases.md index 670b683ad..1f47d37c9 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -14,7 +14,12 @@ The following gives an (incomplete) list of past releases of the last 12 months. changes over time. -### Version 0.28.0 +### Version 0.28.1 + +* github-260 Remove hive-storage-api from several plugins and lib + + +### Version 0.28.0 - 2022-10-07 * Improve support for MariaDB / MySQL as data sinks * github-245: Bump ejs, @vue/cli-plugin-babel, @vue/cli-plugin-eslint and @vue/cli-service in /flowman-studio-ui diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index c935002a5..dddd8bef4 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -503,10 +503,6 @@ org.apache.hive hive-llap-common - - org.apache.hive - hive-storage-api - org.apache.hadoop diff --git a/flowman-plugins/kafka/pom.xml b/flowman-plugins/kafka/pom.xml index 35f28a46b..793f9b8d8 100644 --- a/flowman-plugins/kafka/pom.xml +++ b/flowman-plugins/kafka/pom.xml @@ -77,16 +77,6 @@ org.apache.spark spark-sql-kafka-0-10_${scala.api_version} - - - org.apache.hadoop - hadoop-client-api - - - org.apache.hadoop - hadoop-client-runtime - - diff --git a/pom.xml b/pom.xml index aee973f2d..69ebc0090 100644 --- a/pom.xml +++ b/pom.xml @@ -51,7 +51,8 @@ oss - 2.3.3 + 2.3.9 + 2.7.2 1.2.0 2.33 2.4.0 @@ -175,6 +176,7 @@ 3.0.0-${cdh.version} 3.0 2.1.1-${cdh.version} + 2.1.1-${cdh.version} 2.1.0-${cdh.version} 2.2.1-${cdh.version} 10.14.2.0 @@ -236,6 +238,7 @@ 3.1.1.${cdp.version} 3.1 3.1.3000.${cdp.version} + 3.1.30000.${cdp.version} 2.2.3.${cdp.version} 2.4.1.7.1.5.0-257 10.14.2.0 @@ -279,6 +282,8 @@ 1.1.0 2.4.8 2.4 + 1.2.1 + 1.2.1 1.1.7.3 4.1.47.Final 4.7 @@ -318,6 +323,8 @@ 1.1.2 3.0.3 3.0 + 2.3.7 + 2.7.1 1.1.8.2 4.1.47.Final 4.7.1 @@ -357,6 +364,8 @@ 1.1.2 3.1.2 3.1 + 2.3.7 + 2.7.2 1.1.8.2 4.1.51.Final 4.8-1 @@ -396,6 +405,8 @@ 1.1.2 3.2.2 3.2 + 2.3.9 + 2.7.2 1.1.8.4 4.1.68.Final 4.8 @@ -436,6 +447,8 @@ 1.1.2 3.3.0 3.3 + 2.3.9 + 2.7.2 1.1.8.4 4.1.74.Final 4.8 @@ -1193,10 +1206,6 @@ org.apache.hive hive-llap-common - - org.apache.hive - hive-storage-api - org.apache.hadoop @@ -1289,6 +1298,16 @@ spark-sql-kafka-0-10_${scala.api_version} ${spark.version} compile + + + org.apache.hadoop + hadoop-client-api + + + org.apache.hadoop + hadoop-client-runtime + + org.apache.spark @@ -1298,6 +1317,13 @@ test + + org.apache.hive + hive-storage-api + ${hive-storage-api.version} + provided + + org.antlr antlr-runtime From af448758ab0414f1be3a54b0564ec91853eb726d Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Fri, 7 Oct 2022 17:51:57 +0200 Subject: [PATCH 03/52] github-261 Add descriptions to all pom.xml --- CHANGELOG.md | 3 ++- docker/pom.xml | 1 + flowman-archetype-quickstart/pom.xml | 1 + flowman-client/pom.xml | 1 + flowman-common/pom.xml | 1 + flowman-core/pom.xml | 1 + flowman-dist/pom.xml | 1 + flowman-dsl/pom.xml | 1 + flowman-hub/pom.xml | 1 + flowman-parent/pom.xml | 1 + flowman-plugins/aws/pom.xml | 1 + flowman-plugins/azure/pom.xml | 1 + flowman-plugins/delta/pom.xml | 1 + flowman-plugins/impala/pom.xml | 1 + flowman-plugins/json/pom.xml | 1 + flowman-plugins/kafka/pom.xml | 1 + flowman-plugins/mariadb/pom.xml | 1 + flowman-plugins/mssqlserver/pom.xml | 1 + flowman-plugins/mysql/pom.xml | 1 + flowman-plugins/openapi/pom.xml | 1 + flowman-plugins/oracle/pom.xml | 1 + flowman-plugins/postgresql/pom.xml | 1 + flowman-plugins/sftp/pom.xml | 1 + flowman-plugins/swagger/pom.xml | 1 + flowman-scalatest-compat/pom.xml | 1 + flowman-server-ui/pom.xml | 1 + flowman-server/pom.xml | 1 + flowman-spark-extensions/pom.xml | 1 + flowman-spark-testing/pom.xml | 1 + flowman-spec/pom.xml | 1 + flowman-studio-ui/pom.xml | 1 + flowman-studio/pom.xml | 1 + flowman-testing/pom.xml | 1 + flowman-tools/pom.xml | 1 + flowman-yaml-schema/pom.xml | 1 + 35 files changed, 36 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77e552852..8ca2d3768 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,7 @@ # Version 0.28.1 -* github-260 Remove hive-storage-api from several plugins and lib +* github-260: Remove hive-storage-api from several plugins and lib +* github-261: Add descriptions to all pom.xml # Version 0.28.0 - 2022-10-07 diff --git a/docker/pom.xml b/docker/pom.xml index 26a7b137d..862be5341 100644 --- a/docker/pom.xml +++ b/docker/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-docker Flowman Docker image + Flowman Docker image pom diff --git a/flowman-archetype-quickstart/pom.xml b/flowman-archetype-quickstart/pom.xml index c6a487f23..ae3c45994 100644 --- a/flowman-archetype-quickstart/pom.xml +++ b/flowman-archetype-quickstart/pom.xml @@ -3,6 +3,7 @@ 4.0.0 flowman-archetype-quickstart Flowman quickstart Maven archetype + Maven archetype for a Flowman quickstart project maven-archetype diff --git a/flowman-client/pom.xml b/flowman-client/pom.xml index 6e8f6bd49..426c0c66c 100644 --- a/flowman-client/pom.xml +++ b/flowman-client/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-client Flowman console client + Flowman console client com.dimajix.flowman diff --git a/flowman-common/pom.xml b/flowman-common/pom.xml index c0057e1e9..f8386be60 100644 --- a/flowman-common/pom.xml +++ b/flowman-common/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-common Flowman common library + Flowman common library com.dimajix.flowman diff --git a/flowman-core/pom.xml b/flowman-core/pom.xml index 29152e204..3fdf0eabf 100644 --- a/flowman-core/pom.xml +++ b/flowman-core/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-core Flowman core library + Flowman core library com.dimajix.flowman diff --git a/flowman-dist/pom.xml b/flowman-dist/pom.xml index 8caad26ed..cd7f30746 100644 --- a/flowman-dist/pom.xml +++ b/flowman-dist/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-dist Flowman Distribution + Flowman installable binary distribution pom diff --git a/flowman-dsl/pom.xml b/flowman-dsl/pom.xml index 999835dd3..ce6ee2f96 100644 --- a/flowman-dsl/pom.xml +++ b/flowman-dsl/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-dsl Flowman Scala DSL + Flowman Scala DSL flowman-root diff --git a/flowman-hub/pom.xml b/flowman-hub/pom.xml index 2e0d01744..af86d5ac6 100644 --- a/flowman-hub/pom.xml +++ b/flowman-hub/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-hub Flowman Hub + Flowman Hub flowman-root diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index dddd8bef4..98d962f30 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-parent Flowman Parent BOM + Flowman Parent BOM pom diff --git a/flowman-plugins/aws/pom.xml b/flowman-plugins/aws/pom.xml index d3b0ccae1..a5cbc587a 100644 --- a/flowman-plugins/aws/pom.xml +++ b/flowman-plugins/aws/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-aws Flowman AWS plugin + Flowman AWS plugin com.dimajix.flowman diff --git a/flowman-plugins/azure/pom.xml b/flowman-plugins/azure/pom.xml index 4e31c6cbe..d584599a4 100644 --- a/flowman-plugins/azure/pom.xml +++ b/flowman-plugins/azure/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-azure Flowman Azure plugin + Flowman Azure plugin pom diff --git a/flowman-plugins/delta/pom.xml b/flowman-plugins/delta/pom.xml index 962e3148b..c20068bbe 100644 --- a/flowman-plugins/delta/pom.xml +++ b/flowman-plugins/delta/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-delta Flowman Delta plugin + Flowman Delta plugin com.dimajix.flowman diff --git a/flowman-plugins/impala/pom.xml b/flowman-plugins/impala/pom.xml index 57cf28905..bd38bb073 100644 --- a/flowman-plugins/impala/pom.xml +++ b/flowman-plugins/impala/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-impala Flowman Impala plugin + Flowman Impala plugin com.dimajix.flowman diff --git a/flowman-plugins/json/pom.xml b/flowman-plugins/json/pom.xml index 900828fa8..eb269a29c 100644 --- a/flowman-plugins/json/pom.xml +++ b/flowman-plugins/json/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-json Flowman JSON Schema plugin + Flowman JSON Schema plugin com.dimajix.flowman diff --git a/flowman-plugins/kafka/pom.xml b/flowman-plugins/kafka/pom.xml index 793f9b8d8..36d1c78fc 100644 --- a/flowman-plugins/kafka/pom.xml +++ b/flowman-plugins/kafka/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-kafka Flowman Kafka plugin + Flowman Kafka plugin com.dimajix.flowman diff --git a/flowman-plugins/mariadb/pom.xml b/flowman-plugins/mariadb/pom.xml index d861acd69..b3b9b7eb0 100644 --- a/flowman-plugins/mariadb/pom.xml +++ b/flowman-plugins/mariadb/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-mariadb Flowman MariaDB plugin + Flowman MariaDB plugin pom diff --git a/flowman-plugins/mssqlserver/pom.xml b/flowman-plugins/mssqlserver/pom.xml index 9d4221194..f4767e5d4 100644 --- a/flowman-plugins/mssqlserver/pom.xml +++ b/flowman-plugins/mssqlserver/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-mssqlserver Flowman MS SQL Server plugin + Flowman MS SQL Server plugin com.dimajix.flowman diff --git a/flowman-plugins/mysql/pom.xml b/flowman-plugins/mysql/pom.xml index 4b398eb0c..4dc91d603 100644 --- a/flowman-plugins/mysql/pom.xml +++ b/flowman-plugins/mysql/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-mysql Flowman MySQL plugin + Flowman MySQL plugin pom diff --git a/flowman-plugins/openapi/pom.xml b/flowman-plugins/openapi/pom.xml index 16e3a4223..a0a3a6546 100644 --- a/flowman-plugins/openapi/pom.xml +++ b/flowman-plugins/openapi/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-openapi Flowman OpenAPI Schema plugin + Flowman OpenAPI Schema plugin com.dimajix.flowman diff --git a/flowman-plugins/oracle/pom.xml b/flowman-plugins/oracle/pom.xml index 758e481b6..cbdb17ec0 100644 --- a/flowman-plugins/oracle/pom.xml +++ b/flowman-plugins/oracle/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-oracle Flowman Oracle plugin + Flowman Oracle plugin pom diff --git a/flowman-plugins/postgresql/pom.xml b/flowman-plugins/postgresql/pom.xml index f8fa9d40d..6b552858f 100644 --- a/flowman-plugins/postgresql/pom.xml +++ b/flowman-plugins/postgresql/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-postgresql Flowman PostgreSQL plugin + Flowman PostgreSQL plugin pom diff --git a/flowman-plugins/sftp/pom.xml b/flowman-plugins/sftp/pom.xml index b8254dab0..3c4947699 100644 --- a/flowman-plugins/sftp/pom.xml +++ b/flowman-plugins/sftp/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-sftp Flowman SFTP plugin + Flowman SFTP plugin com.dimajix.flowman diff --git a/flowman-plugins/swagger/pom.xml b/flowman-plugins/swagger/pom.xml index e4f37e014..6147b60ce 100644 --- a/flowman-plugins/swagger/pom.xml +++ b/flowman-plugins/swagger/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-plugin-swagger Flowman Swagger Schema plugin + Flowman Swagger Schema plugin com.dimajix.flowman diff --git a/flowman-scalatest-compat/pom.xml b/flowman-scalatest-compat/pom.xml index cc2c990e9..d8968344e 100644 --- a/flowman-scalatest-compat/pom.xml +++ b/flowman-scalatest-compat/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-scalatest-compat Flowman ScalaTest compatibility library + Flowman ScalaTest compatibility library com.dimajix.flowman diff --git a/flowman-server-ui/pom.xml b/flowman-server-ui/pom.xml index a5e025baa..6aa64b64b 100644 --- a/flowman-server-ui/pom.xml +++ b/flowman-server-ui/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-server-ui Flowman Server UI + Flowman Server UI com.dimajix.flowman diff --git a/flowman-server/pom.xml b/flowman-server/pom.xml index 4f3de222e..bfebf8598 100644 --- a/flowman-server/pom.xml +++ b/flowman-server/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-server Flowman Server + Flowman Server flowman-root diff --git a/flowman-spark-extensions/pom.xml b/flowman-spark-extensions/pom.xml index 44d641eff..783dba12c 100644 --- a/flowman-spark-extensions/pom.xml +++ b/flowman-spark-extensions/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-spark-extensions Flowman Spark extensions + Flowman Spark extensions com.dimajix.flowman diff --git a/flowman-spark-testing/pom.xml b/flowman-spark-testing/pom.xml index 3ce93f5ae..e90681131 100644 --- a/flowman-spark-testing/pom.xml +++ b/flowman-spark-testing/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-spark-testing Flowman Spark testing utilities + Flowman Spark testing utilities com.dimajix.flowman diff --git a/flowman-spec/pom.xml b/flowman-spec/pom.xml index 34282031a..5db2dfd94 100644 --- a/flowman-spec/pom.xml +++ b/flowman-spec/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-spec Flowman YAML spec + Flowman YAML spec flowman-root diff --git a/flowman-studio-ui/pom.xml b/flowman-studio-ui/pom.xml index 7737a9d8f..2f5d1fd9d 100644 --- a/flowman-studio-ui/pom.xml +++ b/flowman-studio-ui/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-studio-ui Flowman Studio UI + Flowman Studio UI com.dimajix.flowman diff --git a/flowman-studio/pom.xml b/flowman-studio/pom.xml index 2415cd6fa..dbaf514f0 100644 --- a/flowman-studio/pom.xml +++ b/flowman-studio/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-studio Flowman Studio + Flowman Studio flowman-root diff --git a/flowman-testing/pom.xml b/flowman-testing/pom.xml index e2065fb1a..f2aaaa3ce 100644 --- a/flowman-testing/pom.xml +++ b/flowman-testing/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-testing Flowman testing utilities + Flowman testing utilities com.dimajix.flowman diff --git a/flowman-tools/pom.xml b/flowman-tools/pom.xml index 4beee330b..60bccab7d 100644 --- a/flowman-tools/pom.xml +++ b/flowman-tools/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-tools Flowman CLI tools + Flowman command line tools com.dimajix.flowman diff --git a/flowman-yaml-schema/pom.xml b/flowman-yaml-schema/pom.xml index df0b99397..4ea9423f8 100644 --- a/flowman-yaml-schema/pom.xml +++ b/flowman-yaml-schema/pom.xml @@ -5,6 +5,7 @@ 4.0.0 flowman-yaml-schema Flowman YAML schema + Flowman YAML schema to be used for syntax highlighting and auto-completion in capable editors pom From 390b4a95d7761901779c460f4bf58384590e7490 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 10 Oct 2022 12:30:02 +0200 Subject: [PATCH 04/52] Add more information to failed verifications --- .../dimajix/flowman/spec/target/CompareTarget.scala | 3 ++- .../flowman/spec/target/CopyFileTarget.scala | 6 ++++-- .../dimajix/flowman/spec/target/CopyTarget.scala | 10 +++++++--- .../flowman/spec/target/DeleteFileTarget.scala | 6 ++++-- .../dimajix/flowman/spec/target/DropTarget.scala | 6 ++++-- .../dimajix/flowman/spec/target/FileTarget.scala | 6 ++++-- .../dimajix/flowman/spec/target/GetFileTarget.scala | 6 ++++-- .../flowman/spec/target/HiveDatabaseTarget.scala | 6 ++++-- .../dimajix/flowman/spec/target/LocalTarget.scala | 6 ++++-- .../flowman/spec/target/MergeFilesTarget.scala | 6 ++++-- .../dimajix/flowman/spec/target/MergeTarget.scala | 6 ++++-- .../dimajix/flowman/spec/target/PutFileTarget.scala | 6 ++++-- .../flowman/spec/target/RelationTarget.scala | 11 +++++++---- .../dimajix/flowman/spec/target/SchemaTarget.scala | 6 ++++-- .../flowman/spec/target/TruncateTarget.scala | 13 ++++++++----- .../dimajix/flowman/spec/target/VerifyTarget.scala | 1 - 16 files changed, 68 insertions(+), 36 deletions(-) diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CompareTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CompareTarget.scala index dfc8ce1bb..2a0bbdf08 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CompareTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CompareTarget.scala @@ -25,6 +25,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -99,7 +100,7 @@ case class CompareTarget( if (diff.nonEmpty) { logger.error(s"Dataset '${actual.name}' does not equal the expected dataset '${expected.name}'") logger.error(s"Difference between datasets: \n${diff.get}") - throw new VerificationFailedException(identifier) + throw new VerificationFailedException(identifier, new ExecutionException(s"Dataset '${actual.name}' does not equal the expected dataset '${expected.name}'")) } else { logger.info(s"Dataset '${actual.name}' matches the expected dataset '${expected.name}'") diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyFileTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyFileTarget.scala index 69deb2ca8..73e043f22 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyFileTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyFileTarget.scala @@ -25,6 +25,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -117,8 +118,9 @@ case class CopyFileTarget( val file = executor.fs.file(target) if (!file.exists()) { - logger.error(s"Verification of target '$identifier' failed - location '$target' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - location '$target' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyTarget.scala index 85af3416d..c3b4a37df 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/CopyTarget.scala @@ -29,6 +29,7 @@ import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_PARALLELISM import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_REBALANCE import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.OutputMode import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException @@ -147,14 +148,17 @@ case class CopyTarget( require(executor != null) if (target.exists(executor) == No) { - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - target '${target.name}' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } schema.foreach { spec => val file = executor.fs.file(spec.file) if (!file.exists()) { - logger.error(s"Verification of target '$identifier' failed - schema file '${spec.file}' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - schema file '${spec.file}' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DeleteFileTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DeleteFileTarget.scala index ec6ac6645..91a1cc23c 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DeleteFileTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DeleteFileTarget.scala @@ -25,6 +25,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -85,8 +86,9 @@ case class DeleteFileTarget( val file = executor.fs.file(location) if (file.exists()) { - logger.error(s"Verification of target '$identifier' failed - location '$location' exists") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - location '$location' exists" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DropTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DropTarget.scala index 523ffa458..2ce3fa9b0 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DropTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/DropTarget.scala @@ -24,6 +24,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Operation import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException @@ -137,8 +138,9 @@ case class DropTarget( val rel = relation.value if (rel.exists(execution) == Yes) { - logger.error(s"Verification of target '$identifier' failed - relation '${relation.identifier}' still exists") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - relation '${relation.identifier}' still exists" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala index 1fbeb1ddb..e997b9bd3 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala @@ -28,6 +28,7 @@ import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_PARALLELISM import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_REBALANCE import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.MappingUtils import com.dimajix.flowman.execution.OutputMode import com.dimajix.flowman.execution.Phase @@ -189,8 +190,9 @@ case class FileTarget( val file = executor.fs.file(qualifiedLocation) if (!file.exists()) { - logger.error(s"Verification of target '$identifier' failed - location '$qualifiedLocation' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - location '$qualifiedLocation' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/GetFileTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/GetFileTarget.scala index 71e8d8a9b..f425fc135 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/GetFileTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/GetFileTarget.scala @@ -25,6 +25,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -117,8 +118,9 @@ case class GetFileTarget( val file = executor.fs.local(target) if (!file.exists()) { - logger.error(s"Verification of target '$identifier' failed - local file '$target' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - local file '$target' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/HiveDatabaseTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/HiveDatabaseTarget.scala index 3831db6b3..8fbe8b817 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/HiveDatabaseTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/HiveDatabaseTarget.scala @@ -24,6 +24,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -93,8 +94,9 @@ case class HiveDatabaseTarget( require(executor != null) if (!executor.catalog.databaseExists(database)) { - logger.error(s"Database '$database' provided by target '$identifier' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Database '$database' provided by target '$identifier' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/LocalTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/LocalTarget.scala index 61f25d12c..e1b04c593 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/LocalTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/LocalTarget.scala @@ -33,6 +33,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.MappingUtils import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException @@ -171,8 +172,9 @@ case class LocalTarget( val file = executor.fs.local(path) if (!file.exists()) { - logger.error(s"Verification of target '$identifier' failed - local file '$path' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - local file '$path' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeFilesTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeFilesTarget.scala index 35413dba4..e483c76ca 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeFilesTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeFilesTarget.scala @@ -28,6 +28,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -142,8 +143,9 @@ case class MergeFilesTarget( val file = executor.fs.file(target) if (!file.exists()) { - logger.error(s"Verification of target '$identifier' failed - file file '$target' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - file file '$target' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeTarget.scala index 5a7d15b8c..d31024161 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/MergeTarget.scala @@ -36,6 +36,7 @@ import com.dimajix.flowman.execution.Execution import com.dimajix.flowman.execution.MappingUtils import com.dimajix.flowman.execution.MergeClause import com.dimajix.flowman.execution.DeleteClause +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.InsertClause import com.dimajix.flowman.execution.UpdateClause import com.dimajix.flowman.execution.MigrationPolicy @@ -234,8 +235,9 @@ case class MergeTarget( val rel = relation.value if (rel.loaded(executor) == No) { - logger.error(s"Verification of target '$identifier' failed - relation '${relation.identifier}' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - relation '${relation.identifier}' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala index 730fc5839..23895a21c 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala @@ -25,6 +25,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -109,8 +110,9 @@ case class PutFileTarget( val file = executor.fs.file(target) if (!file.exists()) { - logger.error(s"Verification of target '$identifier' failed - file '$target' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - file '$target' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala index 9adc7ceae..7d85af96d 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala @@ -39,6 +39,7 @@ import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_PARALLELISM import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_REBALANCE import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.MappingUtils import com.dimajix.flowman.execution.MigrationPolicy import com.dimajix.flowman.execution.MigrationStrategy @@ -304,8 +305,9 @@ case class RelationTarget( val policy = VerifyPolicy.ofString(execution.flowmanConf.getConf(FlowmanConf.DEFAULT_TARGET_VERIFY_POLICY)) policy match { case VerifyPolicy.EMPTY_AS_FAILURE => - logger.error(s"Verification of target '$identifier' failed - partition $partition of relation '${relation.identifier}' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - partition $partition of relation '${relation.identifier}' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) case VerifyPolicy.EMPTY_AS_SUCCESS|VerifyPolicy.EMPTY_AS_SUCCESS_WITH_ERRORS => if (rel.exists(execution) != No) { logger.warn(s"Verification of target '$identifier' failed - partition $partition of relation '${relation.identifier}' does not exist. Ignoring.") @@ -315,8 +317,9 @@ case class RelationTarget( Status.SUCCESS } else { - logger.error(s"Verification of target '$identifier' failed - relation '${relation.identifier}' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - relation '${relation.identifier}' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/SchemaTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/SchemaTarget.scala index 786919126..741f0c7ba 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/SchemaTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/SchemaTarget.scala @@ -25,6 +25,7 @@ import com.dimajix.common.Trilean import com.dimajix.common.Yes import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.BaseTarget @@ -107,8 +108,9 @@ case class SchemaTarget( val outputFile = executor.fs.file(file) if (!outputFile.exists()) { - logger.error(s"Verification of target '$identifier' failed - schema file '$file' does not exist") - throw new VerificationFailedException(identifier) + val error = s"Verification of target '$identifier' failed - schema file '$file' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/TruncateTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/TruncateTarget.scala index 2399f529c..e329d2af3 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/TruncateTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/TruncateTarget.scala @@ -27,6 +27,7 @@ import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_PARALLELISM import com.dimajix.flowman.config.FlowmanConf.DEFAULT_TARGET_REBALANCE import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Operation import com.dimajix.flowman.execution.OutputMode import com.dimajix.flowman.execution.Phase @@ -177,11 +178,13 @@ case class TruncateTarget( resolvedPartitions(rel) .find(p => rel.loaded(execution, p) == Yes) .foreach { partition => - if (partition.isEmpty) - logger.error(s"Verification of target '$identifier' failed - relation '$relation' not empty") - else - logger.error(s"Verification of target '$identifier' failed - partition $partition of relation '$relation' exists") - throw new VerificationFailedException(identifier) + val error = + if (partition.isEmpty) + s"Verification of target '$identifier' failed - relation '$relation' not empty" + else + s"Verification of target '$identifier' failed - partition $partition of relation '$relation' exists" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/VerifyTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/VerifyTarget.scala index 23bf91eed..3938f864c 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/VerifyTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/VerifyTarget.scala @@ -33,7 +33,6 @@ import com.dimajix.flowman.execution.ErrorMode import com.dimajix.flowman.execution.Execution import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.Status -import com.dimajix.flowman.execution.ValidationFailedException import com.dimajix.flowman.execution.VerificationFailedException import com.dimajix.flowman.model.Assertion import com.dimajix.flowman.model.BaseTarget From 02470e11d09d95494c7916ee28741fb863c0b1e5 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 10 Oct 2022 15:04:01 +0200 Subject: [PATCH 05/52] Fix wrong dependency version for CDP 7.1 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 69ebc0090..d454873c8 100644 --- a/pom.xml +++ b/pom.xml @@ -238,7 +238,7 @@ 3.1.1.${cdp.version} 3.1 3.1.3000.${cdp.version} - 3.1.30000.${cdp.version} + 3.1.3000.${cdp.version} 2.2.3.${cdp.version} 2.4.1.7.1.5.0-257 10.14.2.0 From 9b2215334b6b7d679a22e546b77bc2791ece2df3 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 11 Oct 2022 19:13:04 +0200 Subject: [PATCH 06/52] github-262 Verification of 'relation' targets should only check existence --- CHANGELOG.md | 1 + docs/releases.md | 2 + .../flowman/spec/target/RelationTarget.scala | 26 ++++++- .../spec/target/RelationTargetTest.scala | 77 +++++++++++++++++++ 4 files changed, 103 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8ca2d3768..0cfa989f5 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ * github-260: Remove hive-storage-api from several plugins and lib * github-261: Add descriptions to all pom.xml +* github-262: Verification of "relation" targets should only check existence # Version 0.28.0 - 2022-10-07 diff --git a/docs/releases.md b/docs/releases.md index 1f47d37c9..51f68c4b8 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -17,6 +17,8 @@ changes over time. ### Version 0.28.1 * github-260 Remove hive-storage-api from several plugins and lib +* github-261: Add descriptions to all pom.xml +* github-262: Verification of "relation" targets should only check existence ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala index 7d85af96d..efe077a5e 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/RelationTarget.scala @@ -297,8 +297,7 @@ case class RelationTarget( override def verify2(execution: Execution) : TargetResult = { require(execution != null) - val startTime = Instant.now() - Try { + def verifyWithData() : Status = { val partition = this.partition.mapValues(v => SingleValue(v)) val rel = relation.value if (rel.loaded(execution, partition) == No) { @@ -308,7 +307,7 @@ case class RelationTarget( val error = s"Verification of target '$identifier' failed - partition $partition of relation '${relation.identifier}' does not exist" logger.error(error) throw new VerificationFailedException(identifier, new ExecutionException(error)) - case VerifyPolicy.EMPTY_AS_SUCCESS|VerifyPolicy.EMPTY_AS_SUCCESS_WITH_ERRORS => + case VerifyPolicy.EMPTY_AS_SUCCESS | VerifyPolicy.EMPTY_AS_SUCCESS_WITH_ERRORS => if (rel.exists(execution) != No) { logger.warn(s"Verification of target '$identifier' failed - partition $partition of relation '${relation.identifier}' does not exist. Ignoring.") if (policy == VerifyPolicy.EMPTY_AS_SUCCESS_WITH_ERRORS) @@ -327,6 +326,27 @@ case class RelationTarget( Status.SUCCESS } } + def verifyWithoutData() : Status = { + val rel = relation.value + if (rel.exists(execution) != No) { + Status.SUCCESS + } + else { + val error = s"Verification of target '$identifier' failed - relation '${relation.identifier}' does not exist" + logger.error(error) + throw new VerificationFailedException(identifier, new ExecutionException(error)) + } + } + + val startTime = Instant.now() + Try { + if (mapping.nonEmpty) { + verifyWithData() + } + else { + verifyWithoutData() + } + } match { case Success(status) => TargetResult(this, Phase.VERIFY, status, startTime) case Failure(ex) => TargetResult(this, Phase.VERIFY, ex, startTime) diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/RelationTargetTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/RelationTargetTest.scala index b21eff608..1b5c3319e 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/RelationTargetTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/RelationTargetTest.scala @@ -272,6 +272,83 @@ class RelationTargetTest extends AnyFlatSpec with Matchers with MockFactory with target.dirty(execution, Phase.DESTROY) should be (No) } + it should "support the whole lifecycle without a mapping" in { + val outputPath = Paths.get(tempDir.toString, "test_" + UUID.randomUUID().toString) + val spec = + s""" + |relations: + | output: + | kind: file + | location: ${outputPath.toUri} + | format: csv + | schema: + | kind: inline + | fields: + | - name: int_col + | type: integer + | - name: dbl_col + | type: double + | - name: str_col + | type: string + | + |targets: + | out: + | kind: relation + | relation: output + """.stripMargin + val project = Module.read.string(spec).toProject("project") + val session = Session.builder() + .withSparkSession(spark) + .withProject(project) + .build() + val execution = session.execution + val context = session.getContext(project) + + val output = context.getRelation(RelationIdentifier("output")) + val target = context.getTarget(TargetIdentifier("out")) + + // == Create ================================================================================================= + output.exists(execution) should be(No) + output.loaded(execution) should be(No) + target.dirty(execution, Phase.CREATE) should be(Yes) + target.execute(execution, Phase.CREATE) + output.exists(execution) should be(Yes) + output.loaded(execution) should be(No) + target.dirty(execution, Phase.CREATE) should be(No) + output.read(execution).count() should be(0) + + // == Build ================================================================================================== + target.dirty(execution, Phase.BUILD) should be(No) + target.execute(execution, Phase.BUILD) + output.exists(execution) should be(Yes) + output.loaded(execution) should be(No) + target.dirty(execution, Phase.BUILD) should be(No) + output.read(execution).count() should be(0) + + // == Verify ================================================================================================= + target.dirty(execution, Phase.VERIFY) should be(Yes) + target.execute(execution, Phase.VERIFY) + output.exists(execution) should be(Yes) + output.loaded(execution) should be(No) + target.dirty(execution, Phase.VERIFY) should be(Yes) + output.read(execution).count() should be(0) + + // == Truncate =============================================================================================== + target.dirty(execution, Phase.TRUNCATE) should be(No) + target.execute(execution, Phase.TRUNCATE) + output.exists(execution) should be(Yes) + output.loaded(execution) should be(No) + target.dirty(execution, Phase.TRUNCATE) should be(No) + output.read(execution).count() should be(0) + + // == Destroy ================================================================================================ + target.dirty(execution, Phase.DESTROY) should be(Yes) + target.execute(execution, Phase.DESTROY) + output.exists(execution) should be(No) + output.loaded(execution) should be(No) + target.dirty(execution, Phase.DESTROY) should be(No) + } + it should "count the number of records" in { val spark = this.spark import spark.implicits._ From ee4f5d261e836adcd004ae12d7bd6340f269c4dd Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 13 Oct 2022 10:46:34 +0200 Subject: [PATCH 07/52] github-263 Add filter condition to data quality checks in documentation --- CHANGELOG.md | 1 + .../com/dimajix/flowman/documentation/ColumnCheck.scala | 4 ++++ .../com/dimajix/flowman/documentation/SchemaCheck.scala | 5 ++++- .../scala/com/dimajix/flowman/documentation/velocity.scala | 2 ++ .../com/dimajix/flowman/documentation/html+css/project.vtl | 2 +- .../com/dimajix/flowman/documentation/html/project.vtl | 2 +- 6 files changed, 13 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 0cfa989f5..7fdf3d3d7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,7 @@ * github-260: Remove hive-storage-api from several plugins and lib * github-261: Add descriptions to all pom.xml * github-262: Verification of "relation" targets should only check existence +* github-263: Add filter condition to data quality checks in documentation # Version 0.28.0 - 2022-10-07 diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/ColumnCheck.scala b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/ColumnCheck.scala index 237e59f7b..f4dfae551 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/ColumnCheck.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/ColumnCheck.scala @@ -49,6 +49,10 @@ final case class ColumnCheckReference( abstract class ColumnCheck extends Fragment with Product with Serializable { def name : String + def text : String = filter match { + case Some(condition) => s"$name WHERE $condition" + case None => name + } def filter : Option[String] def result : Option[CheckResult] def withResult(result:CheckResult) : ColumnCheck diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/SchemaCheck.scala b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/SchemaCheck.scala index 134aa7dc4..0b8674ef4 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/SchemaCheck.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/SchemaCheck.scala @@ -16,7 +16,6 @@ package com.dimajix.flowman.documentation -import java.lang import java.util.Locale import org.apache.spark.sql.Column @@ -53,6 +52,10 @@ final case class SchemaCheckReference( abstract class SchemaCheck extends Fragment with Product with Serializable { def name : String + def text: String = filter match { + case Some(condition) => s"$name WHERE $condition" + case None => name + } def filter : Option[String] def result : Option[CheckResult] def withResult(result:CheckResult) : SchemaCheck diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/velocity.scala b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/velocity.scala index 1b1ea9823..d2e17548b 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/velocity.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/velocity.scala @@ -57,6 +57,7 @@ final case class ColumnCheckWrapper(check:ColumnCheck) extends FragmentWrapper(c override def toString: String = check.name def getName() : String = check.name + def getText() : String = check.text def getResult() : CheckResultWrapper = check.result.map(CheckResultWrapper).orNull def getStatus() : String = check.result.map(_.status.toString).getOrElse("NOT_RUN") def getSuccess() : Boolean = check.result.exists(_.success) @@ -85,6 +86,7 @@ final case class SchemaCheckWrapper(check:SchemaCheck) extends FragmentWrapper(c override def toString: String = check.name def getName() : String = check.name + def getText() : String = check.text def getResult() : CheckResultWrapper = check.result.map(CheckResultWrapper).orNull def getStatus() : String = check.result.map(_.status.toString).getOrElse("NOT_RUN") def getSuccess() : Boolean = check.result.exists(_.success) diff --git a/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html+css/project.vtl b/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html+css/project.vtl index 0859da139..6a50ec6e1 100644 --- a/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html+css/project.vtl +++ b/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html+css/project.vtl @@ -12,7 +12,7 @@ #end #macro(testTitle $check) - #if(${check.description})${check.description}
#end${check.name} + #if(${check.description})${check.description}
#end${check.text} #end diff --git a/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html/project.vtl b/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html/project.vtl index 69995034e..36cdc768e 100644 --- a/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html/project.vtl +++ b/flowman-spec/src/main/resources/com/dimajix/flowman/documentation/html/project.vtl @@ -176,7 +176,7 @@ #end #macro(testTitle $check) - #if(${check.description})${check.description}
#end${check.name} + #if(${check.description})${check.description}
#end${check.text} #end From 241de4d86e800652b5cdb2559e2851f924ceaa03 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Fri, 14 Oct 2022 19:24:22 +0200 Subject: [PATCH 08/52] Update base packages in Docker image --- docker/Dockerfile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docker/Dockerfile b/docker/Dockerfile index abe79a304..6a5b4806e 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -21,6 +21,8 @@ RUN curl -sL --retry 3 "https://archive.apache.org/dist/spark/spark-${BUILD_SPAR COPY bin/ /opt/docker/bin/ COPY libexec/ /opt/docker/libexec/ +# Update OS +RUN apt-get update && apt-get upgrade --yes && apt clean # Copy and install Repository COPY $DIST_FILE /tmp/repo/flowman-dist.tar.gz From 4d41dd91316c81a2f8685d08234089c9344852e2 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 19 Oct 2022 08:01:13 +0200 Subject: [PATCH 09/52] github-265 Make JDBC dialects pluggable --- CHANGELOG.md | 1 + docs/releases.md | 2 ++ .../com.dimajix.flowman.jdbc.SqlDialect | 8 +++++ .../dimajix/flowman/jdbc/DerbyDialect.scala | 3 +- .../com/dimajix/flowman/jdbc/H2Dialect.scala | 3 +- .../dimajix/flowman/jdbc/HiveDialect.scala | 3 +- .../dimajix/flowman/jdbc/NoopDialect.scala | 3 +- .../dimajix/flowman/jdbc/OracleDialect.scala | 3 +- .../flowman/jdbc/PostgresDialect.scala | 3 +- .../dimajix/flowman/jdbc/SqlDialects.scala | 33 +++---------------- .../flowman/jdbc/SqlServerDialect.scala | 4 ++- pom.xml | 10 ++++-- 12 files changed, 39 insertions(+), 37 deletions(-) create mode 100644 flowman-core/src/main/resources/META-INF/services/com.dimajix.flowman.jdbc.SqlDialect diff --git a/CHANGELOG.md b/CHANGELOG.md index 7fdf3d3d7..b35aba058 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ * github-261: Add descriptions to all pom.xml * github-262: Verification of "relation" targets should only check existence * github-263: Add filter condition to data quality checks in documentation +* github-265: Make JDBC dialects pluggable # Version 0.28.0 - 2022-10-07 diff --git a/docs/releases.md b/docs/releases.md index 51f68c4b8..dede9c848 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -19,6 +19,8 @@ changes over time. * github-260 Remove hive-storage-api from several plugins and lib * github-261: Add descriptions to all pom.xml * github-262: Verification of "relation" targets should only check existence +* github-263: Add filter condition to data quality checks in documentation +* github-265: Make JDBC dialects pluggable ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-core/src/main/resources/META-INF/services/com.dimajix.flowman.jdbc.SqlDialect b/flowman-core/src/main/resources/META-INF/services/com.dimajix.flowman.jdbc.SqlDialect new file mode 100644 index 000000000..512f6c3a8 --- /dev/null +++ b/flowman-core/src/main/resources/META-INF/services/com.dimajix.flowman.jdbc.SqlDialect @@ -0,0 +1,8 @@ +com.dimajix.flowman.jdbc.DerbyDialect +com.dimajix.flowman.jdbc.H2Dialect +com.dimajix.flowman.jdbc.HiveDialect +com.dimajix.flowman.jdbc.MariaDialect +com.dimajix.flowman.jdbc.MySQLDialect +com.dimajix.flowman.jdbc.OracleDialect +com.dimajix.flowman.jdbc.PostgresDialect +com.dimajix.flowman.jdbc.SqlServerDialect diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/DerbyDialect.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/DerbyDialect.scala index 5ebe7db46..3b2001f6c 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/DerbyDialect.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/DerbyDialect.scala @@ -32,7 +32,7 @@ import com.dimajix.flowman.types.ShortType import com.dimajix.flowman.types.StringType -object DerbyDialect extends BaseDialect { +class DerbyDialect extends BaseDialect { private object Statements extends DerbyStatements(this) override def canHandle(url: String): Boolean = url.startsWith("jdbc:derby") @@ -88,6 +88,7 @@ object DerbyDialect extends BaseDialect { override def statement : SqlStatements = Statements } +object DerbyDialect extends DerbyDialect class DerbyStatements(dialect: BaseDialect) extends BaseStatements(dialect) { diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/H2Dialect.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/H2Dialect.scala index 8c8a4340c..0ff4a38a7 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/H2Dialect.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/H2Dialect.scala @@ -20,7 +20,7 @@ import java.util.Locale import com.dimajix.flowman.catalog.TableIdentifier -object H2Dialect extends BaseDialect { +class H2Dialect extends BaseDialect { private object Statements extends H2Statements(this) override def canHandle(url: String): Boolean = url.startsWith("jdbc:h2") @@ -41,6 +41,7 @@ object H2Dialect extends BaseDialect { override def statement : SqlStatements = Statements } +object H2Dialect extends H2Dialect class H2Statements(dialect: BaseDialect) extends BaseStatements(dialect) { diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/HiveDialect.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/HiveDialect.scala index acdac0d39..5875a1f78 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/HiveDialect.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/HiveDialect.scala @@ -17,7 +17,7 @@ package com.dimajix.flowman.jdbc -object HiveDialect extends BaseDialect { +class HiveDialect extends BaseDialect { override def canHandle(url : String): Boolean = url.startsWith("jdbc:hive") def quote(table:org.apache.spark.sql.catalyst.TableIdentifier): String = { @@ -40,3 +40,4 @@ object HiveDialect extends BaseDialect { */ override def supportsExactViewRetrieval: Boolean = true } +object HiveDialect extends HiveDialect diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/NoopDialect.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/NoopDialect.scala index 4f649bb61..8ea947d8b 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/NoopDialect.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/NoopDialect.scala @@ -16,7 +16,7 @@ package com.dimajix.flowman.jdbc -object NoopDialect extends BaseDialect { +class NoopDialect extends BaseDialect { /** * Check if this dialect instance can handle a certain jdbc url. * @@ -26,3 +26,4 @@ object NoopDialect extends BaseDialect { */ override def canHandle(url: String): Boolean = true } +object NoopDialect extends NoopDialect diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/OracleDialect.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/OracleDialect.scala index f0b610e21..013699379 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/OracleDialect.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/OracleDialect.scala @@ -48,7 +48,7 @@ import com.dimajix.flowman.types.VarcharType import com.dimajix.flowman.util.UtcTimestamp -object OracleDialect extends BaseDialect { +class OracleDialect extends BaseDialect { private object Statements extends OracleStatements(this) private object Commands extends OracleCommands(this) @@ -115,6 +115,7 @@ object OracleDialect extends BaseDialect { override def statement : SqlStatements = Statements override def command: SqlCommands = Commands } +object OracleDialect extends OracleDialect class OracleStatements(dialect: BaseDialect) extends BaseStatements(dialect) { diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/PostgresDialect.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/PostgresDialect.scala index 66cbaf28a..1b2c72251 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/PostgresDialect.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/PostgresDialect.scala @@ -37,7 +37,7 @@ import com.dimajix.flowman.types.ShortType import com.dimajix.flowman.types.StringType -object PostgresDialect extends BaseDialect { +class PostgresDialect extends BaseDialect { private object Statements extends PostgresStatements(this) private object Expressions extends PostgresExpressions(this) private object Commands extends PostgresCommands(this) @@ -83,6 +83,7 @@ object PostgresDialect extends BaseDialect { override def expr : SqlExpressions = Expressions override def command : SqlCommands = Commands } +object PostgresDialect extends PostgresDialect class PostgresExpressions(dialect: BaseDialect) extends BaseExpressions(dialect) { diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlDialects.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlDialects.scala index f657490d6..4dc89c3d3 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlDialects.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlDialects.scala @@ -16,38 +16,15 @@ package com.dimajix.flowman.jdbc +import java.util.ServiceLoader +import scala.collection.JavaConverters._ object SqlDialects { - /** - * Register a dialect for use on all new matching jdbc `org.apache.spark.sql.DataFrame`. - * Reading an existing dialect will cause a move-to-front. - * - * @param dialect The new dialect. - */ - def registerDialect(dialect: SqlDialect) : Unit = { - dialects = dialect :: dialects.filterNot(_ == dialect) - } - - /** - * Unregister a dialect. Does nothing if the dialect is not registered. - * - * @param dialect The jdbc dialect. - */ - def unregisterDialect(dialect : SqlDialect) : Unit = { - dialects = dialects.filterNot(_ == dialect) + private lazy val dialects = { + val loader = ServiceLoader.load(classOf[SqlDialect]) + loader.iterator().asScala.toSeq } - private[this] var dialects = List[SqlDialect]() - - registerDialect(HiveDialect) - registerDialect(DerbyDialect) - registerDialect(H2Dialect) - registerDialect(MySQLDialect) - registerDialect(MariaDialect) - registerDialect(SqlServerDialect) - registerDialect(PostgresDialect) - registerDialect(OracleDialect) - /** * Fetch the JdbcDialect class corresponding to a given database url. */ diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlServerDialect.scala b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlServerDialect.scala index 3d3f73766..8c04a43ae 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlServerDialect.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/jdbc/SqlServerDialect.scala @@ -43,7 +43,7 @@ import com.dimajix.flowman.types.TimestampType import com.dimajix.flowman.types.VarcharType -object SqlServerDialect extends BaseDialect { +class SqlServerDialect extends BaseDialect { // Special JDBC types in Microsoft SQL Server. // https://github.com/microsoft/mssql-jdbc/blob/v8.2.2/src/main/java/microsoft/sql/Types.java private object SpecificTypes { @@ -114,6 +114,8 @@ object SqlServerDialect extends BaseDialect { override def expr : SqlExpressions = Expressions override def command : SqlCommands = Commands } +object SqlServerDialect extends SqlServerDialect + class MsSqlServerExpressions(dialect: BaseDialect) extends BaseExpressions(dialect) { override def primaryKey(columns: Seq[String], clustered:Boolean): String = { diff --git a/pom.xml b/pom.xml index d454873c8..356f45da7 100644 --- a/pom.xml +++ b/pom.xml @@ -7,8 +7,14 @@ flowman-root 0.28.1-SNAPSHOT pom + Flowman root pom - A Spark based ETL tool + + Flowman is a Big Data build tool powered by Apache Spark. It simplified the development of complex data + transformations by employing a purely declarative workflow. Moreover, Flowman can also take over the complete + lifecycle management of outgoing data sinks, let them be Hive tables, JDBC tables or simply files. This + lifecycle management includes table creation, automatic migration, populating and optional clean up. + https://flowman.io 2018 @@ -25,7 +31,7 @@ Kaya Kupferschmidt k.kupferschmidt@dimajix.de dimajix - https://www.dimajix.com + https://www.dimajix.de From 78f976ac815d2ce77b63e6018af50c7e8bfc460d Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 19 Oct 2022 08:32:40 +0200 Subject: [PATCH 10/52] github-264 Create jars again for all plugins --- CHANGELOG.md | 1 + docs/releases.md | 1 + flowman-dist/pom.xml | 2 +- flowman-dist/src/{main => }/assembly/assembly.xml | 0 flowman-parent/pom.xml | 2 +- .../aws/src/{main => }/assembly/assembly.xml | 0 flowman-plugins/azure/pom.xml | 15 --------------- .../azure/src/{main => }/assembly/assembly.xml | 0 .../delta/src/{main => }/assembly/assembly.xml | 0 .../hbase/src/{main => }/assembly/assembly.xml | 0 .../impala/src/{main => }/assembly/assembly.xml | 0 .../json/src/{main => }/assembly/assembly.xml | 0 .../kafka/src/{main => }/assembly/assembly.xml | 0 flowman-plugins/mariadb/pom.xml | 15 --------------- .../mariadb/src/{main => }/assembly/assembly.xml | 0 .../src/{main => }/assembly/assembly.xml | 0 flowman-plugins/mysql/pom.xml | 15 --------------- .../mysql/src/{main => }/assembly/assembly.xml | 0 .../openapi/src/{main => }/assembly/assembly.xml | 0 flowman-plugins/oracle/pom.xml | 15 --------------- .../oracle/src/{main => }/assembly/assembly.xml | 0 flowman-plugins/postgresql/pom.xml | 15 --------------- .../src/{main => }/assembly/assembly.xml | 0 .../sftp/src/{main => }/assembly/assembly.xml | 0 .../swagger/src/{main => }/assembly/assembly.xml | 0 .../src/{main => }/assembly/assembly.xml | 0 pom.xml | 2 +- 27 files changed, 5 insertions(+), 78 deletions(-) rename flowman-dist/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/aws/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/azure/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/delta/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/hbase/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/impala/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/json/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/kafka/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/mariadb/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/mssqlserver/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/mysql/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/openapi/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/oracle/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/postgresql/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/sftp/src/{main => }/assembly/assembly.xml (100%) rename flowman-plugins/swagger/src/{main => }/assembly/assembly.xml (100%) rename flowman-yaml-schema/src/{main => }/assembly/assembly.xml (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md index b35aba058..d777952ec 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ * github-262: Verification of "relation" targets should only check existence * github-263: Add filter condition to data quality checks in documentation * github-265: Make JDBC dialects pluggable +* github-264: Provide "jars" for all plugins # Version 0.28.0 - 2022-10-07 diff --git a/docs/releases.md b/docs/releases.md index dede9c848..ef188a63d 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -21,6 +21,7 @@ changes over time. * github-262: Verification of "relation" targets should only check existence * github-263: Add filter condition to data quality checks in documentation * github-265: Make JDBC dialects pluggable +* github-264: Provide "jars" for all plugins ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-dist/pom.xml b/flowman-dist/pom.xml index cd7f30746..0289c3d4d 100644 --- a/flowman-dist/pom.xml +++ b/flowman-dist/pom.xml @@ -265,7 +265,7 @@ ${project.artifactId}-${flowman.dist.label} - src/main/assembly/assembly.xml + src/assembly/assembly.xml diff --git a/flowman-dist/src/main/assembly/assembly.xml b/flowman-dist/src/assembly/assembly.xml similarity index 100% rename from flowman-dist/src/main/assembly/assembly.xml rename to flowman-dist/src/assembly/assembly.xml diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index 98d962f30..39bfe0dab 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -225,7 +225,7 @@ ${project.artifactId}-${project.version} - src/main/assembly/assembly.xml + src/assembly/assembly.xml diff --git a/flowman-plugins/aws/src/main/assembly/assembly.xml b/flowman-plugins/aws/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/aws/src/main/assembly/assembly.xml rename to flowman-plugins/aws/src/assembly/assembly.xml diff --git a/flowman-plugins/azure/pom.xml b/flowman-plugins/azure/pom.xml index d584599a4..ce414ab54 100644 --- a/flowman-plugins/azure/pom.xml +++ b/flowman-plugins/azure/pom.xml @@ -6,7 +6,6 @@ flowman-plugin-azure Flowman Azure plugin Flowman Azure plugin - pom com.dimajix.flowman @@ -43,20 +42,6 @@ - - - org.apache.maven.plugins - maven-resources-plugin - - - default-resources - process-resources - - resources - - - - org.apache.maven.plugins maven-assembly-plugin diff --git a/flowman-plugins/azure/src/main/assembly/assembly.xml b/flowman-plugins/azure/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/azure/src/main/assembly/assembly.xml rename to flowman-plugins/azure/src/assembly/assembly.xml diff --git a/flowman-plugins/delta/src/main/assembly/assembly.xml b/flowman-plugins/delta/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/delta/src/main/assembly/assembly.xml rename to flowman-plugins/delta/src/assembly/assembly.xml diff --git a/flowman-plugins/hbase/src/main/assembly/assembly.xml b/flowman-plugins/hbase/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/hbase/src/main/assembly/assembly.xml rename to flowman-plugins/hbase/src/assembly/assembly.xml diff --git a/flowman-plugins/impala/src/main/assembly/assembly.xml b/flowman-plugins/impala/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/impala/src/main/assembly/assembly.xml rename to flowman-plugins/impala/src/assembly/assembly.xml diff --git a/flowman-plugins/json/src/main/assembly/assembly.xml b/flowman-plugins/json/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/json/src/main/assembly/assembly.xml rename to flowman-plugins/json/src/assembly/assembly.xml diff --git a/flowman-plugins/kafka/src/main/assembly/assembly.xml b/flowman-plugins/kafka/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/kafka/src/main/assembly/assembly.xml rename to flowman-plugins/kafka/src/assembly/assembly.xml diff --git a/flowman-plugins/mariadb/pom.xml b/flowman-plugins/mariadb/pom.xml index b3b9b7eb0..f605c4b9d 100644 --- a/flowman-plugins/mariadb/pom.xml +++ b/flowman-plugins/mariadb/pom.xml @@ -6,7 +6,6 @@ flowman-plugin-mariadb Flowman MariaDB plugin Flowman MariaDB plugin - pom com.dimajix.flowman @@ -30,20 +29,6 @@ - - - org.apache.maven.plugins - maven-resources-plugin - - - default-resources - process-resources - - resources - - - - org.apache.maven.plugins maven-assembly-plugin diff --git a/flowman-plugins/mariadb/src/main/assembly/assembly.xml b/flowman-plugins/mariadb/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/mariadb/src/main/assembly/assembly.xml rename to flowman-plugins/mariadb/src/assembly/assembly.xml diff --git a/flowman-plugins/mssqlserver/src/main/assembly/assembly.xml b/flowman-plugins/mssqlserver/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/mssqlserver/src/main/assembly/assembly.xml rename to flowman-plugins/mssqlserver/src/assembly/assembly.xml diff --git a/flowman-plugins/mysql/pom.xml b/flowman-plugins/mysql/pom.xml index 4dc91d603..30f27cead 100644 --- a/flowman-plugins/mysql/pom.xml +++ b/flowman-plugins/mysql/pom.xml @@ -6,7 +6,6 @@ flowman-plugin-mysql Flowman MySQL plugin Flowman MySQL plugin - pom com.dimajix.flowman @@ -30,20 +29,6 @@ - - - org.apache.maven.plugins - maven-resources-plugin - - - default-resources - process-resources - - resources - - - - org.apache.maven.plugins maven-assembly-plugin diff --git a/flowman-plugins/mysql/src/main/assembly/assembly.xml b/flowman-plugins/mysql/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/mysql/src/main/assembly/assembly.xml rename to flowman-plugins/mysql/src/assembly/assembly.xml diff --git a/flowman-plugins/openapi/src/main/assembly/assembly.xml b/flowman-plugins/openapi/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/openapi/src/main/assembly/assembly.xml rename to flowman-plugins/openapi/src/assembly/assembly.xml diff --git a/flowman-plugins/oracle/pom.xml b/flowman-plugins/oracle/pom.xml index cbdb17ec0..2686cc762 100644 --- a/flowman-plugins/oracle/pom.xml +++ b/flowman-plugins/oracle/pom.xml @@ -6,7 +6,6 @@ flowman-plugin-oracle Flowman Oracle plugin Flowman Oracle plugin - pom com.dimajix.flowman @@ -30,20 +29,6 @@ - - - org.apache.maven.plugins - maven-resources-plugin - - - default-resources - process-resources - - resources - - - - org.apache.maven.plugins maven-assembly-plugin diff --git a/flowman-plugins/oracle/src/main/assembly/assembly.xml b/flowman-plugins/oracle/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/oracle/src/main/assembly/assembly.xml rename to flowman-plugins/oracle/src/assembly/assembly.xml diff --git a/flowman-plugins/postgresql/pom.xml b/flowman-plugins/postgresql/pom.xml index 6b552858f..9a85cc99b 100644 --- a/flowman-plugins/postgresql/pom.xml +++ b/flowman-plugins/postgresql/pom.xml @@ -6,7 +6,6 @@ flowman-plugin-postgresql Flowman PostgreSQL plugin Flowman PostgreSQL plugin - pom com.dimajix.flowman @@ -30,20 +29,6 @@ - - - org.apache.maven.plugins - maven-resources-plugin - - - default-resources - process-resources - - resources - - - - org.apache.maven.plugins maven-assembly-plugin diff --git a/flowman-plugins/postgresql/src/main/assembly/assembly.xml b/flowman-plugins/postgresql/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/postgresql/src/main/assembly/assembly.xml rename to flowman-plugins/postgresql/src/assembly/assembly.xml diff --git a/flowman-plugins/sftp/src/main/assembly/assembly.xml b/flowman-plugins/sftp/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/sftp/src/main/assembly/assembly.xml rename to flowman-plugins/sftp/src/assembly/assembly.xml diff --git a/flowman-plugins/swagger/src/main/assembly/assembly.xml b/flowman-plugins/swagger/src/assembly/assembly.xml similarity index 100% rename from flowman-plugins/swagger/src/main/assembly/assembly.xml rename to flowman-plugins/swagger/src/assembly/assembly.xml diff --git a/flowman-yaml-schema/src/main/assembly/assembly.xml b/flowman-yaml-schema/src/assembly/assembly.xml similarity index 100% rename from flowman-yaml-schema/src/main/assembly/assembly.xml rename to flowman-yaml-schema/src/assembly/assembly.xml diff --git a/pom.xml b/pom.xml index 356f45da7..50fcbd159 100644 --- a/pom.xml +++ b/pom.xml @@ -899,7 +899,7 @@ ${project.artifactId}-${project.version} - src/main/assembly/assembly.xml + src/assembly/assembly.xml From 998690628aaa558aa04c72ff69fce2db23c0f333 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 20 Oct 2022 15:58:27 +0200 Subject: [PATCH 11/52] github-267 Add new flowman-spark-dependencies module to simplify dependency management --- CHANGELOG.md | 3 +- docker/pom.xml | 2 +- docs/releases.md | 3 +- flowman-archetype-quickstart/pom.xml | 2 +- .../resources/archetype-resources/pom.xml | 33 +--- flowman-client/pom.xml | 2 +- flowman-common/pom.xml | 2 +- flowman-core/pom.xml | 66 ++------ flowman-dist/pom.xml | 2 +- flowman-dist/src/assembly/assembly.xml | 1 + flowman-dsl/pom.xml | 23 +-- flowman-hub/pom.xml | 20 +-- flowman-parent/pom.xml | 28 +++- flowman-plugins/aws/pom.xml | 10 +- flowman-plugins/azure/pom.xml | 2 +- flowman-plugins/delta/pom.xml | 35 +---- flowman-plugins/impala/pom.xml | 24 +-- flowman-plugins/json/pom.xml | 29 +--- flowman-plugins/kafka/pom.xml | 35 +---- flowman-plugins/mariadb/pom.xml | 2 +- flowman-plugins/mssqlserver/pom.xml | 35 +---- flowman-plugins/mysql/pom.xml | 2 +- flowman-plugins/openapi/pom.xml | 25 +-- flowman-plugins/oracle/pom.xml | 2 +- flowman-plugins/postgresql/pom.xml | 2 +- flowman-plugins/sftp/pom.xml | 35 +---- flowman-plugins/swagger/pom.xml | 25 +-- flowman-scalatest-compat/pom.xml | 2 +- flowman-server-ui/pom.xml | 2 +- flowman-server/pom.xml | 31 +--- flowman-spark-dependencies/pom.xml | 142 ++++++++++++++++++ flowman-spark-extensions/pom.xml | 28 +--- flowman-spark-testing/pom.xml | 23 +-- flowman-spec/pom.xml | 28 +--- flowman-studio-ui/pom.xml | 2 +- flowman-studio/pom.xml | 31 +--- flowman-testing/pom.xml | 31 +--- flowman-tools/pom.xml | 32 +--- flowman-yaml-schema/pom.xml | 20 ++- pom.xml | 17 ++- 40 files changed, 353 insertions(+), 486 deletions(-) create mode 100644 flowman-spark-dependencies/pom.xml diff --git a/CHANGELOG.md b/CHANGELOG.md index d777952ec..13f4cdc75 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# Version 0.28.1 +# Version 0.29.0 * github-260: Remove hive-storage-api from several plugins and lib * github-261: Add descriptions to all pom.xml @@ -6,6 +6,7 @@ * github-263: Add filter condition to data quality checks in documentation * github-265: Make JDBC dialects pluggable * github-264: Provide "jars" for all plugins +* github-267: Add new flowman-spark-dependencies module to simplify dependency management # Version 0.28.0 - 2022-10-07 diff --git a/docker/pom.xml b/docker/pom.xml index 862be5341..6c5dd8add 100644 --- a/docker/pom.xml +++ b/docker/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/docs/releases.md b/docs/releases.md index ef188a63d..c29f3d613 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -14,7 +14,7 @@ The following gives an (incomplete) list of past releases of the last 12 months. changes over time. -### Version 0.28.1 +### Version 0.29.0 * github-260 Remove hive-storage-api from several plugins and lib * github-261: Add descriptions to all pom.xml @@ -22,6 +22,7 @@ changes over time. * github-263: Add filter condition to data quality checks in documentation * github-265: Make JDBC dialects pluggable * github-264: Provide "jars" for all plugins +* github-267: Add new flowman-spark-dependencies module to simplify dependency management ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-archetype-quickstart/pom.xml b/flowman-archetype-quickstart/pom.xml index ae3c45994..91cff224d 100644 --- a/flowman-archetype-quickstart/pom.xml +++ b/flowman-archetype-quickstart/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/flowman-archetype-quickstart/src/main/resources/archetype-resources/pom.xml b/flowman-archetype-quickstart/src/main/resources/archetype-resources/pom.xml index 026b2b1f9..258d546bd 100644 --- a/flowman-archetype-quickstart/src/main/resources/archetype-resources/pom.xml +++ b/flowman-archetype-quickstart/src/main/resources/archetype-resources/pom.xml @@ -138,37 +138,12 @@ com.dimajix.flowman - flowman-tools - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - - - - org.apache.hadoop - hadoop-client - - - org.apache.spark - spark-sql_${scala.api_version} + flowman-spark-dependencies + pom - org.apache.spark - spark-hive_${scala.api_version} - - - commons-beanutils - commons-beanutils - - - commons-collections - commons-collections - - - commons-logging - commons-logging + com.dimajix.flowman + flowman-tools diff --git a/flowman-client/pom.xml b/flowman-client/pom.xml index 426c0c66c..0c6599a04 100644 --- a/flowman-client/pom.xml +++ b/flowman-client/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/flowman-common/pom.xml b/flowman-common/pom.xml index f8386be60..f4dfe0349 100644 --- a/flowman-common/pom.xml +++ b/flowman-common/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/flowman-core/pom.xml b/flowman-core/pom.xml index 3fdf0eabf..220266e86 100644 --- a/flowman-core/pom.xml +++ b/flowman-core/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -71,71 +71,29 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + com.dimajix.flowman flowman-spark-extensions - com.dimajix.flowman flowman-spark-testing - com.dimajix.flowman flowman-common - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} - - org.apache.spark spark-avro_${scala.api_version} - - com.fasterxml.jackson.core - jackson-core - - - - com.fasterxml.jackson.core - jackson-databind - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - - - com.fasterxml.jackson.module - jackson-module-paranamer - - - - com.fasterxml.jackson.module - jackson-module-scala_${scala.api_version} - - com.kjetland mbknor-jackson-jsonschema_${scala.api_version} @@ -162,7 +120,8 @@ com.jayway.jsonpath json-path - 2.6.0 + 2.7.0 + compile org.apache.tapestry @@ -190,6 +149,7 @@ com.jsuereth scala-arm_${scala.api_version} + compile @@ -202,15 +162,10 @@ io.github.classgraph classgraph - 4.8.138 + 4.8.149 compile - - org.apache.derby - derby - - com.h2database h2 @@ -221,7 +176,6 @@ org.scalatest scalatest_${scala.api_version} - org.scalamock scalamock_${scala.api_version} diff --git a/flowman-dist/pom.xml b/flowman-dist/pom.xml index 0289c3d4d..e01bb1fd8 100644 --- a/flowman-dist/pom.xml +++ b/flowman-dist/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/flowman-dist/src/assembly/assembly.xml b/flowman-dist/src/assembly/assembly.xml index 2b68a7280..99ec18b62 100644 --- a/flowman-dist/src/assembly/assembly.xml +++ b/flowman-dist/src/assembly/assembly.xml @@ -46,6 +46,7 @@ org.apache.velocity:velocity-engine-core + com.dimajix.flowman:flowman-spark-dependencies lib true diff --git a/flowman-dsl/pom.xml b/flowman-dsl/pom.xml index ce6ee2f96..714a87c99 100644 --- a/flowman-dsl/pom.xml +++ b/flowman-dsl/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -30,27 +30,16 @@ com.dimajix.flowman - flowman-spec + flowman-spark-dependencies + pom - com.dimajix.flowman - flowman-spark-testing - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} + flowman-spec - - org.apache.spark - spark-hive_${scala.api_version} + com.dimajix.flowman + flowman-spark-testing diff --git a/flowman-hub/pom.xml b/flowman-hub/pom.xml index af86d5ac6..b5721eef0 100644 --- a/flowman-hub/pom.xml +++ b/flowman-hub/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -102,27 +102,11 @@ com.dimajix.flowman flowman-common - com.dimajix.flowman flowman-scalatest-compat - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - args4j args4j @@ -132,7 +116,6 @@ com.typesafe.akka akka-http_${scala.api_version} - com.typesafe.akka akka-http-spray-json_${scala.api_version} @@ -142,7 +125,6 @@ com.github.swagger-akka-http swagger-akka-http_${scala.api_version} - org.webjars swagger-ui diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index 39bfe0dab..b9c49dfe4 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -268,6 +268,13 @@ + + com.dimajix.flowman + flowman-spark-dependencies + ${flowman.version} + pom + provided + com.dimajix.flowman flowman-spark-extensions @@ -964,6 +971,25 @@ provided + + io.netty + netty + ${netty.version} + provided + + + io.netty + netty-all + ${netty-all.version} + provided + + + io.netty + netty-handler + ${netty-all.version} + provided + + com.google.re2j re2j diff --git a/flowman-plugins/aws/pom.xml b/flowman-plugins/aws/pom.xml index a5cbc587a..0e388243b 100644 --- a/flowman-plugins/aws/pom.xml +++ b/flowman-plugins/aws/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -259,15 +259,21 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-core provided - com.dimajix.flowman flowman-scalatest-compat + test diff --git a/flowman-plugins/azure/pom.xml b/flowman-plugins/azure/pom.xml index ce414ab54..7edfbf405 100644 --- a/flowman-plugins/azure/pom.xml +++ b/flowman-plugins/azure/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/delta/pom.xml b/flowman-plugins/delta/pom.xml index c20068bbe..d0e2b9f03 100644 --- a/flowman-plugins/delta/pom.xml +++ b/flowman-plugins/delta/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -166,42 +166,23 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec + provided - com.dimajix.flowman flowman-spark-testing test - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - io.delta delta-core_${scala.api_version} diff --git a/flowman-plugins/impala/pom.xml b/flowman-plugins/impala/pom.xml index bd38bb073..b096d6eac 100644 --- a/flowman-plugins/impala/pom.xml +++ b/flowman-plugins/impala/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -52,33 +52,23 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec provided - com.dimajix.flowman flowman-spark-testing test - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - org.apache.hive diff --git a/flowman-plugins/json/pom.xml b/flowman-plugins/json/pom.xml index eb269a29c..b03792a45 100644 --- a/flowman-plugins/json/pom.xml +++ b/flowman-plugins/json/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -79,38 +79,23 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec provided - com.dimajix.flowman flowman-spark-testing test - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - org.json json diff --git a/flowman-plugins/kafka/pom.xml b/flowman-plugins/kafka/pom.xml index 36d1c78fc..57ec5ce51 100644 --- a/flowman-plugins/kafka/pom.xml +++ b/flowman-plugins/kafka/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -44,47 +44,28 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec + provided - com.dimajix.flowman flowman-spark-testing test - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-hive_${scala.api_version} - - org.apache.spark spark-sql-kafka-0-10_${scala.api_version} - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - org.apache.kafka kafka_${scala.api_version} diff --git a/flowman-plugins/mariadb/pom.xml b/flowman-plugins/mariadb/pom.xml index f605c4b9d..cae9532ca 100644 --- a/flowman-plugins/mariadb/pom.xml +++ b/flowman-plugins/mariadb/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/mssqlserver/pom.xml b/flowman-plugins/mssqlserver/pom.xml index f4767e5d4..581d374ef 100644 --- a/flowman-plugins/mssqlserver/pom.xml +++ b/flowman-plugins/mssqlserver/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -118,42 +118,23 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec + provided - com.dimajix.flowman flowman-spark-testing test - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - com.microsoft.sqlserver mssql-jdbc diff --git a/flowman-plugins/mysql/pom.xml b/flowman-plugins/mysql/pom.xml index 30f27cead..765fd3f84 100644 --- a/flowman-plugins/mysql/pom.xml +++ b/flowman-plugins/mysql/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/openapi/pom.xml b/flowman-plugins/openapi/pom.xml index a0a3a6546..8194d62ee 100644 --- a/flowman-plugins/openapi/pom.xml +++ b/flowman-plugins/openapi/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -44,39 +44,28 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec provided - com.dimajix.flowman flowman-dsl provided - com.dimajix.flowman flowman-spark-testing test - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - io.swagger.parser.v3 swagger-parser diff --git a/flowman-plugins/oracle/pom.xml b/flowman-plugins/oracle/pom.xml index 2686cc762..0476d1a7d 100644 --- a/flowman-plugins/oracle/pom.xml +++ b/flowman-plugins/oracle/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/postgresql/pom.xml b/flowman-plugins/postgresql/pom.xml index 9a85cc99b..038b80e32 100644 --- a/flowman-plugins/postgresql/pom.xml +++ b/flowman-plugins/postgresql/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml diff --git a/flowman-plugins/sftp/pom.xml b/flowman-plugins/sftp/pom.xml index 3c4947699..ab0383d0d 100644 --- a/flowman-plugins/sftp/pom.xml +++ b/flowman-plugins/sftp/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -44,11 +44,17 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec + provided - com.dimajix.flowman flowman-spark-testing @@ -61,31 +67,6 @@ 262 - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-hive_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - org.scalatest scalatest_${scala.api_version} diff --git a/flowman-plugins/swagger/pom.xml b/flowman-plugins/swagger/pom.xml index 6147b60ce..5ca38c4cf 100644 --- a/flowman-plugins/swagger/pom.xml +++ b/flowman-plugins/swagger/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../../pom.xml @@ -44,39 +44,28 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + provided + com.dimajix.flowman flowman-spec provided - com.dimajix.flowman flowman-dsl provided - com.dimajix.flowman flowman-spark-testing test - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - io.swagger swagger-parser diff --git a/flowman-scalatest-compat/pom.xml b/flowman-scalatest-compat/pom.xml index d8968344e..3aabc86ee 100644 --- a/flowman-scalatest-compat/pom.xml +++ b/flowman-scalatest-compat/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/flowman-server-ui/pom.xml b/flowman-server-ui/pom.xml index 6aa64b64b..37fb5cd0f 100644 --- a/flowman-server-ui/pom.xml +++ b/flowman-server-ui/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/flowman-server/pom.xml b/flowman-server/pom.xml index bfebf8598..e488d0fee 100644 --- a/flowman-server/pom.xml +++ b/flowman-server/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -42,7 +42,7 @@ , lib - json,org.everit.json.schema,velocity-engine-core + json,org.everit.json.schema,velocity-engine-core,flowman-spark-dependencies @@ -98,46 +98,29 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + com.dimajix.flowman flowman-tools - com.dimajix.flowman flowman-server-ui - com.dimajix.flowman flowman-scalatest-compat - - org.apache.hadoop - hadoop-client - - org.apache.derby derby - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - args4j args4j diff --git a/flowman-spark-dependencies/pom.xml b/flowman-spark-dependencies/pom.xml new file mode 100644 index 000000000..ba1dc7414 --- /dev/null +++ b/flowman-spark-dependencies/pom.xml @@ -0,0 +1,142 @@ + + + 4.0.0 + flowman-spark-dependencies + Flowman Spark dependencies + + This package contains all Spark dependencies required by Flowman. You should add this dependency to your + projects when you want to link against Flowman. Otherwise, the Spark dependencies will not be added + automatically, since Flowman has marked them as "provided" dependencies. + + pom + + + com.dimajix.flowman + flowman-root + 0.29.0-SNAPSHOT + ../pom.xml + + + + + org.apache.hadoop + hadoop-client + compile + + + org.apache.hadoop + hadoop-common + compile + + + org.apache.spark + spark-core_${scala.api_version} + compile + + + org.apache.spark + spark-sql_${scala.api_version} + compile + + + org.apache.spark + spark-hive_${scala.api_version} + compile + + + + com.fasterxml.jackson.core + jackson-core + compile + + + com.fasterxml.jackson.core + jackson-annotations + compile + + + com.fasterxml.jackson.core + jackson-databind + compile + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + compile + + + com.fasterxml.jackson.module + jackson-module-paranamer + compile + + + com.fasterxml.jackson.module + jackson-module-scala_${scala.api_version} + compile + + + org.yaml + snakeyaml + compile + + + + org.apache.commons + commons-lang3 + compile + + + org.apache.commons + commons-compress + compile + + + commons-codec + commons-codec + compile + + + commons-httpclient + commons-httpclient + compile + + + commons-cli + commons-cli + compile + + + commons-io + commons-io + compile + + + commons-beanutils + commons-beanutils + compile + + + commons-collections + commons-collections + compile + + + commons-lang + commons-lang + compile + + + commons-logging + commons-logging + compile + + + + org.apache.derby + derby + compile + + + diff --git a/flowman-spark-extensions/pom.xml b/flowman-spark-extensions/pom.xml index 783dba12c..86627a0a9 100644 --- a/flowman-spark-extensions/pom.xml +++ b/flowman-spark-extensions/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -63,32 +63,16 @@ com.dimajix.flowman - flowman-scalatest-compat + flowman-spark-dependencies + pom - com.dimajix.flowman - flowman-spark-testing - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} + flowman-scalatest-compat - - org.apache.spark - spark-hive_${scala.api_version} + com.dimajix.flowman + flowman-spark-testing diff --git a/flowman-spark-testing/pom.xml b/flowman-spark-testing/pom.xml index e90681131..4f26d6c08 100644 --- a/flowman-spark-testing/pom.xml +++ b/flowman-spark-testing/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -29,25 +29,10 @@ - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-core_${scala.api_version} - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} + com.dimajix.flowman + flowman-spark-dependencies + pom - com.dimajix.flowman flowman-scalatest-compat diff --git a/flowman-spec/pom.xml b/flowman-spec/pom.xml index 5db2dfd94..cc5417ba7 100644 --- a/flowman-spec/pom.xml +++ b/flowman-spec/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -61,32 +61,16 @@ com.dimajix.flowman - flowman-core + flowman-spark-dependencies + pom - com.dimajix.flowman - flowman-spark-testing - - - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} + flowman-core - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml + com.dimajix.flowman + flowman-spark-testing diff --git a/flowman-studio-ui/pom.xml b/flowman-studio-ui/pom.xml index 2f5d1fd9d..f541c1132 100644 --- a/flowman-studio-ui/pom.xml +++ b/flowman-studio-ui/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml diff --git a/flowman-studio/pom.xml b/flowman-studio/pom.xml index dbaf514f0..d2b7cf0ec 100644 --- a/flowman-studio/pom.xml +++ b/flowman-studio/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -42,7 +42,7 @@ , lib - json,org.everit.json.schema,velocity-engine-core + json,org.everit.json.schema,velocity-engine-core,flowman-spark-dependencies @@ -98,41 +98,24 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + com.dimajix.flowman flowman-tools - com.dimajix.flowman flowman-studio-ui - com.dimajix.flowman flowman-scalatest-compat - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - args4j args4j diff --git a/flowman-testing/pom.xml b/flowman-testing/pom.xml index f2aaaa3ce..cceedf4e6 100644 --- a/flowman-testing/pom.xml +++ b/flowman-testing/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -38,52 +38,33 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + com.dimajix.flowman flowman-scalatest-compat compile - com.dimajix.flowman flowman-spec - com.dimajix.flowman flowman-dsl - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} - - org.scalatest scalatest_${scala.api_version} - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - org.junit.jupiter junit-jupiter-api - org.assertj assertj-core diff --git a/flowman-tools/pom.xml b/flowman-tools/pom.xml index 60bccab7d..4a23156aa 100644 --- a/flowman-tools/pom.xml +++ b/flowman-tools/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -42,7 +42,7 @@ , lib - json,org.everit.json.schema,velocity-engine-core + json,org.everit.json.schema,velocity-engine-core,flowman-spark-dependencies @@ -98,47 +98,29 @@ + + com.dimajix.flowman + flowman-spark-dependencies + pom + com.dimajix.flowman flowman-spec - com.dimajix.flowman flowman-dsl - com.dimajix.flowman flowman-scalatest-compat - - org.apache.hadoop - hadoop-client - - - - org.apache.spark - spark-sql_${scala.api_version} - - - - org.apache.spark - spark-hive_${scala.api_version} - - - - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml - - org.jline jline-terminal 3.21.0 - org.jline jline-reader diff --git a/flowman-yaml-schema/pom.xml b/flowman-yaml-schema/pom.xml index 4ea9423f8..7a404c2a4 100644 --- a/flowman-yaml-schema/pom.xml +++ b/flowman-yaml-schema/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT ../pom.xml @@ -64,6 +64,11 @@ jackson-core ${jackson.version} + + com.fasterxml.jackson.module + jackson-module-scala_${scala.api_version} + ${jackson.version} + jakarta.validation jakarta.validation-api @@ -112,6 +117,19 @@ org.apache.spark spark-sql_${scala.api_version} + + org.apache.spark + spark-hive_${scala.api_version} + + + + org.slf4j + slf4j-api + + + org.apache.logging.log4j + log4j-slf4j-impl + diff --git a/pom.xml b/pom.xml index 50fcbd159..2328f1702 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 com.dimajix.flowman flowman-root - 0.28.1-SNAPSHOT + 0.29.0-SNAPSHOT pom Flowman root pom @@ -552,6 +552,7 @@ flowman-scalatest-compat + flowman-spark-dependencies flowman-spark-testing flowman-spark-extensions flowman-common @@ -999,6 +1000,13 @@ + + com.dimajix.flowman + flowman-spark-dependencies + ${project.version} + pom + provided + com.dimajix.flowman flowman-scalatest-compat @@ -1512,13 +1520,18 @@ ${netty.version} provided - io.netty netty-all ${netty-all.version} provided + + io.netty + netty-handler + ${netty-all.version} + provided + com.google.protobuf From 5cccdd0fd759a7d0375e590ff71081730dd405db Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Sat, 22 Oct 2022 19:17:27 +0200 Subject: [PATCH 12/52] Refactor File interface --- .../com/dimajix/flowman/hadoop/File.scala | 190 ++----------- .../dimajix/flowman/hadoop/FileSystem.scala | 4 +- .../dimajix/flowman/hadoop/HadoopFile.scala | 263 ++++++++++++++++++ .../com/dimajix/flowman/hadoop/FileTest.scala | 2 +- 4 files changed, 291 insertions(+), 168 deletions(-) create mode 100644 flowman-core/src/main/scala/com/dimajix/flowman/hadoop/HadoopFile.scala diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/File.scala b/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/File.scala index 388d03c9d..0f064042b 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/File.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/File.scala @@ -16,25 +16,15 @@ package com.dimajix.flowman.hadoop -import java.io.FileNotFoundException -import java.io.IOException +import java.io.InputStream +import java.io.OutputStream import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.FSDataInputStream -import org.apache.hadoop.fs.FSDataOutputStream -import org.apache.hadoop.fs.LocalFileSystem import org.apache.hadoop.fs.Path -import org.apache.hadoop.io.IOUtils object File { - def empty = new File(null, null) - def apply(conf:Configuration, path:Path) : File = { - File(path.getFileSystem(conf), path) - } - def apply(conf:Configuration, path:String) : File = { - apply(conf, new Path(path)) - } + def empty = HadoopFile(null, null) } /** @@ -43,17 +33,17 @@ object File { * @param fs * @param path */ -case class File(fs:org.apache.hadoop.fs.FileSystem, path:Path) { +abstract class File { override def toString: String = if (path != null) path.toString else "" + def path : Path + /** * Creates a new File object by attaching a child entry * @param sub * @return */ - def /(sub:String) : File = { - File(fs, new Path(path, sub)) - } + def /(sub:String) : File /** * Returns the file name of the File @@ -67,173 +57,87 @@ case class File(fs:org.apache.hadoop.fs.FileSystem, path:Path) { * Returns the parent directory of the File * @return */ - def parent : File = { - File(fs, path.getParent) - } + def parent : File /** * Returns the absolute path * @return */ - def absolute : File = { - File(fs, path.makeQualified(fs.getUri, fs.getWorkingDirectory)) - } + def absolute : File /** * Returns the size of the file. Will throw an exception if the file does not exist * @return */ - def length : Long = { - fs.getFileStatus(path).getLen - } + def length : Long - def resolve(name:String) : File = { - File(fs, new Path(path.toUri.resolve(name))) - } + def resolve(name:String) : File /** * Lists all directory entries. Will throw an exception if the File is not a directory * @return */ - def list() : Seq[File] = { - if (!isDirectory()) - throw new IOException(s"File '$path' is not a directory - cannot list files") - fs.listStatus(path) - .map(item => (item.getPath.toString, File(fs, item.getPath))) - .sortBy(_._1) - .map(_._2) - } + def list() : Seq[File] - def glob(pattern:Path) : Seq[File] = { - if (!isDirectory()) - throw new IOException(s"File '$path' is not a directory - cannot list files") - fs.globStatus(new Path(path, pattern)) - .map(item => (item.getPath.toString, File(fs, item.getPath))) - .sortBy(_._1) - .map(_._2) - } - - /** - * Renames the file to a different name. The destination has to be on the same FileSystem, otherwise an - * exception will be thrown - * @param dst - */ - def rename(dst:File) : Unit = { - if (!dst.fs.eq(fs)) { - throw new IOException(s"Target of rename needs to be on the same filesystem") - } - rename(dst.path) - } + def glob(pattern:Path) : Seq[File] /** * Renamed the file to a different name * @param dst */ - def rename(dst:Path) : Unit = { - if (fs.exists(dst) && !fs.delete(dst, false)) { - throw new IOException(s"Cannot rename '$path' to '$dst', because '$dst' already exists") - } - - if (!fs.rename(path, dst)) { - throw new IOException(s"Cannot rename '$path' to '$dst'") - } - } + def rename(dst:Path) : Unit /** * Copies the file to a different file. The relation file may reside on a different file system * @param dst * @param overwrite */ - def copy(dst:File, overwrite:Boolean) : Unit = { - if (!overwrite && dst.isFile()) - throw new IOException("Target $dst already exists") - - // Append file name if relation is a directory - val dstFile = if (dst.isDirectory()) - dst / path.getName - else - dst - - // Perform copy - if (dstFile.fs.isInstanceOf[LocalFileSystem]) - copyToLocal(dstFile, overwrite) - else - copyToRemote(dstFile, overwrite) - } + def copy(dst:File, overwrite:Boolean) : Unit /** * Creates a file and returns the correspondiong output stream * @param overwrite * @return */ - def create(overwrite:Boolean = false) : FSDataOutputStream = { - fs.create(path, overwrite) - } + def create(overwrite:Boolean = false) : OutputStream /** * Opens an existing file and returns the corresponding input stream * @return */ - def open() : FSDataInputStream = { - fs.open(path) - } + def open() : InputStream /** * Deletes the file and/or directory * @param recursive */ - def delete(recursive:Boolean = false) : Unit = { - if (fs.exists(path) && !fs.delete(path, recursive)) { - throw new IOException(s"Cannot delete '$path'") - } - } + def delete(recursive:Boolean = false) : Unit /** * Returns true if the file exists. It can either be a file or a directory * @return */ - def exists() : Boolean = { - fs.exists(path) - } + def exists() : Boolean - def mkdirs() : Unit = { - if (!fs.mkdirs(path)) - throw new IOException(s"Cannot create directory '$path'") - } + def mkdirs() : Unit /** * Returns true if the file exists as a directory * @return */ - def isDirectory() : Boolean = { - try { - fs.getFileStatus(path).isDirectory - } - catch { - case _: FileNotFoundException => false - } - } + def isDirectory() : Boolean /** * Returns true if the file exists as a normal file * @return */ - def isFile() : Boolean = { - try { - fs.getFileStatus(path).isFile - } - catch { - case _: FileNotFoundException => false - } - } + def isFile() : Boolean /** * Returns true if the File is an absolute path * @return */ - def isAbsolute() : Boolean = { - path.isAbsolute - } + def isAbsolute() : Boolean /** * Creates a new File instance with an additional suffix attached. This will not physically create the file @@ -241,51 +145,7 @@ case class File(fs:org.apache.hadoop.fs.FileSystem, path:Path) { * @param suffix * @return */ - def withSuffix(suffix:String) : File = { - File(fs, path.suffix(suffix)) - } - - def withName(name:String) : File = { - File(fs, new Path(path.getParent, name)) - } - - private def copyToLocal(dst:File, overwrite:Boolean) : Unit = { - val output = dst.create(overwrite) - try { - val input = open() - try { - IOUtils.copyBytes(input, output, 16384, true) - } - finally { - input.close() - } - } - catch { - case ex:Throwable => - output.close() - dst.delete() - throw ex - } - } + def withSuffix(suffix:String) : File - private def copyToRemote(dst:File, overwrite:Boolean) : Unit = { - val tmp = dst.withSuffix("._COPYING_") - val output = tmp.create(overwrite) - try { - val input = open() - try { - IOUtils.copyBytes(input, output, 16384, true) - tmp.rename(dst) - } - finally { - input.close() - } - } - catch { - case ex:Throwable => - output.close() - tmp.delete() - throw ex - } - } + def withName(name:String) : File } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileSystem.scala index 5eb818fe7..a652c1570 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileSystem.scala @@ -32,12 +32,12 @@ case class FileSystem(conf:Configuration) { def file(path:Path) : File = { val fs = path.getFileSystem(conf) - File(fs, path) + HadoopFile(fs, path) } def file(path:String) : File = file(new Path(path)) def file(path:URI) : File = file(new Path(path)) - def local(path:Path) : File = File(localFs, path) + def local(path:Path) : File = HadoopFile(localFs, path) def local(path:String) : File = local(new Path(path)) def local(path:java.io.File) : File = local(new Path(path.toString)) def local(path:URI) : File = local(new Path(path)) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/HadoopFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/HadoopFile.scala new file mode 100644 index 000000000..4331b9252 --- /dev/null +++ b/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/HadoopFile.scala @@ -0,0 +1,263 @@ +/* + * Copyright 2018-2022 Kaya Kupferschmidt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dimajix.flowman.hadoop + +import java.io.FileNotFoundException +import java.io.IOException + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs.FSDataInputStream +import org.apache.hadoop.fs.FSDataOutputStream +import org.apache.hadoop.fs.LocalFileSystem +import org.apache.hadoop.fs.Path +import org.apache.hadoop.io.IOUtils + + +/** + * The File class represents a file on a Hadoop filesystem. It contains a path and a filesystem and provides + * convenience methods for working with files. + * @param fs + * @param path + */ +case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends File { + override def toString: String = if (path != null) path.toString else "" + + /** + * Creates a new File object by attaching a child entry + * @param sub + * @return + */ + def /(sub:String) : File = { + HadoopFile(fs, new Path(path, sub)) + } + + /** + * Returns the parent directory of the File + * @return + */ + def parent : File = { + HadoopFile(fs, path.getParent) + } + + /** + * Returns the absolute path + * @return + */ + def absolute : File = { + HadoopFile(fs, path.makeQualified(fs.getUri, fs.getWorkingDirectory)) + } + + /** + * Returns the size of the file. Will throw an exception if the file does not exist + * @return + */ + def length : Long = { + fs.getFileStatus(path).getLen + } + + def resolve(name:String) : File = { + HadoopFile(fs, new Path(path.toUri.resolve(name))) + } + + /** + * Lists all directory entries. Will throw an exception if the File is not a directory + * @return + */ + def list() : Seq[File] = { + if (!isDirectory()) + throw new IOException(s"File '$path' is not a directory - cannot list files") + fs.listStatus(path) + .map(item => (item.getPath.toString, HadoopFile(fs, item.getPath))) + .sortBy(_._1) + .map(_._2) + } + + def glob(pattern:Path) : Seq[File] = { + if (!isDirectory()) + throw new IOException(s"File '$path' is not a directory - cannot list files") + fs.globStatus(new Path(path, pattern)) + .map(item => (item.getPath.toString, HadoopFile(fs, item.getPath))) + .sortBy(_._1) + .map(_._2) + } + + /** + * Renamed the file to a different name + * @param dst + */ + def rename(dst:Path) : Unit = { + if (fs.exists(dst) && !fs.delete(dst, false)) { + throw new IOException(s"Cannot rename '$path' to '$dst', because '$dst' already exists") + } + + if (!fs.rename(path, dst)) { + throw new IOException(s"Cannot rename '$path' to '$dst'") + } + } + + /** + * Copies the file to a different file. The relation file may reside on a different file system + * @param dst + * @param overwrite + */ + def copy(dst:File, overwrite:Boolean) : Unit = { + if (!overwrite && dst.isFile()) + throw new IOException("Target $dst already exists") + + // Append file name if relation is a directory + val dstFile = if (dst.isDirectory()) + dst / path.getName + else + dst + + // Perform copy + dstFile match { + case HadoopFile(fs, _) if fs.isInstanceOf[LocalFileSystem] => + copyToLocal(dstFile, overwrite) + case _ => + copyToRemote(dstFile, overwrite) + } + } + + /** + * Creates a file and returns the correspondiong output stream + * @param overwrite + * @return + */ + def create(overwrite:Boolean = false) : FSDataOutputStream = { + fs.create(path, overwrite) + } + + /** + * Opens an existing file and returns the corresponding input stream + * @return + */ + def open() : FSDataInputStream = { + fs.open(path) + } + + /** + * Deletes the file and/or directory + * @param recursive + */ + def delete(recursive:Boolean = false) : Unit = { + if (fs.exists(path) && !fs.delete(path, recursive)) { + throw new IOException(s"Cannot delete '$path'") + } + } + + /** + * Returns true if the file exists. It can either be a file or a directory + * @return + */ + def exists() : Boolean = { + fs.exists(path) + } + + def mkdirs() : Unit = { + if (!fs.mkdirs(path)) + throw new IOException(s"Cannot create directory '$path'") + } + + /** + * Returns true if the file exists as a directory + * @return + */ + def isDirectory() : Boolean = { + try { + fs.getFileStatus(path).isDirectory + } + catch { + case _: FileNotFoundException => false + } + } + + /** + * Returns true if the file exists as a normal file + * @return + */ + def isFile() : Boolean = { + try { + fs.getFileStatus(path).isFile + } + catch { + case _: FileNotFoundException => false + } + } + + /** + * Returns true if the File is an absolute path + * @return + */ + def isAbsolute() : Boolean = { + path.isAbsolute + } + + /** + * Creates a new File instance with an additional suffix attached. This will not physically create the file + * on the FileSystem, but will return a File which then can be used for creation + * @param suffix + * @return + */ + def withSuffix(suffix:String) : File = { + HadoopFile(fs, path.suffix(suffix)) + } + + def withName(name:String) : File = { + HadoopFile(fs, new Path(path.getParent, name)) + } + + private def copyToLocal(dst:File, overwrite:Boolean) : Unit = { + val output = dst.create(overwrite) + try { + val input = open() + try { + IOUtils.copyBytes(input, output, 16384, true) + } + finally { + input.close() + } + } + catch { + case ex:Throwable => + output.close() + dst.delete() + throw ex + } + } + + private def copyToRemote(dst:File, overwrite:Boolean) : Unit = { + val tmp = dst.withSuffix("._COPYING_") + val output = tmp.create(overwrite) + try { + val input = open() + try { + IOUtils.copyBytes(input, output, 16384, true) + tmp.rename(dst.path) + } + finally { + input.close() + } + } + catch { + case ex:Throwable => + output.close() + tmp.delete() + throw ex + } + } +} diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileTest.scala index 4dbb79fce..b89c58564 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileTest.scala @@ -85,7 +85,7 @@ class FileTest extends AnyFlatSpec with Matchers with LocalSparkSession { file.isDirectory() should be (false) val newName = file.withName("lolo-" + System.currentTimeMillis().toString + ".tmp") - file.rename(newName) + file.rename(newName.path) file.exists() should be (false) file.isFile() should be (false) file.isDirectory() should be (false) From e4fb5aa65bd7b3421e63aba7f7b605a168d3f87f Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Sat, 22 Oct 2022 19:19:15 +0200 Subject: [PATCH 13/52] Improved Maven POMs --- flowman-parent/pom.xml | 53 +++++++++++++++--------------- flowman-plugins/aws/pom.xml | 22 +++++++++++++ flowman-spark-dependencies/pom.xml | 5 +++ pom.xml | 12 +++++++ 4 files changed, 66 insertions(+), 26 deletions(-) diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index b9c49dfe4..304e457a4 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -273,25 +273,25 @@ flowman-spark-dependencies ${flowman.version} pom - provided + compile com.dimajix.flowman flowman-spark-extensions ${flowman.version} - provided + compile com.dimajix.flowman flowman-common ${flowman.version} - provided + compile com.dimajix.flowman flowman-core ${flowman.version} - provided + compile org.apache.velocity @@ -303,132 +303,133 @@ com.dimajix.flowman flowman-spec ${flowman.version} - provided + compile com.dimajix.flowman flowman-dsl ${flowman.version} - provided + compile com.dimajix.flowman flowman-tools ${flowman.version} - provided + compile com.dimajix.flowman flowman-server ${flowman.version} - provided + compile com.dimajix.flowman flowman-server-ui ${flowman.version} - provided + compile com.dimajix.flowman flowman-hub ${flowman.version} - provided + compile com.dimajix.flowman flowman-studio ${flowman.version} - provided + compile com.dimajix.flowman flowman-studio-ui ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-aws ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-azure ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-delta ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-kafka ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-impala ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-postgresql ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-mariadb ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-mysql ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-mssqlserver ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-sftp ${project.version} + compile com.dimajix.flowman flowman-plugin-swagger ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-openapi ${flowman.version} - provided + compile com.dimajix.flowman flowman-plugin-json ${flowman.version} - provided + compile com.dimajix.flowman flowman-client ${flowman.version} - provided + compile com.dimajix.flowman @@ -436,7 +437,7 @@ ${flowman.version} tar.gz bin - provided + compile com.dimajix.flowman diff --git a/flowman-plugins/aws/pom.xml b/flowman-plugins/aws/pom.xml index 0e388243b..e98638ef6 100644 --- a/flowman-plugins/aws/pom.xml +++ b/flowman-plugins/aws/pom.xml @@ -280,6 +280,12 @@ org.apache.hadoop hadoop-aws ${hadoop.version} + + + com.amazonaws + aws-java-sdk-bundle + + @@ -287,4 +293,20 @@ scalatest_${scala.api_version} + + + + + com.amazonaws + aws-java-sdk-s3 + ${aws.version} + + + com.amazonaws + aws-java-sdk-bundle + + + + + diff --git a/flowman-spark-dependencies/pom.xml b/flowman-spark-dependencies/pom.xml index ba1dc7414..d4456a267 100644 --- a/flowman-spark-dependencies/pom.xml +++ b/flowman-spark-dependencies/pom.xml @@ -35,6 +35,11 @@ spark-core_${scala.api_version} compile + + org.apache.spark + spark-tags_${scala.api_version} + compile + org.apache.spark spark-sql_${scala.api_version} diff --git a/pom.xml b/pom.xml index 2328f1702..b9549e88c 100644 --- a/pom.xml +++ b/pom.xml @@ -1033,6 +1033,12 @@ com.dimajix.flowman flowman-core ${project.version} + + + org.apache.velocity + velocity-engine-core + + com.dimajix.flowman @@ -1404,6 +1410,12 @@ hadoop-client ${hadoop.version} provided + + + log4j + log4j + + org.apache.hadoop From ff5c6069f68d7cad1754fd76d8065fd78392563f Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Sun, 23 Oct 2022 10:53:32 +0200 Subject: [PATCH 14/52] Fix dependency to Velocity --- flowman-core/pom.xml | 3 ++- pom.xml | 6 ------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/flowman-core/pom.xml b/flowman-core/pom.xml index 220266e86..cfe801042 100644 --- a/flowman-core/pom.xml +++ b/flowman-core/pom.xml @@ -111,10 +111,11 @@ + org.apache.velocity velocity-engine-core 2.3 - compile + provided diff --git a/pom.xml b/pom.xml index b9549e88c..bbf7d0aaa 100644 --- a/pom.xml +++ b/pom.xml @@ -1033,12 +1033,6 @@ com.dimajix.flowman flowman-core ${project.version} - - - org.apache.velocity - velocity-engine-core - - com.dimajix.flowman From 1c21a0bd69f006c14f22af7a2f12648a22687fb5 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Sun, 23 Oct 2022 12:41:20 +0200 Subject: [PATCH 15/52] Fix dependency to Velocity --- flowman-core/pom.xml | 1 - flowman-spark-dependencies/.gitignore | 1 + pom.xml | 6 +++--- 3 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 flowman-spark-dependencies/.gitignore diff --git a/flowman-core/pom.xml b/flowman-core/pom.xml index cfe801042..7b75b2b38 100644 --- a/flowman-core/pom.xml +++ b/flowman-core/pom.xml @@ -115,7 +115,6 @@ org.apache.velocity velocity-engine-core 2.3 - provided diff --git a/flowman-spark-dependencies/.gitignore b/flowman-spark-dependencies/.gitignore new file mode 100644 index 000000000..b83d22266 --- /dev/null +++ b/flowman-spark-dependencies/.gitignore @@ -0,0 +1 @@ +/target/ diff --git a/pom.xml b/pom.xml index bbf7d0aaa..ac116c054 100644 --- a/pom.xml +++ b/pom.xml @@ -859,11 +859,11 @@ true org.apache.maven.plugins maven-shade-plugin - 3.3.0 + 3.4.0 false - true - false + false + true false true true From cb5564466449a2963fb59cb80338a6f0e5c49cd2 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Sun, 23 Oct 2022 18:50:48 +0200 Subject: [PATCH 16/52] Fix Cloudera dependency hell --- flowman-spark-dependencies/pom.xml | 106 ++++++++++++++++------------- pom.xml | 8 ++- 2 files changed, 65 insertions(+), 49 deletions(-) diff --git a/flowman-spark-dependencies/pom.xml b/flowman-spark-dependencies/pom.xml index d4456a267..40bd03fed 100644 --- a/flowman-spark-dependencies/pom.xml +++ b/flowman-spark-dependencies/pom.xml @@ -20,121 +20,131 @@ + - org.apache.hadoop - hadoop-client + org.apache.commons + commons-lang3 compile - org.apache.hadoop - hadoop-common + org.apache.commons + commons-compress compile - org.apache.spark - spark-core_${scala.api_version} + commons-codec + commons-codec compile - org.apache.spark - spark-tags_${scala.api_version} + commons-httpclient + commons-httpclient compile - org.apache.spark - spark-sql_${scala.api_version} + commons-cli + commons-cli compile - org.apache.spark - spark-hive_${scala.api_version} + commons-io + commons-io compile - - com.fasterxml.jackson.core - jackson-core + commons-beanutils + commons-beanutils compile - com.fasterxml.jackson.core - jackson-annotations + commons-collections + commons-collections compile - com.fasterxml.jackson.core - jackson-databind + commons-lang + commons-lang compile - com.fasterxml.jackson.dataformat - jackson-dataformat-yaml + commons-logging + commons-logging compile + - com.fasterxml.jackson.module - jackson-module-paranamer + org.apache.hadoop + hadoop-client compile - com.fasterxml.jackson.module - jackson-module-scala_${scala.api_version} + org.apache.hadoop + hadoop-common compile - org.yaml - snakeyaml + org.apache.spark + spark-core_${scala.api_version} compile - - org.apache.commons - commons-lang3 + org.apache.spark + spark-tags_${scala.api_version} compile - org.apache.commons - commons-compress + org.apache.spark + spark-sql_${scala.api_version} compile - commons-codec - commons-codec + org.apache.spark + spark-hive_${scala.api_version} compile + - commons-httpclient - commons-httpclient + org.apache.hive + hive-common compile + - commons-cli - commons-cli + com.fasterxml.jackson.core + jackson-core compile - commons-io - commons-io + com.fasterxml.jackson.core + jackson-annotations compile - commons-beanutils - commons-beanutils + com.fasterxml.jackson.core + jackson-databind compile - commons-collections - commons-collections + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml compile - commons-lang - commons-lang + com.fasterxml.jackson.module + jackson-module-paranamer compile - commons-logging - commons-logging + com.fasterxml.jackson.module + jackson-module-scala_${scala.api_version} + compile + + + org.yaml + snakeyaml compile diff --git a/pom.xml b/pom.xml index ac116c054..6f90ad25c 100644 --- a/pom.xml +++ b/pom.xml @@ -809,7 +809,7 @@ true org.scalatest scalatest-maven-plugin - 2.1.0 + 2.2.0 ${project.build.directory}/surefire-reports . @@ -1331,6 +1331,12 @@ test
+ + org.apache.hive + hive-common + ${hive.version} + provided + org.apache.hive hive-storage-api From ac1fc1fa14732ec8e6c8622e0e140f3b387f4ca0 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Sun, 23 Oct 2022 18:53:00 +0200 Subject: [PATCH 17/52] Remove hive-common from flowman-spark-dependencies --- flowman-spark-dependencies/pom.xml | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/flowman-spark-dependencies/pom.xml b/flowman-spark-dependencies/pom.xml index 40bd03fed..1ca5c0c56 100644 --- a/flowman-spark-dependencies/pom.xml +++ b/flowman-spark-dependencies/pom.xml @@ -105,13 +105,7 @@ spark-hive_${scala.api_version} compile - - - org.apache.hive - hive-common - compile - - + com.fasterxml.jackson.core jackson-core From 9e3b5e2f4eb7787ab6f5adcbf015eff82cac6ca9 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Sun, 23 Oct 2022 22:07:49 +0200 Subject: [PATCH 18/52] Try to fix build for Spark 3.3 with Hadoop 2.7 --- pom.xml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 6f90ad25c..67bcd13f3 100644 --- a/pom.xml +++ b/pom.xml @@ -1411,9 +1411,13 @@ ${hadoop.version} provided + + org.slf4j + * + log4j - log4j + * From 97bc06ae18bb2aa674c3dda96ef83bf683e04e3f Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 24 Oct 2022 19:44:27 +0200 Subject: [PATCH 19/52] Fallback to load default-namespace.yml and system.yml from classpath --- .../com/dimajix/flowman/model/Namespace.scala | 5 ++++- flowman-parent/pom.xml | 5 +++++ .../com/dimajix/flowman/server/Application.scala | 1 + .../flowman/conf/default-namespace.yml.template | 2 ++ .../META-INF/flowman/conf/system.yml.template | 1 + .../scala/com/dimajix/flowman/tools/Tool.scala | 16 ++++++++++++++++ pom.xml | 5 +++++ 7 files changed, 34 insertions(+), 1 deletion(-) create mode 100644 flowman-tools/src/main/resources/META-INF/flowman/conf/default-namespace.yml.template create mode 100644 flowman-tools/src/main/resources/META-INF/flowman/conf/system.yml.template diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/model/Namespace.scala b/flowman-core/src/main/scala/com/dimajix/flowman/model/Namespace.scala index e824b1cb7..c6f3d0351 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/model/Namespace.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/model/Namespace.scala @@ -72,7 +72,10 @@ object Namespace { def string(text:String) : Namespace = { reader.string(text) } - def default() : Namespace = defaultNamespace + def default() : Namespace = { + logger.info(s"Using Flowman default namespace settings") + defaultNamespace + } private def reader : NamespaceReader = { loader.find(_.supports(format)) diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index 304e457a4..694bff8e0 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -77,6 +77,11 @@ org.apache.maven.plugins maven-jar-plugin 3.2.2 + + + plugin.yml + + true diff --git a/flowman-server/src/main/scala/com/dimajix/flowman/server/Application.scala b/flowman-server/src/main/scala/com/dimajix/flowman/server/Application.scala index 9cc1cfe68..68ab530ee 100644 --- a/flowman-server/src/main/scala/com/dimajix/flowman/server/Application.scala +++ b/flowman-server/src/main/scala/com/dimajix/flowman/server/Application.scala @@ -42,6 +42,7 @@ object Application { class Application extends Tool { override protected def loadNamespace() : Namespace = { + // TODO: Fall back to resources if nothing is found val ns = ToolConfig.confDirectory .map(confDir => new File(confDir, "history-server.yml")) .filter(_.isFile) diff --git a/flowman-tools/src/main/resources/META-INF/flowman/conf/default-namespace.yml.template b/flowman-tools/src/main/resources/META-INF/flowman/conf/default-namespace.yml.template new file mode 100644 index 000000000..cf8f2a52d --- /dev/null +++ b/flowman-tools/src/main/resources/META-INF/flowman/conf/default-namespace.yml.template @@ -0,0 +1,2 @@ +metrics: + - kind: console diff --git a/flowman-tools/src/main/resources/META-INF/flowman/conf/system.yml.template b/flowman-tools/src/main/resources/META-INF/flowman/conf/system.yml.template new file mode 100644 index 000000000..b7db25411 --- /dev/null +++ b/flowman-tools/src/main/resources/META-INF/flowman/conf/system.yml.template @@ -0,0 +1 @@ +# Empty diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala index 2f4beb84d..bbbdeea71 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala @@ -17,10 +17,12 @@ package com.dimajix.flowman.tools import java.io.File +import java.net.URL import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path +import com.dimajix.common.Resources import com.dimajix.flowman.common.ToolConfig import com.dimajix.flowman.config.FlowmanConf import com.dimajix.flowman.execution.Session @@ -50,6 +52,10 @@ class Tool { .map(confDir => new File(confDir, "system.yml")) .filter(_.isFile) .map(file => SystemSettings.read.file(file)) + .orElse( + Option(getResource("META-INF/flowman/conf/system.yml")) + .map(SystemSettings.read.url) + ) .getOrElse(SystemSettings.read.default()) // Load all global plugins from System settings @@ -62,6 +68,10 @@ class Tool { .map(confDir => new File(confDir, "default-namespace.yml")) .filter(_.isFile) .map(file => Namespace.read.file(file)) + .orElse( + Option(getResource("META-INF/flowman/conf/default-namespace.yml")) + .map(Namespace.read.url) + ) .getOrElse(Namespace.read.default()) // Load all plugins from Namespace @@ -69,12 +79,18 @@ class Tool { ns } + private def getResource(name:String) : URL = { + val loader = Thread.currentThread.getContextClassLoader + loader.getResource(name) + } + def loadProject(projectPath:Path) : Project = { // Create Hadoop FileSystem instance val hadoopConfig = new Configuration() val fs = FileSystem(hadoopConfig) // Load Project. If no schema is specified, load from local file system + // TODO: Support resources in jar files val projectUri = projectPath.toUri if (projectUri.getAuthority == null && projectUri.getScheme == null) Project.read.file(fs.local(projectPath)) diff --git a/pom.xml b/pom.xml index 67bcd13f3..449334f3a 100644 --- a/pom.xml +++ b/pom.xml @@ -734,6 +734,11 @@ org.apache.maven.plugins maven-jar-plugin 3.2.2 + + + plugin.yml + + true From 954ec5e55761fbd9e16a6205278bf32d689ba1a2 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 25 Oct 2022 19:15:00 +0200 Subject: [PATCH 20/52] github-269 Implement new 'iterativeSql' mapping --- CHANGELOG.md | 1 + docs/index.md | 2 +- docs/releases.md | 1 + docs/spec/mapping/iterative-sql.md | 73 ++++++ docs/spec/mapping/recursive-sql.md | 13 +- docs/{workflow.md => workflow/index.md} | 8 +- .../spec/mapping/IterativeSqlMapping.scala | 214 ++++++++++++++++++ .../flowman/spec/mapping/MappingSpec.scala | 1 + .../spec/mapping/RecursiveSqlMapping.scala | 21 +- .../mapping/IterativeSqlMappingTest.scala | 123 ++++++++++ .../mapping/RecursiveSqlMappingTest.scala | 64 ++++++ 11 files changed, 509 insertions(+), 12 deletions(-) create mode 100644 docs/spec/mapping/iterative-sql.md rename docs/{workflow.md => workflow/index.md} (96%) create mode 100644 flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala create mode 100644 flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala diff --git a/CHANGELOG.md b/CHANGELOG.md index 13f4cdc75..38e129943 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ * github-265: Make JDBC dialects pluggable * github-264: Provide "jars" for all plugins * github-267: Add new flowman-spark-dependencies module to simplify dependency management +* github-269: Implement new 'iterativeSql' mapping # Version 0.28.0 - 2022-10-07 diff --git a/docs/index.md b/docs/index.md index 72730cc28..082a8c412 100644 --- a/docs/index.md +++ b/docs/index.md @@ -109,7 +109,7 @@ Flowman also provides optional plugins which extend functionality. You can find spec/index testing/index documenting/index - workflow + workflow/index setup/index connectors/index plugins/index diff --git a/docs/releases.md b/docs/releases.md index c29f3d613..5b631ca30 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -23,6 +23,7 @@ changes over time. * github-265: Make JDBC dialects pluggable * github-264: Provide "jars" for all plugins * github-267: Add new flowman-spark-dependencies module to simplify dependency management +* github-269: Create 'iterativeSql' mapping ### Version 0.28.0 - 2022-10-07 diff --git a/docs/spec/mapping/iterative-sql.md b/docs/spec/mapping/iterative-sql.md new file mode 100644 index 000000000..8023b9e23 --- /dev/null +++ b/docs/spec/mapping/iterative-sql.md @@ -0,0 +1,73 @@ +# Iterative SQL Mapping +The `iterativeSql` mapping allows to iteratively execute SQL transformation which contains Spark SQL code. The +iteration will stop when the data does not change anymore. + +## Example +The following example will detect trees within a company hierarchy table, which provides simple parent-child +relations. The objective of the query is to assign a separate ID to each company tree. The query will essentially +propagate the `tree_id` from each parent down to its direct children. This step is performed over and over again +until the `tree_id` from the root companies without a parent are propagated to the leave companies without any +children. +``` +mappings: + organization_hierarchy: + kind: iterativeSql + input: companies + sql: | + SELECT + COALESCE(parent.tree_id, c.tree_id) AS tree_id, + c.parent_company_number, + c.company_number + FROM companies c + LEFT JOIN __this__ parent + ON c.parent_company_number = parent.company_number +``` +Within the first step, the output of the input mapping `companies` is assigned to the identifier `__this__`. Then the +SQL query is executed for the first time, which will provide the start value of the forthcoming iteration. In each +iteration, the result of the previous iteration is assigned to `__this__` and the query is executed. +Then the result is compared to the result of the previous iteration. If the results are the same, a fix point is +reached and the execution stops. Otherwise, the iteration will continue. + +## Fields +* `kind` **(mandatory)** *(type: string)*: `iterativeSql` + +* `broadcast` **(optional)** *(type: boolean)* *(default: false)*: +Hint for broadcasting the result of this mapping for map-side joins. + +* `cache` **(optional)** *(type: string)* *(default: NONE)*: +Cache mode for the results of this mapping. Supported values are + * `NONE` - Disables caching of teh results of this mapping + * `DISK_ONLY` - Caches the results on disk + * `MEMORY_ONLY` - Caches the results in memory. If not enough memory is available, records will be uncached. + * `MEMORY_ONLY_SER` - Caches the results in memory in a serialized format. If not enough memory is available, records will be uncached. + * `MEMORY_AND_DISK` - Caches the results first in memory and then spills to disk. + * `MEMORY_AND_DISK_SER` - Caches the results first in memory in a serialized format and then spills to disk. + +* `input` **(required)** *(type: string)*: +The input mapping which serves as the starting point of the iteration. This means that for the first execution, +the identifier `__this__` will simply refer the output of this mapping. Within the next iterations, `__this__` will +refer to the result of the previous iteration. + +* `sql` **(optional)** *(type: string)* *(default: empty)*: +The SQL statement to execute + +* `file` **(optional)** *(type: string)* *(default: empty)*: +The name of a file containing the SQL to execute. + +* `uri` **(optional)** *(type: string)* *(default: empty)*: +A url pointing to a resource containing the SQL to execute. + +* `maxIterations` **(optional)** *(type: int)* *(default: 99)*: +The maximum of iterations. The mapping will fail if the number of actual iterations required to find the fix point +exceeds this number. + + +## Outputs +* `main` - the only output of the mapping + + +## Description +The `iterativeSql` mapping allows to execute recursive SQL statements, which refer to themselves. + +Flowman also supports [`recursiveSql` mappings](recursive-sql.md), which provide similar functionality more along +the lines of classical recursive SQL statements. diff --git a/docs/spec/mapping/recursive-sql.md b/docs/spec/mapping/recursive-sql.md index 06afd2ea3..d662c73f1 100644 --- a/docs/spec/mapping/recursive-sql.md +++ b/docs/spec/mapping/recursive-sql.md @@ -20,6 +20,11 @@ mappings: WHERE n < 6 " ``` +Within the first step, `__this__` is assigned an empty table. Then the SQL query is executed for the first time, +which will provide the next value of the forthcoming iterations. In each iteration, the result of the previous +iteration is assigned to `__this__` and the query is executed again. Then the result is compared to the result of +the previous iteration. If the results are the same, a fix point is reached and the +execution stops. Otherwise, the iteration will continue. ## Fields * `kind` **(mandatory)** *(type: string)*: `recursiveSql` @@ -45,6 +50,10 @@ The name of a file containing the SQL to execute. * `uri` **(optional)** *(type: string)* *(default: empty)*: A url pointing to a resource containing the SQL to execute. +* `maxIterations` **(optional)** *(type: int)* *(default: 99)*: +The maximum of iterations. The mapping will fail if the number of actual iterations required to find the fix point +exceeds this number. + ## Outputs * `main` - the only output of the mapping @@ -52,6 +61,8 @@ A url pointing to a resource containing the SQL to execute. ## Description The `recursiveSql` mapping allows to execute recursive SQL statements, which refer to themselves. The result of each -step is made available as a temporary table `__this__`. Currently the query has to be a `UNION` where the first part +step is made available as a temporary table `__this__`. Currently, the query has to be a `UNION` where the first part may not contain a reference to `__this__`. The first part of the `UNION` will be used to determine the schema of the result. + +Flowman also supports [`iterativeSql` mappings](iterative-sql.md), which provide similar functionality. diff --git a/docs/workflow.md b/docs/workflow/index.md similarity index 96% rename from docs/workflow.md rename to docs/workflow/index.md index c26961721..309fe6958 100644 --- a/docs/workflow.md +++ b/docs/workflow/index.md @@ -6,7 +6,7 @@ production deployment. The workflow starts from creating a new Flowman project, describes how to run the project locally, build a self-contained redistributable package and then deploy it to a remote repository manager like Nexus. -![Flowman Development Workflow](images/flowman-workflow.png) +![Flowman Development Workflow](../images/flowman-workflow.png) The whole workflow is implemented with [Apache Maven](https://maven.apache.org/), but you could of course also chose a different build tool. Maven was chosen simply because one can assume that this is present in a Big Data environment, so @@ -90,7 +90,7 @@ like Linux, Windows and Mac OS). #### 1. Running with installed Flowman In order to run tests with a local Flowman installation, you first need to setup Flowman on your local machine -[as described in the documentation](setup/installation.md). +[as described in the documentation](../setup/installation.md). #### 2. Running wih Docker A much simpler option than setting up a local Flowman development installation is to use the pre-built Docker @@ -106,7 +106,7 @@ Flowman shell via ```shell bin/flowshell -f ``` -Please read more about using the Flowman Shell in the [corresponding documentation](cli/flowshell/index.md). +Please read more about using the Flowman Shell in the [corresponding documentation](../cli/flowshell/index.md). Whenever you change something in your project, you can easily reload the project in the shell via ```shell @@ -126,7 +126,7 @@ need to be provided by your environment. Note for Windows users: Maven will also execute all tests in your Flowman project. The Hadoop dependency will require the so called *WinUtils* to be installed on your machine, please read more about -[setting up your Windows environment](setup/windows.md). +[setting up your Windows environment](../setup/windows.md). diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala new file mode 100644 index 000000000..147b2688f --- /dev/null +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala @@ -0,0 +1,214 @@ +/* + * Copyright 2022 Kaya Kupferschmidt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dimajix.flowman.spec.mapping + +import java.io.StringWriter +import java.lang +import java.net.URL +import java.nio.charset.Charset +import java.util.Locale + +import scala.annotation.tailrec + +import com.fasterxml.jackson.annotation.JsonProperty +import org.apache.commons.io.IOUtils +import org.apache.hadoop.fs.Path +import org.apache.spark.sql.DataFrame +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.Row +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.encoders.RowEncoder +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.catalyst.plans.logical.UnaryNode +import org.apache.spark.sql.catalyst.plans.logical.Union +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.functions.count +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.functions.not + +import com.dimajix.flowman.execution.Context +import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException +import com.dimajix.flowman.model.BaseMapping +import com.dimajix.flowman.model.Mapping +import com.dimajix.flowman.model.MappingOutputIdentifier +import com.dimajix.flowman.types.StructType +import com.dimajix.spark.sql.DataFrameBuilder +import com.dimajix.spark.sql.DataFrameUtils +import com.dimajix.spark.sql.DataFrameUtils.withTempView +import com.dimajix.spark.sql.DataFrameUtils.withTempViews +import com.dimajix.spark.sql.SqlParser + + +case class IterativeSqlMapping( + instanceProperties:Mapping.Properties, + input:MappingOutputIdentifier, + sql:Option[String], + file:Option[Path] = None, + url:Option[URL] = None, + maxIterations:Int = 99 +) +extends BaseMapping { + /** + * Resolves all dependencies required to build the SQL + * + * @return + */ + override def inputs : Set[MappingOutputIdentifier] = dependencies + + /** + * Executes this MappingType and returns a corresponding DataFrame + * + * @param execution + * @param input + * @return + */ + override def execute(execution:Execution, input:Map[MappingOutputIdentifier,DataFrame]) : Map[String,DataFrame] = { + require(execution != null) + require(input != null) + + val statement = this.statement + + @tailrec + def fix(in:DataFrame, iteration:Int=1) : DataFrame = { + if (iteration > maxIterations) + throw new ExecutionException(s"Recursive mapping '$identifier' exceeded maximum iterations $maxIterations") + val result = nextDf(statement, in) + if (!checkDataFramesEquals(in, result)) + fix(result, iteration+1) + else + result + } + + // Register all input DataFrames as temp views + val result = withTempViews(input.map(kv => kv._1.name -> kv._2)) { + val first = nextDf(statement, input(this.input)) + fix(first) + } + + Map("main" -> result) + } + + + private def nextDf(statement:String, prev:DataFrame) : DataFrame = { + val spark = prev.sparkSession + withTempView("__this__", prev) { + spark.sql(statement).localCheckpoint(false) + } + } + + private def checkDataFramesEquals(expected:DataFrame, result:DataFrame) : Boolean = { + val expectedCol = "assertDataFrameNoOrderEquals_expected" + val actualCol = "assertDataFrameNoOrderEquals_actual" + val expectedColumns = expected.columns.map(s => expected(s)) + val expectedElementsCount = expected + .groupBy(expectedColumns: _*) + .agg(count(lit(1)).as(expectedCol)) + val resultColumns = result.columns.map(s => result(s)) + val resultElementsCount = result + .groupBy(resultColumns: _*) + .agg(count(lit(1)).as(actualCol)) + + val joinExprs = expected.columns + .map(s => expected.col(s) <=> result.col(s)).reduce(_.and(_)) + val diff = expectedElementsCount + .join(resultElementsCount, joinExprs, "full_outer") + .filter(not(col(expectedCol) <=> col(actualCol))) + diff.take(1).length == 0 + } + + /** + * Returns the schema as produced by this mapping, relative to the given input schema. The map might not contain + * schema information for all outputs, if the schema cannot be inferred. + * @param input + * @return + */ + override def describe(execution: Execution, input: Map[MappingOutputIdentifier, StructType]): Map[String, StructType] = { + require(execution != null) + require(input != null) + + val spark = execution.spark + + // Create dummy data frames + val replacements = input.map { case (id,schema) => + id.name -> DataFrameBuilder.singleRow(spark, schema.sparkType) + } + val firstDf = replacements(this.input.name) + + val result = withTempViews(replacements) { + nextDf(statement, firstDf) + } + + // Apply documentation + val schemas = Map("main" -> StructType.of(result.schema)) + applyDocumentation(schemas) + } + + private lazy val dependencies = { + SqlParser.resolveDependencies(statement) + .filter(_.toLowerCase(Locale.ROOT) != "__this__") + .map(MappingOutputIdentifier.parse) + input + } + private lazy val statement : String = { + if (sql.exists(_.nonEmpty)) { + sql.get + } + else if (file.nonEmpty) { + val fs = context.fs + val input = fs.file(file.get).open() + try { + val writer = new StringWriter() + IOUtils.copy(input, writer, Charset.forName("UTF-8")) + writer.toString + } + finally { + input.close() + } + } + else if (url.nonEmpty) { + IOUtils.toString(url.get, "UTF-8") + } + else { + throw new IllegalArgumentException("SQL mapping needs either 'sql', 'file' or 'url'") + } + } +} + + +class IterativeSqlMappingSpec extends MappingSpec { + @JsonProperty(value="input", required = true) private var input: String = _ + @JsonProperty(value="sql", required=false) private var sql:Option[String] = None + @JsonProperty(value="file", required=false) private var file:Option[String] = None + @JsonProperty(value="url", required=false) private var url: Option[String] = None + @JsonProperty(value="maxIterations", required=false) private var maxIterations: String = "99" + + /** + * Creates the instance of the specified Mapping with all variable interpolation being performed + * @param context + * @return + */ + override def instantiate(context: Context, properties:Option[Mapping.Properties] = None): IterativeSqlMapping = { + IterativeSqlMapping( + instanceProperties(context, properties), + MappingOutputIdentifier(context.evaluate(input)), + context.evaluate(sql), + file.map(context.evaluate).filter(_.nonEmpty).map(p => new Path(p)), + url.map(context.evaluate).filter(_.nonEmpty).map(u => new URL(u)), + context.evaluate(maxIterations).toInt + ) + } +} diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/MappingSpec.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/MappingSpec.scala index ae50556de..b7d2e1ca8 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/MappingSpec.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/MappingSpec.scala @@ -66,6 +66,7 @@ object MappingSpec extends TypeRegistry[MappingSpec] { new JsonSubTypes.Type(name = "flatten", value = classOf[FlattenMappingSpec]), new JsonSubTypes.Type(name = "groupedAggregate", value = classOf[GroupedAggregateMappingSpec]), new JsonSubTypes.Type(name = "historize", value = classOf[HistorizeMappingSpec]), + new JsonSubTypes.Type(name = "iterativeSql", value = classOf[IterativeSqlMappingSpec]), new JsonSubTypes.Type(name = "join", value = classOf[JoinMappingSpec]), new JsonSubTypes.Type(name = "latest", value = classOf[LatestMappingSpec]), new JsonSubTypes.Type(name = "mock", value = classOf[MockMappingSpec]), diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMapping.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMapping.scala index b1900c34c..1de7e7f5c 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMapping.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMapping.scala @@ -17,6 +17,7 @@ package com.dimajix.flowman.spec.mapping import java.io.StringWriter +import java.lang import java.net.URL import java.nio.charset.Charset import java.util.Locale @@ -37,6 +38,7 @@ import org.apache.spark.sql.catalyst.plans.logical.Union import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.model.BaseMapping import com.dimajix.flowman.model.Mapping import com.dimajix.flowman.model.MappingOutputIdentifier @@ -47,11 +49,13 @@ import com.dimajix.spark.sql.DataFrameUtils.withTempView import com.dimajix.spark.sql.DataFrameUtils.withTempViews import com.dimajix.spark.sql.SqlParser + case class RecursiveSqlMapping( instanceProperties:Mapping.Properties, sql:Option[String], - file:Option[Path], - url:Option[URL] + file:Option[Path] = None, + url:Option[URL] = None, + maxIterations: Int = 99 ) extends BaseMapping { /** @@ -75,11 +79,13 @@ extends BaseMapping { val statement = this.statement @tailrec - def fix(in:DataFrame, inCount:Long) : DataFrame = { + def fix(in:DataFrame, inCount:Long, iteration:Int=1) : DataFrame = { + if (iteration > maxIterations) + throw new ExecutionException(s"Recursive mapping '$identifier' exceeded maximum iterations $maxIterations") val result = nextDf(statement, in) val resultCount = result.count() if (resultCount != inCount) - fix(result, resultCount) + fix(result, resultCount, iteration+1) else result } @@ -98,7 +104,8 @@ extends BaseMapping { def findUnion(plan:LogicalPlan) : LogicalPlan = { plan match { case union:Union => union - case node:UnaryNode =>findUnion(node.child) + case node:UnaryNode => findUnion(node.child) + case _ => throw new IllegalArgumentException(s"SQL provided in recursiveSql mapping '$identifier' is not supported. Please use a structure like 'SELECT starting_point UNION ALL recursion', where starting_point does not reference __this__.") } } @@ -176,6 +183,7 @@ class RecursiveSqlMappingSpec extends MappingSpec { @JsonProperty(value="sql", required=false) private var sql:Option[String] = None @JsonProperty(value="file", required=false) private var file:Option[String] = None @JsonProperty(value="url", required=false) private var url: Option[String] = None + @JsonProperty(value="maxIterations", required=false) private var maxIterations: String = "99" /** * Creates the instance of the specified Mapping with all variable interpolation being performed @@ -187,7 +195,8 @@ class RecursiveSqlMappingSpec extends MappingSpec { instanceProperties(context, properties), context.evaluate(sql), file.map(context.evaluate).filter(_.nonEmpty).map(p => new Path(p)), - url.map(context.evaluate).filter(_.nonEmpty).map(u => new URL(u)) + url.map(context.evaluate).filter(_.nonEmpty).map(u => new URL(u)), + context.evaluate(maxIterations).toInt ) } } diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala new file mode 100644 index 000000000..d0778fba1 --- /dev/null +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala @@ -0,0 +1,123 @@ +/* + * Copyright 2020 Kaya Kupferschmidt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dimajix.flowman.spec.mapping + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import com.dimajix.flowman.execution.ExecutionException +import com.dimajix.flowman.execution.Session +import com.dimajix.flowman.model.Mapping +import com.dimajix.flowman.model.MappingIdentifier +import com.dimajix.flowman.model.MappingOutputIdentifier +import com.dimajix.flowman.model.Module +import com.dimajix.flowman.types.Field +import com.dimajix.flowman.types.IntegerType +import com.dimajix.flowman.types.StructType +import com.dimajix.spark.testing.LocalSparkSession + + +class IterativeSqlMappingTest extends AnyFlatSpec with Matchers with LocalSparkSession{ + "The IterativeSqlMapping" should "be parseable" in { + val spec = + """ + |mappings: + | t1: + | kind: iterativeSql + | input: some_input + | maxIterations: 12 + | sql: " + | SELECT x,y + | FROM t0 + | " + """.stripMargin + + val project = Module.read.string(spec).toProject("project") + project.mappings.size should be (1) + project.mappings.contains("t1") should be (true) + project.mappings("t1") shouldBe a[IterativeSqlMappingSpec] + + val session = Session.builder().disableSpark().build() + val context = session.getContext(project) + val mapping = context.getMapping(MappingIdentifier("t1")) + mapping shouldBe a[IterativeSqlMapping] + } + + it should "calculate factorials" in { + val spark = this.spark + import spark.implicits._ + + val session = Session.builder().withSparkSession(spark).build() + val context = session.context + val executor = session.execution + + val mapping = IterativeSqlMapping( + Mapping.Properties(context), + MappingOutputIdentifier("input"), + Some(""" + |SELECT + | IF(n < 6, n+1, n) AS n + |FROM __this__ + |""".stripMargin), + None, + None + ) + + val inputDf = spark.createDataFrame(Seq((1,1))).withColumnRenamed("_1", "n") + val resultDf = mapping.execute(executor, Map(MappingOutputIdentifier("input") -> inputDf))("main") + val resultRecords = resultDf.as[Int].collect() + resultRecords should be (Array(6)) + + val resultSchema = mapping.describe(executor, Map(MappingOutputIdentifier("input") -> StructType.of(inputDf.schema))) + resultSchema should be (Map( + "main" -> StructType(Seq( + Field("n", IntegerType, false) + )) + )) + } + + it should "throw an exception on too many iterations" in { + val spark = this.spark + + val session = Session.builder().withSparkSession(spark).build() + val context = session.context + val executor = session.execution + + val mapping = IterativeSqlMapping( + Mapping.Properties(context), + MappingOutputIdentifier("input"), + Some( + """ + |SELECT + | n+1 AS n + |FROM __this__ + |""".stripMargin), + None, + None, + maxIterations = 2 + ) + + val inputDf = spark.createDataFrame(Seq((1,1))).withColumnRenamed("_1", "n") + an[ExecutionException] should be thrownBy(mapping.execute(executor, Map(MappingOutputIdentifier("input") -> inputDf))("main")) + val resultSchema = mapping.describe(executor, Map(MappingOutputIdentifier("input") -> StructType.of(inputDf.schema))) + resultSchema should be(Map( + "main" -> StructType(Seq( + Field("n", IntegerType, false) + )) + )) + } +} diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMappingTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMappingTest.scala index d66489229..3b505ffed 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMappingTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/RecursiveSqlMappingTest.scala @@ -19,6 +19,7 @@ package com.dimajix.flowman.spec.mapping import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import com.dimajix.flowman.execution.ExecutionException import com.dimajix.flowman.execution.Session import com.dimajix.flowman.model.Mapping import com.dimajix.flowman.model.MappingIdentifier @@ -148,4 +149,67 @@ class RecursiveSqlMappingTest extends AnyFlatSpec with Matchers with LocalSparkS )) )) } + + it should "throw an exception on unsupported query structure" in { + val spark = this.spark + + val session = Session.builder().withSparkSession(spark).build() + val context = session.context + val executor = session.execution + + val mapping = RecursiveSqlMapping( + Mapping.Properties(context), + Some( + """ + |SELECT + | n+1 AS n, + | (n+1)*fact AS fact + |FROM __this__ + |WHERE n < 6 + |""".stripMargin), + None, + None + ) + + an[IllegalArgumentException] should be thrownBy (mapping.execute(executor, Map())("main")) + an[IllegalArgumentException] should be thrownBy (mapping.describe(executor, Map())) + } + + it should "throw an exception on too many iterations" in { + val spark = this.spark + + val session = Session.builder().withSparkSession(spark).build() + val context = session.context + val executor = session.execution + + val mapping = RecursiveSqlMapping( + Mapping.Properties(context), + Some( + """ + |SELECT + | 0 AS n, + | 1 AS fact + | + |UNION DISTINCT + | + |SELECT + | n+1 AS n, + | (n+1)*fact AS fact + |FROM __this__ + |WHERE n < 6 + |""".stripMargin), + None, + None, + maxIterations = 2 + ) + + an[ExecutionException] should be thrownBy(mapping.execute(executor, Map())("main")) + val resultSchema = mapping.describe(executor, Map()) + resultSchema should be(Map( + "main" -> StructType(Seq( + Field("n", IntegerType, false), + Field("fact", IntegerType, false) + )) + )) + } } From 7249dddcb20a03ffaf92a89c6f3c4fb1cb7dce84 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 26 Oct 2022 19:08:26 +0200 Subject: [PATCH 21/52] Reduce Spark logging in unittests --- .../dimajix/flowman/execution/Runner.scala | 10 ++-- .../dimajix/spark/testing/log4j.properties | 53 +++++++++++++++++++ .../spark/testing/LocalSparkSession.scala | 30 +++++++++++ 3 files changed, 86 insertions(+), 7 deletions(-) create mode 100644 flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala index 7daf5d214..94be13783 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala @@ -1,5 +1,5 @@ /* - * Copyright 2018-2021 Kaya Kupferschmidt + * Copyright 2018-2022 Kaya Kupferschmidt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,7 +16,6 @@ package com.dimajix.flowman.execution -import java.time.Clock import java.time.Duration import java.time.Instant import java.time.ZoneId @@ -37,7 +36,6 @@ import com.dimajix.common.Trilean import com.dimajix.common.Unknown import com.dimajix.common.text.TimeFormatter import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.execution.AbstractContext.Builder import com.dimajix.flowman.history.StateStore import com.dimajix.flowman.history.StateStoreAdaptorListener import com.dimajix.flowman.history.TargetState @@ -58,8 +56,6 @@ import com.dimajix.flowman.model.TargetResult import com.dimajix.flowman.model.Test import com.dimajix.flowman.model.TestWrapper import com.dimajix.flowman.spi.LogFilter -import com.dimajix.flowman.types.FieldType -import com.dimajix.flowman.types.LongType import com.dimajix.flowman.util.ConsoleColors._ import com.dimajix.spark.SparkUtils.withJobGroup @@ -432,7 +428,7 @@ private[execution] final class JobRunnerImpl(runner:Runner) extends RunnerImpl { } catch { case NonFatal(ex) => - logger.error("Cannot retrieve status from history database.", ex) + logger.error(s"Cannot retrieve status from history database. Exception:\n ${ExceptionUtils.reasons(ex)}") false } } @@ -535,7 +531,7 @@ private[execution] final class TestRunnerImpl(runner:Runner) extends RunnerImpl catch { // Catch all exceptions case NonFatal(ex) => - logger.error(s"Caught exception during $title:", ex) + logger.error(s"Caught exception during $title: ${reasons(ex)}") Status.FAILED } } diff --git a/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties b/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties new file mode 100644 index 000000000..db49c46ea --- /dev/null +++ b/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties @@ -0,0 +1,53 @@ +# +# Copyright 2018-2022 Kaya Kupferschmidt +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +log4j.rootCategory=INFO, console +log4j.appender.console=org.apache.log4j.ConsoleAppender +log4j.appender.console.target=System.err +log4j.appender.console.layout=org.apache.log4j.PatternLayout +log4j.appender.console.layout.ConversionPattern=[%p] %m%n +#log4j.appender.console.layout.ConversionPattern=[%p] %c %x - %m%n + + +# Settings to quiet third party logs that are too verbose +log4j.logger.org.sparkproject.jetty=WARN +log4j.logger.org.sparkproject.jetty.util.component.AbstractLifeCycle=ERROR +log4j.logger.org.spark_project.jetty=WARN +log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR + +# Set Spark general logging to WARN +log4j.logger.org.apache.spark=WARN + +# Set Hadoop related logging to WARN +log4j.logger.org.apache.hadoop=WARN +log4j.logger.org.apache.hadoop.hive=WARN + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL +log4j.logger.org.apache.hadoop.hive.ql.exec.FunctionRegistry=ERROR + +# Parquet related logging +log4j.logger.org.apache.parquet=WARN +log4j.logger.parquet=WARN +log4j.logger.org.apache.parquet.CorruptStatistics=ERROR +log4j.logger.parquet.CorruptStatistics=ERROR + +# Slick logging +log4j.logger.slick.jdbc.JdbcBackend.statement=WARN + +# Flowman logging +log4j.logger.com.dimajix.flowman=INFO diff --git a/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala b/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala index bdcc70caa..bc5652097 100644 --- a/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala +++ b/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala @@ -17,11 +17,15 @@ package com.dimajix.spark.testing import java.io.File +import java.io.IOException +import java.net.URL +import java.util.Properties import scala.util.control.NonFatal import org.apache.hadoop.hive.conf.HiveConf import org.apache.hive.common.util.HiveVersionInfo +import org.apache.log4j.PropertyConfigurator import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SparkSession @@ -54,6 +58,8 @@ trait LocalSparkSession extends LocalTempDir { this:Suite => override def beforeAll() : Unit = { super.beforeAll() + configureLogging() + val builder = SparkSession.builder() .master("local[4]") .config("spark.ui.enabled", "false") @@ -142,4 +148,28 @@ trait LocalSparkSession extends LocalTempDir { this:Suite => super.afterAll() } + + private def configureLogging(): Unit = { + val loader = Thread.currentThread.getContextClassLoader + val configUrl = loader.getResource("com/dimajix/spark/testing/log4j.properties") + val props = loadProperties(configUrl) + PropertyConfigurator.configure(props) + } + + private def loadProperties(url: URL): Properties = { + try { + val urlConnection = url.openConnection + urlConnection.setUseCaches(false) + val inputStream = urlConnection.getInputStream + try { + val loaded = new Properties + loaded.load(inputStream) + loaded + } finally { + inputStream.close() + } + } catch { + case e: IOException => null + } + } } From 560d25a1c59269563d3a12de6eecdce9f60df32e Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 26 Oct 2022 19:36:41 +0200 Subject: [PATCH 22/52] Improve IterativeMapping --- .../dimajix/spark/sql/DataFrameUtils.scala | 62 ++++++++++++++++--- .../spark/sql/DataFrameUtilsTest.scala | 36 +++++++++-- .../spec/mapping/IterativeSqlMapping.scala | 27 ++------ .../mapping/IterativeSqlMappingTest.scala | 59 +++++++++++++++++- 4 files changed, 146 insertions(+), 38 deletions(-) diff --git a/flowman-spark-extensions/src/main/scala/com/dimajix/spark/sql/DataFrameUtils.scala b/flowman-spark-extensions/src/main/scala/com/dimajix/spark/sql/DataFrameUtils.scala index 04a3d7902..a484e6a18 100644 --- a/flowman-spark-extensions/src/main/scala/com/dimajix/spark/sql/DataFrameUtils.scala +++ b/flowman-spark-extensions/src/main/scala/com/dimajix/spark/sql/DataFrameUtils.scala @@ -1,5 +1,5 @@ /* - * Copyright 2021 Kaya Kupferschmidt + * Copyright 2021-2022 Kaya Kupferschmidt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,6 +28,7 @@ import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.encoders.RowEncoder import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.catalyst.util.BadRecordException +import org.apache.spark.sql.functions.lit import org.apache.spark.sql.types.BinaryType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.types.StructType @@ -35,6 +36,9 @@ import org.apache.spark.storage.StorageLevel import com.dimajix.spark.sql.catalyst.PlanUtils import com.dimajix.spark.sql.local.csv.CsvOptions +import org.apache.spark.sql.functions.count +import org.apache.spark.sql.functions.col +import org.apache.spark.sql.functions.not object DataFrameUtils { @@ -122,20 +126,64 @@ object DataFrameUtils { } } - def compare(left:DataFrame, right:DataFrame) : Boolean = { + /** + * Compare two DataFrames. They are considered to be equal if their schema and all records match. The order of + * the records may be different, though. The comparison is conducted distributed and works with DataFrames of + * arbitrary sizes, but is much slower than [[quickCompare]] + * + * @param expected + * @param result + * @return + */ + def compare(expected: DataFrame, result: DataFrame): Boolean = { + if (SchemaUtils.dropMetadata(expected.schema) != SchemaUtils.dropMetadata(result.schema)) { + false + } + else { + val expectedCol = "flowman_compareDataFrames_expected" + val actualCol = "flowman_compareDataFrames_actual" + val expectedColumns = expected.columns.map(s => expected(s)) + val expectedElementsCount = expected.as("l") + .groupBy(expectedColumns: _*) + .agg(count(lit(1)).as(expectedCol)) + val resultColumns = result.columns.map(s => result(s)) + val resultElementsCount = result.as("r") + .groupBy(resultColumns: _*) + .agg(count(lit(1)).as(actualCol)) + + val joinExprs = expected.columns + .map(s => resultElementsCount(s) <=> expectedElementsCount(s)).reduce(_.and(_)) + val diff = expectedElementsCount + .join(resultElementsCount, joinExprs, "full_outer") + .filter(not(col(expectedCol) <=> col(actualCol))) + diff.take(1).length == 0 + } + } + + /** + * Compare two DataFrames. They are considered to be equal if their schema and all records match. The order of + * the records may be different, though. The comparison is conducted locally, so it should only be used on small + * DataFrames + * + * @param left + * @param right + * @return + */ + def quickCompare(left:DataFrame, right:DataFrame) : Boolean = { val leftRows = left.collect().toSeq val rightRows = right.collect().toSeq - compare(leftRows, rightRows) + quickCompare(leftRows, rightRows) } /** * Compare two DataFrames. They are considered to be equal if their schema and all records match. The order of - * the records may be different, though. + * the records may be different, though. The comparison is conducted locally, so it should only be used on small + * DataFrames * @param left * @param right * @return */ - def compare(left:Seq[Row], right:Seq[Row]) : Boolean = { + def quickCompare(left:Seq[Row], right:Seq[Row]) : Boolean = { normalizeRows(left) == normalizeRows(right) } @@ -149,13 +197,13 @@ object DataFrameUtils { val expectedRows = expected.collect().toSeq val actualRows = actual.collect().toSeq - if (!compare(expectedRows, actualRows)) + if (!quickCompare(expectedRows, actualRows)) Some(genError(expectedRows, actualRows)) else None } def diff(expected:Seq[Row], actual:Seq[Row]) : Option[String] = { - if (!compare(expected, actual)) + if (!quickCompare(expected, actual)) Some(genError(expected, actual)) else None diff --git a/flowman-spark-extensions/src/test/scala/com/dimajix/spark/sql/DataFrameUtilsTest.scala b/flowman-spark-extensions/src/test/scala/com/dimajix/spark/sql/DataFrameUtilsTest.scala index b5f0533e0..53f50458d 100644 --- a/flowman-spark-extensions/src/test/scala/com/dimajix/spark/sql/DataFrameUtilsTest.scala +++ b/flowman-spark-extensions/src/test/scala/com/dimajix/spark/sql/DataFrameUtilsTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2021 Kaya Kupferschmidt + * Copyright 2021-2022 Kaya Kupferschmidt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,6 +46,29 @@ class DataFrameUtilsTest extends AnyFlatSpec with Matchers with LocalSparkSessio } "DataFrameUtils.compare" should "work" in { + val schema = StructType(Seq( + StructField("c1", IntegerType), + StructField("c2", StringType), + StructField("c3", DoubleType), + StructField("c4", DecimalType(30, 6)), + StructField("c5", DateType) + )) + val lines = Seq( + Array("1", "lala", "2.3", "2.5", "2019-02-01"), + Array("2", "", "3.4", "", ""), + Array("", null, "", null, null) + ) + val df1 = DataFrameBuilder.ofStringValues(spark, lines, schema) + val df2 = DataFrameBuilder.ofStringValues(spark, lines, schema) + + DataFrameUtils.compare(df1, df2) should be(true) + DataFrameUtils.compare(df1.limit(2), df2) should be(false) + DataFrameUtils.compare(df1, df2.limit(2)) should be(false) + DataFrameUtils.compare(df1.drop("c1"), df2) should be(false) + DataFrameUtils.compare(df1, df2.drop("c1")) should be(false) + } + + "DataFrameUtils.quickCompare" should "work" in { val schema = StructType(Seq( StructField("c1", IntegerType), StructField("c2", StringType), @@ -61,11 +84,11 @@ class DataFrameUtilsTest extends AnyFlatSpec with Matchers with LocalSparkSessio val df1 = DataFrameBuilder.ofStringValues(spark, lines, schema) val df2 = DataFrameBuilder.ofStringValues(spark, lines, schema) - DataFrameUtils.compare(df1, df2) should be (true) - DataFrameUtils.compare(df1.limit(2), df2) should be (false) - DataFrameUtils.compare(df1, df2.limit(2)) should be (false) - DataFrameUtils.compare(df1.drop("c1"), df2) should be (false) - DataFrameUtils.compare(df1, df2.drop("c1")) should be (false) + DataFrameUtils.quickCompare(df1, df2) should be (true) + DataFrameUtils.quickCompare(df1.limit(2), df2) should be (false) + DataFrameUtils.quickCompare(df1, df2.limit(2)) should be (false) + DataFrameUtils.quickCompare(df1.drop("c1"), df2) should be (false) + DataFrameUtils.quickCompare(df1, df2.drop("c1")) should be (false) } "DataFrameUtils.diff" should "work" in { @@ -161,4 +184,5 @@ class DataFrameUtilsTest extends AnyFlatSpec with Matchers with LocalSparkSessio spark.sessionState.catalog.getTempView("temp") should be (None) } + } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala index 147b2688f..1c5ee9c93 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMapping.scala @@ -87,9 +87,10 @@ extends BaseMapping { def fix(in:DataFrame, iteration:Int=1) : DataFrame = { if (iteration > maxIterations) throw new ExecutionException(s"Recursive mapping '$identifier' exceeded maximum iterations $maxIterations") + val result = nextDf(statement, in) - if (!checkDataFramesEquals(in, result)) - fix(result, iteration+1) + if (!DataFrameUtils.compare(in, result)) + fix(result.localCheckpoint(false), iteration+1) else result } @@ -103,33 +104,13 @@ extends BaseMapping { Map("main" -> result) } - private def nextDf(statement:String, prev:DataFrame) : DataFrame = { val spark = prev.sparkSession withTempView("__this__", prev) { - spark.sql(statement).localCheckpoint(false) + spark.sql(statement) } } - private def checkDataFramesEquals(expected:DataFrame, result:DataFrame) : Boolean = { - val expectedCol = "assertDataFrameNoOrderEquals_expected" - val actualCol = "assertDataFrameNoOrderEquals_actual" - val expectedColumns = expected.columns.map(s => expected(s)) - val expectedElementsCount = expected - .groupBy(expectedColumns: _*) - .agg(count(lit(1)).as(expectedCol)) - val resultColumns = result.columns.map(s => result(s)) - val resultElementsCount = result - .groupBy(resultColumns: _*) - .agg(count(lit(1)).as(actualCol)) - - val joinExprs = expected.columns - .map(s => expected.col(s) <=> result.col(s)).reduce(_.and(_)) - val diff = expectedElementsCount - .join(resultElementsCount, joinExprs, "full_outer") - .filter(not(col(expectedCol) <=> col(actualCol))) - diff.take(1).length == 0 - } /** * Returns the schema as produced by this mapping, relative to the given input schema. The map might not contain diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala index d0778fba1..fc2d7cbc2 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2020 Kaya Kupferschmidt + * Copyright 2020-2022 Kaya Kupferschmidt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,8 @@ package com.dimajix.flowman.spec.mapping +import org.apache.spark.sql.Row +import org.apache.spark.sql.functions.col import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -28,10 +30,11 @@ import com.dimajix.flowman.model.Module import com.dimajix.flowman.types.Field import com.dimajix.flowman.types.IntegerType import com.dimajix.flowman.types.StructType +import com.dimajix.spark.sql.DataFrameUtils import com.dimajix.spark.testing.LocalSparkSession -class IterativeSqlMappingTest extends AnyFlatSpec with Matchers with LocalSparkSession{ +class IterativeSqlMappingTest extends AnyFlatSpec with Matchers with LocalSparkSession { "The IterativeSqlMapping" should "be parseable" in { val spec = """ @@ -120,4 +123,56 @@ class IterativeSqlMappingTest extends AnyFlatSpec with Matchers with LocalSparkS )) )) } + + it should "support complex hierarchical lookups" in { + val spark = this.spark + + val session = Session.builder().withSparkSession(spark).build() + val context = session.context + val executor = session.execution + + val mapping = IterativeSqlMapping( + Mapping.Properties(context), + MappingOutputIdentifier("organization_hierarchy_start"), + Some( + """ + |SELECT + | COALESCE(parent.tree_id, t.tree_id) AS tree_id, + | t.account_number, + | t.parent_account_number, + | t.company_name + |FROM organization_hierarchy_start t + |LEFT JOIN __this__ parent + | ON t.parent_account_number = parent.account_number + |""".stripMargin), + None, + None, + maxIterations = 99 + ) + + val inputDf = spark.createDataFrame(Seq( + ("1000", null, "Company 1000"), + ("1100", "1000", "Company 1100"), + ("1110", "1100", "Company 1110"), + ("1111", "1110", "Company 1111"), + ("11111", "1111", "Company 11111"), + ("1200", "1000", "Company 1200"), + ("2000", null, "Company 2000") + )) + .withColumnRenamed("_1", "account_number") + .withColumnRenamed("_2", "parent_account_number") + .withColumnRenamed("_3", "company_name") + .withColumn("tree_id", col("account_number")) + val result = mapping.execute(executor, Map(MappingOutputIdentifier("organization_hierarchy_start") -> inputDf))("main") + val expected = Seq( + Row("1000", "1000", null, "Company 1000"), + Row("1000", "1100", "1000", "Company 1100"), + Row("1000", "1110", "1100", "Company 1110"), + Row("1000", "1111", "1110", "Company 1111"), + Row("1000", "11111", "1111", "Company 11111"), + Row("1000", "1200", "1000", "Company 1200"), + Row("2000", "2000", null, "Company 2000"), + ) + DataFrameUtils.quickCompare(result.collect(), expected) should be (true) + } } From 267d84f56c84134b6abb7c9a839dbed47ac98ed1 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 27 Oct 2022 07:18:18 +0200 Subject: [PATCH 23/52] Fix build for Scala 2.11 --- .../dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala index fc2d7cbc2..46115fc69 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/IterativeSqlMappingTest.scala @@ -171,7 +171,7 @@ class IterativeSqlMappingTest extends AnyFlatSpec with Matchers with LocalSparkS Row("1000", "1111", "1110", "Company 1111"), Row("1000", "11111", "1111", "Company 11111"), Row("1000", "1200", "1000", "Company 1200"), - Row("2000", "2000", null, "Company 2000"), + Row("2000", "2000", null, "Company 2000") ) DataFrameUtils.quickCompare(result.collect(), expected) should be (true) } From 95180e419e664729820c4735820b882d9ee3c358 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 27 Oct 2022 11:41:58 +0200 Subject: [PATCH 24/52] Add jackson-datatypes-jdk8 to root and parent POM --- flowman-parent/pom.xml | 6 ++++++ pom.xml | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index 694bff8e0..45b2f27ae 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -851,6 +851,12 @@ ${jackson.version} compile + + com.fasterxml.jackson.datatype + jackson-datatype-jdk8 + ${jackson.version} + compile + com.thoughtworks.paranamer diff --git a/pom.xml b/pom.xml index 449334f3a..059de96dc 100644 --- a/pom.xml +++ b/pom.xml @@ -1729,6 +1729,12 @@ ${jackson.version} compile + + com.fasterxml.jackson.datatype + jackson-datatype-jdk8 + ${jackson.version} + compile + com.thoughtworks.paranamer From 5ac7b841b81fc9529543baba3ec3a3546538b0f0 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 27 Oct 2022 13:43:16 +0200 Subject: [PATCH 25/52] github-270 Upgrade Spark to 3.3.1 --- CHANGELOG.md | 1 + docs/releases.md | 1 + .../spark/sql/catalyst/SqlBuilder.scala | 16 +++++- .../dimajix/spark/testing/log4j.properties | 5 +- .../spark/testing/LocalSparkSession.scala | 26 +++------- .../mapping/GroupedAggregateMapping.scala | 4 +- .../mapping/GroupedAggregateMappingTest.scala | 2 +- .../flowman/testing/log4j-defaults.properties | 2 +- pom.xml | 52 ++++++++++++++----- 9 files changed, 70 insertions(+), 39 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 38e129943..9bccc908d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ * github-264: Provide "jars" for all plugins * github-267: Add new flowman-spark-dependencies module to simplify dependency management * github-269: Implement new 'iterativeSql' mapping +* github-270: Upgrade Spark to 3.3.1 # Version 0.28.0 - 2022-10-07 diff --git a/docs/releases.md b/docs/releases.md index 5b631ca30..e30fdee73 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -24,6 +24,7 @@ changes over time. * github-264: Provide "jars" for all plugins * github-267: Add new flowman-spark-dependencies module to simplify dependency management * github-269: Create 'iterativeSql' mapping +* github-270: Upgrade Spark to 3.3.1 ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-spark-extensions/src/main/spark-3.3/com/dimajix/spark/sql/catalyst/SqlBuilder.scala b/flowman-spark-extensions/src/main/spark-3.3/com/dimajix/spark/sql/catalyst/SqlBuilder.scala index f19ab3fbe..8723eb8a8 100644 --- a/flowman-spark-extensions/src/main/spark-3.3/com/dimajix/spark/sql/catalyst/SqlBuilder.scala +++ b/flowman-spark-extensions/src/main/spark-3.3/com/dimajix/spark/sql/catalyst/SqlBuilder.scala @@ -703,7 +703,9 @@ class SqlBuilder private( // Remove aliases RemoveRedundantAliases, // Remove redundant casts - SimplifyCasts + SimplifyCasts, + // Remove redundant aliases + EliminateAlias ) ) @@ -748,6 +750,18 @@ class SqlBuilder private( } } + object EliminateAlias extends Rule[LogicalPlan] { + override def apply(tree: LogicalPlan): LogicalPlan = { + tree transformAllExpressions { + case a: Alias => + a.child match { + case n:NamedExpression if n.name == a.name => n + case _ => a + } + } + } + } + case class SQLTable( database: String, table: String, diff --git a/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties b/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties index db49c46ea..de2a12461 100644 --- a/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties +++ b/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties @@ -15,7 +15,7 @@ # # Set everything to be logged to the console -log4j.rootCategory=INFO, console +log4j.rootCategory=WARN, console log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout @@ -48,6 +48,3 @@ log4j.logger.parquet.CorruptStatistics=ERROR # Slick logging log4j.logger.slick.jdbc.JdbcBackend.statement=WARN - -# Flowman logging -log4j.logger.com.dimajix.flowman=INFO diff --git a/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala b/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala index bc5652097..5d97752f9 100644 --- a/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala +++ b/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala @@ -58,7 +58,7 @@ trait LocalSparkSession extends LocalTempDir { this:Suite => override def beforeAll() : Unit = { super.beforeAll() - configureLogging() + setupLogging() val builder = SparkSession.builder() .master("local[4]") @@ -149,27 +149,17 @@ trait LocalSparkSession extends LocalTempDir { this:Suite => super.afterAll() } - private def configureLogging(): Unit = { + protected def setupLogging(): Unit = { val loader = Thread.currentThread.getContextClassLoader val configUrl = loader.getResource("com/dimajix/spark/testing/log4j.properties") - val props = loadProperties(configUrl) - PropertyConfigurator.configure(props) + setupLogging(configUrl) } - private def loadProperties(url: URL): Properties = { - try { - val urlConnection = url.openConnection - urlConnection.setUseCaches(false) - val inputStream = urlConnection.getInputStream - try { - val loaded = new Properties - loaded.load(inputStream) - loaded - } finally { - inputStream.close() - } - } catch { - case e: IOException => null + protected def setupLogging(url: URL): Unit = { + val log4j = System.getProperty("log4j.configuration") + if (log4j == null || log4j.isEmpty) { + val loader = Thread.currentThread.getContextClassLoader + PropertyConfigurator.configure(url) } } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMapping.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMapping.scala index bb7a70356..ac5b00dd7 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMapping.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMapping.scala @@ -117,9 +117,9 @@ case class GroupedAggregateMapping( val allGroupings = performGroupedAggregation(filteredInput, dimensionColumns, groupingColumns) // This is a workaround for newer Spark version, which apparently use a different mechanism to derive - // grouping-ids than Spark up until 3.1.x + // grouping-ids than Spark up until 3.1.x. It was changed back in Spark 3.3.1 val dimensionIndices2 = { - if (org.apache.spark.SPARK_VERSION >= "3.2") + if (org.apache.spark.SPARK_VERSION >= "3.2" && org.apache.spark.SPARK_VERSION < "3.3.1") groups.values.flatMap(g => g.dimensions ++ g.filter.map(f => filterNames(filterIndices(f)))).toSeq.distinct.zipWithIndex.toMap else dimensionIndices diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMappingTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMappingTest.scala index 3922fb35d..b1054358e 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMappingTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/mapping/GroupedAggregateMappingTest.scala @@ -173,7 +173,7 @@ class GroupedAggregateMappingTest extends AnyFlatSpec with Matchers with LocalSp ) mapping.input should be (MappingOutputIdentifier("data")) - mapping.outputs.toSet should be (Set("g1", "g2", "g3", "cache")) + mapping.outputs should be (Set("g1", "g2", "g3", "cache")) val data = execution.spark.createDataFrame(Seq( ("c1_v1", "c2_v1", "c3_v1", 23.0), diff --git a/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties b/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties index 62f7dd334..4a276af9e 100644 --- a/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties +++ b/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties @@ -20,7 +20,7 @@ log4j.appender.console=org.apache.log4j.ConsoleAppender log4j.appender.console.target=System.err log4j.appender.console.layout=org.apache.log4j.PatternLayout #log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n -log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} [%p] %m%n +log4j.appender.console.layout.ConversionPattern=[%p] %m%n # Settings to quiet third party logs that are too verbose diff --git a/pom.xml b/pom.xml index 059de96dc..fc6f44bc8 100644 --- a/pom.xml +++ b/pom.xml @@ -74,18 +74,14 @@ 4.0.4 5.8.2 3.21.0 - 1.2 1.9.4 - 3.0.9 1.4 1.8.1 3.1 2.6 1.1.3 - 3.4.1 3.1 1.5.4 - 1.6 1.6 3.9.9.Final @@ -102,15 +98,15 @@ 3.2 1.2.0 1.1.2 - 3.3.0 + 3.3.1 3.3 1.1.8.4 4.1.74.Final 4.8 - 1.30 - 2.13.3 + 1.31 + 2.13.4 2.13 - 2.13.3 + 2.13.4.1 2.8 2.8.0 10.14.2.0 @@ -126,10 +122,14 @@ 2.10.13 2.13.0 2.13.0 + 1.5.0 3.2.2 + 3.0.16 1.21 2.11.0 3.12.0 + 3.6.1 + 1.9 ${project.version} @@ -204,10 +204,14 @@ 2.9.9 2.7.1 2.12.0 + 1.2 3.2.1 + 3.0.9 1.19 2.4 3.7 + 3.4.1 + 1.6 2.8.2 1.7.25 @@ -268,10 +272,14 @@ 2.9.3 4.3.0.${cdp.version} 4.3.0.${cdp.version} + 1.2 3.2.2 + 3.0.9 1.19 2.4 3.8.1 + 3.4.1 + 1.6 @@ -312,10 +320,14 @@ 2.9.3 2.7.1 2.7.1 + 1.2 3.2.2 + 3.0.9 1.8.1 2.4 3.5 + 3.4.1 + 1.6 @@ -353,10 +365,14 @@ 2.10.5 2.13.0 2.13.0 + 1.2 3.2.2 + 3.0.9 1.20 2.4 3.9 + 3.4.1 + 1.6 @@ -394,10 +410,14 @@ 2.10.5 2.13.0 2.13.0 + 1.2 3.2.2 + 3.0.9 1.20 2.4 3.9 + 3.4.1 + 1.6 @@ -435,11 +455,15 @@ 2.10.10 2.13.0 2.13.0 + 1.2 1.10 3.2.2 + 3.0.16 1.21 2.8.0 3.12.0 + 3.4.1 + 1.6 @@ -451,17 +475,17 @@ 3.2 1.2.0 1.1.2 - 3.3.0 + 3.3.1 3.3 2.3.9 2.7.2 1.1.8.4 4.1.74.Final 4.8 - 1.30 - 2.13.3 + 1.31 + 2.13.4 2.13 - 2.13.3 + 2.13.4.1 2.8 2.8.0 10.14.2.0 @@ -477,11 +501,15 @@ 2.10.13 2.13.0 2.13.0 + 1.5.0 1.10 3.2.2 + 3.0.16 1.21 2.11.0 3.12.0 + 3.6.1 + 1.9 From bba2817c2d66c423c6418ef0c2cd40a8a5ffc52c Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 27 Oct 2022 13:49:32 +0200 Subject: [PATCH 26/52] github-271 Upgrade DeltaLake to 2.1.1 --- CHANGELOG.md | 1 + docs/releases.md | 1 + flowman-plugins/delta/pom.xml | 4 ++-- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9bccc908d..9956b3329 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ * github-267: Add new flowman-spark-dependencies module to simplify dependency management * github-269: Implement new 'iterativeSql' mapping * github-270: Upgrade Spark to 3.3.1 +* github-271: Upgrade Delta to 2.1.1 # Version 0.28.0 - 2022-10-07 diff --git a/docs/releases.md b/docs/releases.md index e30fdee73..0dbf8578e 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -25,6 +25,7 @@ changes over time. * github-267: Add new flowman-spark-dependencies module to simplify dependency management * github-269: Create 'iterativeSql' mapping * github-270: Upgrade Spark to 3.3.1 +* github-271: Upgrade Delta to 2.1.1 ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-plugins/delta/pom.xml b/flowman-plugins/delta/pom.xml index d0e2b9f03..400cd3de9 100644 --- a/flowman-plugins/delta/pom.xml +++ b/flowman-plugins/delta/pom.xml @@ -18,7 +18,7 @@ flowman-delta ${project.version} ${project.build.finalName}.jar - 2.1.0 + 2.1.1 @@ -106,7 +106,7 @@ spark-3.3 - 2.1.0 + 2.1.1 From 31e8c4e714ff2662f1aa47178e3a3493ce130b47 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 31 Oct 2022 18:03:06 +0100 Subject: [PATCH 27/52] github-273 Support resource files and jar files in FileSystem interface --- .../scala/com/dimajix/common/Resources.scala | 4 + .../dimajix/flowman/catalog/HiveCatalog.scala | 2 +- .../flowman/documentation/Documenter.scala | 2 +- .../flowman/execution/AbstractContext.scala | 2 +- .../execution/AnalyzingExecution.scala | 2 +- .../dimajix/flowman/execution/Context.scala | 2 +- .../dimajix/flowman/execution/Execution.scala | 2 +- .../flowman/execution/MonitorExecution.scala | 2 +- .../flowman/execution/RootContext.scala | 2 +- .../flowman/execution/RootExecution.scala | 2 +- .../flowman/execution/ScopedExecution.scala | 2 +- .../dimajix/flowman/execution/Session.scala | 2 +- .../dimajix/flowman/{hadoop => fs}/File.scala | 13 +- .../{hadoop => fs}/FileCollector.scala | 2 +- .../flowman/{hadoop => fs}/FileSystem.scala | 35 ++- .../flowman/{hadoop => fs}/FileUtils.scala | 2 +- .../flowman/{hadoop => fs}/GlobPattern.scala | 2 +- .../flowman/{hadoop => fs}/HadoopFile.scala | 29 ++- .../com/dimajix/flowman/fs/JavaFile.scala | 201 ++++++++++++++++++ .../com/dimajix/flowman/model/Module.scala | 8 +- .../com/dimajix/flowman/model/Project.scala | 2 +- .../flowman/model/ResourceIdentifier.scala | 2 +- .../com/dimajix/flowman/model/velocity.scala | 2 +- .../flowman/spi/DocumenterReader.scala | 2 +- .../dimajix/flowman/spi/ModuleReader.scala | 2 +- .../dimajix/flowman/spi/ProjectReader.scala | 2 +- .../com/dimajix/flowman/storage/Parcel.scala | 2 +- .../dimajix/flowman/storage/Workspace.scala | 2 +- .../dimajix/flowman/templating/wrapper.scala | 4 +- .../dimajix/flowman/types/SchemaWriter.scala | 2 +- .../dimajix/flowman/util/ObjectMapper.scala | 2 +- .../execution/ProjectContextTest.scala | 2 +- .../{hadoop => fs}/FileCollectorTest.scala | 2 +- .../FileSystemTest.scala} | 42 +++- .../dimajix/flowman/fs/HadoopFileTest.scala | 86 ++++++++ .../com/dimajix/flowman/fs/JavaFileTest.scala | 122 +++++++++++ .../com/dimajix/flowman/dsl/module.scala | 2 +- .../spec/relation/DeltaFileRelation.scala | 2 +- .../spec/target/SftpUploadTarget.scala | 8 +- .../flowman/spec/YamlDocumenterReader.scala | 2 +- .../flowman/spec/YamlModuleReader.scala | 2 +- .../flowman/spec/YamlProjectReader.scala | 2 +- .../spec/documentation/FileGenerator.scala | 2 +- .../flowman/spec/relation/FileRelation.scala | 4 +- .../spec/relation/HiveTableRelation.scala | 2 +- .../relation/HiveUnionTableRelation.scala | 2 +- .../flowman/spec/relation/LocalRelation.scala | 2 +- .../flowman/spec/storage/FileStore.scala | 4 +- .../flowman/spec/storage/LocalParcel.scala | 4 +- .../flowman/spec/storage/LocalWorkspace.scala | 4 +- .../flowman/spec/storage/ParcelSpec.scala | 2 +- .../flowman/spec/target/FileTarget.scala | 2 +- .../dimajix/flowman/spec/ProjectTest.scala | 2 +- .../spec/storage/LocalWorkspaceTest.scala | 2 +- .../spec/target/MergeFilesTargetTest.scala | 4 +- .../rest/workspace/ParcelEndpoint.scala | 2 +- .../studio/service/WorkspaceManager.scala | 2 +- .../com/dimajix/flowman/testing/Runner.scala | 2 +- .../com/dimajix/flowman/tools/Tool.scala | 2 +- 59 files changed, 566 insertions(+), 92 deletions(-) rename flowman-core/src/main/scala/com/dimajix/flowman/{hadoop => fs}/File.scala (93%) rename flowman-core/src/main/scala/com/dimajix/flowman/{hadoop => fs}/FileCollector.scala (99%) rename flowman-core/src/main/scala/com/dimajix/flowman/{hadoop => fs}/FileSystem.scala (54%) rename flowman-core/src/main/scala/com/dimajix/flowman/{hadoop => fs}/FileUtils.scala (99%) rename flowman-core/src/main/scala/com/dimajix/flowman/{hadoop => fs}/GlobPattern.scala (99%) rename flowman-core/src/main/scala/com/dimajix/flowman/{hadoop => fs}/HadoopFile.scala (91%) create mode 100644 flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala rename flowman-core/src/test/scala/com/dimajix/flowman/{hadoop => fs}/FileCollectorTest.scala (99%) rename flowman-core/src/test/scala/com/dimajix/flowman/{hadoop/FileTest.scala => fs/FileSystemTest.scala} (70%) create mode 100644 flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala create mode 100644 flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala diff --git a/flowman-common/src/main/scala/com/dimajix/common/Resources.scala b/flowman-common/src/main/scala/com/dimajix/common/Resources.scala index 2047996ad..3fbebf7f2 100644 --- a/flowman-common/src/main/scala/com/dimajix/common/Resources.scala +++ b/flowman-common/src/main/scala/com/dimajix/common/Resources.scala @@ -22,6 +22,10 @@ import java.util.Properties class Resources object Resources { + def getURL(resourceName:String) : URL = { + classOf[Resources].getClassLoader.getResource(resourceName) + } + def loadProperties(resourceName:String) : Properties = { val url = classOf[Resources].getClassLoader.getResource(resourceName) loadProperties(url) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/catalog/HiveCatalog.scala b/flowman-core/src/main/scala/com/dimajix/flowman/catalog/HiveCatalog.scala index c2b202f9f..832457a94 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/catalog/HiveCatalog.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/catalog/HiveCatalog.scala @@ -57,7 +57,7 @@ import com.dimajix.flowman.catalog.TableChange.UpdateColumnComment import com.dimajix.flowman.catalog.TableChange.UpdateColumnNullability import com.dimajix.flowman.config.Configuration import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileUtils +import com.dimajix.flowman.fs.FileUtils import com.dimajix.flowman.model.PartitionField import com.dimajix.flowman.model.PartitionSchema import com.dimajix.spark.features.hiveVarcharSupported diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/Documenter.scala b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/Documenter.scala index 1ab246fb5..f6410d3fd 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/documentation/Documenter.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/documentation/Documenter.scala @@ -28,7 +28,7 @@ import com.dimajix.flowman.execution.Execution import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.Session import com.dimajix.flowman.graph.Graph -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model import com.dimajix.flowman.model.AbstractInstance import com.dimajix.flowman.model.Job diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala index 92abc2bed..71c3ff1ab 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala @@ -24,7 +24,7 @@ import org.slf4j.Logger import com.dimajix.flowman.config.Configuration import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.model.Connection import com.dimajix.flowman.model.Profile import com.dimajix.flowman.model.Prototype diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/AnalyzingExecution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/AnalyzingExecution.scala index cff3138bb..b1de39824 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/AnalyzingExecution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/AnalyzingExecution.scala @@ -23,7 +23,7 @@ import org.slf4j.LoggerFactory import com.dimajix.flowman.catalog.HiveCatalog import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.metric.MetricBoard import com.dimajix.flowman.metric.MetricSystem import com.dimajix.flowman.model.Assertion diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala index 000620e28..d3a274fa1 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala @@ -21,7 +21,7 @@ import org.apache.spark.SparkConf import com.dimajix.flowman.config.Configuration import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.model.Connection import com.dimajix.flowman.model.ConnectionIdentifier import com.dimajix.flowman.model.Job diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Execution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Execution.scala index d8871068e..98dace6b6 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Execution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Execution.scala @@ -23,7 +23,7 @@ import org.apache.spark.sql.SparkSession import com.dimajix.flowman.catalog.HiveCatalog import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.metric.MetricBoard import com.dimajix.flowman.metric.MetricSystem import com.dimajix.flowman.model.Assertion diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/MonitorExecution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/MonitorExecution.scala index a849941d4..09cce5c87 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/MonitorExecution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/MonitorExecution.scala @@ -21,7 +21,7 @@ import org.apache.spark.sql.SparkSession import com.dimajix.flowman.catalog.HiveCatalog import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.metric.MetricBoard import com.dimajix.flowman.metric.MetricSystem import com.dimajix.flowman.model.Mapping diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala index 9d290fd97..f1ac674d0 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala @@ -25,7 +25,7 @@ import org.apache.spark.SparkConf import org.slf4j.LoggerFactory import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.model.Connection import com.dimajix.flowman.model.ConnectionIdentifier import com.dimajix.flowman.model.Job diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootExecution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootExecution.scala index 896b895f1..21cf648fb 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootExecution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootExecution.scala @@ -21,7 +21,7 @@ import org.slf4j.LoggerFactory import com.dimajix.flowman.catalog.HiveCatalog import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.metric.MetricBoard import com.dimajix.flowman.metric.MetricSystem import com.dimajix.flowman.model.Assertion diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopedExecution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopedExecution.scala index 31cd3df32..5fd9e4c40 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopedExecution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopedExecution.scala @@ -21,7 +21,7 @@ import org.slf4j.LoggerFactory import com.dimajix.flowman.catalog.HiveCatalog import com.dimajix.flowman.config.FlowmanConf -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.metric.MetricBoard import com.dimajix.flowman.metric.MetricSystem diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala index e7bbef7b2..a2540530d 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala @@ -29,7 +29,7 @@ import com.dimajix.flowman.config.Configuration import com.dimajix.flowman.config.FlowmanConf import com.dimajix.flowman.documentation.Documenter import com.dimajix.flowman.execution.Session.builder -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.history.NullStateStore import com.dimajix.flowman.history.StateStore import com.dimajix.flowman.metric.MetricSystem diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/File.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala similarity index 93% rename from flowman-core/src/main/scala/com/dimajix/flowman/hadoop/File.scala rename to flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala index 0f064042b..102a4f58f 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/File.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import java.io.InputStream import java.io.OutputStream @@ -33,7 +33,7 @@ object File { * @param fs * @param path */ -abstract class File { +abstract class File { override def toString: String = if (path != null) path.toString else "" def path : Path @@ -49,7 +49,7 @@ abstract class File { * Returns the file name of the File * @return */ - def filename : String = { + def name : String = { path.getName } @@ -71,15 +71,13 @@ abstract class File { */ def length : Long - def resolve(name:String) : File - /** * Lists all directory entries. Will throw an exception if the File is not a directory * @return */ def list() : Seq[File] - def glob(pattern:Path) : Seq[File] + def glob(pattern:String) : Seq[File] /** * Renamed the file to a different name @@ -95,7 +93,7 @@ abstract class File { def copy(dst:File, overwrite:Boolean) : Unit /** - * Creates a file and returns the correspondiong output stream + * Creates a file and returns the corresponding output stream. Intermediate directories will be created as required. * @param overwrite * @return */ @@ -148,4 +146,5 @@ abstract class File { def withSuffix(suffix:String) : File def withName(name:String) : File + } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileCollector.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileCollector.scala similarity index 99% rename from flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileCollector.scala rename to flowman-core/src/main/scala/com/dimajix/flowman/fs/FileCollector.scala index e715709c7..76966d207 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileCollector.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileCollector.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import java.io.{FileNotFoundException, StringWriter} diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala similarity index 54% rename from flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileSystem.scala rename to flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index a652c1570..5856dab9d 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -14,22 +14,24 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import java.net.URI +import java.nio.file.FileSystemNotFoundException +import java.nio.file.Paths +import java.util.Collections import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.LocalFileSystem import org.apache.hadoop.fs.Path +import com.dimajix.common.Resources + /** * This is a super thin wrapper around Hadoop FileSystems which is used to create Flowman File instances * @param conf */ case class FileSystem(conf:Configuration) { - private val localFs = org.apache.hadoop.fs.FileSystem.getLocal(conf) - def file(path:Path) : File = { val fs = path.getFileSystem(conf) HadoopFile(fs, path) @@ -37,8 +39,25 @@ case class FileSystem(conf:Configuration) { def file(path:String) : File = file(new Path(path)) def file(path:URI) : File = file(new Path(path)) - def local(path:Path) : File = HadoopFile(localFs, path) - def local(path:String) : File = local(new Path(path)) - def local(path:java.io.File) : File = local(new Path(path.toString)) - def local(path:URI) : File = local(new Path(path)) + def local(path:Path) : File = local(path.toUri) + def local(path:String) : File = JavaFile(Paths.get(path)) + def local(path:java.io.File) : File = JavaFile(path.toPath) + def local(path:URI) : File = JavaFile(Paths.get(path)) + + def resource(path:String) : File = { + val uri = Resources.getURL(path).toURI + if (uri.getScheme == "jar") { + try { + java.nio.file.FileSystems.getFileSystem(uri) + } + catch { + case _: FileSystemNotFoundException => + java.nio.file.FileSystems.newFileSystem(uri, Collections.emptyMap[String, String]()) + } + JavaFile(Paths.get(uri)) + } + else { + JavaFile(Paths.get(uri)) + } + } } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileUtils.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileUtils.scala similarity index 99% rename from flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileUtils.scala rename to flowman-core/src/main/scala/com/dimajix/flowman/fs/FileUtils.scala index d11d93f48..bb3d2bfe8 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/FileUtils.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileUtils.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import java.io.FileNotFoundException diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/GlobPattern.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/GlobPattern.scala similarity index 99% rename from flowman-core/src/main/scala/com/dimajix/flowman/hadoop/GlobPattern.scala rename to flowman-core/src/main/scala/com/dimajix/flowman/fs/GlobPattern.scala index 3a237dfa6..c6f1cbe56 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/GlobPattern.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/GlobPattern.scala @@ -16,7 +16,7 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import java.util.regex.Pattern import java.util.regex.PatternSyntaxException diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/HadoopFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala similarity index 91% rename from flowman-core/src/main/scala/com/dimajix/flowman/hadoop/HadoopFile.scala rename to flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala index 4331b9252..032d14c23 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/hadoop/HadoopFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala @@ -14,10 +14,11 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import java.io.FileNotFoundException import java.io.IOException +import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.FSDataInputStream @@ -42,7 +43,11 @@ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends Fil * @return */ def /(sub:String) : File = { - HadoopFile(fs, new Path(path, sub)) + val rel = new Path(new URI(sub)) + if (rel.isAbsolute) + HadoopFile(fs, rel) + else + HadoopFile(fs, new Path(path, sub)) } /** @@ -50,7 +55,17 @@ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends Fil * @return */ def parent : File = { - HadoopFile(fs, path.getParent) + val p = path.getParent + if (p == null) { + this + } + else if (p.getName.isEmpty) { + HadoopFile(fs, p) + } + else { + val uri = new URI(p.toUri.toString + "/") + HadoopFile(fs, new Path(uri)) + } } /** @@ -69,10 +84,6 @@ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends Fil fs.getFileStatus(path).getLen } - def resolve(name:String) : File = { - HadoopFile(fs, new Path(path.toUri.resolve(name))) - } - /** * Lists all directory entries. Will throw an exception if the File is not a directory * @return @@ -86,7 +97,7 @@ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends Fil .map(_._2) } - def glob(pattern:Path) : Seq[File] = { + def glob(pattern:String) : Seq[File] = { if (!isDirectory()) throw new IOException(s"File '$path' is not a directory - cannot list files") fs.globStatus(new Path(path, pattern)) @@ -134,7 +145,7 @@ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends Fil } /** - * Creates a file and returns the correspondiong output stream + * Creates a file and returns the corresponding output stream. Intermediate directories will be created as required. * @param overwrite * @return */ diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala new file mode 100644 index 000000000..391149f7f --- /dev/null +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -0,0 +1,201 @@ +/* + * Copyright 2022 Kaya Kupferschmidt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dimajix.flowman.fs + +import java.io.InputStream +import java.io.OutputStream +import java.net.URI +import java.nio.file.Files +import java.nio.file.Path +import java.nio.file.Paths +import java.nio.file.StandardOpenOption +import java.util.Comparator +import java.util.stream.Collectors + +import scala.collection.JavaConverters._ + +import org.apache.hadoop.fs + + +case class JavaFile(jpath:Path) extends File { + override def path: fs.Path = new fs.Path(jpath.toUri) + + /** + * Creates a new File object by attaching a child entry + * + * @param sub + * @return + */ + override def /(sub: String): File = { + val uri = new URI(sub) + if (uri.isAbsolute) + JavaFile(Paths.get(uri)) + else + JavaFile(jpath.resolve(sub)) + } + + /** + * Returns the parent directory of the File + * + * @return + */ + override def parent: File = { + val p = jpath.getParent + if (p != null) + JavaFile(p) + else + this + } + + /** + * Returns the absolute path + * + * @return + */ + override def absolute: File = JavaFile(jpath.toAbsolutePath) + + /** + * Returns the size of the file. Will throw an exception if the file does not exist + * + * @return + */ + override def length: Long = Files.size(jpath) + + /** + * Lists all directory entries. Will throw an exception if the File is not a directory + * + * @return + */ + override def list(): Seq[File] = Files.list(jpath) + .collect(Collectors.toList[Path]) + .asScala + .sortBy(_.toString) + .map(JavaFile) + + override def glob(pattern: String): Seq[File] = { + val stream = Files.newDirectoryStream(jpath, pattern) + stream.asScala + .toSeq + .sortBy(_.toString) + .map(x => JavaFile(x)) + } + + /** + * Renamed the file to a different name + * + * @param dst + */ + override def rename(dst: fs.Path): Unit = { + Files.move(jpath, Paths.get(dst.toUri)) + } + + /** + * Copies the file to a different file. The relation file may reside on a different file system + * + * @param dst + * @param overwrite + */ + override def copy(dst: File, overwrite: Boolean): Unit = { + val out = dst.create(overwrite) + try { + Files.copy(jpath, out) + } + finally { + out.close() + } + } + + /** + * Creates a file and returns the corresponding output stream. Intermediate directories will be created as required. + * + * @param overwrite + * @return + */ + override def create(overwrite: Boolean): OutputStream = { + Files.createDirectories(jpath.getParent) + if (overwrite) + Files.newOutputStream(jpath, StandardOpenOption.CREATE, StandardOpenOption.TRUNCATE_EXISTING, StandardOpenOption.WRITE) + else + Files.newOutputStream(jpath, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE) + } + + /** + * Opens an existing file and returns the corresponding input stream + * + * @return + */ + override def open(): InputStream = { + Files.newInputStream(jpath) + } + + /** + * Deletes the file and/or directory + * + * @param recursive + */ + override def delete(recursive: Boolean): Unit = { + if (recursive) { + Files.walk(jpath) + .sorted(Comparator.reverseOrder[Path]()) + .forEach(p => p.toFile.delete()) + } + else { + Files.delete(jpath) + } + } + + /** + * Returns true if the file exists. It can either be a file or a directory + * + * @return + */ + override def exists(): Boolean = Files.exists(jpath) + + override def mkdirs(): Unit = Files.createDirectories(jpath) + + /** + * Returns true if the file exists as a directory + * + * @return + */ + override def isDirectory(): Boolean = Files.isDirectory(jpath) + + /** + * Returns true if the file exists as a normal file + * + * @return + */ + override def isFile(): Boolean = Files.isRegularFile(jpath) + + /** + * Returns true if the File is an absolute path + * + * @return + */ + override def isAbsolute(): Boolean = jpath.isAbsolute + + /** + * Creates a new File instance with an additional suffix attached. This will not physically create the file + * on the FileSystem, but will return a File which then can be used for creation + * + * @param suffix + * @return + */ + override def withSuffix(suffix: String): File = JavaFile(jpath.getParent.resolve(jpath.getFileName + suffix)) + + override def withName(name: String): File = JavaFile(jpath.getParent.resolve(name)) +} diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/model/Module.scala b/flowman-core/src/main/scala/com/dimajix/flowman/model/Module.scala index d8c3e416b..ded781f77 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/model/Module.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/model/Module.scala @@ -24,8 +24,8 @@ import scala.util.control.NonFatal import org.slf4j.LoggerFactory -import com.dimajix.flowman.hadoop.File -import com.dimajix.flowman.hadoop.GlobPattern +import com.dimajix.flowman.fs.File +import com.dimajix.flowman.fs.GlobPattern import com.dimajix.flowman.spi.ModuleReader @@ -96,7 +96,7 @@ object Module { val patterns = reader.globPatterns.map(GlobPattern(_)) file.list() .par - .filter(f => f.isFile() && patterns.exists(_.matches(f.filename))) + .filter(f => f.isFile() && patterns.exists(_.matches(f.name))) .map(f => loadFile(f)) .foldLeft(Module())((l, r) => l.merge(r)) } @@ -114,7 +114,7 @@ object Module { val patterns = reader.globPatterns.map(GlobPattern(_)) file.list() .par - .filter(f => f.isFile() && patterns.exists(_.matches(f.filename))) + .filter(f => f.isFile() && patterns.exists(_.matches(f.name))) .map(f => f -> loadFile(f)) .seq } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/model/Project.scala b/flowman-core/src/main/scala/com/dimajix/flowman/model/Project.scala index f66d9a385..30c887cd7 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/model/Project.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/model/Project.scala @@ -23,7 +23,7 @@ import scala.util.control.NonFatal import org.slf4j.LoggerFactory -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.spi.ProjectReader diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/model/ResourceIdentifier.scala b/flowman-core/src/main/scala/com/dimajix/flowman/model/ResourceIdentifier.scala index 009a30a5e..6d86c3851 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/model/ResourceIdentifier.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/model/ResourceIdentifier.scala @@ -26,7 +26,7 @@ import scala.annotation.tailrec import org.apache.hadoop.fs.Path import com.dimajix.flowman.catalog.TableIdentifier -import com.dimajix.flowman.hadoop.GlobPattern +import com.dimajix.flowman.fs.GlobPattern object ResourceIdentifier { diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/model/velocity.scala b/flowman-core/src/main/scala/com/dimajix/flowman/model/velocity.scala index 5bcc6a129..fdf91b3eb 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/model/velocity.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/model/velocity.scala @@ -18,7 +18,7 @@ package com.dimajix.flowman.model import scala.collection.JavaConverters._ -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.templating.FileWrapper diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/spi/DocumenterReader.scala b/flowman-core/src/main/scala/com/dimajix/flowman/spi/DocumenterReader.scala index 94ff08658..b5f3028b6 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/spi/DocumenterReader.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/spi/DocumenterReader.scala @@ -19,7 +19,7 @@ package com.dimajix.flowman.spi import java.io.IOException import com.dimajix.flowman.documentation.Documenter -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Prototype diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/spi/ModuleReader.scala b/flowman-core/src/main/scala/com/dimajix/flowman/spi/ModuleReader.scala index 93310ebc5..cb747c247 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/spi/ModuleReader.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/spi/ModuleReader.scala @@ -19,7 +19,7 @@ package com.dimajix.flowman.spi import java.io.IOException import java.io.InputStream -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Module diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/spi/ProjectReader.scala b/flowman-core/src/main/scala/com/dimajix/flowman/spi/ProjectReader.scala index 757b22de5..250bf0125 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/spi/ProjectReader.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/spi/ProjectReader.scala @@ -18,7 +18,7 @@ package com.dimajix.flowman.spi import java.io.IOException -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Project diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/storage/Parcel.scala b/flowman-core/src/main/scala/com/dimajix/flowman/storage/Parcel.scala index e97542e49..994839461 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/storage/Parcel.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/storage/Parcel.scala @@ -16,7 +16,7 @@ package com.dimajix.flowman.storage -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.AbstractInstance diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/storage/Workspace.scala b/flowman-core/src/main/scala/com/dimajix/flowman/storage/Workspace.scala index 0ef882a3f..bf744223f 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/storage/Workspace.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/storage/Workspace.scala @@ -16,7 +16,7 @@ package com.dimajix.flowman.storage -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Project diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/templating/wrapper.scala b/flowman-core/src/main/scala/com/dimajix/flowman/templating/wrapper.scala index 8cce88d65..fd416cde0 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/templating/wrapper.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/templating/wrapper.scala @@ -36,7 +36,7 @@ import org.apache.velocity.VelocityContext import org.apache.velocity.app.VelocityEngine import org.slf4j.LoggerFactory -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.templating.FileWrapper.logger import com.dimajix.flowman.util.UtcTimestamp @@ -82,7 +82,7 @@ case class FileWrapper(file:File) { def getParent() : FileWrapper = FileWrapper(file.parent) def getAbsPath() : FileWrapper = FileWrapper(file.absolute) def getPath() : String = Path.getPathWithoutSchemeAndAuthority(file.path).toString - def getFilename() : String = file.filename + def getFilename() : String = file.name def withSuffix(suffix:String) : FileWrapper = FileWrapper(file.withSuffix(suffix)) def withName(name:String) : FileWrapper = FileWrapper(file.withName(name)) } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/types/SchemaWriter.scala b/flowman-core/src/main/scala/com/dimajix/flowman/types/SchemaWriter.scala index f21a98a63..f7f1a3e33 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/types/SchemaWriter.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/types/SchemaWriter.scala @@ -19,7 +19,7 @@ package com.dimajix.flowman.types import java.nio.charset.Charset import java.util.Locale -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File /** diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/util/ObjectMapper.scala b/flowman-core/src/main/scala/com/dimajix/flowman/util/ObjectMapper.scala index 6111bfba4..0b64fc5b9 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/util/ObjectMapper.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/util/ObjectMapper.scala @@ -33,7 +33,7 @@ import com.fasterxml.jackson.databind.{ObjectMapper => JacksonMapper} import com.fasterxml.jackson.dataformat.yaml.YAMLFactory import com.fasterxml.jackson.module.scala.DefaultScalaModule -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File /** diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/execution/ProjectContextTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/execution/ProjectContextTest.scala index 9a2730163..4be22a73a 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/execution/ProjectContextTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/execution/ProjectContextTest.scala @@ -20,7 +20,7 @@ import org.apache.hadoop.conf.Configuration import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.model.Project diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileCollectorTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileCollectorTest.scala similarity index 99% rename from flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileCollectorTest.scala rename to flowman-core/src/test/scala/com/dimajix/flowman/fs/FileCollectorTest.scala index 085e23e58..0628ebdf2 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileCollectorTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileCollectorTest.scala @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import java.time.Month diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala similarity index 70% rename from flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileTest.scala rename to flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index b89c58564..6fed6900a 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/hadoop/FileTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2018 Kaya Kupferschmidt + * Copyright 2018-2022 Kaya Kupferschmidt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -14,7 +14,7 @@ * limitations under the License. */ -package com.dimajix.flowman.hadoop +package com.dimajix.flowman.fs import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -22,7 +22,7 @@ import org.scalatest.matchers.should.Matchers import com.dimajix.spark.testing.LocalSparkSession -class FileTest extends AnyFlatSpec with Matchers with LocalSparkSession { +class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { "A local File" should "be useable with simple strings" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) @@ -32,7 +32,7 @@ class FileTest extends AnyFlatSpec with Matchers with LocalSparkSession { tmpFromString.isDirectory() should be (true) } - it should "be useable with Files" in { + it should "be usable with Files" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val tmpFromUri = fs.local(tempDir) @@ -41,7 +41,7 @@ class FileTest extends AnyFlatSpec with Matchers with LocalSparkSession { tmpFromUri.isDirectory() should be (true) } - it should "be useable with URIs" in { + it should "be usable with URIs" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val tmpFromUri = fs.local(tempDir.toURI) @@ -98,4 +98,36 @@ class FileTest extends AnyFlatSpec with Matchers with LocalSparkSession { newName.isFile() should be (false) newName.isDirectory() should be (false) } + + it should "support resources somewhere" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.resource("com/dimajix/flowman/flowman.properties") + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val dir = fs.resource("com/dimajix/flowman") + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + } + + it should "support resources in JARs" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.resource("org/apache/spark/log4j2-defaults.properties") + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val dir = fs.resource("org/apache/spark") + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + } } diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala new file mode 100644 index 000000000..1d799a804 --- /dev/null +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala @@ -0,0 +1,86 @@ +/* + * Copyright 2022 Kaya Kupferschmidt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dimajix.flowman.fs + +import org.apache.hadoop.conf.Configuration +import org.apache.hadoop.fs +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers + +import com.dimajix.spark.testing.LocalTempDir + + +class HadoopFileTest extends AnyFlatSpec with Matchers with LocalTempDir { + private val localFs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration()) + + "The HadoopFile" should "work" in { + val dir = HadoopFile(localFs, new fs.Path(tempDir.toURI)) + dir.path should be (new fs.Path(tempDir.toURI)) + dir.exists() should be (true) + dir.isFile() should be (false) + dir.isDirectory() should be (true) + dir.isAbsolute() should be (true) + + (dir / dir.toString) should be (dir) + + val file = dir / "lala" + file.exists() should be (false) + file.isFile() should be (false) + file.isDirectory() should be (false) + file.isAbsolute() should be(true) + file.parent should be (dir) + file.name should be ("lala") + file.withName("lolo") should be (dir / "lolo") + } + + it should "work at root level" in { + val dir = HadoopFile(localFs, new fs.Path("file:/")) + dir.parent should be (dir) + dir.path should be(new fs.Path("file:/")) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isDirectory() should be(true) + dir.isAbsolute() should be(true) + + val file = dir / "lala" + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + file.isAbsolute() should be(true) + file.parent should be(dir) + file.name should be("lala") + file.withName("lolo") should be(dir / "lolo") + } + + it should "support creating entries" in { + val tmp = HadoopFile(localFs, new fs.Path(tempDir.toURI)) + val file = tmp / ("lala-" + System.currentTimeMillis().toString + ".tmp") + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create().close() + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + file.delete(false) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + } +} diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala new file mode 100644 index 000000000..618d0dc62 --- /dev/null +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala @@ -0,0 +1,122 @@ +/* + * Copyright 2022 Kaya Kupferschmidt + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.dimajix.flowman.fs + +import java.nio.file.Paths +import java.util.Collections + +import org.scalatest.flatspec.AnyFlatSpec +import org.scalatest.matchers.should.Matchers +import org.apache.hadoop.fs + +import com.dimajix.common.Resources +import com.dimajix.spark.testing.LocalTempDir + + +class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { + "The JavaFile" should "work" in { + val dir = JavaFile(tempDir.toPath) + dir.path should be (new fs.Path(tempDir.toURI)) + dir.exists() should be (true) + dir.isFile() should be (false) + dir.isDirectory() should be (true) + dir.isAbsolute() should be (true) + + (dir / dir.toString) should be (dir) + + val file = dir / "lala" + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + file.isAbsolute() should be (true) + file.parent should be(dir) + file.name should be("lala") + file.withName("lolo") should be(dir / "lolo") + } + + it should "work at root level" in { + val dir = JavaFile(tempDir.toPath.getRoot) + dir.parent should be (dir) + dir.path should be(new fs.Path("file:/")) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isDirectory() should be(true) + dir.isAbsolute() should be(true) + + val file = dir / "lala" + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + file.isAbsolute() should be(true) + file.parent should be(dir) + file.name should be("lala") + file.withName("lolo") should be(dir / "lolo") + } + + it should "support creating entries" in { + val tmp = JavaFile(tempDir.toPath) + val file = tmp / ("lala-" + System.currentTimeMillis().toString + ".tmp") + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create().close() + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + file.delete(false) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + } + + it should "support resources somewhere" in { + val res = Resources.getURL("com/dimajix/flowman/flowman.properties") + val file = JavaFile(Paths.get(res.toURI)) + file.exists() should be (true) + file.isFile() should be (true) + file.isAbsolute() should be (true) + file.isDirectory() should be(false) + + val res1 = Resources.getURL("com/dimajix/flowman") + val dir = JavaFile(Paths.get(res1.toURI)) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + } + + it should "support resources in JARs" in { + val res = Resources.getURL("org/apache/spark/log4j2-defaults.properties") + val xyz = java.nio.file.FileSystems.newFileSystem(res.toURI, Collections.emptyMap[String,String]()) + val file = JavaFile(Paths.get(res.toURI)) + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val res1 = Resources.getURL("org/apache/spark") + val dir = JavaFile(Paths.get(res1.toURI)) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + + xyz.close() + } +} diff --git a/flowman-dsl/src/main/scala/com/dimajix/flowman/dsl/module.scala b/flowman-dsl/src/main/scala/com/dimajix/flowman/dsl/module.scala index 3ad3f6445..7b674086b 100644 --- a/flowman-dsl/src/main/scala/com/dimajix/flowman/dsl/module.scala +++ b/flowman-dsl/src/main/scala/com/dimajix/flowman/dsl/module.scala @@ -22,7 +22,7 @@ import org.apache.hadoop.{fs => hdfs} import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Environment -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model import com.dimajix.flowman.model.Identifier import com.dimajix.flowman.model.Instance diff --git a/flowman-plugins/delta/src/main/scala/com/dimajix/flowman/spec/relation/DeltaFileRelation.scala b/flowman-plugins/delta/src/main/scala/com/dimajix/flowman/spec/relation/DeltaFileRelation.scala index 546fe14cc..e190e4c21 100644 --- a/flowman-plugins/delta/src/main/scala/com/dimajix/flowman/spec/relation/DeltaFileRelation.scala +++ b/flowman-plugins/delta/src/main/scala/com/dimajix/flowman/spec/relation/DeltaFileRelation.scala @@ -45,7 +45,7 @@ import com.dimajix.flowman.execution.MigrationStrategy import com.dimajix.flowman.execution.Operation import com.dimajix.flowman.execution.OutputMode import com.dimajix.flowman.execution.UnspecifiedSchemaException -import com.dimajix.flowman.hadoop.FileUtils +import com.dimajix.flowman.fs.FileUtils import com.dimajix.flowman.jdbc.HiveDialect import com.dimajix.flowman.model.PartitionField import com.dimajix.flowman.model.PartitionSchema diff --git a/flowman-plugins/sftp/src/main/scala/com/dimajix/flowman/spec/target/SftpUploadTarget.scala b/flowman-plugins/sftp/src/main/scala/com/dimajix/flowman/spec/target/SftpUploadTarget.scala index 46d109359..87ad4663a 100644 --- a/flowman-plugins/sftp/src/main/scala/com/dimajix/flowman/spec/target/SftpUploadTarget.scala +++ b/flowman-plugins/sftp/src/main/scala/com/dimajix/flowman/spec/target/SftpUploadTarget.scala @@ -174,7 +174,7 @@ case class SftpUploadTarget( } } - private def uploadSingleFile(client:SFTPv3Client, src:com.dimajix.flowman.hadoop.File, dst:Path) : Unit = { + private def uploadSingleFile(client:SFTPv3Client, src:com.dimajix.flowman.fs.File, dst:Path) : Unit = { logger.info(s"Uploading file '$src' to sftp remote destination '$dst'") ensureDirectory(client, dst.getParent) tryWith(src.open()) { input => @@ -186,7 +186,7 @@ case class SftpUploadTarget( } } - private def uploadMergedFile(client:SFTPv3Client, src:com.dimajix.flowman.hadoop.File, dst:Path, delimiter:Option[Array[Byte]]) : Unit = { + private def uploadMergedFile(client:SFTPv3Client, src:com.dimajix.flowman.fs.File, dst:Path, delimiter:Option[Array[Byte]]) : Unit = { logger.info(s"Uploading merged directory '$src' to sftp remote destination '$dst'") ensureDirectory(client, dst.getParent) val handle = client.createFile(dst.toString) @@ -204,13 +204,13 @@ case class SftpUploadTarget( client.closeFile(handle) } - private def uploadDirectory(client:SFTPv3Client, src:com.dimajix.flowman.hadoop.File, dst:Path) : Unit = { + private def uploadDirectory(client:SFTPv3Client, src:com.dimajix.flowman.fs.File, dst:Path) : Unit = { logger.info(s"Uploading directory '$src' to sftp remote destination '$dst'") ensureDirectory(client, dst) src.list() .filter(_.isFile()) .foreach(file => { - uploadSingleFile(client, file, new Path(dst, file.filename)) + uploadSingleFile(client, file, new Path(dst, file.name)) }) } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlDocumenterReader.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlDocumenterReader.scala index a74e1e108..6bf8fcf76 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlDocumenterReader.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlDocumenterReader.scala @@ -17,7 +17,7 @@ package com.dimajix.flowman.spec import com.dimajix.flowman.documentation.Documenter -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Prototype import com.dimajix.flowman.spec.documentation.DocumenterSpec import com.dimajix.flowman.spi.DocumenterReader diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlModuleReader.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlModuleReader.scala index 75eb2f357..b66d0cffb 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlModuleReader.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlModuleReader.scala @@ -22,7 +22,7 @@ import java.io.InputStream import com.fasterxml.jackson.core.JsonProcessingException import com.fasterxml.jackson.databind.JsonMappingException -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Module import com.dimajix.flowman.spi.ModuleReader diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlProjectReader.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlProjectReader.scala index 3e729f9cc..b144e3022 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlProjectReader.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/YamlProjectReader.scala @@ -21,7 +21,7 @@ import java.io.IOException import com.fasterxml.jackson.core.JsonProcessingException import com.fasterxml.jackson.databind.JsonMappingException -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Project import com.dimajix.flowman.spi.ProjectReader diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala index f99694907..ca0b617bc 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala @@ -33,7 +33,7 @@ import com.dimajix.flowman.documentation.Generator import com.dimajix.flowman.documentation.ProjectDoc import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Execution -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File object FileGenerator { diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/FileRelation.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/FileRelation.scala index 681446523..68ebfd34d 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/FileRelation.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/FileRelation.scala @@ -42,8 +42,8 @@ import com.dimajix.flowman.execution.MigrationPolicy import com.dimajix.flowman.execution.MigrationStrategy import com.dimajix.flowman.execution.Operation import com.dimajix.flowman.execution.OutputMode -import com.dimajix.flowman.hadoop.FileCollector -import com.dimajix.flowman.hadoop.FileUtils +import com.dimajix.flowman.fs.FileCollector +import com.dimajix.flowman.fs.FileUtils import com.dimajix.flowman.jdbc.HiveDialect import com.dimajix.flowman.model.BaseRelation import com.dimajix.flowman.model.PartitionField diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveTableRelation.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveTableRelation.scala index 595cdfa1f..cbbea2a77 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveTableRelation.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveTableRelation.scala @@ -60,7 +60,7 @@ import com.dimajix.flowman.execution.MigrationStrategy import com.dimajix.flowman.execution.Operation import com.dimajix.flowman.execution.OutputMode import com.dimajix.flowman.execution.UnspecifiedSchemaException -import com.dimajix.flowman.hadoop.FileUtils +import com.dimajix.flowman.fs.FileUtils import com.dimajix.flowman.jdbc.HiveDialect import com.dimajix.flowman.model.PartitionField import com.dimajix.flowman.model.PartitionSchema diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveUnionTableRelation.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveUnionTableRelation.scala index 60bc07f5e..89c6c43f1 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveUnionTableRelation.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/HiveUnionTableRelation.scala @@ -39,7 +39,7 @@ import com.dimajix.flowman.execution.MigrationStrategy import com.dimajix.flowman.execution.Operation import com.dimajix.flowman.execution.OutputMode import com.dimajix.flowman.execution.UnspecifiedSchemaException -import com.dimajix.flowman.hadoop.FileUtils +import com.dimajix.flowman.fs.FileUtils import com.dimajix.flowman.jdbc.HiveDialect import com.dimajix.flowman.model.BaseRelation import com.dimajix.flowman.model.PartitionField diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/LocalRelation.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/LocalRelation.scala index 595b7a007..bd2ebd3b5 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/LocalRelation.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/relation/LocalRelation.scala @@ -34,7 +34,7 @@ import com.dimajix.flowman.execution.MigrationPolicy import com.dimajix.flowman.execution.MigrationStrategy import com.dimajix.flowman.execution.Operation import com.dimajix.flowman.execution.OutputMode -import com.dimajix.flowman.hadoop.FileCollector +import com.dimajix.flowman.fs.FileCollector import com.dimajix.flowman.model.BaseRelation import com.dimajix.flowman.model.PartitionField import com.dimajix.flowman.model.PartitionSchema diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/FileStore.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/FileStore.scala index 80b948cf4..7c34d5a6f 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/FileStore.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/FileStore.scala @@ -22,7 +22,7 @@ import org.slf4j.LoggerFactory import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.NoSuchProjectException -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Project import com.dimajix.flowman.storage.AbstractStore import com.dimajix.flowman.storage.Store @@ -31,7 +31,7 @@ import com.dimajix.flowman.storage.Store case class FileStore(root:File) extends AbstractStore { private val logger = LoggerFactory.getLogger(classOf[FileStore]) - private val globPattern = new Path("*/project.{yml,yaml}") + private val globPattern = "*/project.{yml,yaml}" /** * Loads a project via its name (not its filename or directory) diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalParcel.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalParcel.scala index b6a35aa2d..c95a467ae 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalParcel.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalParcel.scala @@ -28,7 +28,7 @@ import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream import org.apache.commons.io.IOUtils import com.dimajix.common.tryWith -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Project import com.dimajix.flowman.spec.ToSpec import com.dimajix.flowman.storage.AbstractParcel @@ -59,7 +59,7 @@ case class LocalParcel(override val name:String, override val root:File) extends if (!targz.isFile()) throw new IOException(s"Source file '$targz' doesn't exists!") - root.glob(new org.apache.hadoop.fs.Path("*")).foreach(_.delete(true)) + root.glob("*").foreach(_.delete(true)) decompressTarGzipFile(targz, root) } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalWorkspace.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalWorkspace.scala index b84e80399..fa28befc4 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalWorkspace.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/LocalWorkspace.scala @@ -23,7 +23,7 @@ import com.fasterxml.jackson.annotation.JsonProperty import org.apache.hadoop.fs.Path import com.dimajix.flowman.execution.NoSuchProjectException -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Project import com.dimajix.flowman.spec.ObjectMapper import com.dimajix.flowman.spec.ToSpec @@ -36,7 +36,7 @@ object LocalWorkspace { def load(file:File) : LocalWorkspace = new LocalWorkspace(file) def list(root:File) : Seq[Workspace] = { - val globPattern = new Path("*/.flowman-workspace.yaml") + val globPattern = "*/.flowman-workspace.yaml" root.glob(globPattern) .flatMap(file => Try(load(file.parent)).toOption) } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/ParcelSpec.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/ParcelSpec.scala index 2928064aa..a8a7b281d 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/ParcelSpec.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/storage/ParcelSpec.scala @@ -21,7 +21,7 @@ import com.fasterxml.jackson.annotation.JsonSubTypes import com.fasterxml.jackson.annotation.JsonTypeInfo import com.dimajix.common.TypeRegistry -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.spec.annotation.ParcelType import com.dimajix.flowman.spi.ClassAnnotationHandler import com.dimajix.flowman.storage.Parcel diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala index e997b9bd3..b8d823269 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/FileTarget.scala @@ -33,7 +33,7 @@ import com.dimajix.flowman.execution.MappingUtils import com.dimajix.flowman.execution.OutputMode import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.VerificationFailedException -import com.dimajix.flowman.hadoop.FileUtils +import com.dimajix.flowman.fs.FileUtils import com.dimajix.flowman.model.BaseTarget import com.dimajix.flowman.model.MappingOutputIdentifier import com.dimajix.flowman.model.ResourceIdentifier diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/ProjectTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/ProjectTest.scala index e5470a698..2d88d3f59 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/ProjectTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/ProjectTest.scala @@ -22,7 +22,7 @@ import org.apache.hadoop.fs.Path import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.model.DuplicateEntityException import com.dimajix.flowman.model.Project diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/storage/LocalWorkspaceTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/storage/LocalWorkspaceTest.scala index 733e40698..f5aef65b5 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/storage/LocalWorkspaceTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/storage/LocalWorkspaceTest.scala @@ -20,7 +20,7 @@ import org.apache.hadoop.conf.Configuration import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.spark.testing.LocalTempDir diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/MergeFilesTargetTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/MergeFilesTargetTest.scala index 4813536a3..b56dd25e9 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/MergeFilesTargetTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/target/MergeFilesTargetTest.scala @@ -43,11 +43,11 @@ class MergeFilesTargetTest extends AnyFlatSpec with Matchers with LocalSparkSess dest.isFile() should be (false) dest.isDirectory() should be (false) - val file1 = (source / "file_1.txt").create() + val file1 = (source / "file_1.txt").create(true) file1.write("This is a test".getBytes(Charset.forName("UTF-8"))) file1.close() - val file2 = (source / "file_2.txt").create() + val file2 = (source / "file_2.txt").create(false) file2.write("The second line".getBytes(Charset.forName("UTF-8"))) file2.close() diff --git a/flowman-studio/src/main/scala/com/dimajix/flowman/studio/rest/workspace/ParcelEndpoint.scala b/flowman-studio/src/main/scala/com/dimajix/flowman/studio/rest/workspace/ParcelEndpoint.scala index 4eef7bd08..655cec0d9 100644 --- a/flowman-studio/src/main/scala/com/dimajix/flowman/studio/rest/workspace/ParcelEndpoint.scala +++ b/flowman-studio/src/main/scala/com/dimajix/flowman/studio/rest/workspace/ParcelEndpoint.scala @@ -38,7 +38,7 @@ import javax.ws.rs.PUT import javax.ws.rs.Path import org.apache.hadoop.conf.Configuration -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.spec.storage.LocalParcel import com.dimajix.flowman.storage.Parcel import com.dimajix.flowman.storage.Workspace diff --git a/flowman-studio/src/main/scala/com/dimajix/flowman/studio/service/WorkspaceManager.scala b/flowman-studio/src/main/scala/com/dimajix/flowman/studio/service/WorkspaceManager.scala index 40d58bbed..147563b05 100644 --- a/flowman-studio/src/main/scala/com/dimajix/flowman/studio/service/WorkspaceManager.scala +++ b/flowman-studio/src/main/scala/com/dimajix/flowman/studio/service/WorkspaceManager.scala @@ -20,7 +20,7 @@ import scala.collection.mutable import org.slf4j.LoggerFactory -import com.dimajix.flowman.hadoop.File +import com.dimajix.flowman.fs.File import com.dimajix.flowman.spec.storage.LocalWorkspace import com.dimajix.flowman.storage.Workspace diff --git a/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala b/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala index 907b6c6f0..69c6d1928 100644 --- a/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala +++ b/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala @@ -33,7 +33,7 @@ import org.apache.spark.sql.internal.SQLConf import com.dimajix.flowman.common.Logging import com.dimajix.flowman.execution.Phase import com.dimajix.flowman.execution.Session -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.model.Job import com.dimajix.flowman.model.JobIdentifier import com.dimajix.flowman.model.Namespace diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala index bbbdeea71..86133b1b9 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala @@ -26,7 +26,7 @@ import com.dimajix.common.Resources import com.dimajix.flowman.common.ToolConfig import com.dimajix.flowman.config.FlowmanConf import com.dimajix.flowman.execution.Session -import com.dimajix.flowman.hadoop.FileSystem +import com.dimajix.flowman.fs.FileSystem import com.dimajix.flowman.model.Namespace import com.dimajix.flowman.model.Project import com.dimajix.flowman.model.SystemSettings From 6e44680ebff39f42541b329fee69e697401d2da9 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 1 Nov 2022 13:47:22 +0100 Subject: [PATCH 28/52] github-273 More work on supporting projects inside jar files --- .../scala/com/dimajix/common/Resources.scala | 6 +- .../com/dimajix/flowman/fs/FileSystem.scala | 30 +++- .../dimajix/flowman/fs/FileSystemTest.scala | 146 +++++++++++++++++- .../com/dimajix/flowman/tools/Tool.scala | 9 +- 4 files changed, 176 insertions(+), 15 deletions(-) diff --git a/flowman-common/src/main/scala/com/dimajix/common/Resources.scala b/flowman-common/src/main/scala/com/dimajix/common/Resources.scala index 3fbebf7f2..5f2a2f38f 100644 --- a/flowman-common/src/main/scala/com/dimajix/common/Resources.scala +++ b/flowman-common/src/main/scala/com/dimajix/common/Resources.scala @@ -23,11 +23,13 @@ import java.util.Properties class Resources object Resources { def getURL(resourceName:String) : URL = { - classOf[Resources].getClassLoader.getResource(resourceName) + val loader = Thread.currentThread.getContextClassLoader + loader.getResource(resourceName) } def loadProperties(resourceName:String) : Properties = { - val url = classOf[Resources].getClassLoader.getResource(resourceName) + val loader = Thread.currentThread.getContextClassLoader + val url = loader.getResource(resourceName) loadProperties(url) } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index 5856dab9d..35e93ab54 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -33,19 +33,41 @@ import com.dimajix.common.Resources */ case class FileSystem(conf:Configuration) { def file(path:Path) : File = { - val fs = path.getFileSystem(conf) - HadoopFile(fs, path) + if (path.toUri.getScheme == "jar") { + resource(path.toUri) + } + else { + val fs = path.getFileSystem(conf) + HadoopFile(fs, path) + } + } + def file(path:String) : File = { + val uri = new URI(path) + if (uri.getScheme == "jar") + resource(uri) + else + file(new Path(path)) } - def file(path:String) : File = file(new Path(path)) def file(path:URI) : File = file(new Path(path)) def local(path:Path) : File = local(path.toUri) def local(path:String) : File = JavaFile(Paths.get(path)) def local(path:java.io.File) : File = JavaFile(path.toPath) - def local(path:URI) : File = JavaFile(Paths.get(path)) + def local(path:URI) : File = { + if (path.getScheme == null) { + val uri = new URI("file", path.getUserInfo, path.getHost, path.getPort, path.getPath, path.getQuery, path.getFragment) + JavaFile(Paths.get(uri)) + } + else { + JavaFile(Paths.get(path)) + } + } def resource(path:String) : File = { val uri = Resources.getURL(path).toURI + resource(uri) + } + def resource(uri:URI) : File = { if (uri.getScheme == "jar") { try { java.nio.file.FileSystems.getFileSystem(uri) diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index 6fed6900a..ab00edba0 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -16,14 +16,16 @@ package com.dimajix.flowman.fs +import org.apache.hadoop.fs.Path import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers +import com.dimajix.common.Resources import com.dimajix.spark.testing.LocalSparkSession class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { - "A local File" should "be useable with simple strings" in { + "FileSystem.local" should "be usable with simple strings" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val tmpFromString = fs.local(tempDir.toString) @@ -41,6 +43,15 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { tmpFromUri.isDirectory() should be (true) } + it should "be usable with Paths" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val tmpFromUri = fs.local(new Path(tempDir.toString)) + tmpFromUri.exists() should be(true) + tmpFromUri.isFile() should be(false) + tmpFromUri.isDirectory() should be(true) + } + it should "be usable with URIs" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) @@ -99,7 +110,74 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { newName.isDirectory() should be (false) } - it should "support resources somewhere" in { + "FileSystem.file" should "be usable with simple strings" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val tmpFromString = fs.file(tempDir.toString) + tmpFromString.exists() should be(true) + tmpFromString.isFile() should be(false) + tmpFromString.isDirectory() should be(true) + } + + it should "be usable with URIs" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val tmpFromUri = fs.file(tempDir.toURI) + tmpFromUri.exists() should be(true) + tmpFromUri.isFile() should be(false) + tmpFromUri.isDirectory() should be(true) + } + + it should "support creating entries" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val tmp = fs.file(tempDir.toURI) + val file = tmp / ("lala-" + System.currentTimeMillis().toString + ".tmp") + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create().close() + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + file.delete(false) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + } + + it should "support renaming entries" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val tmp = fs.file(tempDir.toURI) + val file = tmp / ("lala-" + System.currentTimeMillis().toString + ".tmp") + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create().close() + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + val newName = file.withName("lolo-" + System.currentTimeMillis().toString + ".tmp") + file.rename(newName.path) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + newName.exists() should be(true) + newName.isFile() should be(true) + newName.isDirectory() should be(false) + + newName.delete(false) + newName.exists() should be(false) + newName.isFile() should be(false) + newName.isDirectory() should be(false) + } + + "FileSystem.resource" should "support resources somewhere" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val file = fs.resource("com/dimajix/flowman/flowman.properties") @@ -130,4 +208,68 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { dir.isAbsolute() should be(true) dir.isDirectory() should be(true) } + + "FileSystem.file" should "support resources somewhere via 'file(URI)'" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val dir = fs.file(Resources.getURL("com/dimajix/flowman").toURI) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + } + + it should "support resources somewhere via 'file(String)'" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(Resources.getURL("com/dimajix/flowman/flowman.properties").toString) + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val dir = fs.file(Resources.getURL("com/dimajix/flowman").toString) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + } + + it should "support resources in JARs via 'file(URI)'" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(Resources.getURL("org/apache/spark/log4j2-defaults.properties").toURI) + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val dir = fs.file(Resources.getURL("org/apache/spark").toURI) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + } + + it should "support resources in JARs via 'file(String)'" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(Resources.getURL("org/apache/spark/log4j2-defaults.properties").toString) + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val dir = fs.file(Resources.getURL("org/apache/spark").toString) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + } } diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala index 86133b1b9..d7ea6a0c3 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala @@ -53,7 +53,7 @@ class Tool { .filter(_.isFile) .map(file => SystemSettings.read.file(file)) .orElse( - Option(getResource("META-INF/flowman/conf/system.yml")) + Option(Resources.getURL("META-INF/flowman/conf/system.yml")) .map(SystemSettings.read.url) ) .getOrElse(SystemSettings.read.default()) @@ -69,7 +69,7 @@ class Tool { .filter(_.isFile) .map(file => Namespace.read.file(file)) .orElse( - Option(getResource("META-INF/flowman/conf/default-namespace.yml")) + Option(Resources.getURL("META-INF/flowman/conf/default-namespace.yml")) .map(Namespace.read.url) ) .getOrElse(Namespace.read.default()) @@ -79,11 +79,6 @@ class Tool { ns } - private def getResource(name:String) : URL = { - val loader = Thread.currentThread.getContextClassLoader - loader.getResource(name) - } - def loadProject(projectPath:Path) : Project = { // Create Hadoop FileSystem instance val hadoopConfig = new Configuration() From 3871593f75d02b76912be33735203573037223bb Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 1 Nov 2022 13:59:13 +0100 Subject: [PATCH 29/52] Fix build for Scala 2.11 --- .../src/main/scala/com/dimajix/flowman/fs/JavaFile.scala | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index 391149f7f..315912210 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -24,6 +24,7 @@ import java.nio.file.Path import java.nio.file.Paths import java.nio.file.StandardOpenOption import java.util.Comparator +import java.util.function.Consumer import java.util.stream.Collectors import scala.collection.JavaConverters._ @@ -151,7 +152,9 @@ case class JavaFile(jpath:Path) extends File { if (recursive) { Files.walk(jpath) .sorted(Comparator.reverseOrder[Path]()) - .forEach(p => p.toFile.delete()) + .forEach(new Consumer[Path] { + override def accept(t: Path): Unit = t.toFile.delete() + }) } else { Files.delete(jpath) From 8c1020309b77808d842af3dd6342f31a7907367e Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 1 Nov 2022 17:55:32 +0100 Subject: [PATCH 30/52] Fix build for Spark < 3.3 --- .../main/scala/com/dimajix/flowman/fs/FileSystem.scala | 6 +++++- .../scala/com/dimajix/flowman/fs/FileSystemTest.scala | 10 ++++++++-- .../scala/com/dimajix/flowman/fs/JavaFileTest.scala | 4 ++-- 3 files changed, 15 insertions(+), 5 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index 35e93ab54..50ce98a8a 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -18,6 +18,7 @@ package com.dimajix.flowman.fs import java.net.URI import java.nio.file.FileSystemNotFoundException +import java.nio.file.NoSuchFileException import java.nio.file.Paths import java.util.Collections @@ -64,7 +65,10 @@ case class FileSystem(conf:Configuration) { } def resource(path:String) : File = { - val uri = Resources.getURL(path).toURI + val url = Resources.getURL(path) + if (url == null) + throw new NoSuchFileException(s"Resource '$path' not found") + val uri = url.toURI resource(uri) } def resource(uri:URI) : File = { diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index ab00edba0..a186587e8 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -16,6 +16,8 @@ package com.dimajix.flowman.fs +import java.nio.file.NoSuchFileException + import org.apache.hadoop.fs.Path import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -191,6 +193,8 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { dir.isFile() should be(false) dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + + a[NoSuchFileException] should be thrownBy (fs.resource("com/dimajix/flowman/no-such-file")) } it should "support resources in JARs" in { @@ -207,6 +211,8 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { dir.isFile() should be(false) dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + + a[NoSuchFileException] should be thrownBy (fs.resource("org/apache/spark/no-such-file")) } "FileSystem.file" should "support resources somewhere via 'file(URI)'" in { @@ -244,7 +250,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { it should "support resources in JARs via 'file(URI)'" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val file = fs.file(Resources.getURL("org/apache/spark/log4j2-defaults.properties").toURI) + val file = fs.file(Resources.getURL("org/apache/spark/SparkContext.class").toURI) file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) @@ -260,7 +266,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { it should "support resources in JARs via 'file(String)'" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val file = fs.file(Resources.getURL("org/apache/spark/log4j2-defaults.properties").toString) + val file = fs.file(Resources.getURL("org/apache/spark/SparkContext.class").toString) file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala index 618d0dc62..a8db5f923 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala @@ -19,9 +19,9 @@ package com.dimajix.flowman.fs import java.nio.file.Paths import java.util.Collections +import org.apache.hadoop.fs import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers -import org.apache.hadoop.fs import com.dimajix.common.Resources import com.dimajix.spark.testing.LocalTempDir @@ -102,7 +102,7 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { } it should "support resources in JARs" in { - val res = Resources.getURL("org/apache/spark/log4j2-defaults.properties") + val res = Resources.getURL("org/apache/spark/SparkContext.class") val xyz = java.nio.file.FileSystems.newFileSystem(res.toURI, Collections.emptyMap[String,String]()) val file = JavaFile(Paths.get(res.toURI)) file.exists() should be(true) From 5af0b0718058dcd52e4c7fc50a11343d67b82f0f Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 1 Nov 2022 19:01:49 +0100 Subject: [PATCH 31/52] Fix for Spark < 3.3 --- .../src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index a186587e8..5136e4f29 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -200,7 +200,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { it should "support resources in JARs" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val file = fs.resource("org/apache/spark/log4j2-defaults.properties") + val file = fs.resource("org/apache/spark/SparkContext.class") file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) From 7dd3eef6c52c7ac1321d4228bdd181864c45f36f Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 1 Nov 2022 19:53:17 +0100 Subject: [PATCH 32/52] github-272 Create build profile for AWS EMR 6.8.0 --- .gitlab-ci.yml | 9 +++ CHANGELOG.md | 1 + build-release.sh | 2 + docs/releases.md | 1 + flowman-plugins/aws/pom.xml | 29 +++++++ flowman-plugins/delta/pom.xml | 6 ++ flowman-plugins/mssqlserver/pom.xml | 8 ++ .../spark/testing/LocalSparkSession.scala | 1 - pom.xml | 75 +++++++++++++++++++ 9 files changed, 131 insertions(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b41528822..dfd8ede40 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -181,3 +181,12 @@ build-cdp7.1: paths: - flowman-dist/target/flowman-dist-*-bin.tar.gz expire_in: 5 days + +build-emr6.8: + stage: build + script: 'mvn ${MAVEN_CLI_OPTS} clean package -PEMR-6.8 -Ddockerfile.skip' + artifacts: + name: "flowman-dist-emr6.8" + paths: + - flowman-dist/target/flowman-dist-*-bin.tar.gz + expire_in: 5 days diff --git a/CHANGELOG.md b/CHANGELOG.md index 9956b3329..bd88f0997 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,6 +10,7 @@ * github-269: Implement new 'iterativeSql' mapping * github-270: Upgrade Spark to 3.3.1 * github-271: Upgrade Delta to 2.1.1 +* github-272: Create build profile for AWS EMR 6.8.0 # Version 0.28.0 - 2022-10-07 diff --git a/build-release.sh b/build-release.sh index e5fb69f9c..9937d1940 100755 --- a/build-release.sh +++ b/build-release.sh @@ -44,6 +44,8 @@ build_profile -Phadoop-3.3 -Pspark-3.2 -Dhadoop.version=3.3.1 build_profile -Phadoop-2.7 -Pspark-3.3 build_profile -Phadoop-3.3 -Pspark-3.3 -Dhadoop.version=3.3.2 +build_profile -PEMR-6.8 + export JAVA_HOME=/usr/lib/jvm/java-1.8.0 build_profile -PCDH-6.3 build_profile -PCDP-7.1 diff --git a/docs/releases.md b/docs/releases.md index 0dbf8578e..938b713ff 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -26,6 +26,7 @@ changes over time. * github-269: Create 'iterativeSql' mapping * github-270: Upgrade Spark to 3.3.1 * github-271: Upgrade Delta to 2.1.1 +* github-272: Create build profile for AWS EMR 6.8.0 ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-plugins/aws/pom.xml b/flowman-plugins/aws/pom.xml index e98638ef6..481eab374 100644 --- a/flowman-plugins/aws/pom.xml +++ b/flowman-plugins/aws/pom.xml @@ -229,6 +229,35 @@ + + + EMR-6.8 + + 1.12.170 + + + + com.amazonaws + aws-java-sdk-core + ${aws.version} + + + com.amazonaws + aws-java-sdk-dynamodb + ${aws.version} + + + com.amazonaws + aws-java-sdk-kms + ${aws.version} + + + com.amazonaws + aws-java-sdk-s3 + ${aws.version} + + + diff --git a/flowman-plugins/delta/pom.xml b/flowman-plugins/delta/pom.xml index 400cd3de9..12ba400cb 100644 --- a/flowman-plugins/delta/pom.xml +++ b/flowman-plugins/delta/pom.xml @@ -64,6 +64,12 @@ + + EMR-6.8 + + 2.1.1 + + spark-2.4 diff --git a/flowman-plugins/mssqlserver/pom.xml b/flowman-plugins/mssqlserver/pom.xml index 581d374ef..6ea85ec22 100644 --- a/flowman-plugins/mssqlserver/pom.xml +++ b/flowman-plugins/mssqlserver/pom.xml @@ -52,6 +52,14 @@ + + EMR-6.8 + + com.solytic + 1.4.0 + _${scala.api_version} + + spark-2.4 diff --git a/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala b/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala index 5d97752f9..f7ca7cd71 100644 --- a/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala +++ b/flowman-spark-testing/src/main/scala/com/dimajix/spark/testing/LocalSparkSession.scala @@ -158,7 +158,6 @@ trait LocalSparkSession extends LocalTempDir { this:Suite => protected def setupLogging(url: URL): Unit = { val log4j = System.getProperty("log4j.configuration") if (log4j == null || log4j.isEmpty) { - val loader = Thread.currentThread.getContextClassLoader PropertyConfigurator.configure(url) } } diff --git a/pom.xml b/pom.xml index fc6f44bc8..b73310694 100644 --- a/pom.xml +++ b/pom.xml @@ -164,6 +164,12 @@ cloudera + + true + + + false + https://repository.cloudera.com/artifactory/cloudera-repos/ @@ -230,6 +236,12 @@ cloudera + + true + + + false + https://repository.cloudera.com/artifactory/cloudera-repos/ @@ -282,6 +294,69 @@ 1.6 + + EMR-6.8 + + + emr-6.8.0 + + true + + + false + + https://s3.eu-central-1.amazonaws.com/eu-central-1-emr-artifacts/emr-6.8.0/repos/maven/ + + + + emr6.8 + 6.8.0 + 11 + 2.12.15 + 2.12 + 3.2.9 + 3.2 + 1.2.0 + 1.1.2 + 3.3.0 + 3.3 + 3.2.1-amzn-8 + 3.2 + 2.3.9 + 2.7.2 + 1.1.8.4 + 4.1.74.Final + 4.8 + 1.30 + 2.13.3 + 2.13 + 2.13.3 + 2.8 + 2.8.0 + 10.14.2.0 + 3.6.2 + 1.11.0 + 3.7.0-M11 + 14.0.1 + 2.17.2 + 1.7.32 + 4.5.9 + 4.4.11 + 4.2.0 + 2.10.13 + 2.13.0 + 2.13.0 + 1.5.0 + 1.15 + 3.2.2 + 3.0.16 + 1.21 + 2.11.0 + 3.12.0 + 3.6.1 + 1.9 + + From 0206d46961733a2dde9b0b18a5ceaaae66864d5b Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 2 Nov 2022 08:21:18 +0100 Subject: [PATCH 33/52] Add documentation for performance tweaking --- docs/cookbook/performance.md | 55 ++++++++++++++++++++++++++++++++++++ docs/setup/config.md | 2 +- 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 docs/cookbook/performance.md diff --git a/docs/cookbook/performance.md b/docs/cookbook/performance.md new file mode 100644 index 000000000..0f94c9a78 --- /dev/null +++ b/docs/cookbook/performance.md @@ -0,0 +1,55 @@ +# Performance Tuning + +Processing performance always is an important topic for data transformation, and so is the case with Flowman. In order +to improve overall performance, there are different configurations, some of them being well known configuration +parameters for Apache Spark, while others are specific to Flowman. + + +## Spark Parameters + +Since Flowman is based on Apache Spark, you can apply all the performance tuning strategies that apply to Spark. +You can specify almost all settings either in the [`default-namespace.yml` file](../setup/config.md) or in any other +project file in a `config` section. The most important settings probably are as follows: + +```yaml +config: + # Use 8 CPU cores per Spark executor + - spark.executor.cores=8 + # Allocate 54 GB RAM per Spark executor + - spark.executor.memory=54g + # Only keep up to 200 jobs in the Spark web UI + - spark.ui.retainedJobs=200 + # Use 400 partitions in shuffle operations + - spark.sql.shuffle.partitions=400 + # Number of executors to allocate + - spark.executor.instances=2 + # Memory overhead as safety margin + - spark.executor.memoryOverhead=1G +``` + +Often it is a good idea to make these properties easily configurable via system environment variables as follows: +```yaml +config: + - spark.executor.cores=$System.getenv('SPARK_EXECUTOR_CORES', '8') + - spark.executor.memory=$System.getenv('SPARK_EXECUTOR_MEMORY', '54g') + - spark.ui.retainedJobs=$System.getenv('RETAINED_JOBS', 200) + - spark.sql.shuffle.partitions=$System.getenv('SPARK_PARTITIONS', 400) +``` + + +## Flowman Parameters + +In addition to classical Spark tuning parameters, Flowman also offers some advanced functionality which may help to +cut down processing overhead cost by parallelizing target execution and mapping instantiation. This will not speed +up the processing itself, but it will help to hide some expensive Spark planning costs, which may involve querying +the Hive metastore or remote file systems, which are known to be slow. + +```yaml +config: + # Enable building multiple targets in parallel + - flowman.execution.executor.class=com.dimajix.flowman.execution.ParallelExecutor + # Build up to 4 targets in parallel + - flowman.execution.executor.parallelism=4 + # Instantiate up to 16 mappings in parallel + - flowman.execution.mapping.parallelism=16 +``` diff --git a/docs/setup/config.md b/docs/setup/config.md index 89d7c9a72..caaddc72c 100644 --- a/docs/setup/config.md +++ b/docs/setup/config.md @@ -51,7 +51,7 @@ the existence of targets and/or the history to decide if a rebuild is required. - `flowman.execution.executor.class` *(type: class)* *(default: `com.dimajix.flowman.execution.SimpleExecutor`)* (since Flowman 0.16.0) Configure the executor to use. The default `SimpleExecutor` will process all targets in the correct order sequentially. The alternative implementation `com.dimajix.flowman.execution.ParallelExecutor` will run multiple - targets in parallel (if they are not depending on each other) + targets in parallel (if they do not depend on each other) - `flowman.execution.executor.parallelism` *(type: int)* *(default: 4)* (since Flowman 0.16.0) The number of targets to be executed in parallel, when the `ParallelExecutor` is used. From 8051df9c8131b87fe8c40795216a9034af940a8d Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 2 Nov 2022 08:23:15 +0100 Subject: [PATCH 34/52] Improve documentation --- docs/cookbook/syntax-highlighting.md | 36 ++++++++++++++-------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/docs/cookbook/syntax-highlighting.md b/docs/cookbook/syntax-highlighting.md index 44468d945..642bcddd0 100644 --- a/docs/cookbook/syntax-highlighting.md +++ b/docs/cookbook/syntax-highlighting.md @@ -4,24 +4,6 @@ In order to support the development of Flowman projects, Flowman provides the ca which can be used by editors to perform syntax validation and auto complete. You will find a set of pre-generated files in the `yaml-schema` directory, which contain syntax information for all core entities and all plugins. -Since you might not use all plugins (or have your own plugins), Flowman also offers a small utility to generate -the YAML schema files yourself. Using the provided schema generator will ensure that the schema perfectly matches -to your setup with the right plugins. The schema files can be created with the [`flowman-schema`](../cli/schema.md) -command, such that the schema files will include all entities from any plugin loaded via the -[`default-namespace`](../spec/namespace.md). - - -## Creating YAML schemas - -```shell -flowman-schema -o my-schema-directory -``` - -This command will create multiple different YAML schema files: -* `module.json` - This is the YAML schema for all modules, i.e. defining relations, mapping, etc. -* `project.json` - This YAML schema file contains all entities of the [`project.yml`](../spec/project.md) file. -* `namespace.sjon` - This YAML schema file contains all entities of [namespace definitions](../spec/namespace.md). -* `documentation.sjon` - This YAML schema file contains all entities of [`documentation.yml`](../documenting/config.md). ## Supported Editors @@ -64,3 +46,21 @@ In order to benefit from a really excellent autocompletion in Visual Studio Code } } ``` + +## Creating YAML schemas + +Since you might not use all plugins (or have your own plugins), Flowman also offers a small utility to generate +the YAML schema files yourself. Using the provided schema generator will ensure that the schema perfectly matches +to your setup with the right plugins. The schema files can be created with the [`flowman-schema`](../cli/schema.md) +command, such that the schema files will include all entities from any plugin loaded via the +[`default-namespace`](../spec/namespace.md). + +```shell +flowman-schema -o my-schema-directory +``` + +This command will create multiple different YAML schema files: +* `module.json` - This is the YAML schema for all modules, i.e. defining relations, mapping, etc. +* `project.json` - This YAML schema file contains all entities of the [`project.yml`](../spec/project.md) file. +* `namespace.sjon` - This YAML schema file contains all entities of [namespace definitions](../spec/namespace.md). +* `documentation.sjon` - This YAML schema file contains all entities of [`documentation.yml`](../documenting/config.md). From 41e75693941c0225e547eed2b4e563033f318451 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 2 Nov 2022 08:32:50 +0100 Subject: [PATCH 35/52] Improve documentation --- docs/cookbook/advanced-jdbc.md | 2 +- docs/cookbook/data-quality.md | 2 +- docs/cookbook/{validation.md => data-validation.md} | 2 +- docs/cookbook/{metrics.md => execution-metrics.md} | 0 docs/cookbook/hadoop-dependencies.md | 2 +- docs/cookbook/impala.md | 2 +- docs/cookbook/kerberos.md | 2 +- docs/cookbook/override-jars.md | 2 +- docs/cookbook/target-ordering.md | 3 ++- docs/index.md | 2 +- docs/spec/measure/index.md | 2 +- docs/spec/metric/console.md | 2 +- docs/spec/target/file.md | 2 +- docs/spec/target/local.md | 2 +- docs/spec/target/merge.md | 2 +- docs/spec/target/relation.md | 2 +- 16 files changed, 16 insertions(+), 15 deletions(-) rename docs/cookbook/{validation.md => data-validation.md} (98%) rename docs/cookbook/{metrics.md => execution-metrics.md} (100%) diff --git a/docs/cookbook/advanced-jdbc.md b/docs/cookbook/advanced-jdbc.md index 7a745506d..f7598824a 100644 --- a/docs/cookbook/advanced-jdbc.md +++ b/docs/cookbook/advanced-jdbc.md @@ -1,4 +1,4 @@ -# Using advanced Features of JDBC Databases +# Advanced JDBC Database Features Flowman already provides a very robust for dealing with relation databases, both as data sources and as data sinks. But when writing into a relational database, you eventually might find yourself in a situation where Flowman does diff --git a/docs/cookbook/data-quality.md b/docs/cookbook/data-quality.md index 367442f91..f4b8ceb45 100644 --- a/docs/cookbook/data-quality.md +++ b/docs/cookbook/data-quality.md @@ -46,7 +46,7 @@ generated with an independent command with [`flowexec`](../cli/flowexec/index.md ## Data Quality Metrics In addition to the `validate` and `verify` targets, Flowman also offers a special [measure target](../spec/target/measure.md). This target provides some means to collect some important metrics from data and provide the results as metrics. These -in turn can be [published to Prometheus](metrics.md) or other metric collectors. +in turn can be [published to Prometheus](execution-metrics.md) or other metric collectors. ### Example diff --git a/docs/cookbook/validation.md b/docs/cookbook/data-validation.md similarity index 98% rename from docs/cookbook/validation.md rename to docs/cookbook/data-validation.md index 170aebbeb..f644e2afb 100644 --- a/docs/cookbook/validation.md +++ b/docs/cookbook/data-validation.md @@ -1,4 +1,4 @@ -# Pre-build Validations +# Data Validation before Build In many cases, you'd like to perform some validation of input data before you start processing. For example when joining data, you often assume some uniqueness constraint on the join key in some tables or mappings. If that diff --git a/docs/cookbook/metrics.md b/docs/cookbook/execution-metrics.md similarity index 100% rename from docs/cookbook/metrics.md rename to docs/cookbook/execution-metrics.md diff --git a/docs/cookbook/hadoop-dependencies.md b/docs/cookbook/hadoop-dependencies.md index bddfa8d91..dd171668f 100644 --- a/docs/cookbook/hadoop-dependencies.md +++ b/docs/cookbook/hadoop-dependencies.md @@ -1,4 +1,4 @@ -# Installing additional Hadoop Dependencies +# Hadoop Dependencies Installation Starting with version 3.2, Spark has reduced the number of Hadoop libraries which are part of the downloadable Spark distribution. Unfortunately, some of the libraries which have been removed are required by some Flowman plugins (for diff --git a/docs/cookbook/impala.md b/docs/cookbook/impala.md index a2f710ee5..b4a6cdd7c 100644 --- a/docs/cookbook/impala.md +++ b/docs/cookbook/impala.md @@ -1,4 +1,4 @@ -# Updating Impala Metadata +# Impala Metadata Impala is another "SQL on Hadoop" execution engine mainly developed and backed up by Cloudera. Impala allows you to access data stored in Hadoop and registered in the Hive metastore, just like Hive itself, but often at a significantly diff --git a/docs/cookbook/kerberos.md b/docs/cookbook/kerberos.md index 8df93045e..ca651318d 100644 --- a/docs/cookbook/kerberos.md +++ b/docs/cookbook/kerberos.md @@ -1,4 +1,4 @@ -# Using Kerberos Authentication +# Kerberos Authentication Of course, you can also run Flowman in a Kerberos environment, as long as the components you use actually support Kerberos. This includes Spark, Hadoop and Kafka. diff --git a/docs/cookbook/override-jars.md b/docs/cookbook/override-jars.md index 72adaee5b..1dc46e94e 100644 --- a/docs/cookbook/override-jars.md +++ b/docs/cookbook/override-jars.md @@ -1,4 +1,4 @@ -# Force Spark to specific jar version +# Override jar versions A common problem with Spark and specifically with many Hadoop environments (like Cloudera) are mismatches between application jar versions and jars provided by the runtime environment. Flowman is built with carefully set dependency diff --git a/docs/cookbook/target-ordering.md b/docs/cookbook/target-ordering.md index 905ca1650..629803b4d 100644 --- a/docs/cookbook/target-ordering.md +++ b/docs/cookbook/target-ordering.md @@ -1,10 +1,11 @@ -# Manual Target Execution Order +# Target Execution Ordering When executing a [job](../spec/job/index.md), Flowman normally figures out the correct execution order of all [targets](../spec/target/index.md) automatically. This is implemented by looking at the different targets inputs and outputs, such that Flowman ensures that first all the inputs of a target is build before the target itself is executed. + ## Cyclic Dependencies But sometimes, this does not give you the desired result, or Flowman might even detect a cyclic dependency between your targets. Although this might indicate an issue at the conceptional level, there are completely valid use cases diff --git a/docs/index.md b/docs/index.md index 082a8c412..baedb18fa 100644 --- a/docs/index.md +++ b/docs/index.md @@ -41,7 +41,7 @@ central place in your value chain for data preparations for the next steps. * Powerful yet simple [command line tool for batch execution](cli/flowexec/index.md) * Powerful [Command line tool for interactive data flow analysis](cli/flowshell/index.md) * [History server](cli/history-server.md) that provides an overview of past jobs and targets including lineage -* [Metric system](cookbook/metrics.md) with the ability to publish these to servers like Prometheus +* [Metric system](cookbook/execution-metrics.md) with the ability to publish these to servers like Prometheus * Extendable via Plugins diff --git a/docs/spec/measure/index.md b/docs/spec/measure/index.md index 736963879..88bffb29b 100644 --- a/docs/spec/measure/index.md +++ b/docs/spec/measure/index.md @@ -1,7 +1,7 @@ # Measures Flowman provides capabilities to assess data quality by taking *measures* from mappings and provide the result as -[metrics](../../cookbook/metrics.md). This enables developer to build data quality dashboards using well known tools +[metrics](../../cookbook/execution-metrics.md). This enables developer to build data quality dashboards using well known tools like Prometheus and Grafana. diff --git a/docs/spec/metric/console.md b/docs/spec/metric/console.md index 989fef74a..5d0df35a6 100644 --- a/docs/spec/metric/console.md +++ b/docs/spec/metric/console.md @@ -1,6 +1,6 @@ # Console Metric Sink -The `console` metric sink is the simplest possible way to publish [execution metrics](../../cookbook/metrics.md) and +The `console` metric sink is the simplest possible way to publish [execution metrics](../../cookbook/execution-metrics.md) and [data quality measures](../../cookbook/data-quality.md) as simple logging output on the console. Even if you publish your metrics to a metric collector like [Promethus](prometheus.md), it is a good idea to also add the `console` metric sink, so you can see all metrics together with other log output diff --git a/docs/spec/target/file.md b/docs/spec/target/file.md index b15acaeeb..708c2f900 100644 --- a/docs/spec/target/file.md +++ b/docs/spec/target/file.md @@ -76,4 +76,4 @@ The relation target also provides some metric containing the number of records w - `project` - Name of the project - `version` - Version of the project -See [Execution Metrics](../../cookbook/metrics.md) for more information how to use these metrics. +See [Execution Metrics](../../cookbook/execution-metrics.md) for more information how to use these metrics. diff --git a/docs/spec/target/local.md b/docs/spec/target/local.md index 151e795ca..654875c88 100644 --- a/docs/spec/target/local.md +++ b/docs/spec/target/local.md @@ -52,4 +52,4 @@ The relation target also provides some metric containing the number of records w - `project` - Name of the project - `version` - Version of the project -See [Execution Metrics](../../cookbook/metrics.md) for more information how to use these metrics. +See [Execution Metrics](../../cookbook/execution-metrics.md) for more information how to use these metrics. diff --git a/docs/spec/target/merge.md b/docs/spec/target/merge.md index 31ebbb738..80d15a348 100644 --- a/docs/spec/target/merge.md +++ b/docs/spec/target/merge.md @@ -125,4 +125,4 @@ The relation target also provides some metric containing the number of records w - `project` - Name of the project - `version` - Version of the project -See [Execution Metrics](../../cookbook/metrics.md) for more information how to use these metrics. +See [Execution Metrics](../../cookbook/execution-metrics.md) for more information how to use these metrics. diff --git a/docs/spec/target/relation.md b/docs/spec/target/relation.md index 5a2d52cbf..49a99e031 100644 --- a/docs/spec/target/relation.md +++ b/docs/spec/target/relation.md @@ -137,4 +137,4 @@ The relation target also provides some metric containing the number of records w - `project` - Name of the project - `version` - Version of the project -See [Execution Metrics](../../cookbook/metrics.md) for more information how to use these metrics. +See [Execution Metrics](../../cookbook/execution-metrics.md) for more information how to use these metrics. From 280d54767f0c169987f6707c2eb4f852ce1d035c Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 2 Nov 2022 16:46:51 +0100 Subject: [PATCH 36/52] Minor code improvements --- .../flowman/execution/CachingExecution.scala | 18 ++--- .../spec/template/MappingTemplateTest.scala | 75 +++++++++++++++++++ 2 files changed, 84 insertions(+), 9 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala index 78c5ab5a1..d7a42217b 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala @@ -376,21 +376,21 @@ abstract class CachingExecution(parent:Option[Execution], isolated:Boolean) exte else instances + // Optionally cache the DataFrames, before potentially marking them as broadcast candidates + if (cacheLevel != null && cacheLevel != StorageLevel.NONE) { + // If one of the DataFrame is called 'cache', then only cache that one, otherwise all will be cached + if (df1.keySet.contains("cache")) + df1("cache").persist(cacheLevel) + else + df1.values.foreach(_.persist(cacheLevel)) + } + // Optionally mark DataFrame to be broadcasted val df2 = if (doBroadcast) df1.map { case (name,df) => (name, broadcast(df)) } else df1 - // Optionally cache the DataFrames - if (cacheLevel != null && cacheLevel != StorageLevel.NONE) { - // If one of the DataFrame is called 'cache', then only cache that one, otherwise all will be cached - if (df2.keySet.contains("cache")) - df2("cache").persist(cacheLevel) - else - df2.values.foreach(_.persist(cacheLevel)) - } - df2 } } diff --git a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/template/MappingTemplateTest.scala b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/template/MappingTemplateTest.scala index 1a75b5c8e..3ddc62710 100644 --- a/flowman-spec/src/test/scala/com/dimajix/flowman/spec/template/MappingTemplateTest.scala +++ b/flowman-spec/src/test/scala/com/dimajix/flowman/spec/template/MappingTemplateTest.scala @@ -116,6 +116,81 @@ class MappingTemplateTest extends AnyFlatSpec with Matchers { an[InstantiateMappingFailedException] should be thrownBy(context.getMapping(MappingIdentifier("rel_4"))) } + it should "respect broadcast & cache" in { + val spec = + """ + |templates: + | user: + | kind: mapping + | parameters: + | - name: p0 + | type: string + | - name: p1 + | type: int + | default: 12 + | template: + | kind: values + | records: + | - ["$p0",$p1] + | schema: + | kind: inline + | fields: + | - name: str_col + | type: string + | - name: int_col + | type: integer + | + |mappings: + | rel_1: + | kind: template/user + | p0: some_value + | rel_2: + | kind: template/user + | broadcast: true + | p0: some_value + | rel_3: + | kind: template/user + | p0: some_value + | p1: 27 + | cache: MEMORY_AND_DISK + | rel_4: + | kind: template/user + | p0: some_value + | p3: no_such_param + |""".stripMargin + + val project = Module.read.string(spec).toProject("project") + val session = Session.builder().disableSpark().build() + val context = session.getContext(project) + + val map_1 = context.getMapping(MappingIdentifier("rel_1")) + map_1 shouldBe a[ValuesMapping] + map_1.name should be("rel_1") + map_1.identifier should be(MappingIdentifier("project/rel_1")) + map_1.kind should be("values") + map_1.broadcast should be(false) + map_1.checkpoint should be(false) + map_1.cache should be(StorageLevel.NONE) + + val map_2 = context.getMapping(MappingIdentifier("rel_2")) + map_2 shouldBe a[ValuesMapping] + map_2.name should be("rel_2") + map_2.identifier should be(MappingIdentifier("project/rel_2")) + map_2.kind should be("values") + map_2.broadcast should be(true) + map_2.checkpoint should be(false) + map_2.cache should be(StorageLevel.NONE) + + val map_3 = context.getMapping(MappingIdentifier("rel_3")) + map_3 shouldBe a[ValuesMapping] + map_3.name should be("rel_3") + map_3.identifier should be(MappingIdentifier("project/rel_3")) + map_3.kind should be("values") + map_3.broadcast should be(false) + map_3.checkpoint should be(false) + map_3.cache should be(StorageLevel.MEMORY_AND_DISK) + } + it should "throw an error on unknown templates" in { val spec = """ From 14c5324b77281e4667f77d0f78e38d1779219a6f Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 2 Nov 2022 17:30:21 +0100 Subject: [PATCH 37/52] Fix handling of relative local paths --- .../scala/com/dimajix/flowman/fs/File.scala | 2 ++ .../com/dimajix/flowman/fs/FileSystem.scala | 10 +++++++-- .../com/dimajix/flowman/fs/HadoopFile.scala | 4 ++++ .../com/dimajix/flowman/fs/JavaFile.scala | 5 +++++ .../dimajix/flowman/fs/FileSystemTest.scala | 9 ++++++++ .../spec/documentation/FileGenerator.scala | 2 +- .../flowman/spec/hook/SimpleReportHook.scala | 21 ++++++++++--------- .../com/dimajix/flowman/tools/Tool.scala | 1 - 8 files changed, 40 insertions(+), 14 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala index 102a4f58f..dcb7e774d 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala @@ -99,6 +99,8 @@ abstract class File { */ def create(overwrite:Boolean = false) : OutputStream + def append() : OutputStream + /** * Opens an existing file and returns the corresponding input stream * @return diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index 50ce98a8a..2160a37a6 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -56,8 +56,14 @@ case class FileSystem(conf:Configuration) { def local(path:java.io.File) : File = JavaFile(path.toPath) def local(path:URI) : File = { if (path.getScheme == null) { - val uri = new URI("file", path.getUserInfo, path.getHost, path.getPort, path.getPath, path.getQuery, path.getFragment) - JavaFile(Paths.get(uri)) + val file = Paths.get(path.getPath) + if (!file.isAbsolute) { + JavaFile(file.toAbsolutePath) + } + else { + val uri = new URI("file", path.getUserInfo, path.getHost, path.getPort, path.getPath, path.getQuery, path.getFragment) + JavaFile(Paths.get(uri)) + } } else { JavaFile(Paths.get(path)) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala index 032d14c23..77e5a508e 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala @@ -153,6 +153,10 @@ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends Fil fs.create(path, overwrite) } + def append(): FSDataOutputStream = { + fs.append(path) + } + /** * Opens an existing file and returns the corresponding input stream * @return diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index 315912210..73beb2623 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -134,6 +134,11 @@ case class JavaFile(jpath:Path) extends File { Files.newOutputStream(jpath, StandardOpenOption.CREATE_NEW, StandardOpenOption.WRITE) } + override def append(): OutputStream = { + Files.createDirectories(jpath.getParent) + Files.newOutputStream(jpath, StandardOpenOption.CREATE, StandardOpenOption.APPEND, StandardOpenOption.WRITE) + } + /** * Opens an existing file and returns the corresponding input stream * diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index 5136e4f29..8336506cb 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -45,6 +45,15 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { tmpFromUri.isDirectory() should be (true) } + it should "be usable relative paths" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.local("target/classes") + file.exists() should be(true) + file.isFile() should be(false) + file.isDirectory() should be(true) + } + it should "be usable with Paths" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala index ca0b617bc..5757b185e 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/documentation/FileGenerator.scala @@ -68,7 +68,7 @@ case class FileGenerator( outputDir.list().foreach(_.delete(true)) } else if (outputDir.isFile()) { - outputDir.isFile() + outputDir.delete(false) } outputDir.mkdirs() diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/SimpleReportHook.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/SimpleReportHook.scala index fd4522489..a793db2a1 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/SimpleReportHook.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/SimpleReportHook.scala @@ -70,7 +70,7 @@ case class SimpleReportHook( ) extends BaseHook { private val logger = LoggerFactory.getLogger(classOf[ReportHook]) - private def newOutput():Option[PrintStream] = { + private def newOutput(execution:Execution):Option[PrintStream] = { if (location.toString == "stdout") { Some(System.out) } @@ -78,17 +78,18 @@ case class SimpleReportHook( Some(System.err) } else { - val fs = location.getFileSystem(context.hadoopConf) + val fs = execution.fs + val file = fs.file(location) val out = mode match { - case OutputMode.OVERWRITE => fs.create(location) - case OutputMode.APPEND => fs.append(location) + case OutputMode.OVERWRITE => file.create(true) + case OutputMode.APPEND => file.append() case OutputMode.ERROR_IF_EXISTS => - if (fs.exists(location)) + if (file.exists()) throw new FileAlreadyExistsException(s"Cannot open report output, file $location already exists") - fs.create(location) + file.create(false) case OutputMode.IGNORE_IF_EXISTS => - if (!fs.exists(location)) { - fs.create(location) + if (!file.exists()) { + file.create(false) } else { null @@ -172,7 +173,7 @@ case class SimpleReportHook( override def startLifecycle(execution:Execution, job:Job, instance:JobLifecycle) : LifecycleToken = { val now = Instant.now() logger.info(s"Creating new report to $location") - val output = newOutput() + val output = newOutput(execution) output.foreach { p => printBigTitle(p, s"Processing job ${job.identifier} at $now") printEnvironment(p, job.context) @@ -207,7 +208,7 @@ case class SimpleReportHook( val now = Instant.now() val output = parent.flatMap { case ReporterLifecycleToken(output) => output - case _ => newOutput() + case _ => newOutput(execution) } output.foreach { p => printTitle(p, s"${instance.phase} job ${job.identifier} at $now") diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala index d7ea6a0c3..525435e73 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala @@ -85,7 +85,6 @@ class Tool { val fs = FileSystem(hadoopConfig) // Load Project. If no schema is specified, load from local file system - // TODO: Support resources in jar files val projectUri = projectPath.toUri if (projectUri.getAuthority == null && projectUri.getScheme == null) Project.read.file(fs.local(projectPath)) From 4e9fa3bc73707793c260bbad01522bdde013efa9 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Wed, 2 Nov 2022 20:16:42 +0100 Subject: [PATCH 38/52] github-273 Improve support for projects in jar files --- .../scala/com/dimajix/flowman/fs/File.scala | 7 +- .../com/dimajix/flowman/fs/FileSystem.scala | 10 ++- .../com/dimajix/flowman/fs/HadoopFile.scala | 15 +++- .../com/dimajix/flowman/fs/JavaFile.scala | 16 ++++- .../dimajix/flowman/fs/FileSystemTest.scala | 9 ++- .../dimajix/flowman/fs/HadoopFileTest.scala | 9 +++ .../com/dimajix/flowman/fs/JavaFileTest.scala | 70 ++++++++++++++++--- .../dimajix/flowman/tools/StatefulTool.scala | 7 +- .../com/dimajix/flowman/tools/Tool.scala | 21 +++--- .../dimajix/flowman/tools/exec/Driver.scala | 11 +-- .../dimajix/flowman/tools/main/Driver.scala | 2 +- .../dimajix/flowman/tools/shell/Shell.scala | 3 +- .../tools/shell/project/LoadCommand.scala | 2 +- .../tools/shell/project/ReloadCommand.scala | 2 +- 14 files changed, 142 insertions(+), 42 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala index dcb7e774d..719b3bc20 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/File.scala @@ -18,6 +18,7 @@ package com.dimajix.flowman.fs import java.io.InputStream import java.io.OutputStream +import java.net.URI import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path @@ -38,6 +39,8 @@ abstract class File { def path : Path + def uri : URI + /** * Creates a new File object by attaching a child entry * @param sub @@ -49,9 +52,7 @@ abstract class File { * Returns the file name of the File * @return */ - def name : String = { - path.getName - } + def name : String /** * Returns the parent directory of the File diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index 2160a37a6..a1f75b1ee 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -44,10 +44,14 @@ case class FileSystem(conf:Configuration) { } def file(path:String) : File = { val uri = new URI(path) - if (uri.getScheme == "jar") + if (uri.getScheme == "jar") { resource(uri) - else - file(new Path(path)) + } + else { + val p = new Path(path) + val fs = p.getFileSystem(conf) + HadoopFile(fs, p) + } } def file(path:URI) : File = file(new Path(path)) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala index 77e5a508e..c0e0bf02c 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala @@ -35,14 +35,14 @@ import org.apache.hadoop.io.IOUtils * @param path */ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends File { - override def toString: String = if (path != null) path.toString else "" + override def uri : URI = path.toUri /** * Creates a new File object by attaching a child entry * @param sub * @return */ - def /(sub:String) : File = { + override def /(sub:String) : File = { val rel = new Path(new URI(sub)) if (rel.isAbsolute) HadoopFile(fs, rel) @@ -50,11 +50,20 @@ case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends Fil HadoopFile(fs, new Path(path, sub)) } + /** + * Returns the file name of the File + * + * @return + */ + override def name: String = { + path.getName + } + /** * Returns the parent directory of the File * @return */ - def parent : File = { + override def parent : File = { val p = path.getParent if (p == null) { this diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index 73beb2623..acdb97775 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -33,7 +33,11 @@ import org.apache.hadoop.fs case class JavaFile(jpath:Path) extends File { - override def path: fs.Path = new fs.Path(jpath.toUri) + override def toString: String = uri.toString + + override def path: fs.Path = new fs.Path(uri) + + override def uri : URI = new URI(jpath.toUri.toString.replace("file:///", "file:/")) /** * Creates a new File object by attaching a child entry @@ -49,6 +53,16 @@ case class JavaFile(jpath:Path) extends File { JavaFile(jpath.resolve(sub)) } + /** + * Returns the file name of the File + * + * @return + */ + override def name : String = { + val n = jpath.getFileName + if (n != null) n.toString else "" + } + /** * Returns the parent directory of the File * diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index 8336506cb..ca3f4d54a 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -49,6 +49,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val file = fs.local("target/classes") + file.name should be ("classes") file.exists() should be(true) file.isFile() should be(false) file.isDirectory() should be(true) @@ -58,6 +59,8 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val tmpFromUri = fs.local(new Path(tempDir.toString)) + //tmpFromUri.path should be (new Path("file:" + tempDir.toString + "/")) + tmpFromUri.path should be (new Path(tempDir.toURI)) tmpFromUri.exists() should be(true) tmpFromUri.isFile() should be(false) tmpFromUri.isDirectory() should be(true) @@ -67,6 +70,8 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val tmpFromUri = fs.local(tempDir.toURI) + //tmpFromUri.path should be(new Path("file:" + tempDir.toString + "/")) + tmpFromUri.path should be(new Path(tempDir.toURI)) tmpFromUri.exists() should be (true) tmpFromUri.isFile() should be (false) tmpFromUri.isDirectory() should be (true) @@ -76,7 +81,9 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val tmp = fs.local(tempDir) - val file = tmp / ("lala-" + System.currentTimeMillis().toString + ".tmp") + val name = "lala-" + System.currentTimeMillis().toString + ".tmp" + val file = tmp / name + file.name should be (name) file.exists() should be (false) file.isFile() should be (false) file.isDirectory() should be (false) diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala index 1d799a804..5e2a5289d 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/HadoopFileTest.scala @@ -16,8 +16,11 @@ package com.dimajix.flowman.fs +import java.net.URI + import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs +import org.apache.hadoop.fs.Path import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -29,6 +32,7 @@ class HadoopFileTest extends AnyFlatSpec with Matchers with LocalTempDir { "The HadoopFile" should "work" in { val dir = HadoopFile(localFs, new fs.Path(tempDir.toURI)) + dir.uri should be (tempDir.toURI) dir.path should be (new fs.Path(tempDir.toURI)) dir.exists() should be (true) dir.isFile() should be (false) @@ -38,6 +42,9 @@ class HadoopFileTest extends AnyFlatSpec with Matchers with LocalTempDir { (dir / dir.toString) should be (dir) val file = dir / "lala" + file.uri should be(tempDir.toURI.resolve("lala")) + file.path should be(new Path(tempDir.toURI.resolve("lala"))) + file.path should be(new Path(tempDir.toURI.toString, "lala")) file.exists() should be (false) file.isFile() should be (false) file.isDirectory() should be (false) @@ -50,6 +57,8 @@ class HadoopFileTest extends AnyFlatSpec with Matchers with LocalTempDir { it should "work at root level" in { val dir = HadoopFile(localFs, new fs.Path("file:/")) dir.parent should be (dir) + dir.uri should be (new URI("file:/")) + dir.name should be ("") dir.path should be(new fs.Path("file:/")) dir.exists() should be(true) dir.isFile() should be(false) diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala index a8db5f923..b21097131 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala @@ -16,10 +16,12 @@ package com.dimajix.flowman.fs +import java.net.URI import java.nio.file.Paths import java.util.Collections import org.apache.hadoop.fs +import org.apache.hadoop.fs.Path import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers @@ -30,6 +32,7 @@ import com.dimajix.spark.testing.LocalTempDir class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { "The JavaFile" should "work" in { val dir = JavaFile(tempDir.toPath) + dir.uri should be (tempDir.toURI) dir.path should be (new fs.Path(tempDir.toURI)) dir.exists() should be (true) dir.isFile() should be (false) @@ -39,6 +42,8 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { (dir / dir.toString) should be (dir) val file = dir / "lala" + file.name should be ("lala") + file.uri should be (tempDir.toURI.resolve("lala")) file.exists() should be(false) file.isFile() should be(false) file.isDirectory() should be(false) @@ -51,6 +56,8 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { it should "work at root level" in { val dir = JavaFile(tempDir.toPath.getRoot) dir.parent should be (dir) + dir.name should be ("") + dir.uri should be(new URI("file:/")) dir.path should be(new fs.Path("file:/")) dir.exists() should be(true) dir.isFile() should be(false) @@ -58,6 +65,7 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { dir.isAbsolute() should be(true) val file = dir / "lala" + file.name should be ("lala") file.exists() should be(false) file.isFile() should be(false) file.isDirectory() should be(false) @@ -69,7 +77,12 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { it should "support creating entries" in { val tmp = JavaFile(tempDir.toPath) - val file = tmp / ("lala-" + System.currentTimeMillis().toString + ".tmp") + val name = "lala-" + System.currentTimeMillis().toString + ".tmp" + val file = tmp / name + file.uri should be (tempDir.toURI.resolve(name)) + file.path should be (new Path(tempDir.toURI.resolve(name))) + file.path should be (new Path(tempDir.toURI.toString ,name)) + file.name should be (name) file.exists() should be(false) file.isFile() should be(false) file.isDirectory() should be(false) @@ -88,34 +101,69 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { it should "support resources somewhere" in { val res = Resources.getURL("com/dimajix/flowman/flowman.properties") val file = JavaFile(Paths.get(res.toURI)) + file.name should be ("flowman.properties") + file.uri should be (res.toURI) + file.path.toUri should be (res.toURI) + file.path should be (new Path(res.toURI)) file.exists() should be (true) file.isFile() should be (true) file.isAbsolute() should be (true) file.isDirectory() should be(false) val res1 = Resources.getURL("com/dimajix/flowman") - val dir = JavaFile(Paths.get(res1.toURI)) - dir.exists() should be(true) - dir.isFile() should be(false) - dir.isAbsolute() should be(true) - dir.isDirectory() should be(true) + val dir1 = JavaFile(Paths.get(res1.toURI)) + dir1.name should be ("flowman") + //dir1.uri should be (res1.toURI) + dir1.path.toString should be (res1.toURI.toString + "/") + dir1.exists() should be(true) + dir1.isFile() should be(false) + dir1.isAbsolute() should be(true) + dir1.isDirectory() should be(true) + + val res2 = Resources.getURL("com/dimajix/flowman/") + val dir2 = JavaFile(Paths.get(res2.toURI)) + dir2.name should be ("flowman") + dir2.uri should be (res2.toURI) + dir2.path should be(new Path(res2.toURI)) + dir2.path.toUri should be(res2.toURI) + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) } it should "support resources in JARs" in { val res = Resources.getURL("org/apache/spark/SparkContext.class") val xyz = java.nio.file.FileSystems.newFileSystem(res.toURI, Collections.emptyMap[String,String]()) val file = JavaFile(Paths.get(res.toURI)) + file.uri should be (res.toURI) + file.path.toUri should be (res.toURI) + file.path should be (new Path(res.toURI)) + file.name should be ("SparkContext.class") file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) file.isDirectory() should be(false) val res1 = Resources.getURL("org/apache/spark") - val dir = JavaFile(Paths.get(res1.toURI)) - dir.exists() should be(true) - dir.isFile() should be(false) - dir.isAbsolute() should be(true) - dir.isDirectory() should be(true) + val dir1 = JavaFile(Paths.get(res1.toURI)) + dir1.uri should be (res1.toURI) + dir1.path should be (new Path(res1.toURI)) + dir1.name should be ("spark") + dir1.exists() should be(true) + dir1.isFile() should be(false) + dir1.isAbsolute() should be(true) + dir1.isDirectory() should be(true) + + val res2 = Resources.getURL("org/apache/spark/") + val dir2 = JavaFile(Paths.get(res2.toURI)) + //dir2.uri should be(res2.toURI) + //dir2.path should be(new Path(res2.toURI)) + dir2.name should be("spark") + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) xyz.close() } diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/StatefulTool.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/StatefulTool.scala index d978b61f9..0d7d20f53 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/StatefulTool.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/StatefulTool.scala @@ -1,9 +1,10 @@ package com.dimajix.flowman.tools -import org.apache.hadoop.fs.Path +import java.net.URI import com.dimajix.flowman.execution.Context import com.dimajix.flowman.execution.Session +import com.dimajix.flowman.fs.File import com.dimajix.flowman.model.Job import com.dimajix.flowman.model.Project import com.dimajix.flowman.model.Test @@ -56,9 +57,9 @@ class StatefulTool( _session } - override def loadProject(path: Path): Project = { + override def loadProject(file:File): Project = { // First try to load new project - _project = super.loadProject(path) + _project = super.loadProject(file) // Then create new session. If project loading fails, the old session will remain newSession() diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala index 525435e73..bfb6a1652 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/Tool.scala @@ -17,10 +17,9 @@ package com.dimajix.flowman.tools import java.io.File -import java.net.URL +import java.net.URI import org.apache.hadoop.conf.Configuration -import org.apache.hadoop.fs.Path import com.dimajix.common.Resources import com.dimajix.flowman.common.ToolConfig @@ -79,17 +78,23 @@ class Tool { ns } - def loadProject(projectPath:Path) : Project = { + def loadProject(projectPath:String) : Project = { // Create Hadoop FileSystem instance val hadoopConfig = new Configuration() val fs = FileSystem(hadoopConfig) // Load Project. If no schema is specified, load from local file system - val projectUri = projectPath.toUri - if (projectUri.getAuthority == null && projectUri.getScheme == null) - Project.read.file(fs.local(projectPath)) - else - Project.read.file(fs.file(projectPath)) + val uri = new URI(projectPath) + val file = + if (uri.getAuthority == null && uri.getScheme == null) + fs.local(projectPath) + else + fs.file(projectPath) + loadProject(file) + } + + def loadProject(file:com.dimajix.flowman.fs.File) : Project = { + Project.read.file(file) } def createSession( diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/exec/Driver.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/exec/Driver.scala index d4a4cca70..424bca62b 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/exec/Driver.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/exec/Driver.scala @@ -16,24 +16,25 @@ package com.dimajix.flowman.tools.exec +import java.net.URI + import scala.util.Failure import scala.util.Success import scala.util.Try -import org.apache.hadoop.fs.Path import org.kohsuke.args4j.CmdLineException import org.slf4j.LoggerFactory import com.dimajix.flowman.FLOWMAN_VERSION import com.dimajix.flowman.HADOOP_BUILD_VERSION -import com.dimajix.flowman.JAVA_VERSION -import com.dimajix.flowman.SPARK_VERSION import com.dimajix.flowman.HADOOP_VERSION +import com.dimajix.flowman.JAVA_VERSION import com.dimajix.flowman.SCALA_VERSION import com.dimajix.flowman.SPARK_BUILD_VERSION +import com.dimajix.flowman.SPARK_VERSION import com.dimajix.flowman.common.Logging -import com.dimajix.flowman.common.ToolConfig import com.dimajix.flowman.common.ParserUtils.splitSettings +import com.dimajix.flowman.common.ToolConfig import com.dimajix.flowman.execution.Status import com.dimajix.flowman.tools.Tool import com.dimajix.flowman.util.ConsoleColors @@ -117,7 +118,7 @@ class Driver(options:Arguments) extends Tool { } else { // Create Flowman Session, which also includes a Spark Session - val project = loadProject(new Path(options.projectFile)) + val project = loadProject(options.projectFile) val config = splitSettings(options.config) val environment = splitSettings(options.environment) diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/main/Driver.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/main/Driver.scala index 45c385828..648641790 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/main/Driver.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/main/Driver.scala @@ -68,7 +68,7 @@ class Driver(options:Arguments) extends Tool { * @return */ def run() : Boolean = { - val project = loadProject(new Path(options.projectFile)) + val project = loadProject(options.projectFile) // Create Flowman Session, which also includes a Spark Session val config = splitSettings(options.config) diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/Shell.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/Shell.scala index c5ff784ec..867b95718 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/Shell.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/Shell.scala @@ -17,6 +17,7 @@ package com.dimajix.flowman.tools.shell import java.io.File +import java.net.URI import scala.collection.JavaConverters._ import scala.util.Failure @@ -96,7 +97,7 @@ object Shell { Logging.setSparkLogging(options.sparkLogging) _instance = new Shell(options) - _instance.loadProject(new Path(options.projectFile)) + _instance.loadProject(options.projectFile) _instance.run() } } diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/LoadCommand.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/LoadCommand.scala index b8ea5c0ca..dad0527d5 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/LoadCommand.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/LoadCommand.scala @@ -39,7 +39,7 @@ class LoadCommand extends Command { override def execute(session: Session, project:Project, context:Context): Status = { try { - Shell.instance.loadProject(new Path(this.project)) + Shell.instance.loadProject(this.project) Status.SUCCESS } catch { diff --git a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/ReloadCommand.scala b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/ReloadCommand.scala index f2d6200ce..d4d7d8cc1 100644 --- a/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/ReloadCommand.scala +++ b/flowman-tools/src/main/scala/com/dimajix/flowman/tools/shell/project/ReloadCommand.scala @@ -36,7 +36,7 @@ class ReloadCommand extends Command { override def execute(session: Session, project:Project, context:Context): Status = { project.filename.map { fn => try { - Shell.instance.loadProject(fn.path) + Shell.instance.loadProject(fn) Status.SUCCESS } catch { From bbf55a496bcc266f664a733544927f722c42d59a Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 3 Nov 2022 07:22:15 +0100 Subject: [PATCH 39/52] Reduce log level for Hive --- .../resources/com/dimajix/spark/testing/log4j.properties | 6 +++--- .../com/dimajix/flowman/testing/log4j-defaults.properties | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties b/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties index de2a12461..07921ee81 100644 --- a/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties +++ b/flowman-spark-testing/src/main/resources/com/dimajix/spark/testing/log4j.properties @@ -32,9 +32,9 @@ log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR # Set Spark general logging to WARN log4j.logger.org.apache.spark=WARN -# Set Hadoop related logging to WARN -log4j.logger.org.apache.hadoop=WARN -log4j.logger.org.apache.hadoop.hive=WARN +# Set Hadoop related logging to WARN/ERROR +log4j.logger.org.apache.hadoop=ERROR +log4j.logger.org.apache.hadoop.hive=ERROR # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL diff --git a/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties b/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties index 4a276af9e..175c7682d 100644 --- a/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties +++ b/flowman-testing/src/main/resources/com/dimajix/flowman/testing/log4j-defaults.properties @@ -32,9 +32,9 @@ log4j.logger.org.spark_project.jetty.util.component.AbstractLifeCycle=ERROR # Set Spark general logging to WARN log4j.logger.org.apache.spark=WARN -# Set Hadoop related logging to WARN +# Set Hadoop related logging to WARN/ERROR log4j.logger.org.apache.hadoop=WARN -log4j.logger.org.apache.hadoop.hive=WARN +log4j.logger.org.apache.hadoop.hive=ERROR # SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support log4j.logger.org.apache.hadoop.hive.metastore.RetryingHMSHandler=FATAL From c0db29b4ca1d1331edce406896e49c834738bd8a Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 3 Nov 2022 07:27:58 +0100 Subject: [PATCH 40/52] github-273 Fix build for Java 1.8 --- .../main/scala/com/dimajix/flowman/fs/JavaFile.scala | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index acdb97775..6041615d8 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -31,6 +31,8 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs +import com.dimajix.common.text.StringUtils + case class JavaFile(jpath:Path) extends File { override def toString: String = uri.toString @@ -60,7 +62,13 @@ case class JavaFile(jpath:Path) extends File { */ override def name : String = { val n = jpath.getFileName - if (n != null) n.toString else "" + if (n != null) { + // Remove trailing "/". Required for Java.18 (not Java 11) + val sep = jpath.getFileSystem.getSeparator.head + n.toString.takeWhile(_ != sep) + } else { + "" + } } /** From f880eef8ddb2b4862609ad86461fed7abd03f039 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 3 Nov 2022 12:00:31 +0100 Subject: [PATCH 41/52] github-273 Fix some issues with new File API --- .../com/dimajix/flowman/fs/FileSystem.scala | 24 +- .../com/dimajix/flowman/fs/JavaFile.scala | 2 +- .../dimajix/flowman/fs/FileSystemTest.scala | 431 +++++++++++++++++- .../flowman/spec/hook/ReportHook.scala | 21 +- .../com/dimajix/flowman/testing/Runner.scala | 4 - 5 files changed, 440 insertions(+), 42 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index a1f75b1ee..f55da7be1 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -39,7 +39,15 @@ case class FileSystem(conf:Configuration) { } else { val fs = path.getFileSystem(conf) - HadoopFile(fs, path) + val uri = path.toUri + if (uri.getScheme == null && path.isAbsolute) { + val p = new Path(fs.getScheme, uri.getAuthority, uri.getPath) + HadoopFile(fs, p) + + } + else { + HadoopFile(fs, path) + } } } def file(path:String) : File = { @@ -48,29 +56,27 @@ case class FileSystem(conf:Configuration) { resource(uri) } else { - val p = new Path(path) - val fs = p.getFileSystem(conf) - HadoopFile(fs, p) + file(new Path(path)) } } def file(path:URI) : File = file(new Path(path)) def local(path:Path) : File = local(path.toUri) - def local(path:String) : File = JavaFile(Paths.get(path)) - def local(path:java.io.File) : File = JavaFile(path.toPath) + def local(path:String) : File = JavaFile(Paths.get(path).normalize()) + def local(path:java.io.File) : File = JavaFile(path.toPath.normalize()) def local(path:URI) : File = { if (path.getScheme == null) { val file = Paths.get(path.getPath) if (!file.isAbsolute) { - JavaFile(file.toAbsolutePath) + JavaFile(file.normalize()) } else { val uri = new URI("file", path.getUserInfo, path.getHost, path.getPort, path.getPath, path.getQuery, path.getFragment) - JavaFile(Paths.get(uri)) + JavaFile(Paths.get(uri).normalize()) } } else { - JavaFile(Paths.get(path)) + JavaFile(Paths.get(path).normalize()) } } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index 6041615d8..7c92e2e43 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -50,7 +50,7 @@ case class JavaFile(jpath:Path) extends File { override def /(sub: String): File = { val uri = new URI(sub) if (uri.isAbsolute) - JavaFile(Paths.get(uri)) + JavaFile(Paths.get(uri).normalize().toAbsolutePath) else JavaFile(jpath.resolve(sub)) } diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index ca3f4d54a..151f5b1a3 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -16,6 +16,7 @@ package com.dimajix.flowman.fs +import java.net.URI import java.nio.file.NoSuchFileException import org.apache.hadoop.fs.Path @@ -30,51 +31,204 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { "FileSystem.local" should "be usable with simple strings" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val tmpFromString = fs.local(tempDir.toString) - tmpFromString.exists() should be (true) - tmpFromString.isFile() should be (false) - tmpFromString.isDirectory() should be (true) + val dir1 = fs.local(tempDir.toString) + dir1.uri should be (tempDir.toURI) + dir1.path should be (new Path(tempDir.toURI)) + //dir1.path should be (new Path(tempDir.toURI.toString)) + dir1.exists() should be (true) + dir1.isFile() should be (false) + dir1.isDirectory() should be (true) + + val dir2 = fs.local(tempDir.toString + "/lolo") + dir2.uri should be(tempDir.toURI.resolve("lolo")) + dir2.path should be(new Path(tempDir.toURI.resolve("lolo"))) + dir2.path should be (new Path(tempDir.toURI.resolve("lolo").toString)) + dir2.name should be ("lolo") + dir2.exists() should be(false) + dir2.isFile() should be(false) + dir2.isDirectory() should be(false) + + val dir3 = fs.local(tempDir.toString + "/lolo/") + dir3.uri should be(tempDir.toURI.resolve("lolo")) + dir3.path should be(new Path(tempDir.toURI.resolve("lolo"))) + dir3.path should be(new Path(tempDir.toURI.resolve("lolo").toString)) + dir3.name should be("lolo") + dir3.exists() should be(false) + dir3.isFile() should be(false) + dir3.isDirectory() should be(false) + } + + it should "be usable with URIs" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val dir = fs.local(tempDir.toURI) + dir.uri should be(tempDir.toURI) + dir.path should be(new Path(tempDir.toURI)) + //tmpFromUri.path should be(new Path(tempDir.toURI.toString)) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isDirectory() should be(true) + + val dir2 = fs.local(tempDir.toURI.resolve("lolo")) + dir2.uri should be(tempDir.toURI.resolve("lolo")) + dir2.path should be(new Path(tempDir.toURI.resolve("lolo"))) + dir2.path should be(new Path(tempDir.toURI.resolve("lolo").toString)) + dir2.name should be("lolo") + dir2.exists() should be(false) + dir2.isFile() should be(false) + dir2.isDirectory() should be(false) + + val dir3 = fs.local(tempDir.toURI.resolve("lolo/")) + dir3.uri should be(tempDir.toURI.resolve("lolo")) + dir3.path should be(new Path(tempDir.toURI.resolve("lolo"))) + dir3.path should be(new Path(tempDir.toURI.resolve("lolo").toString)) + dir3.name should be("lolo") + dir3.exists() should be(false) + dir3.isFile() should be(false) + dir3.isDirectory() should be(false) + } + + it should "be usable with Paths" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val tmpFromUri = fs.local(new Path(tempDir.toString)) + //tmpFromUri.path should be (new Path("file:" + tempDir.toString + "/")) + tmpFromUri.path should be(new Path(tempDir.toURI)) + tmpFromUri.uri should be(tempDir.toURI) + tmpFromUri.exists() should be(true) + tmpFromUri.isFile() should be(false) + tmpFromUri.isDirectory() should be(true) + + val dir = tmpFromUri / "lala" + dir.uri should be(new Path(new Path(tempDir.toURI), "lala").toUri) + dir.path should be(new Path(new Path(tempDir.toURI), "lala")) + dir.name should be("lala") + val file = dir / "lolo.tmp" + file.uri should be(new Path(dir.path, "lolo.tmp").toUri) + file.path should be(new Path(dir.path, "lolo.tmp")) + file.name should be("lolo.tmp") + + val dir2 = fs.local(new Path(tempDir.toURI.resolve("lolo"))) + dir2.uri should be(tempDir.toURI.resolve("lolo")) + dir2.path should be(new Path(tempDir.toURI.resolve("lolo"))) + dir2.path should be(new Path(tempDir.toURI.resolve("lolo").toString)) + dir2.name should be("lolo") + dir2.exists() should be(false) + dir2.isFile() should be(false) + dir2.isDirectory() should be(false) + + val dir3 = fs.local(new Path(tempDir.toURI.resolve("lolo/"))) + dir3.uri should be(tempDir.toURI.resolve("lolo")) + dir3.path should be(new Path(tempDir.toURI.resolve("lolo"))) + dir3.path should be(new Path(tempDir.toURI.resolve("lolo").toString)) + dir3.name should be("lolo") + dir3.exists() should be(false) + dir3.isFile() should be(false) + dir3.isDirectory() should be(false) } it should "be usable with Files" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val tmpFromUri = fs.local(tempDir) + tmpFromUri.uri should be (tempDir.toURI) + tmpFromUri.path should be(new Path(tempDir.toURI)) + //tmpFromUri.path should be(new Path(tempDir.toURI.toString)) tmpFromUri.exists() should be (true) tmpFromUri.isFile() should be (false) tmpFromUri.isDirectory() should be (true) } - it should "be usable relative paths" in { + it should "be usable relative paths (String)" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val file = fs.local("target/classes") file.name should be ("classes") + file.path.isAbsolute should be (true) + file.uri.isAbsolute should be (true) file.exists() should be(true) file.isFile() should be(false) file.isDirectory() should be(true) + + val abs = file.absolute + abs.name should be("classes") + abs.path.isAbsolute should be(true) + abs.uri.isAbsolute should be(true) + abs.exists() should be(true) + abs.isFile() should be(false) + abs.isDirectory() should be(true) } - it should "be usable with Paths" in { + it should "be usable relative paths (Path)" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val tmpFromUri = fs.local(new Path(tempDir.toString)) + val file = fs.local(new Path("target/classes")) + file.name should be("classes") + file.path.isAbsolute should be(true) + file.uri.isAbsolute should be(true) + file.exists() should be(true) + file.isFile() should be(false) + file.isDirectory() should be(true) + + val abs = file.absolute + abs.name should be("classes") + abs.path.isAbsolute should be(true) + abs.uri.isAbsolute should be(true) + abs.exists() should be(true) + abs.isFile() should be(false) + abs.isDirectory() should be(true) + } + + it should "resolve relative Paths in local(String)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.local(tempDir.toString + "/lala/../lolo") //tmpFromUri.path should be (new Path("file:" + tempDir.toString + "/")) - tmpFromUri.path should be (new Path(tempDir.toURI)) - tmpFromUri.exists() should be(true) - tmpFromUri.isFile() should be(false) - tmpFromUri.isDirectory() should be(true) + file.path should be(new Path(tempDir.toURI.toString + "/lolo")) + file.uri should be(tempDir.toURI.resolve("lolo")) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create(true).close + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + file.parent.exists() should be(true) + file.parent.isFile() should be(false) + file.parent.isDirectory() should be(true) + + file.delete() + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) } - it should "be usable with URIs" in { + it should "resolve relative Paths in local(URI)" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val tmpFromUri = fs.local(tempDir.toURI) - //tmpFromUri.path should be(new Path("file:" + tempDir.toString + "/")) - tmpFromUri.path should be(new Path(tempDir.toURI)) - tmpFromUri.exists() should be (true) - tmpFromUri.isFile() should be (false) - tmpFromUri.isDirectory() should be (true) + val file = fs.local(new URI(tempDir.toString + "/lala2/../lolo2")) + //tmpFromUri.path should be (new Path("file:" + tempDir.toString + "/")) + file.path should be(new Path(tempDir.toURI.toString + "/lolo2")) + file.uri should be(tempDir.toURI.resolve("lolo2")) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create(true).close + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + file.parent.exists() should be(true) + file.parent.isFile() should be(false) + file.parent.isDirectory() should be(true) + + file.delete() + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) } it should "support creating entries" in { @@ -146,6 +300,98 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { tmpFromUri.isDirectory() should be(true) } + it should "be usable relative paths (String)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file("target/classes") + file.name should be("classes") + file.path.isAbsolute should be(false) + file.uri.isAbsolute should be(false) + file.exists() should be(true) + file.isFile() should be(false) + file.isDirectory() should be(true) + + val abs = file.absolute + abs.name should be("classes") + abs.path.isAbsolute should be(true) + abs.uri.isAbsolute should be(true) + abs.exists() should be(true) + abs.isFile() should be(false) + abs.isDirectory() should be(true) + } + + it should "be usable relative paths (Path)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(new Path("target/classes")) + file.name should be("classes") + file.path.isAbsolute should be(false) + file.uri.isAbsolute should be(false) + file.exists() should be(true) + file.isFile() should be(false) + file.isDirectory() should be(true) + + val abs = file.absolute + abs.name should be("classes") + abs.path.isAbsolute should be(true) + abs.uri.isAbsolute should be(true) + abs.exists() should be(true) + abs.isFile() should be(false) + abs.isDirectory() should be(true) + } + + it should "resolve relative Paths in file(String)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(tempDir.toString + "/lala/../lolo") + //tmpFromUri.path should be (new Path("file:" + tempDir.toString + "/")) + file.path should be(new Path(tempDir.toURI.toString + "/lolo")) + file.uri should be(tempDir.toURI.resolve("lolo")) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create(true).close + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + file.parent.exists() should be(true) + file.parent.isFile() should be(false) + file.parent.isDirectory() should be(true) + + file.delete() + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + } + + it should "resolve relative Paths in file(URI)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(new URI(tempDir.toString + "/lala/../lolo")) + //tmpFromUri.path should be (new Path("file:" + tempDir.toString + "/")) + file.path should be(new Path(tempDir.toURI.toString + "/lolo")) + file.uri should be(tempDir.toURI.resolve("lolo")) + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + + file.create(true).close + file.exists() should be(true) + file.isFile() should be(true) + file.isDirectory() should be(false) + + file.parent.exists() should be(true) + file.parent.isFile() should be(false) + file.parent.isDirectory() should be(true) + + file.delete() + file.exists() should be(false) + file.isFile() should be(false) + file.isDirectory() should be(false) + } + it should "support creating entries" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) @@ -235,63 +481,212 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val file = fs.file(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file.uri should be (Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file.path should be (new Path(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI)) + file.name should be ("flowman.properties") file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) file.isDirectory() should be(false) + val file2 = fs.file(Resources.getURL("com/dimajix/flowman/../flowman/flowman.properties").toURI) + file2.uri should be(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file2.path should be(new Path(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI)) + file2.name should be("flowman.properties") + file2.exists() should be(true) + file2.isFile() should be(true) + file2.isAbsolute() should be(true) + file2.isDirectory() should be(false) + val dir = fs.file(Resources.getURL("com/dimajix/flowman").toURI) + dir.uri should be(Resources.getURL("com/dimajix/flowman").toURI) + dir.path should be(new Path(Resources.getURL("com/dimajix/flowman").toURI)) + dir.name should be("flowman") dir.exists() should be(true) dir.isFile() should be(false) dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + + val dir2 = fs.file(Resources.getURL("com/dimajix/flowman/").toURI) + dir2.uri should be(Resources.getURL("com/dimajix/flowman/").toURI) + dir2.path should be(new Path(Resources.getURL("com/dimajix/flowman/").toURI)) + //dir2.name should be("flowman") + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) + } + + it should "support resources somewhere via 'file(Path)'" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(new Path(Resources.getURL("com/dimajix/flowman/flowman.properties").toString)) + file.uri should be(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file.path should be(new Path(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI)) + file.name should be("flowman.properties") + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val file2 = fs.file(new Path(Resources.getURL("com/dimajix/flowman/../flowman/flowman.properties").toString)) + file2.uri should be(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file2.path should be(new Path(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI)) + file2.name should be("flowman.properties") + file2.exists() should be(true) + file2.isFile() should be(true) + file2.isAbsolute() should be(true) + file2.isDirectory() should be(false) + + val dir = fs.file(new Path(Resources.getURL("com/dimajix/flowman").toString)) + dir.uri should be(Resources.getURL("com/dimajix/flowman").toURI) + dir.path should be(new Path(Resources.getURL("com/dimajix/flowman").toURI)) + dir.name should be("flowman") + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + + val dir2 = fs.file(new Path(Resources.getURL("com/dimajix/flowman/").toString)) + dir2.uri should be(Resources.getURL("com/dimajix/flowman").toURI) + dir2.path should be(new Path(Resources.getURL("com/dimajix/flowman").toURI)) + dir2.name should be("flowman") + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) } it should "support resources somewhere via 'file(String)'" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val file = fs.file(Resources.getURL("com/dimajix/flowman/flowman.properties").toString) + file.uri should be(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file.path should be(new Path(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI)) + file.name should be("flowman.properties") file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) file.isDirectory() should be(false) + val file2 = fs.file(Resources.getURL("com/dimajix/flowman/../flowman/flowman.properties").toString) + file2.uri should be(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI) + file2.path should be(new Path(Resources.getURL("com/dimajix/flowman/flowman.properties").toURI)) + file2.name should be("flowman.properties") + file2.exists() should be(true) + file2.isFile() should be(true) + file2.isAbsolute() should be(true) + file2.isDirectory() should be(false) + val dir = fs.file(Resources.getURL("com/dimajix/flowman").toString) + dir.uri should be(Resources.getURL("com/dimajix/flowman").toURI) + dir.path should be(new Path(Resources.getURL("com/dimajix/flowman").toURI)) + dir.name should be("flowman") dir.exists() should be(true) dir.isFile() should be(false) dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + + val dir2 = fs.file(Resources.getURL("com/dimajix/flowman/").toString) + dir2.uri should be(Resources.getURL("com/dimajix/flowman").toURI) + dir2.path should be(new Path(Resources.getURL("com/dimajix/flowman").toURI)) + dir2.name should be("flowman") + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) } it should "support resources in JARs via 'file(URI)'" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val file = fs.file(Resources.getURL("org/apache/spark/SparkContext.class").toURI) + file.uri should be(Resources.getURL("org/apache/spark/SparkContext.class").toURI) + file.path should be(new Path(Resources.getURL("org/apache/spark/SparkContext.class").toURI)) + file.name should be("SparkContext.class") file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) file.isDirectory() should be(false) val dir = fs.file(Resources.getURL("org/apache/spark").toURI) + dir.uri should be(Resources.getURL("org/apache/spark").toURI) + dir.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) + dir.name should be("spark") + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isAbsolute() should be(true) + dir.isDirectory() should be(true) + + val dir2 = fs.file(Resources.getURL("org/apache/spark/").toURI) + dir2.uri should be(Resources.getURL("org/apache/spark").toURI) + dir2.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) + dir2.name should be("spark") + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) + } + + it should "support resources in JARs via 'file(Path)'" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(new Path(Resources.getURL("org/apache/spark/SparkContext.class").toURI)) + file.uri should be(Resources.getURL("org/apache/spark/SparkContext.class").toURI) + file.path should be(new Path(Resources.getURL("org/apache/spark/SparkContext.class").toURI)) + file.name should be("SparkContext.class") + file.exists() should be(true) + file.isFile() should be(true) + file.isAbsolute() should be(true) + file.isDirectory() should be(false) + + val dir = fs.file(new Path(Resources.getURL("org/apache/spark").toURI)) + dir.uri should be(Resources.getURL("org/apache/spark").toURI) + dir.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) + dir.name should be("spark") dir.exists() should be(true) dir.isFile() should be(false) dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + + val dir2 = fs.file(new Path(Resources.getURL("org/apache/spark/").toURI)) + dir2.uri should be(Resources.getURL("org/apache/spark").toURI) + dir2.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) + dir2.name should be("spark") + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) } it should "support resources in JARs via 'file(String)'" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) val file = fs.file(Resources.getURL("org/apache/spark/SparkContext.class").toString) + file.uri should be(Resources.getURL("org/apache/spark/SparkContext.class").toURI) + file.path should be(new Path(Resources.getURL("org/apache/spark/SparkContext.class").toURI)) + file.name should be("SparkContext.class") file.exists() should be(true) file.isFile() should be(true) file.isAbsolute() should be(true) file.isDirectory() should be(false) val dir = fs.file(Resources.getURL("org/apache/spark").toString) + dir.uri should be(Resources.getURL("org/apache/spark").toURI) + dir.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) + dir.name should be("spark") dir.exists() should be(true) dir.isFile() should be(false) dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + + val dir2 = fs.file(Resources.getURL("org/apache/spark/").toString) + dir2.uri should be(Resources.getURL("org/apache/spark").toURI) + dir2.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) + dir2.name should be("spark") + dir2.exists() should be(true) + dir2.isFile() should be(false) + dir2.isAbsolute() should be(true) + dir2.isDirectory() should be(true) } } diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/ReportHook.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/ReportHook.scala index ed59bf475..302233ef0 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/ReportHook.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/hook/ReportHook.scala @@ -87,7 +87,7 @@ case class ReportHook( ) extends BaseHook { private val logger = LoggerFactory.getLogger(classOf[ReportHook]) - private def newOutput():Option[PrintStream] = { + private def newOutput(execution:Execution):Option[PrintStream] = { if (location.toString == "stdout") { Some(System.out) } @@ -95,17 +95,18 @@ case class ReportHook( Some(System.err) } else { - val fs = location.getFileSystem(context.hadoopConf) + val fs = execution.fs + val file = fs.file(location) val out = mode match { - case OutputMode.OVERWRITE => fs.create(location) - case OutputMode.APPEND => fs.append(location) + case OutputMode.OVERWRITE => file.create(true) + case OutputMode.APPEND => file.append() case OutputMode.ERROR_IF_EXISTS => - if (fs.exists(location)) + if (file.exists()) throw new FileAlreadyExistsException(s"Cannot open report output, file $location already exists") - fs.create(location) + file.create(false) case OutputMode.IGNORE_IF_EXISTS => - if (!fs.exists(location)) { - fs.create(location) + if (!file.exists()) { + file.create(false) } else { null @@ -145,7 +146,7 @@ case class ReportHook( */ override def startLifecycle(execution:Execution, job:Job, instance:JobLifecycle) : LifecycleToken = { logger.info(s"Creating new report to $location") - val output = newOutput() + val output = newOutput(execution) output.foreach { p => val vars = Map( "job" -> JobWrapper(job), @@ -205,7 +206,7 @@ case class ReportHook( val output = parent.flatMap { case ReporterLifecycleToken(output) => output - case _ => newOutput() + case _ => newOutput(execution) } output.foreach { p => val vars = Map( diff --git a/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala b/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala index 69c6d1928..f6e630b96 100644 --- a/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala +++ b/flowman-testing/src/main/scala/com/dimajix/flowman/testing/Runner.scala @@ -89,10 +89,6 @@ object Runner { this.project = Project.read.file(fs.local(project.toURI)) this } - def withProject(project:URI) : Builder = { - this.project = Project.read.file(fs.local(project)) - this - } def withProject(project:Project) : Builder = { this.project = project this From 378784e0921947207e8e422545fa8ed744152ec1 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Thu, 3 Nov 2022 16:29:26 +0100 Subject: [PATCH 42/52] github-273 Fix some issues with new File API in Java 8 --- .../com/dimajix/flowman/fs/FileSystem.scala | 25 ++++++++++++++++--- .../com/dimajix/flowman/fs/JavaFile.scala | 2 +- .../dimajix/flowman/fs/FileSystemTest.scala | 3 +++ 3 files changed, 25 insertions(+), 5 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index f55da7be1..17defd77f 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -26,6 +26,12 @@ import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import com.dimajix.common.Resources +import com.dimajix.flowman.fs.FileSystem.SEPARATOR + + +object FileSystem { + val SEPARATOR = "/" +} /** @@ -33,6 +39,7 @@ import com.dimajix.common.Resources * @param conf */ case class FileSystem(conf:Configuration) { + def file(path:Path) : File = { if (path.toUri.getScheme == "jar") { resource(path.toUri) @@ -43,7 +50,6 @@ case class FileSystem(conf:Configuration) { if (uri.getScheme == null && path.isAbsolute) { val p = new Path(fs.getScheme, uri.getAuthority, uri.getPath) HadoopFile(fs, p) - } else { HadoopFile(fs, path) @@ -84,11 +90,11 @@ case class FileSystem(conf:Configuration) { val url = Resources.getURL(path) if (url == null) throw new NoSuchFileException(s"Resource '$path' not found") - val uri = url.toURI - resource(uri) + resource(url.toURI) } def resource(uri:URI) : File = { if (uri.getScheme == "jar") { + // Ensure JAR is opened as a file system try { java.nio.file.FileSystems.getFileSystem(uri) } @@ -96,10 +102,21 @@ case class FileSystem(conf:Configuration) { case _: FileSystemNotFoundException => java.nio.file.FileSystems.newFileSystem(uri, Collections.emptyMap[String, String]()) } - JavaFile(Paths.get(uri)) + + // Remove trailing "/", this is only present in Java 1.8 + val str = uri.toString + val lastEx = str.lastIndexOf("!") + val lastSep = str.lastIndexOf(SEPARATOR) + if (lastSep == str.length - 1 && lastSep > lastEx + 1 && lastEx > 0) { + JavaFile(Paths.get(new URI(str.dropRight(1)))) + } + else { + JavaFile(Paths.get(uri)) + } } else { JavaFile(Paths.get(uri)) } + } } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index 7c92e2e43..b12df068c 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -63,7 +63,7 @@ case class JavaFile(jpath:Path) extends File { override def name : String = { val n = jpath.getFileName if (n != null) { - // Remove trailing "/". Required for Java.18 (not Java 11) + // Remove trailing "/". Required for Java 1.8 (not Java 11) val sep = jpath.getFileSystem.getSeparator.head n.toString.takeWhile(_ != sep) } else { diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index 151f5b1a3..30dc76760 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -618,6 +618,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + // TODO val dir2 = fs.file(Resources.getURL("org/apache/spark/").toURI) dir2.uri should be(Resources.getURL("org/apache/spark").toURI) dir2.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) @@ -649,6 +650,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + // TODO val dir2 = fs.file(new Path(Resources.getURL("org/apache/spark/").toURI)) dir2.uri should be(Resources.getURL("org/apache/spark").toURI) dir2.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) @@ -680,6 +682,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { dir.isAbsolute() should be(true) dir.isDirectory() should be(true) + // TODO val dir2 = fs.file(Resources.getURL("org/apache/spark/").toString) dir2.uri should be(Resources.getURL("org/apache/spark").toURI) dir2.path should be(new Path(Resources.getURL("org/apache/spark").toURI)) From d00f4d1ea15225231fdb5418b13e288e05ced5a0 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 7 Nov 2022 08:39:03 +0100 Subject: [PATCH 43/52] github-273 Add some more tests for special characters in paths --- .../com/dimajix/flowman/fs/FileSystem.scala | 13 ++- .../com/dimajix/flowman/fs/JavaFile.scala | 4 +- .../dimajix/flowman/fs/FileSystemTest.scala | 87 ++++++++++++++----- 3 files changed, 76 insertions(+), 28 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index 17defd77f..9d26feb54 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -57,9 +57,16 @@ case class FileSystem(conf:Configuration) { } } def file(path:String) : File = { - val uri = new URI(path) - if (uri.getScheme == "jar") { - resource(uri) + // parse uri scheme, if any + var scheme:String = null + val colon = path.indexOf(':') + val slash = path.indexOf('/') + if ((colon != -1) && ((slash == -1) || (colon < slash))) { // has a scheme + scheme = path.substring(0, colon) + } + + if (scheme == "jar") { + resource(new URI(path)) } else { file(new Path(path)) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index b12df068c..bd3ad459c 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -31,11 +31,9 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs -import com.dimajix.common.text.StringUtils - case class JavaFile(jpath:Path) extends File { - override def toString: String = uri.toString + override def toString: String = "file:" + jpath.toString override def path: fs.Path = new fs.Path(uri) diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index 30dc76760..e2d3e2f89 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -34,6 +34,7 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val dir1 = fs.local(tempDir.toString) dir1.uri should be (tempDir.toURI) dir1.path should be (new Path(tempDir.toURI)) + dir1.toString should be ("file:" + tempDir.toString) //dir1.path should be (new Path(tempDir.toURI.toString)) dir1.exists() should be (true) dir1.isFile() should be (false) @@ -64,7 +65,8 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val dir = fs.local(tempDir.toURI) dir.uri should be(tempDir.toURI) dir.path should be(new Path(tempDir.toURI)) - //tmpFromUri.path should be(new Path(tempDir.toURI.toString)) + dir.toString + "/" should be (tempDir.toURI.toString) + //dir.path should be(new Path(tempDir.toURI.toString)) dir.exists() should be(true) dir.isFile() should be(false) dir.isDirectory() should be(true) @@ -91,21 +93,21 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { it should "be usable with Paths" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val tmpFromUri = fs.local(new Path(tempDir.toString)) - //tmpFromUri.path should be (new Path("file:" + tempDir.toString + "/")) - tmpFromUri.path should be(new Path(tempDir.toURI)) - tmpFromUri.uri should be(tempDir.toURI) - tmpFromUri.exists() should be(true) - tmpFromUri.isFile() should be(false) - tmpFromUri.isDirectory() should be(true) + val dir = fs.local(new Path(tempDir.toString)) + dir.path should be(new Path(tempDir.toURI)) + dir.uri should be(tempDir.toURI) + dir.toString should be ("file:" + tempDir.toString) + dir.exists() should be(true) + dir.isFile() should be(false) + dir.isDirectory() should be(true) - val dir = tmpFromUri / "lala" - dir.uri should be(new Path(new Path(tempDir.toURI), "lala").toUri) - dir.path should be(new Path(new Path(tempDir.toURI), "lala")) - dir.name should be("lala") - val file = dir / "lolo.tmp" - file.uri should be(new Path(dir.path, "lolo.tmp").toUri) - file.path should be(new Path(dir.path, "lolo.tmp")) + val dir1 = dir / "lala" + dir1.uri should be(new Path(new Path(tempDir.toURI), "lala").toUri) + dir1.path should be(new Path(new Path(tempDir.toURI), "lala")) + dir1.name should be("lala") + val file = dir1 / "lolo.tmp" + file.uri should be(new Path(dir1.path, "lolo.tmp").toUri) + file.path should be(new Path(dir1.path, "lolo.tmp")) file.name should be("lolo.tmp") val dir2 = fs.local(new Path(tempDir.toURI.resolve("lolo"))) @@ -130,13 +132,38 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { it should "be usable with Files" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) - val tmpFromUri = fs.local(tempDir) - tmpFromUri.uri should be (tempDir.toURI) - tmpFromUri.path should be(new Path(tempDir.toURI)) - //tmpFromUri.path should be(new Path(tempDir.toURI.toString)) - tmpFromUri.exists() should be (true) - tmpFromUri.isFile() should be (false) - tmpFromUri.isDirectory() should be (true) + val dir = fs.local(tempDir) + dir.uri should be (tempDir.toURI) + dir.path should be(new Path(tempDir.toURI)) + dir.toString should be ("file:" + tempDir.toString) + //dir.path should be(new Path(tempDir.toURI.toString)) + dir.exists() should be (true) + dir.isFile() should be (false) + dir.isDirectory() should be (true) + } + + it should "be usable with special characters and whitespaces (String)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.local("/tmp/hourly/hour=2022-03-10 20:00:00") + file.uri should be (new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.toString should be ("file:/tmp/hourly/hour=2022-03-10 20:00:00") + } + + it should "be usable with special characters and whitespaces (Path)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.local(new Path("/tmp/hourly/hour=2022-03-10 20:00:00")) + file.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.toString should be("file:/tmp/hourly/hour=2022-03-10 20:00:00") + } + + it should "be usable with special characters and whitespaces (File)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.local(new java.io.File("/tmp/hourly/hour=2022-03-10 20:00:00")) + file.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.toString should be("file:/tmp/hourly/hour=2022-03-10 20:00:00") } it should "be usable relative paths (String)" in { @@ -300,6 +327,22 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { tmpFromUri.isDirectory() should be(true) } + it should "be usable with special characters and whitespaces (String)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file("/tmp/hourly/hour=2022-03-10 20:00:00") + file.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.toString should be("file:/tmp/hourly/hour=2022-03-10 20:00:00") + } + + it should "be usable with special characters and whitespaces (Path)" in { + val conf = spark.sparkContext.hadoopConfiguration + val fs = FileSystem(conf) + val file = fs.file(new Path("/tmp/hourly/hour=2022-03-10 20:00:00")) + file.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.toString should be("file:/tmp/hourly/hour=2022-03-10 20:00:00") + } + it should "be usable relative paths (String)" in { val conf = spark.sparkContext.hadoopConfiguration val fs = FileSystem(conf) From 724ebfd2cddf31b7fed9f07c135d72ba6dee253d Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 7 Nov 2022 11:36:46 +0100 Subject: [PATCH 44/52] github-274 Print Flowman configuration to console --- CHANGELOG.md | 2 ++ docs/releases.md | 2 ++ .../flowman/config/Configuration.scala | 4 +-- .../dimajix/flowman/config/FlowmanConf.scala | 2 +- .../flowman/execution/AbstractContext.scala | 6 ++-- .../dimajix/flowman/execution/Context.scala | 6 ---- .../flowman/execution/ProjectContext.scala | 7 ---- .../flowman/execution/RootContext.scala | 10 ++---- .../flowman/execution/ScopeContext.scala | 7 ---- .../dimajix/flowman/execution/Session.scala | 14 +++++--- .../flowman/execution/RootContextTest.scala | 32 ++----------------- 11 files changed, 24 insertions(+), 68 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index bd88f0997..8f06db9bc 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,8 @@ * github-270: Upgrade Spark to 3.3.1 * github-271: Upgrade Delta to 2.1.1 * github-272: Create build profile for AWS EMR 6.8.0 +* github-273: Refactor file abstraction +* github-274: Print Flowman configuration to console # Version 0.28.0 - 2022-10-07 diff --git a/docs/releases.md b/docs/releases.md index 938b713ff..bc2188f4e 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -27,6 +27,8 @@ changes over time. * github-270: Upgrade Spark to 3.3.1 * github-271: Upgrade Delta to 2.1.1 * github-272: Create build profile for AWS EMR 6.8.0 +* github-273: Refactor file abstraction +* github-274: Print Flowman configuration to console ### Version 0.28.0 - 2022-10-07 diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/config/Configuration.scala b/flowman-core/src/main/scala/com/dimajix/flowman/config/Configuration.scala index 65c481f73..a9c1b76ac 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/config/Configuration.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/config/Configuration.scala @@ -22,7 +22,7 @@ import org.apache.spark.SparkConf import org.apache.spark.sql.SparkShim -class Configuration(userSettings:Map[String,String]) { +final class Configuration(userSettings:Map[String,String]) { private val systemSettings = System.getProperties.stringPropertyNames().asScala.filter(_.startsWith("flowman.")) .map(key => (key, System.getProperty(key))) @@ -37,7 +37,7 @@ class Configuration(userSettings:Map[String,String]) { /** * This variable contains Flowman configuration object */ - val flowmanConf:FlowmanConf = new FlowmanConf(flowmanSettings) + val flowmanConf:FlowmanConf = FlowmanConf(flowmanSettings) /** * Spark configuration also derived from all global settings */ diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/config/FlowmanConf.scala b/flowman-core/src/main/scala/com/dimajix/flowman/config/FlowmanConf.scala index dba973909..d991ecc18 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/config/FlowmanConf.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/config/FlowmanConf.scala @@ -175,7 +175,7 @@ object FlowmanConf { } -class FlowmanConf(settings:Map[String,String]) { +final case class FlowmanConf(settings:Map[String,String]) { import FlowmanConf._ settings.foreach{ case (key,value) => validateSetting(key, value) } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala index 71c3ff1ab..e839c2771 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/AbstractContext.scala @@ -59,7 +59,7 @@ object AbstractContext { rawConfig.update(key, (value, settingLevel.level)) } else { - logger.info(s"Ignoring changing final config variable '$key=${currentValue._1}' to '$value'") + logger.debug(s"Ignoring changing final config variable $key='${currentValue._1}' to '$value'") } } @@ -69,7 +69,7 @@ object AbstractContext { rawEnvironment.update(key, (value, settingLevel.level)) } else { - logger.info(s"Ignoring changing final environment variable '$key=${currentValue._1}' to '$value'") + logger.debug(s"Ignoring changing final environment variable $key='${currentValue._1}' to '$value'") } } @@ -79,7 +79,7 @@ object AbstractContext { rawConnections.update(name, (connection, settingLevel.level)) } else { - logger.info(s"Ignoring changing final database $name") + logger.debug(s"Ignoring changing final connection '$name'") } } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala index d3a274fa1..756589915 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Context.scala @@ -200,12 +200,6 @@ abstract class Context { @throws[NoSuchTemplateException] def getTemplate(identifier: TemplateIdentifier): Template[_] - /** - * Returns the list of active profile names - * @return - */ - def profiles : Set[String] - /** * Returns all configuration options as a key-value map * diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/ProjectContext.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/ProjectContext.scala index f72ea0fff..49b284189 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/ProjectContext.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/ProjectContext.scala @@ -133,13 +133,6 @@ final class ProjectContext private[execution]( */ override def root : RootContext = parent.root - /** - * Returns the list of active profile names - * - * @return - */ - override def profiles: Set[String] = parent.profiles - /** * Returns a specific named Transform. The Transform can either be inside this Contexts project or in a different * project within the same namespace diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala index f1ac674d0..0bc5e60f8 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/RootContext.scala @@ -154,13 +154,6 @@ final class RootContext private[execution]( */ override def root : RootContext = this - /** - * Returns the list of active profile names - * - * @return - */ - override def profiles: Set[String] = _profiles - /** * Returns a fully qualified mapping from a project belonging to the namespace of this execution * @@ -335,7 +328,8 @@ final class RootContext private[execution]( private def createProjectContext(project: Project) : Context = { val builder = ProjectContext.builder(this, project) - profiles.foreach { prof => + // Apply all selected profiles defined in the project + _profiles.foreach { prof => project.profiles.get(prof).foreach { profile => builder.withProfile(profile) } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopeContext.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopeContext.scala index d9a59ac45..6498807de 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopeContext.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/ScopeContext.scala @@ -134,13 +134,6 @@ final class ScopeContext( */ override def root: RootContext = parent.root - /** - * Returns the list of active profile names - * - * @return - */ - override def profiles: Set[String] = parent.profiles - @throws[InstantiateConnectionFailedException] @throws[NoSuchConnectionException] override def getConnection(identifier: ConnectionIdentifier): Connection = { diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala index a2540530d..71e4635b6 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala @@ -390,7 +390,7 @@ class Session private[execution]( val logFilters = LogFilter.filters spark.conf.getAll.toSeq.sortBy(_._1).foreach { keyValue => logFilters.foldLeft(Option(keyValue))((kv, f) => kv.flatMap(kv => f.filterConfig(kv._1,kv._2))) - .foreach { case (key,value) => logger.info("Config: {} = {}", key: Any, value: Any) } + .foreach { case (key,value) => logger.info("Spark Config: {} = {}", key: Any, value: Any) } } // Copy all Spark configs over to SparkConf inside the Context @@ -419,7 +419,7 @@ class Session private[execution]( .withProjectResolver(loadProject) _namespace.foreach { ns => _profiles.foreach(p => ns.profiles.get(p).foreach { profile => - logger.info(s"Applying namespace profile $p") + logger.info(s"Activating namespace profile '$p'") builder.withProfile(profile) }) builder.withEnvironment(ns.environment) @@ -428,7 +428,7 @@ class Session private[execution]( _project.foreach { prj => // github-155: Apply project configuration to session _profiles.foreach(p => prj.profiles.get(p).foreach { profile => - logger.info(s"Applying project profile $p") + logger.info(s"Activating project profile '$p'") builder.withConfig(profile.config, SettingLevel.PROJECT_PROFILE) }) builder.withConfig(prj.config, SettingLevel.PROJECT_SETTING) @@ -437,7 +437,7 @@ class Session private[execution]( } private lazy val _configuration : Configuration = { - if (_project.nonEmpty) { + val conf = if (_project.nonEmpty) { logger.info("Using project specific configuration settings") getContext(_project.get).config } @@ -445,6 +445,12 @@ class Session private[execution]( logger.info("Using global configuration settings") context.config } + + conf.flowmanConf.getAll.foreach { case(key,value) => + logger.info("Flowman Config: {} = {}", key: Any, value: Any) + } + + conf } private lazy val _catalog = { diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/execution/RootContextTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/execution/RootContextTest.scala index 518c82c61..918126aa3 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/execution/RootContextTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/execution/RootContextTest.scala @@ -1,5 +1,5 @@ /* - * Copyright 2021 Kaya Kupferschmidt + * Copyright 2021-2022 Kaya Kupferschmidt * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,35 +35,7 @@ import com.dimajix.flowman.types.StringType class RootContextTest extends AnyFlatSpec with Matchers with MockFactory { - "The RootContext" should "apply profiles" in { - val namespace = Namespace( - name = "default", - profiles = Map( - "profile" -> Profile(name="profile") - ) - ) - val project = Project( - name = "my_project", - profiles = Map( - "profile" -> Profile(name="profile") - ) - ) - - val session = Session.builder() - .withNamespace(namespace) - .withProfile("profile") - .withProfile("profile2") - .disableSpark() - .build() - - val rootContext = session.context - rootContext.profiles should be (Set("profile", "profile2")) - - val projectContext = session.getContext(project) - projectContext.profiles should be (Set("profile", "profile2")) - } - - it should "correctly lookup connections in profiles" in { + "The RootContext" should "correctly lookup connections in profiles" in { val namespaceConnectionTemplate = mock[Prototype[Connection]] val namespaceConnection = mock[Connection] val namespaceProfileConnectionTemplate = mock[Prototype[Connection]] From 191cd78d0399249a17c94922c44867ac45a7c0f6 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 7 Nov 2022 13:47:23 +0100 Subject: [PATCH 45/52] github-274 Sort Flowman configuration before printing --- .../dimajix/flowman/execution/Session.scala | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala index 71e4635b6..fc11ed022 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala @@ -388,10 +388,12 @@ class Session private[execution]( // Log all config properties val logFilters = LogFilter.filters - spark.conf.getAll.toSeq.sortBy(_._1).foreach { keyValue => - logFilters.foldLeft(Option(keyValue))((kv, f) => kv.flatMap(kv => f.filterConfig(kv._1,kv._2))) - .foreach { case (key,value) => logger.info("Spark Config: {} = {}", key: Any, value: Any) } - } + spark.conf.getAll.toSeq + .sortBy(_._1) + .foreach { keyValue => + logFilters.foldLeft(Option(keyValue))((kv, f) => kv.flatMap(kv => f.filterConfig(kv._1,kv._2))) + .foreach { case (key,value) => logger.info("Spark Config: {} = {}", key: Any, value: Any) } + } // Copy all Spark configs over to SparkConf inside the Context sparkConf.setAll(spark.conf.getAll) @@ -446,9 +448,11 @@ class Session private[execution]( context.config } - conf.flowmanConf.getAll.foreach { case(key,value) => - logger.info("Flowman Config: {} = {}", key: Any, value: Any) - } + conf.flowmanConf.getAll.toSeq + .sortBy(_._1) + .foreach { case(key,value) => + logger.info("Flowman Config: {} = {}", key: Any, value: Any) + } conf } From 53c0345283524976bffc88f4c9b22f3818b96124 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 7 Nov 2022 13:55:41 +0100 Subject: [PATCH 46/52] github-274 Beautified dump of Flowman/Spark configuration --- .../main/scala/com/dimajix/flowman/execution/Runner.scala | 2 +- .../main/scala/com/dimajix/flowman/execution/Session.scala | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala index 94be13783..0860d3f81 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Runner.scala @@ -131,7 +131,7 @@ private[execution] sealed class RunnerImpl { logger.info("Environment:") context.environment.toSeq.sortBy(_._1).foreach { case (key,value) => LogFilter.filter(logFilters, key, value.toString) - .foreach { case (key,value) => logger.info(s" $key=$value") } + .foreach { case (key,value) => logger.info(s" $key = $value") } } logger.info("") } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala index fc11ed022..0f34f51df 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/Session.scala @@ -387,12 +387,13 @@ class Session private[execution]( sparkJars.foreach(spark.sparkContext.addJar) // Log all config properties + logger.info("Spark configuration:") val logFilters = LogFilter.filters spark.conf.getAll.toSeq .sortBy(_._1) .foreach { keyValue => logFilters.foldLeft(Option(keyValue))((kv, f) => kv.flatMap(kv => f.filterConfig(kv._1,kv._2))) - .foreach { case (key,value) => logger.info("Spark Config: {} = {}", key: Any, value: Any) } + .foreach { case (key,value) => logger.info(" {} = {}", key: Any, value: Any) } } // Copy all Spark configs over to SparkConf inside the Context @@ -448,10 +449,12 @@ class Session private[execution]( context.config } + // Log Flowman configuration + logger.info("Flowman configuration:") conf.flowmanConf.getAll.toSeq .sortBy(_._1) .foreach { case(key,value) => - logger.info("Flowman Config: {} = {}", key: Any, value: Any) + logger.info(" {} = {}", key: Any, value: Any) } conf From ce5b95995fe02412ed9acd3e3f74a682e3413210 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Mon, 7 Nov 2022 19:40:16 +0100 Subject: [PATCH 47/52] Fix stupid typo in 'putFile' target --- .../scala/com/dimajix/flowman/spec/target/PutFileTarget.scala | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala index 23895a21c..bcb6f73b7 100644 --- a/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala +++ b/flowman-spec/src/main/scala/com/dimajix/flowman/spec/target/PutFileTarget.scala @@ -146,14 +146,14 @@ case class PutFileTarget( class PutFileTargetSpec extends TargetSpec { @JsonProperty(value = "source", required = true) private var source: String = "" - @JsonProperty(value = "tarPut", required = true) private var tarPut: String = "" + @JsonProperty(value = "target", required = true) private var target: String = "" @JsonProperty(value = "overwrite", required = false) private var overwrite: String = "true" override def instantiate(context: Context, properties:Option[Target.Properties] = None): PutFileTarget = { PutFileTarget( instanceProperties(context, properties), new Path(context.evaluate(source)), - new Path(context.evaluate(tarPut)), + new Path(context.evaluate(target)), context.evaluate(overwrite).toBoolean ) } From acf0cefeaa3701a716a22d5097091d094179a28e Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 8 Nov 2022 08:29:39 +0100 Subject: [PATCH 48/52] Fix parallel mapping processing --- .../flowman/execution/CachingExecution.scala | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala index d7a42217b..b7c2a6aac 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala @@ -16,7 +16,6 @@ package com.dimajix.flowman.execution -import scala.collection.concurrent.TrieMap import scala.collection.mutable import scala.collection.parallel.ForkJoinTaskSupport import scala.collection.parallel.TaskSupport @@ -24,7 +23,6 @@ import scala.concurrent.Await import scala.concurrent.Future import scala.concurrent.Promise import scala.concurrent.duration.Duration -import scala.concurrent.forkjoin.ForkJoinPool import scala.util.Try import scala.util.control.NonFatal @@ -184,10 +182,11 @@ abstract class CachingExecution(parent:Option[Execution], isolated:Boolean) exte private def describeMapping(mapping:Mapping) : Map[String,StructType] = { val context = mapping.context - val deps = if (parallelism > 1 ) { - val inputs = mapping.inputs.par - inputs.tasksupport = taskSupport - inputs.map(id => id -> describe(context.getMapping(id.mapping), id.output)) + val inputs = mapping.inputs.toSeq + val deps = if (inputs.size > 1 && parallelism > 1 ) { + val parInputs = inputs.par + parInputs.tasksupport = taskSupport + parInputs.map(id => id -> describe(context.getMapping(id.mapping), id.output)) .seq .toMap } @@ -333,13 +332,14 @@ abstract class CachingExecution(parent:Option[Execution], isolated:Boolean) exte } val dependencies = { - if (parallelism > 1) { - val inputs = mapping.inputs.par - inputs.tasksupport = taskSupport - inputs.map(dep).seq.toMap + val inputs = mapping.inputs.toSeq + if (inputs.size > 1 && parallelism > 1) { + val parInputs = inputs.par + parInputs.tasksupport = taskSupport + parInputs.map(dep).seq.toMap } else { - mapping.inputs.map(dep).toMap + inputs.map(dep).toMap } } From d85733d65249baf4728806f405ca996ccf5cb1d4 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 8 Nov 2022 09:21:58 +0100 Subject: [PATCH 49/52] Fix unittests --- .../scala/com/dimajix/flowman/execution/CachingExecution.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala b/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala index b7c2a6aac..6e31db8ea 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/execution/CachingExecution.scala @@ -191,7 +191,7 @@ abstract class CachingExecution(parent:Option[Execution], isolated:Boolean) exte .toMap } else { - mapping.inputs + inputs .map(id => id -> describe(context.getMapping(id.mapping), id.output)) .toMap } From 33a2a79e784c325740bd9e35ba65d4397a32a5b3 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 8 Nov 2022 18:12:30 +0100 Subject: [PATCH 50/52] github-273 Fix filesystem abstraction on Windows systems --- .../com/dimajix/flowman/fs/FileSystem.scala | 1 + .../com/dimajix/flowman/fs/HadoopFile.scala | 2 +- .../com/dimajix/flowman/fs/JavaFile.scala | 18 +++++++++++++++--- .../dimajix/flowman/fs/FileSystemTest.scala | 19 ++++++++++++++++++- .../com/dimajix/flowman/fs/JavaFileTest.scala | 8 +++++++- 5 files changed, 42 insertions(+), 6 deletions(-) diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala index 9d26feb54..bea29ee7e 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/FileSystem.scala @@ -31,6 +31,7 @@ import com.dimajix.flowman.fs.FileSystem.SEPARATOR object FileSystem { val SEPARATOR = "/" + val WINDOWS: Boolean = System.getProperty("os.name").startsWith("Windows") } diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala index c0e0bf02c..7c58920d2 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/HadoopFile.scala @@ -34,7 +34,7 @@ import org.apache.hadoop.io.IOUtils * @param fs * @param path */ -case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends File { +final case class HadoopFile(fs:org.apache.hadoop.fs.FileSystem, path:Path) extends File { override def uri : URI = path.toUri /** diff --git a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala index bd3ad459c..e88993b52 100644 --- a/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala +++ b/flowman-core/src/main/scala/com/dimajix/flowman/fs/JavaFile.scala @@ -25,6 +25,7 @@ import java.nio.file.Paths import java.nio.file.StandardOpenOption import java.util.Comparator import java.util.function.Consumer +import java.util.regex.Pattern import java.util.stream.Collectors import scala.collection.JavaConverters._ @@ -32,8 +33,19 @@ import scala.collection.JavaConverters._ import org.apache.hadoop.fs -case class JavaFile(jpath:Path) extends File { - override def toString: String = "file:" + jpath.toString +object JavaFile { + private val HAS_DRIVE_LETTER_SPECIFIER = Pattern.compile("^/?[a-zA-Z]:") + private def hasWindowsDrive(path: String) = FileSystem.WINDOWS && HAS_DRIVE_LETTER_SPECIFIER.matcher(path).find +} + +final case class JavaFile(jpath:Path) extends File { + override def toString: String = { + val rawPath = jpath.toString + if (JavaFile.hasWindowsDrive(rawPath)) + "file:/" + rawPath + else + "file:" + rawPath + } override def path: fs.Path = new fs.Path(uri) @@ -105,7 +117,7 @@ case class JavaFile(jpath:Path) extends File { .collect(Collectors.toList[Path]) .asScala .sortBy(_.toString) - .map(JavaFile) + .map(JavaFile.apply) override def glob(pattern: String): Seq[File] = { val stream = Files.newDirectoryStream(jpath, pattern) diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala index e2d3e2f89..4d07d9c8c 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/FileSystemTest.scala @@ -35,7 +35,6 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { dir1.uri should be (tempDir.toURI) dir1.path should be (new Path(tempDir.toURI)) dir1.toString should be ("file:" + tempDir.toString) - //dir1.path should be (new Path(tempDir.toURI.toString)) dir1.exists() should be (true) dir1.isFile() should be (false) dir1.isDirectory() should be (true) @@ -147,7 +146,13 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val fs = FileSystem(conf) val file = fs.local("/tmp/hourly/hour=2022-03-10 20:00:00") file.uri should be (new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.path should be (new Path("file:/tmp/hourly/hour=2022-03-10 20:00:00")) file.toString should be ("file:/tmp/hourly/hour=2022-03-10 20:00:00") + + val file2 = fs.local("/tmp/hourly/hour=2022-03-10%20:00:00") + file2.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2520:00:00")) + file2.path should be (new Path("file:/tmp/hourly/hour=2022-03-10%20:00:00")) + file2.toString should be("file:/tmp/hourly/hour=2022-03-10%20:00:00") } it should "be usable with special characters and whitespaces (Path)" in { @@ -155,7 +160,13 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val fs = FileSystem(conf) val file = fs.local(new Path("/tmp/hourly/hour=2022-03-10 20:00:00")) file.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.path should be (new Path("file:/tmp/hourly/hour=2022-03-10 20:00:00")) file.toString should be("file:/tmp/hourly/hour=2022-03-10 20:00:00") + + val file2 = fs.local(new Path("/tmp/hourly/hour=2022-03-10%20:00:00")) + file2.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2520:00:00")) + file2.path should be (new Path("file:/tmp/hourly/hour=2022-03-10%20:00:00")) + file2.toString should be("file:/tmp/hourly/hour=2022-03-10%20:00:00") } it should "be usable with special characters and whitespaces (File)" in { @@ -163,7 +174,13 @@ class FileSystemTest extends AnyFlatSpec with Matchers with LocalSparkSession { val fs = FileSystem(conf) val file = fs.local(new java.io.File("/tmp/hourly/hour=2022-03-10 20:00:00")) file.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2020:00:00")) + file.path should be (new Path("file:/tmp/hourly/hour=2022-03-10 20:00:00")) file.toString should be("file:/tmp/hourly/hour=2022-03-10 20:00:00") + + val file2 = fs.local(new java.io.File("/tmp/hourly/hour=2022-03-10%20:00:00")) + file2.uri should be(new URI("file:/tmp/hourly/hour=2022-03-10%2520:00:00")) + file2.path should be (new Path("file:/tmp/hourly/hour=2022-03-10%20:00:00")) + file2.toString should be("file:/tmp/hourly/hour=2022-03-10%20:00:00") } it should "be usable relative paths (String)" in { diff --git a/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala index b21097131..a64db4070 100644 --- a/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala +++ b/flowman-core/src/test/scala/com/dimajix/flowman/fs/JavaFileTest.scala @@ -34,6 +34,7 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { val dir = JavaFile(tempDir.toPath) dir.uri should be (tempDir.toURI) dir.path should be (new fs.Path(tempDir.toURI)) + dir.toString should be ("file:" + tempDir.toString) dir.exists() should be (true) dir.isFile() should be (false) dir.isDirectory() should be (true) @@ -44,6 +45,8 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { val file = dir / "lala" file.name should be ("lala") file.uri should be (tempDir.toURI.resolve("lala")) + file.path should be (new Path(tempDir.toURI.resolve("lala"))) + file.toString should be ("file:" + new java.io.File(tempDir, "lala").toString) file.exists() should be(false) file.isFile() should be(false) file.isDirectory() should be(false) @@ -59,19 +62,22 @@ class JavaFileTest extends AnyFlatSpec with Matchers with LocalTempDir { dir.name should be ("") dir.uri should be(new URI("file:/")) dir.path should be(new fs.Path("file:/")) + dir.toString should be ("file:/") dir.exists() should be(true) dir.isFile() should be(false) dir.isDirectory() should be(true) dir.isAbsolute() should be(true) val file = dir / "lala" + file.parent should be (dir) file.name should be ("lala") + file.uri should be(new URI("file:/lala")) + file.path should be(new fs.Path("file:/lala")) file.exists() should be(false) file.isFile() should be(false) file.isDirectory() should be(false) file.isAbsolute() should be(true) file.parent should be(dir) - file.name should be("lala") file.withName("lolo") should be(dir / "lolo") } From 6aa6c772951a9a5c3f020f8f369d73acef793d21 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 8 Nov 2022 18:30:36 +0100 Subject: [PATCH 51/52] Prepare release --- CHANGELOG.md | 2 +- docs/releases.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8f06db9bc..96509e40d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -# Version 0.29.0 +# Version 0.29.0 - 2022-11-08 * github-260: Remove hive-storage-api from several plugins and lib * github-261: Add descriptions to all pom.xml diff --git a/docs/releases.md b/docs/releases.md index bc2188f4e..d2882293f 100644 --- a/docs/releases.md +++ b/docs/releases.md @@ -14,7 +14,7 @@ The following gives an (incomplete) list of past releases of the last 12 months. changes over time. -### Version 0.29.0 +### Version 0.29.0 - 2022-11-08 * github-260 Remove hive-storage-api from several plugins and lib * github-261: Add descriptions to all pom.xml From 3024e78a3f7a106ea5685035490760ab948255b6 Mon Sep 17 00:00:00 2001 From: Kaya Kupferschmidt Date: Tue, 8 Nov 2022 19:04:53 +0100 Subject: [PATCH 52/52] Update versions for release --- docker/pom.xml | 2 +- flowman-archetype-quickstart/pom.xml | 2 +- flowman-client/pom.xml | 2 +- flowman-common/pom.xml | 2 +- flowman-core/pom.xml | 2 +- flowman-dist/pom.xml | 2 +- flowman-dsl/pom.xml | 2 +- flowman-hub/pom.xml | 2 +- flowman-parent/pom.xml | 2 +- flowman-plugins/aws/pom.xml | 2 +- flowman-plugins/azure/pom.xml | 2 +- flowman-plugins/delta/pom.xml | 2 +- flowman-plugins/impala/pom.xml | 2 +- flowman-plugins/json/pom.xml | 2 +- flowman-plugins/kafka/pom.xml | 2 +- flowman-plugins/mariadb/pom.xml | 2 +- flowman-plugins/mssqlserver/pom.xml | 2 +- flowman-plugins/mysql/pom.xml | 2 +- flowman-plugins/openapi/pom.xml | 2 +- flowman-plugins/oracle/pom.xml | 2 +- flowman-plugins/postgresql/pom.xml | 2 +- flowman-plugins/sftp/pom.xml | 2 +- flowman-plugins/swagger/pom.xml | 2 +- flowman-scalatest-compat/pom.xml | 2 +- flowman-server-ui/pom.xml | 2 +- flowman-server/pom.xml | 2 +- flowman-spark-dependencies/pom.xml | 2 +- flowman-spark-extensions/pom.xml | 2 +- flowman-spark-testing/pom.xml | 2 +- flowman-spec/pom.xml | 2 +- flowman-studio-ui/pom.xml | 2 +- flowman-studio/pom.xml | 2 +- flowman-testing/pom.xml | 2 +- flowman-tools/pom.xml | 2 +- flowman-yaml-schema/pom.xml | 2 +- pom.xml | 2 +- 36 files changed, 36 insertions(+), 36 deletions(-) diff --git a/docker/pom.xml b/docker/pom.xml index 6c5dd8add..ae0f4fda8 100644 --- a/docker/pom.xml +++ b/docker/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-archetype-quickstart/pom.xml b/flowman-archetype-quickstart/pom.xml index 91cff224d..76827a3c5 100644 --- a/flowman-archetype-quickstart/pom.xml +++ b/flowman-archetype-quickstart/pom.xml @@ -9,7 +9,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-client/pom.xml b/flowman-client/pom.xml index 0c6599a04..ae55c86a6 100644 --- a/flowman-client/pom.xml +++ b/flowman-client/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-common/pom.xml b/flowman-common/pom.xml index f4dfe0349..578b13c5a 100644 --- a/flowman-common/pom.xml +++ b/flowman-common/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-core/pom.xml b/flowman-core/pom.xml index 7b75b2b38..e13898764 100644 --- a/flowman-core/pom.xml +++ b/flowman-core/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-dist/pom.xml b/flowman-dist/pom.xml index e01bb1fd8..709d43797 100644 --- a/flowman-dist/pom.xml +++ b/flowman-dist/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-dsl/pom.xml b/flowman-dsl/pom.xml index 714a87c99..39737e183 100644 --- a/flowman-dsl/pom.xml +++ b/flowman-dsl/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-hub/pom.xml b/flowman-hub/pom.xml index b5721eef0..804445e62 100644 --- a/flowman-hub/pom.xml +++ b/flowman-hub/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-parent/pom.xml b/flowman-parent/pom.xml index 45b2f27ae..7caa07c0e 100644 --- a/flowman-parent/pom.xml +++ b/flowman-parent/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-plugins/aws/pom.xml b/flowman-plugins/aws/pom.xml index 481eab374..e2a168558 100644 --- a/flowman-plugins/aws/pom.xml +++ b/flowman-plugins/aws/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/azure/pom.xml b/flowman-plugins/azure/pom.xml index 7edfbf405..7d4bdab76 100644 --- a/flowman-plugins/azure/pom.xml +++ b/flowman-plugins/azure/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/delta/pom.xml b/flowman-plugins/delta/pom.xml index 12ba400cb..d3cd7ef1f 100644 --- a/flowman-plugins/delta/pom.xml +++ b/flowman-plugins/delta/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/impala/pom.xml b/flowman-plugins/impala/pom.xml index b096d6eac..e54330e2d 100644 --- a/flowman-plugins/impala/pom.xml +++ b/flowman-plugins/impala/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/json/pom.xml b/flowman-plugins/json/pom.xml index b03792a45..214d85003 100644 --- a/flowman-plugins/json/pom.xml +++ b/flowman-plugins/json/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/kafka/pom.xml b/flowman-plugins/kafka/pom.xml index 57ec5ce51..977eda0d5 100644 --- a/flowman-plugins/kafka/pom.xml +++ b/flowman-plugins/kafka/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/mariadb/pom.xml b/flowman-plugins/mariadb/pom.xml index cae9532ca..e7bcb84e2 100644 --- a/flowman-plugins/mariadb/pom.xml +++ b/flowman-plugins/mariadb/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/mssqlserver/pom.xml b/flowman-plugins/mssqlserver/pom.xml index 6ea85ec22..7e078fe07 100644 --- a/flowman-plugins/mssqlserver/pom.xml +++ b/flowman-plugins/mssqlserver/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/mysql/pom.xml b/flowman-plugins/mysql/pom.xml index 765fd3f84..04c0dd26e 100644 --- a/flowman-plugins/mysql/pom.xml +++ b/flowman-plugins/mysql/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/openapi/pom.xml b/flowman-plugins/openapi/pom.xml index 8194d62ee..48a9787f9 100644 --- a/flowman-plugins/openapi/pom.xml +++ b/flowman-plugins/openapi/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/oracle/pom.xml b/flowman-plugins/oracle/pom.xml index 0476d1a7d..9fb72cab7 100644 --- a/flowman-plugins/oracle/pom.xml +++ b/flowman-plugins/oracle/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/postgresql/pom.xml b/flowman-plugins/postgresql/pom.xml index 038b80e32..890234f7d 100644 --- a/flowman-plugins/postgresql/pom.xml +++ b/flowman-plugins/postgresql/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/sftp/pom.xml b/flowman-plugins/sftp/pom.xml index ab0383d0d..363655989 100644 --- a/flowman-plugins/sftp/pom.xml +++ b/flowman-plugins/sftp/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-plugins/swagger/pom.xml b/flowman-plugins/swagger/pom.xml index 5ca38c4cf..56a654122 100644 --- a/flowman-plugins/swagger/pom.xml +++ b/flowman-plugins/swagger/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../../pom.xml diff --git a/flowman-scalatest-compat/pom.xml b/flowman-scalatest-compat/pom.xml index 3aabc86ee..fec21902a 100644 --- a/flowman-scalatest-compat/pom.xml +++ b/flowman-scalatest-compat/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-server-ui/pom.xml b/flowman-server-ui/pom.xml index 37fb5cd0f..351cfab04 100644 --- a/flowman-server-ui/pom.xml +++ b/flowman-server-ui/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-server/pom.xml b/flowman-server/pom.xml index e488d0fee..ac559cbf8 100644 --- a/flowman-server/pom.xml +++ b/flowman-server/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-spark-dependencies/pom.xml b/flowman-spark-dependencies/pom.xml index 1ca5c0c56..feb7aeeb0 100644 --- a/flowman-spark-dependencies/pom.xml +++ b/flowman-spark-dependencies/pom.xml @@ -15,7 +15,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-spark-extensions/pom.xml b/flowman-spark-extensions/pom.xml index 86627a0a9..8694a52b7 100644 --- a/flowman-spark-extensions/pom.xml +++ b/flowman-spark-extensions/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-spark-testing/pom.xml b/flowman-spark-testing/pom.xml index 4f26d6c08..f67fabb40 100644 --- a/flowman-spark-testing/pom.xml +++ b/flowman-spark-testing/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-spec/pom.xml b/flowman-spec/pom.xml index cc5417ba7..7e621f281 100644 --- a/flowman-spec/pom.xml +++ b/flowman-spec/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-studio-ui/pom.xml b/flowman-studio-ui/pom.xml index f541c1132..776334155 100644 --- a/flowman-studio-ui/pom.xml +++ b/flowman-studio-ui/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-studio/pom.xml b/flowman-studio/pom.xml index d2b7cf0ec..d0998ad98 100644 --- a/flowman-studio/pom.xml +++ b/flowman-studio/pom.xml @@ -10,7 +10,7 @@ flowman-root com.dimajix.flowman - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-testing/pom.xml b/flowman-testing/pom.xml index cceedf4e6..8ce559c05 100644 --- a/flowman-testing/pom.xml +++ b/flowman-testing/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-tools/pom.xml b/flowman-tools/pom.xml index 4a23156aa..7bf273a01 100644 --- a/flowman-tools/pom.xml +++ b/flowman-tools/pom.xml @@ -10,7 +10,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/flowman-yaml-schema/pom.xml b/flowman-yaml-schema/pom.xml index 7a404c2a4..bb3309410 100644 --- a/flowman-yaml-schema/pom.xml +++ b/flowman-yaml-schema/pom.xml @@ -11,7 +11,7 @@ com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 ../pom.xml diff --git a/pom.xml b/pom.xml index b73310694..48711b31a 100644 --- a/pom.xml +++ b/pom.xml @@ -5,7 +5,7 @@ 4.0.0 com.dimajix.flowman flowman-root - 0.29.0-SNAPSHOT + 0.29.0 pom Flowman root pom