diff --git a/docker-compose.yml b/docker-compose.yml index 2d1cb16..a9b9007 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,7 +1,7 @@ version: "3.7" services: spark-master: - image: spydernaz/spark-master:latest + image: spark-master:latest ports: - "9090:8080" - "7077:7077" @@ -11,7 +11,7 @@ services: environment: - "SPARK_LOCAL_IP=spark-master" spark-worker: - image: spydernaz/spark-worker:latest + image: spark-worker:latest depends_on: - spark-master environment: @@ -23,3 +23,5 @@ services: volumes: - ./apps:/opt/spark-apps - ./data:/opt/spark-data + ports: + - "8081" diff --git a/docker/base/Dockerfile b/docker/base/Dockerfile index b2e60f2..814f8db 100644 --- a/docker/base/Dockerfile +++ b/docker/base/Dockerfile @@ -1,57 +1,34 @@ -FROM openjdk:8u212-b04-jdk-stretch -LABEL author="Nathaniel Vala" email="nathanielvala@hotmail.com" -LABEL version="0.2" - -ENV DAEMON_RUN=true -ENV SPARK_VERSION=2.4.3 -ENV HADOOP_VERSION=2.7 -ENV SCALA_VERSION=2.12.4 -ENV SCALA_HOME=/usr/share/scala -ENV SPARK_HOME=/spark - - -RUN apt-get update && apt-get install -y curl vim wget software-properties-common ssh net-tools ca-certificates jq - -# apt update && apt -y upgrade \ -# apt install -y wget ca-certificates && \ -# apt install -y curl bash jq && \ -RUN cd "/tmp" && \ - wget --no-verbose "https://downloads.typesafe.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" && \ - tar xzf "scala-${SCALA_VERSION}.tgz" && \ - mkdir "${SCALA_HOME}" && \ - rm "/tmp/scala-${SCALA_VERSION}/bin/"*.bat && \ - mv "/tmp/scala-${SCALA_VERSION}/bin" "/tmp/scala-${SCALA_VERSION}/lib" "${SCALA_HOME}" && \ - ln -s "${SCALA_HOME}/bin/"* "/usr/bin/" && \ - rm -rf "/tmp/"* - - -# RUN apk add --no-cache --virtual=.build-dependencies wget ca-certificates && \ -# apk add --no-cache bash curl jq && \ -# cd "/tmp" && \ -# wget --no-verbose "https://downloads.typesafe.com/scala/${SCALA_VERSION}/scala-${SCALA_VERSION}.tgz" && \ -# tar xzf "scala-${SCALA_VERSION}.tgz" && \ -# mkdir "${SCALA_HOME}" && \ -# rm "/tmp/scala-${SCALA_VERSION}/bin/"*.bat && \ -# mv "/tmp/scala-${SCALA_VERSION}/bin" "/tmp/scala-${SCALA_VERSION}/lib" "${SCALA_HOME}" && \ -# ln -s "${SCALA_HOME}/bin/"* "/usr/bin/" && \ -# apk del .build-dependencies && \ -# rm -rf "/tmp/"* +FROM oraclelinux:8-slim +ARG JAVA_VERSION=8.0.232-open +ARG SCALA_VERSION=2.12.10 +ARG SBT_VERSION=1.3.7 +ARG SPARK_VERSION=2.4.4 +ARG HADOOP_VERSION=2.7 -# Add Dependencies for PySpark -RUN apt-get install -y python3 python3-pip python3-numpy python3-matplotlib python3-scipy python3-pandas python3-simpy -RUN update-alternatives --install "/usr/bin/python" "python" "$(which python3)" 1 - - -#Scala instalation -RUN export PATH="/usr/local/sbt/bin:$PATH" && apt update && apt install ca-certificates wget tar && mkdir -p "/usr/local/sbt" && wget -qO - --no-check-certificate "https://github.com/sbt/sbt/releases/download/v1.2.8/sbt-1.2.8.tgz" | tar xz -C /usr/local/sbt --strip-components=1 && sbt sbtVersion - -RUN wget --no-verbose http://apache.mirror.iphh.net/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && tar -xvzf spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz \ - && mv spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} spark \ - && rm spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz - - - -# Fix the value of PYTHONHASHSEED -# Note: this is needed when you use Python 3.3 or greater -ENV PYTHONHASHSEED 1 \ No newline at end of file +ENV DAEMON_RUN=true +ENV PYTHONHASHSEED=1 +ENV SDKMAN_DIR=/usr/local/sdkman +ENV JAVA_HOME=${SDKMAN_DIR}/candidates/java/current/ +ENV SCALA_HOME=${SDKMAN_DIR}/candidates/scala/current/ +ENV SBT_HOME=${SDKMAN_DIR}/candidates/sbt/current/ +ENV SPARK_HOME=/opt/spark + +RUN touch /etc/dnf/dnf.conf && microdnf update -y +RUN microdnf install bash curl ca-certificates unzip zip tar gzip bzip2 which findutils python3 python3-pip -y && microdnf clean all -y +RUN curl -s "https://get.sdkman.io" | bash +RUN set -x \ + && echo "sdkman_auto_answer=true" > $SDKMAN_DIR/etc/config \ + && echo "sdkman_auto_selfupdate=false" >> $SDKMAN_DIR/etc/config \ + && echo "sdkman_insecure_ssl=false" >> $SDKMAN_DIR/etc/config +RUN bash -c "source $SDKMAN_DIR/bin/sdkman-init.sh && \ + sdk update && sdk upgrade && sdk selfupdate && \ + sdk install java ${JAVA_VERSION} && sdk default java ${JAVA_VERSION} && \ + sdk install scala ${SCALA_VERSION} && sdk default scala ${SCALA_VERSION} && \ + sdk install sbt ${SBT_VERSION} && sdk default sbt ${SBT_VERSION} && \ + sdk flush archives && sdk flush temp && sdk flush broadcast" +RUN pip3 --no-cache-dir install matplotlib pandas simpy numpy scipy +RUN curl -o /tmp/spark.tgz https://www-us.apache.org/dist/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}.tgz && tar -xvzf /tmp/spark.tgz --one-top-level=spark --strip-components 1 -C /opt/ && rm -f /tmp/spark.tgz +RUN alternatives --set python /usr/bin/python3 + +WORKDIR ${SPARK_HOME} \ No newline at end of file diff --git a/docker/spark-master/Dockerfile b/docker/spark-master/Dockerfile index 2c22680..a763cf3 100644 --- a/docker/spark-master/Dockerfile +++ b/docker/spark-master/Dockerfile @@ -1,10 +1,10 @@ -FROM spydernaz/spark-base:latest +FROM spark-base:latest COPY start-master.sh / -ENV SPARK_MASTER_PORT 7077 -ENV SPARK_MASTER_WEBUI_PORT 8080 -ENV SPARK_MASTER_LOG /spark/logs +ENV SPARK_MASTER_PORT=7077 +ENV SPARK_MASTER_WEBUI_PORT=8080 +ENV SPARK_MASTER_LOG=${SPARK_HOME}/logs EXPOSE 8080 7077 6066 diff --git a/docker/spark-master/start-master.sh b/docker/spark-master/start-master.sh index 87a40b5..ecf8637 100644 --- a/docker/spark-master/start-master.sh +++ b/docker/spark-master/start-master.sh @@ -1,15 +1,13 @@ #!/bin/bash -export SPARK_MASTER_HOST=`hostname` +export SPARK_MASTER_HOST=`cat /etc/hostname` -. "/spark/sbin/spark-config.sh" +. "$SPARK_HOME/sbin/spark-config.sh" -. "/spark/bin/load-spark-env.sh" +. "$SPARK_HOME/bin/load-spark-env.sh" mkdir -p $SPARK_MASTER_LOG -export SPARK_HOME=/spark - ln -sf /dev/stdout $SPARK_MASTER_LOG/spark-master.out -cd /spark/bin && /spark/sbin/../bin/spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out +cd $SPARK_HOME/bin && $SPARK_HOME/sbin/../bin/spark-class org.apache.spark.deploy.master.Master --ip $SPARK_MASTER_HOST --port $SPARK_MASTER_PORT --webui-port $SPARK_MASTER_WEBUI_PORT >> $SPARK_MASTER_LOG/spark-master.out diff --git a/docker/spark-submit/Dockerfile b/docker/spark-submit/Dockerfile index 2a7ec66..7b1db3c 100644 --- a/docker/spark-submit/Dockerfile +++ b/docker/spark-submit/Dockerfile @@ -1,4 +1,4 @@ -FROM spydernaz/spark-base:latest +FROM spark-base:latest COPY spark-submit.sh / diff --git a/docker/spark-submit/spark-submit.sh b/docker/spark-submit/spark-submit.sh index 15aa483..8acb323 100644 --- a/docker/spark-submit/spark-submit.sh +++ b/docker/spark-submit/spark-submit.sh @@ -1,6 +1,6 @@ #!/bin/bash -/spark/bin/spark-submit \ +$SPARK_HOME/bin/spark-submit \ --class ${SPARK_APPLICATION_MAIN_CLASS} \ --master ${SPARK_MASTER_URL} \ --deploy-mode cluster \ diff --git a/docker/spark-worker/Dockerfile b/docker/spark-worker/Dockerfile index a014351..1a4943a 100644 --- a/docker/spark-worker/Dockerfile +++ b/docker/spark-worker/Dockerfile @@ -1,10 +1,10 @@ -FROM spydernaz/spark-base:latest +FROM spark-base:latest COPY start-worker.sh / -ENV SPARK_WORKER_WEBUI_PORT 8081 -ENV SPARK_WORKER_LOG /spark/logs -ENV SPARK_MASTER "spark://spark-master:7077" +ENV SPARK_WORKER_WEBUI_PORT=8081 +ENV SPARK_WORKER_LOG=${SPARK_HOME}/logs +ENV SPARK_MASTER="spark://spark-master:7077" EXPOSE 8081 diff --git a/docker/spark-worker/start-worker.sh b/docker/spark-worker/start-worker.sh index eaae186..e18f275 100644 --- a/docker/spark-worker/start-worker.sh +++ b/docker/spark-worker/start-worker.sh @@ -1,12 +1,10 @@ #!/bin/bash -. "/spark/sbin/spark-config.sh" -. "/spark/bin/load-spark-env.sh" +. "$SPARK_HOME/sbin/spark-config.sh" +. "$SPARK_HOME/bin/load-spark-env.sh" mkdir -p $SPARK_WORKER_LOG -export SPARK_HOME=/spark - ln -sf /dev/stdout $SPARK_WORKER_LOG/spark-worker.out -/spark/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out \ No newline at end of file +$SPARK_HOME/sbin/../bin/spark-class org.apache.spark.deploy.worker.Worker --webui-port $SPARK_WORKER_WEBUI_PORT $SPARK_MASTER >> $SPARK_WORKER_LOG/spark-worker.out \ No newline at end of file