aryn-ai · HenryL27 · Apr 11, 2024 · Apr 11, 2024 · Apr 11, 2024 · Apr 12, 2024
diff --git a/.gitignore b/.gitignore
@@ -34,3 +34,6 @@ apps/jupyter/bind_dir/redirect.html
 .venv
 lib/remote-processors/remote_processors/*pb2*
 poetry.toml
+apps/integration/runs/
+notebooks/data/
+test-output.log
diff --git a/apps/integration/integration/automation/integrate b/apps/integration/integration/automation/integrate
@@ -0,0 +1,269 @@
+#!/bin/bash
+
+archname() {
+  local unamearchname="$1"
+  local arch="amd64"
+  [[ "$unamearchname" = "arm64" || "$unamearchname" = "aarch64" ]] && arch="arm64"
+  echo "${arch}"
+}
+
+error() {
+  echo "ERROR: $@" >&2
+}
+
+die() {
+  error "$@"
+  exit 1
+}
+
+NOW="$(date +"%Y-%m-%d_%H_%M_%S")"
+ARCH="$(archname "$(uname -m)")"
+
+RUNDIR="apps/integration/runs/${NOW}"
+GIT_LOGFILE="${RUNDIR}/git.log"
+DOCKER_LOGFILE="${RUNDIR}/docker.log"
+POETRY_LOGFILE="${RUNDIR}/poetry.log"
+PYTEST_LOGFILE="${RUNDIR}/pytest.log"
+QUERY_LOGFILE="${RUNDIR}/test_queries.log"
+
+# Parse args
+SKIP_BUILD=0
+SKIP_TESTS=0
+SKIP_PUSH=0
+DO_CLEAN=0
+TAG="integration_tests"
+declare SSH_TARGET
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --help|-h)
+      echo "Utility script for building containers, running integration tests, and pushing images"
+      echo "Make sure to run this from the sycamore root directory."
+      echo "-------------------------------------------------------"
+      echo "Arguments:"
+      echo "  --help                Display this message"
+      echo "  --build               Build images"
+      echo "  --tests               Run integration tests"
+      echo "  --push                Push images"
+      echo "  --clean               Remove logs from previous runs before doing anything."
+      echo "  --tag     [TAG]       When building, running, and/or pushing, use this docker tag."
+      echo "                        Default is 'integration_tests'"
+      echo "  --ssh     [TARGET]    When building and running tests, also build and run on this host."
+      echo "                        Useful for multi-arch builds and tests, e.g. --ssh my-arm-box"
+      exit 0
+      ;;
+    --build)
+      SKIP_BUILD=1
+      echo "Will build images"
+      shift
+      ;;
+    --tests)
+      SKIP_TESTS=1
+      echo "Will run integration tests"
+      shift
+      ;;
+    --push)
+      SKIP_PUSH=1
+      echo "Will push images"
+      shift
+      ;;
+    --tag)
+      [[ -z $2 ]] && die "A tag must be specified when using the --tag arg; e.g. --tag my-tag"
+      [[ $2 =~ [a-z]* ]] || die "Detected tag was $2. Tags should begin with lowercase letters"
+      TAG="$2"
+      echo "Using tag ${TAG}"
+      shift
+      shift
+      ;;
+    --clean)
+      DO_CLEAN=0
+      echo "Will clean ${RUNDIR} before running anything"
+      shift
+      ;;
+    --ssh)
+      [[ -z $2 ]] && die "A configured ssh target must be specified when using the --ssh arg; e.g. --ssh my-host"
+      [[ $2 =~ [a-z]* ]] || die "Detected ssh target was $2. ssh tartgets should begin with lowercase letters"
+      SSH_TARGET="$2"
+      SSH_BUILDX_PORT=18460 # Selected by googling 5d10 and rerolling until small enough
+      echo "Using ssh target ${SSH_TARGET}"
+      shift
+      shift
+      ;;
+  esac
+done
+
+main() {
+  [[ -d ".git" ]] || die "Please run this script from sycamore root!"
+  [[ $DO_CLEAN ]] && rm -rf "${RUNDIR}"
+  if [[ -n $SSH_TARGET && $SKIP_BUILD -ne 0 ]]; then
+    create-dual-builder
+    trap cleanup-dual-builder 0 1 2 3 6
+  fi
+  mkdir -p "${RUNDIR}"
+  echo "Building/testing tag ${TAG}" >&2
+  echo "Get the newest git commits" >&2
+  if checkout_main_if_new; then
+    echo "Changes detected. Running Tests" >&2
+    poetry install --no-root > "${POETRY_LOGFILE}" 2>&1 \
+        && { [[ $SKIP_BUILD -eq 0 ]] || build_images > "${DOCKER_LOGFILE}" 2>&1; } \
+        && { [[ $SKIP_TESTS -eq 0 ]] || runtests > "${PYTEST_LOGFILE}" 2>&1; } \
+        && touch "${RUNDIR}/passed"
+    [[ $SKIP_PUSH -eq 0 ]] || push_images >> "${DOCKER_LOGFILE}" 2>&1
+    handle_outputs
+  else
+    echo "No changes detected. Skipping integration tests" >&2
+  fi
+}
+
+
+checkout_main_if_new() {
+  old_sha="$(git rev-parse HEAD)"
+  git fetch origin main > "${GIT_LOGFILE}" 2>&1
+  new_sha="$(git rev-parse FETCH_HEAD)"
+  if [[ "${old_sha}" != "${new_sha}" ]]; then
+    [[ -z $(git status --porcelain) ]] || die "Working tree not clean"
+    git pull --rebase origin main >> "${GIT_LOGFILE}" 2>&1
+    echo "==================" >> "${GIT_LOGFILE}"
+    echo "Using git rev ${new_sha}" >> "${GIT_LOGFILE}"
+    return 0
+  else
+    return 1
+  fi
+}
+
+build_images() {
+  echo "Building all images" >&2
+  docker-build-hub apps/crawler/crawler/http/Dockerfile \
+  && docker-build-hub apps/crawler/crawler/s3/Dockerfile \
+  && docker-build-hub apps/importer/Dockerfile.buildx \
+  && docker-build-hub apps/opensearch/Dockerfile \
+  && docker-build-hub apps/jupyter/Dockerfile.buildx --build-arg=TAG="${TAG}" \
+  && docker-build-hub apps/demo-ui/Dockerfile.buildx \
+  && docker-build-hub apps/remote-processor-service/Dockerfile.buildx \
+  && return 0
+  return 1
+}
+
+handle_outputs() {
+  echo "Handling test outputs" >&2
+  [[ -f test-output.log ]] && mv test-output.log "${QUERY_LOGFILE}"
+  [[ -f "${RUNDIR}/passed" ]] || touch "${RUNDIR}/failed"
+  aws s3 cp --recursive "${RUNDIR}/" "s3://sycamore-ci/${NOW}/${ARCH}"
+}
+
+push_images() {
+  echo "Pushing tested images to dockerhub" >&2
+  docker-push-hub apps/crawler/crawler/http/Dockerfile \
+  && docker-push-hub apps/crawler/crawler/s3/Dockerfile \
+  && docker-push-hub apps/importer/Dockerfile.buildx \
+  && docker-push-hub apps/opensearch/Dockerfile \
+  && docker-push-hub apps/jupyter/Dockerfile.buildx \
+  && docker-push-hub apps/demo-ui/Dockerfile.buildx \
+  && docker-push-hub apps/remote-processor-service/Dockerfile.buildx \
+  && return 0
+  return 1
+}
+
+runtests() {
+  if [[ -n $SSH_TARGET ]]; then
+    ssh "${SSH_TARGET}" "cd sycamore && ./apps/integration/integration/automation/integrate --test --clean --tag ${TAG}" &
+  fi
+  docker volume rm sycamore_crawl_data sycamore_jupyter_data sycamore_opensearch_data
+  docker network prune -f
+  docker compose up reset
+  poetry run pytest apps/integration/ -p integration.conftest --noconftest --docker-tag "${TAG}"
+  # this is a complicated command, so:
+  # -p integration.conftest - load conftest with plugins, to capture the custom command line arg (--docker-tag)
+  # --noconftest            - don't load conftest at pytest runtime; it's already loaded
+  # --docker-tag            - specify tag of containers to test
+  return $?
+}
+
+docker-build-hub() {
+  local docker_file="$1"
+  [[ -n "${docker_file}" ]] || { error "missing ${docker_file}"; return 1;}
+  local repo_name="$(_docker-repo-name "${docker_file}")"
+  [[ -n "${repo_name}" ]] || { error "empty repo name"; return 1;}
+  shift
+
+  echo
+  echo "Building in sycamore and pushing to docker hub with repo name '${repo_name}'"
+  docker buildx build "$(_docker-build-args)" -t "${repo_name}:${TAG}" -f "${docker_file}" \
+     --cache-to type=registry,ref="${repo_name}:build-cache",mode=max \
+     --cache-from type=registry,ref="${repo_name}:build-cache" \
+     --platform="$(_docker-platforms)" "$@" --push . \
+     || { error "buildx failed" && return 1;}
+  echo "Successfully built using docker file $docker_file"
+}
+
+docker-push-hub() {
+  local docker_file="$1"
+  [[ -n "${docker_file}" ]] || { error "missing ${docker_file}"; return 1;}
+  local repo_name="$(_docker-repo-name "${docker_file}")"
+  [[ -n "${repo_name}" ]] || { error "empty repo name"; return 1;}
+
+  echo
+  echo "Pushing image to docker hub for repo '${repo_name}"
+  docker push "${repo_name}:${TAG}" || { error "docker push failed"; return 1;}
+  echo "Successfully pushed image previously built from dockerfile ${docker_file}"
+}
+
+_docker-repo-name() {
+  local docker_file="$1"
+  echo "Finding repo name in: ${docker_file}" >&2
+  local repo_name="$(grep '^# Repo name: ' "${docker_file}" | awk '{print $4}')"
+  [[ "${repo_name}" = *private* ]] && die "Private repo ${repo_name} disallowed"
+  if (( $(wc -w <<< ${repo_name}) != 1 )); then
+    echo "Unable to find repo name in ${docker_file}" 1>&2
+    exit 1
+  fi
+  echo "${repo_name}"
+}
+
+_docker-build-args() {
+  local branch="$(git branch --show-current)"
+  local rev="$(git rev-parse --short HEAD)"
+  local date="$(git show -s --format=%ci HEAD | sed -e 's/ /_/g')"
+  local diff=unknown
+  if [[ -z $(git status --porcelain) ]]; then
+    diff=clean
+  else
+    diff="pending_changes_$(git diff HEAD | shasum | awk '{print $1}')"
+  fi
+  echo "--build-arg=GIT_BRANCH=${branch} --build-arg=GIT_COMMIT=${rev}--${date} --build-arg=GIT_DIFF=${diff}"
+}
+
+_docker-platforms() {
+  local remotearch="${ARCH}"
+  if [[ -n $SSH_TARGET ]]; then
+    remotearch="$(archname "$(ssh "${SSH_TARGET}" uname -m)")"
+  fi
+  if [[ $ARCH != $remotearch ]]; then
+    echo "linux/${ARCH},linux/${remotearch}"
+  else
+    echo "linux/${ARCH}"
+  fi
+}
+
+create-dual-builder() {
+  # Over ssh, start a buildkit container on the target, and use port forwarding
+  # to talk to it. Also start a local buildkit container, and then create a buildx
+  # remote driver that talks to both of them.
+  ssh -N -L "${SSH_BUILDX_PORT}":localhost:"${SSH_BUILDX_PORT}" "${SSH_TARGET}" &
+  REMOTE_ARCH="$(archname "$(ssh "${SSH_TARGET}" uname -m)")"
+  ssh "${SSH_TARGET}" docker run -d --name=remote-buildkitd --privileged -p "${SSH_BUILDX_PORT}":"${SSH_BUILDX_PORT}" \
+        moby/buildkit:latest --addr "tcp://0.0.0.0:${SSH_BUILDX_PORT}"
+  docker run -d --name=remote-buildkitd --privileged -p "$((SSH_BUILDX_PORT - 1))":"$((SSH_BUILDX_PORT - 1))" \
+        moby/buildkit:latest --addr "tcp://0.0.0.0:$((SSH_BUILDX_PORT - 1))"
+  docker buildx create --name dual-builder --platform "linux/${ARCH}" --driver=remote "tcp://localhost:$((SSH_BUILDX_PORT - 1))"
+  docker buildx create --append --name dual-builder --platform "linux/${REMOTE_ARCH}" --driver=remote "tcp://localhost:${SSH_BUILDX_PORT}"
+  docker buildx use dual-builder
+}
+
+cleanup-dual-builder() {
+  docker buildx rm dual-builder
+  ssh "${SSH_TARGET}" "docker stop remote-buildkitd && docker rm remote-buildkitd"
+  docker stop remote-buildkitd && docker rm remote-buildkitd
+  pgrep -f "${SSH_BUILDX_PORT}:localhost:${SSH_BUILDX_PORT}" | xargs kill
+}
+
+main
diff --git a/apps/integration/integration/automation/runtests.sh b/apps/integration/integration/automation/runtests.sh