diff --git a/.github/workflows/bolt_gluten_ut.yml b/.github/workflows/bolt_gluten_ut.yml new file mode 100644 index 000000000..65beaa6f7 --- /dev/null +++ b/.github/workflows/bolt_gluten_ut.yml @@ -0,0 +1,158 @@ +# Run the full Gluten UT matrix against the Bolt backend via +# scripts/gluten_ut/run.sh (parallel dispatcher with bwrap per-suite isolation, +# slow-suites priority dispatch, case-level blacklist). + +name: Bolt Gluten UT + +on: + pull_request: + branches: [ main ] + workflow_dispatch: + inputs: + gluten_repo: + description: 'Gluten repo to check out; leave blank to use the hardcoded default below.' + required: false + default: '' + gluten_ref: + description: 'Gluten branch/tag/sha to check out; leave blank to use the hardcoded default below.' + required: false + default: '' + +env: + GLUTEN_REPO: ${{ inputs.gluten_repo || 'zhangxffff/gluten' }} + GLUTEN_REF: ${{ inputs.gluten_ref || 'chore/run_gluten_ut' }} + # Matches build-test.yml — ccache + conan cache live under /data on the host. + CCACHE_DIR: /data/ccache-data + CCACHE_MAX_SIZE: '100G' + CI_NUM_THREADS: "16" + IN_CI: '1' + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + spark-ut: + runs-on: [ self-hosted, medium ] + container: + image: bolt-registry:5000/bolt-ci:20260114 + # bwrap (per-suite isolation in scripts/gluten_ut/run.sh) must create a + # namespace. The runner host disables unprivileged user namespaces, so + # seccomp/apparmor unconfined alone is NOT enough — bwrap needs real + # CAP_SYS_ADMIN to take the privileged path (no user namespace) for its + # mounts. cap-add SYS_ADMIN + unconfined seccomp/apparmor grants exactly + # that without the full host exposure of --privileged (no host devices, + # no CAP_SYS_MODULE). + options: --user root --init --cap-add=SYS_ADMIN --security-opt seccomp=unconfined --security-opt apparmor=unconfined + volumes: + - /data/ccache-data:/data/ccache-data + - /data/bolt-gluten-ut-arrow:/root/.m2/repository/org/apache/arrow + - /data/bolt-gluten-ut-spark:/data/bolt-gluten-ut-spark + services: + conanserver: + image: bolt-registry:5000/conan-server:latest + volumes: + - /data/conan-server-data:/var/conan/data + timeout-minutes: 240 + steps: + - name: Checkout bolt (this repo) + uses: actions/checkout@v6 + + - name: Checkout gluten at ${{ env.GLUTEN_REF }} + uses: actions/checkout@v6 + with: + repository: ${{ env.GLUTEN_REPO }} + ref: ${{ env.GLUTEN_REF }} + path: gluten + + - name: Install JDK 17 + bubblewrap + # Full JDK (not -headless) is required: arrow's cmake JNI detection + # needs AWT, which is only present in the full openjdk-17-jdk package. + # bubblewrap is used to run each suite in a isolated environment. + run: | + apt-get update + apt-get install -y --no-install-recommends openjdk-17-jdk bubblewrap + bwrap --version + echo "JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64" >> $GITHUB_ENV + + - name: Set up bolt build environment (conan) + uses: ./.github/actions/bolt-build-base + + - name: Align conan default profile with bolt.profile + # bolt's Makefile installs with `-pr default -pr scripts/conan/bolt.profile`, + # but gluten's Makefile only uses `-pr default`. merge to default so that gluten build + # with same profile with bolt. + run: cat scripts/conan/bolt.profile >> ~/.conan2/profiles/default + + - name: Build local bolt (make release_spark) + run: make release_spark && make export_release + + - name: Build gluten native libs (make release) + working-directory: ${{ github.workspace }}/gluten + run: make release + + - name: Build gluten-arrow if jars missing or patches changed + working-directory: ${{ github.workspace }}/gluten + run: | + ARROW_DIR=/root/.m2/repository/org/apache/arrow + STAMP=$ARROW_DIR/.gluten-15.0.0.stamp + KEY=$(sha256sum ep/build-velox/src/modify_arrow*.patch \ + ep/build-velox/src/cmake-compatibility.patch \ + dev/build_arrow.sh 2>/dev/null \ + | sha256sum | cut -d' ' -f1) + mkdir -p "$ARROW_DIR" + ( + flock -x 200 + if [[ -f $STAMP && "$(cat $STAMP)" == "$KEY" ]] \ + && ls $ARROW_DIR/arrow-dataset/15.0.0-gluten/*.jar > /dev/null 2>&1; then + echo "arrow jars match stamp $KEY — skip rebuild" + else + bash dev/build_arrow.sh + echo "$KEY" > "$STAMP" + fi + ) 200> "$ARROW_DIR/.lock" + + - name: Populate Spark binary + source SQL test resources on /data + # Layout under /data/bolt-gluten-ut-spark/: + # spark_home/ — Spark 3.5.5 binary dist. SPARK_HOME points here. + # spark_src/ — full Spark 3.5.5 source tree. + # spark_home/sql → ../spark_src/sql — gluten tests read source sql/ from here. + run: | + set -e + BASE=/data/bolt-gluten-ut-spark + [[ -d "$BASE/spark_home/jars" && -d "$BASE/spark_src/sql" && -L "$BASE/spark_home/sql" ]] && exit 0 + mkdir -p "$BASE" + # flock guards against concurrent CI jobs + ( + flock -x 200 + [[ -d "$BASE/spark_home/jars" && -d "$BASE/spark_src/sql" && -L "$BASE/spark_home/sql" ]] && exit 0 + command -v aria2c > /dev/null \ + || { apt-get update -qq && apt-get install -y --no-install-recommends aria2; } + URL=https://archive.apache.org/dist/spark/spark-3.5.5 + cd "$BASE" + aria2c --quiet -x16 -s16 -k1M -o bin.tgz "$URL/spark-3.5.5-bin-hadoop3.tgz" + aria2c --quiet -x16 -s16 -k1M -o src.tgz "$URL/spark-3.5.5.tgz" + mkdir -p spark_home spark_src + tar -xzf bin.tgz --strip-components=1 -C spark_home + tar -xzf src.tgz --strip-components=1 -C spark_src + ln -sfn ../spark_src/sql spark_home/sql + rm -f bin.tgz src.tgz + ) 200> "$BASE/.lock" + + - name: Run Gluten UT (parallel, blacklist-aware) + env: + GLUTEN_HOME: ${{ github.workspace }}/gluten + SPARK_HOME: /data/bolt-gluten-ut-spark/spark_home/ + JOBS: '8' + run: bash scripts/gluten_ut/run.sh + + - name: Upload test reports + if: always() + uses: actions/upload-artifact@v4 + with: + name: bolt-gluten-ut-reports + path: | + scripts/gluten_ut/logs/*.log + scripts/gluten_ut/logs/*.tsv + scripts/gluten_ut/logs/reports/**/TEST-*.xml + if-no-files-found: warn diff --git a/.gitignore b/.gitignore index 06eff66a9..5387a84e0 100644 --- a/.gitignore +++ b/.gitignore @@ -337,3 +337,7 @@ files.txt log.txt bolt/version/version.h + +# gluten UT runner output (parallel dispatch logs + reports) +/scripts/gluten_ut/logs/ +/scripts/gluten_ut/stdout diff --git a/scripts/gluten_ut/blacklist.txt b/scripts/gluten_ut/blacklist.txt new file mode 100644 index 000000000..230747446 --- /dev/null +++ b/scripts/gluten_ut/blacklist.txt @@ -0,0 +1,19 @@ +org.apache.gluten.config.AllBoltConfiguration#Check bolt backend configs +org.apache.gluten.execution.BoltExplodeExpressionSuite#(aborted) +org.apache.gluten.execution.BoltScanSuite#Test file scheme validation +org.apache.gluten.execution.BoltScanSuite#parquet index based schema evolution +org.apache.gluten.execution.python.ArrowEvalPythonExecSuite#arrow_udf test: with preprojection +org.apache.gluten.execution.python.ArrowEvalPythonExecSuite#arrow_udf test: with unrelated projection +org.apache.gluten.execution.python.ArrowEvalPythonExecSuite#arrow_udf test: without projection +org.apache.gluten.extension.columnar.transition.BoltTransitionSuite#(aborted) +org.apache.gluten.functions.JsonFunctionsValidateSuite#json_object_keys +org.apache.spark.sql.GlutenJsonFunctionsSuite#roundtrip in to_json and from_json - array +org.apache.spark.sql.GlutenSQLQueryTestSuite#datetime-parsing-invalid.sql +org.apache.spark.sql.catalyst.expressions.GlutenCastSuite#Gluten - cast string to timestamp +org.apache.spark.sql.catalyst.expressions.GlutenDateExpressionsSuite#Gluten - to_unix_timestamp +org.apache.spark.sql.catalyst.expressions.GlutenDateExpressionsSuite#Gluten - unix_timestamp +org.apache.spark.sql.catalyst.expressions.GlutenTryCastSuite#Gluten - cast string to timestamp +org.apache.spark.sql.execution.BoltLocalCacheSuite#(aborted) +org.apache.spark.sql.execution.GlutenCoalesceShufflePartitionsSuite#Gluten - determining the number of reducers: plan already partitioned +org.apache.spark.sql.execution.GlutenCoalesceShufflePartitionsSuite#Gluten - determining the number of reducers: plan already partitioned(minNumPostShufflePartitions: 5) +org.apache.spark.sql.execution.datasources.parquet.GlutenParquetIOSuite#SPARK-34817: Read UINT_64 as Decimal from parquet diff --git a/scripts/gluten_ut/run.sh b/scripts/gluten_ut/run.sh new file mode 100755 index 000000000..409a30b94 --- /dev/null +++ b/scripts/gluten_ut/run.sh @@ -0,0 +1,352 @@ +#!/usr/bin/env bash +# Copyright (c) ByteDance Ltd. and/or its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# +# Run the Gluten UT matrix against the Bolt backend. +# +# 1. mvn install -DskipTests build jars + test-classes +# 2. scan test-classes/ discover every test suite +# 3. xargs -P JOBS one mvn per suite, slow ones first, +# bwrap-isolated target/surefire{,-reports} +# 4. classify FAILED/ABORTED against blacklist.txt (whole-file fixed-string match). +# +# Required env: GLUTEN_HOME, SPARK_HOME, bubblewrap binary on PATH. +# Optional env: JOBS (parallelism, default nproc/3). +# +# Logs + reports go to $SCRIPT_DIR/logs/. +# blacklist.txt / slow_suites.txt live next to this script. One entry per line, +# no comments, no blanks. Blacklist entry shape: `#` for a +# specific failure, or `#(aborted)` for a whole-suite abort. +# +# Exit status: 0 if every failure is on the blacklist, else 1. + +set -euo pipefail + +############################################################################### +# Maven profiles. Override via env to switch Spark versions: +# DEFAULT_SPARK_VERSION=3.5 (default; the version that +# gluten-parent's pom hard- +# codes as the property defaults +# for ${sparkshim.artifactId} / +# ${spark.major.version} / etc.) +# MVN_PROFILES='-Pspark-3.4 -Pspark-ut -Pbackends-bolt -Pceleborn -Pjava-17' +# +# When MVN_PROFILES targets a non-default spark version, run_one_suite adds +# `-am` so gluten-parent / gluten-substrait join the per-suite reactor and +# their property defaults get re-resolved via -P. +############################################################################### +DEFAULT_SPARK_VERSION="${DEFAULT_SPARK_VERSION:-3.5}" +MVN_PROFILES="${MVN_PROFILES:--Pspark-${DEFAULT_SPARK_VERSION} -Pspark-ut -Pbackends-bolt -Pceleborn -Pjava-17}" + +MVN_AM="" +if [[ "$MVN_PROFILES" =~ -Pspark-(3\.[0-9]+) ]]; then + [[ "${BASH_REMATCH[1]}" != "$DEFAULT_SPARK_VERSION" ]] && MVN_AM="-am" +fi + +############################################################################### +# Config +############################################################################### +: "${GLUTEN_HOME:?GLUTEN_HOME must point to the gluten source checkout}" +: "${SPARK_HOME:?SPARK_HOME must point to an unpacked Spark source tree (for spark.test.home)}" +[[ -d "$GLUTEN_HOME" ]] || { + echo "GLUTEN_HOME=$GLUTEN_HOME is not a directory" >&2 + exit 1 +} +[[ -d "$SPARK_HOME" ]] || { + echo "SPARK_HOME=$SPARK_HOME is not a directory" >&2 + exit 1 +} + +# Spark's AbstractCommandBuilder.getScalaVersion() reads either of these dirs +# in source-build mode (only one allowed, otherwise "ambiguous Scala version"). +# Without it, local-cluster Worker forks die with "Cannot find any build +# directories" before any Executor launches. The dir only has to exist — it +# stays empty. Idempotent so safe to repeat across runs. +mkdir -p "$SPARK_HOME/launcher/target/scala-2.12" 2> /dev/null || true + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +# Override via env to pick per-spark-version lists (e.g. blacklist-3.4.txt), +# or to share lists across multiple bolt checkouts. +BLACKLIST_FILE="${BLACKLIST_FILE:-$SCRIPT_DIR/blacklist.txt}" +SLOW_SUITES_FILE="${SLOW_SUITES_FILE:-$SCRIPT_DIR/slow_suites.txt}" +LOG_DIR="${LOG_DIR:-$SCRIPT_DIR/logs}" +MVN_BIN="${MVN_BIN:-mvn}" + +# Empirically each suite needs ~3 active threads (mvn + surefire JVM + Spark +# internals). cpus/3 saturates CPU without thrashing. Override via JOBS. +if [[ -z "${JOBS:-}" ]]; then + JOBS=$(($(grep -c ^processor /proc/cpuinfo 2> /dev/null || echo 4) / 3)) + ((JOBS < 1)) && JOBS=1 +fi + +mkdir -p "$LOG_DIR" +cd "$GLUTEN_HOME" +# step() prefixes each banner with "[ | prev ]" so the +# wall-time of each phase is visible from the banner that opens the NEXT one. +SCRIPT_START=$(date +%s) +LAST_STEP=$SCRIPT_START +step() { + local now total delta + now=$(date +%s) + total=$((now - SCRIPT_START)) + delta=$((now - LAST_STEP)) + printf '===== [%d:%02d total | prev %d:%02d] %s =====\n' \ + "$((total / 60))" "$((total % 60))" \ + "$((delta / 60))" "$((delta % 60))" "$*" + LAST_STEP=$now +} +echo "GLUTEN_HOME=$GLUTEN_HOME SPARK_HOME=$SPARK_HOME JOBS=$JOBS" + +command -v bwrap > /dev/null 2>&1 || { + echo "bwrap is required for per-suite target/ isolation. Install bubblewrap." >&2 + exit 1 +} + +############################################################################### +# Step 1/3: install jars + test-classes +############################################################################### +step "Step 1/3: mvn clean install -DskipTests (-T $JOBS)" +# clear stale targets +find . -path '*/target/test-classes' -prune -exec rm -rf {} + 2> /dev/null +find . -path '*/target/scala-*/test-classes' -prune -exec rm -rf {} + 2> /dev/null +# shellcheck disable=SC2086 +"$MVN_BIN" clean install -T "$JOBS" $MVN_PROFILES \ + -DskipTests -Dexec.skip \ + > "$LOG_DIR/_install.log" 2>&1 || { + echo "Install step failed; see $LOG_DIR/_install.log" >&2 + tail -40 "$LOG_DIR/_install.log" >&2 + exit 1 +} + +############################################################################### +# Step 2/3: discover suites +############################################################################### +step "Step 2/3: discover suites" +SUITE_MAP="$LOG_DIR/_suites.tsv" # tab-separated: \t + +# Walk every .class under /target/.../test-classes/ and emit +# `\t` rows in $SUITE_MAP — one per runnable test suite. + +# A class is concrete (runnable) iff javap's declaration line is NOT +# `abstract class` / `abstract interface` / plain `interface`. +is_concrete_class() { + ! javap -p "$1" 2> /dev/null | head -3 \ + | grep -qE "^(public +)?abstract +(class|interface) " +} +export -f is_concrete_class + +# Class names ending in one of these tokens are treated as test suites +# (matches naming conventions used across gluten + bolt test code). +SUITE_NAME_RE='(Suite|Spec|Test|Validation|Statistics|Generator|Configuration|EncodingLong)' + +# Pipeline stages: +# 1. find every /target/[scala-X/]test-classes/*.class — skip +# inner/anon classes (`$` in path), scalatest's leftover +# DiscoverySuite stubs, and arrow's own Java tests under ep/_ep/. +# 2. xargs drop abstract base classes via parallel javap. +# 3. sed rewrite `.//target/[scala-X/]test-classes/.class` +# into ``. +# 4. awk turn path slashes into FQCN dots and keep only suite-shaped names. +# 5. sort -u dedup by FQCN (same class can land in several modules). +find . -path '*/test-classes/*.class' \ + \! -path '*$*' \! -path '*DiscoverySuite*' \! -path '*/ep/_ep/*' \ + | xargs -P "$JOBS" -I{} bash -c 'is_concrete_class "{}" && echo "{}" || :' \ + | sed -nE 's|^\./(.+)/target/(scala-[^/]+/)?test-classes/(.+)\.class$|\1\t\3|p' \ + | awk -F'\t' -v OFS='\t' -v re="$SUITE_NAME_RE" \ + '{ gsub("/", ".", $2) } $2 ~ re' \ + | sort -u -t$'\t' -k2,2 > "$SUITE_MAP" + +NUM_RUN=$(wc -l < "$SUITE_MAP" | tr -d ' ') +echo "Discovered $NUM_RUN suites total." +[[ -f "$BLACKLIST_FILE" ]] && echo "Blacklist: $(wc -l < "$BLACKLIST_FILE" | tr -d ' ') entries." + +############################################################################### +# Step 3/3: dispatch + summarize +############################################################################### +step "Step 3/3: run $NUM_RUN suites with $JOBS parallel jobs" + +WORK_ROOT="$LOG_DIR/work" +REPORTS_ROOT="$LOG_DIR/reports" +rm -rf "$WORK_ROOT" "$REPORTS_ROOT" +mkdir -p "$WORK_ROOT" "$REPORTS_ROOT" +# Drop stale per-suite logs from previous runs +find "$LOG_DIR" -maxdepth 1 -type f -name '*.log' \! -name '_*' -delete + +# Pre-create per-module bind mountpoints used by run_one_suite below. +while IFS= read -r module; do + [[ -z "$module" ]] && continue + rm -rf "$module/target/surefire-reports" 2> /dev/null || true + mkdir -p "$module/target/surefire" "$module/target/surefire-reports" +done < <(cut -f1 "$SUITE_MAP" | sort -u) + +export MVN_BIN GLUTEN_HOME SPARK_HOME LOG_DIR WORK_ROOT REPORTS_ROOT +export MVN_PROFILES MVN_AM + +run_one_suite() { + local module="$1" suite="$2" + local log="$LOG_DIR/${suite}.log" + local sur="$WORK_ROOT/$suite/surefire" + local rep="$REPORTS_ROOT/$suite" + mkdir -p "$sur" "$rep" + local t0=$(date +%s) + # Find the module's test-classes/ dir (Scala or Java layout). + local tc="" + for d in "$GLUTEN_HOME/$module/target/scala-2.12/test-classes" \ + "$GLUTEN_HOME/$module/target/test-classes"; do + [[ -d "$d" ]] && { + tc="$d" + break + } + done + # Per-suite isolation via bwrap: + # --bind : private target/surefire (booter jar) + target/surefire-reports + # --bind sandbox : full copy of test-classes/ under /tmp, with the + # conflicting `unit-tests-working-home/` (used as + # Spark warehouse + metastore by GlutenSQLTestsTrait. + # prepareWorkDir) carved out as a fresh dir per + # suite. + # --ro-bind $SPARK_HOME : re-expose SPARK_HOME, otherwise --tmpfs /tmp may hide it. + local bind_args=() + local sandbox="" + if [[ -n "$tc" ]]; then + sandbox="/tmp/gluten-ut-sandbox/$suite/test-classes" + rm -rf "/tmp/gluten-ut-sandbox/$suite" + mkdir -p "$sandbox" + cp -a "$tc/." "$sandbox/" 2> /dev/null + rm -rf "$sandbox/unit-tests-working-home" 2> /dev/null + mkdir "$sandbox/unit-tests-working-home" + bind_args=(--bind "$sandbox" "$tc") + fi + local rc=0 + # shellcheck disable=SC2086 + bwrap \ + --dev-bind / / --tmpfs /tmp \ + --ro-bind "$SPARK_HOME" "$SPARK_HOME" \ + --bind "$sur" "$GLUTEN_HOME/$module/target/surefire" \ + --bind "$rep" "$GLUTEN_HOME/$module/target/surefire-reports" \ + "${bind_args[@]}" \ + --chdir "$GLUTEN_HOME" \ + "$MVN_BIN" surefire:test scalatest:test \ + -pl "$module" $MVN_AM \ + $MVN_PROFILES \ + -DfailIfNoTests=false -Dexec.skip -Dmaven.test.failure.ignore=true \ + -DargLine="-Dspark.test.home=$SPARK_HOME" \ + -Dtest="$suite" -DwildcardSuites="$suite" \ + -DtagsToExclude=org.apache.gluten.tags.UDFTest,org.apache.gluten.tags.EnhancedFeaturesTest,org.apache.gluten.tags.SkipTest \ + > "$log" 2>&1 || rc=$? + [[ -n "$sandbox" ]] && rm -rf "/tmp/gluten-ut-sandbox/$suite" + local secs=$(($(date +%s) - t0)) + local cases + cases=$(sed -E 's/\x1b\[[0-9;]*m//g' "$log" \ + | grep -oE 'Total number of tests run: [0-9]+' | tail -1 \ + | grep -oE '[0-9]+') + # Trailing marker line for summary: distinguishes "mvn died before scalatest" + # (rc != 0, no FAILED / ABORTED markers in the log) from "scalatest ran and + # the suite passed" (rc == 0 because -Dmaven.test.failure.ignore=true; case + # failures still show up as `*** FAILED ***` lines). + # Leading \n: mvn's colorised output can end with a trailing ANSI reset and no + # newline, which would otherwise prefix the marker (e.g. "\e[0mGLUTEN_UT_MVN_RC=0") + # and defeat a '^'-anchored match in the summary below. + printf '\nGLUTEN_UT_MVN_RC=%s\n' "$rc" >> "$log" + # FD 3 = the parent's original stdout (terminal); see `exec 3>&1` below. + printf ' done [%4ds, %4s cases] %s\n' "$secs" "${cases:-?}" "$suite" >&3 + printf 'finished\t%s\n' "$suite" +} +export -f run_one_suite + +# Slow-list priority: xargs pulls from this file top-down so the suites +# named in slow_suites.txt grab the first JOBS workers and the long tail +# can't dangle. Both partitions keep SUITE_MAP's original order. +DISPATCH_MAP="$LOG_DIR/_suites_dispatch_order.tsv" +if [[ -f "$SLOW_SUITES_FILE" ]]; then + awk 'NR==FNR{s[$0]=1;next} $2 in s' "$SLOW_SUITES_FILE" "$SUITE_MAP" > "$DISPATCH_MAP" + awk 'NR==FNR{s[$0]=1;next} !($2 in s)' "$SLOW_SUITES_FILE" "$SUITE_MAP" >> "$DISPATCH_MAP" + echo "Slow-suite priority queue: $(wc -l < "$SLOW_SUITES_FILE") suite(s) dispatched first." +else + cp "$SUITE_MAP" "$DISPATCH_MAP" +fi + +# Save the terminal stdout as FD 3 so run_one_suite can print a one-line +# "done [...] " to the user as soon as each suite finishes, even +# though the dispatcher's own stdout is captured to _dispatch.log. +exec 3>&1 +( + tr '\t' ' ' < "$DISPATCH_MAP" | xargs -P "$JOBS" -L 1 \ + bash -c 'run_one_suite "$1" "$2"' _ +) > "$LOG_DIR/_dispatch.log" 2>&1 & +DISPATCH_PID=$! + +# Best-effort progress heartbeat. +while kill -0 $DISPATCH_PID 2> /dev/null; do + sleep 10 + done_count=$(grep -c '^finished\b' "$LOG_DIR/_dispatch.log" 2> /dev/null || echo 0) + echo " progress: $done_count / $NUM_RUN suites complete" +done +wait $DISPATCH_PID || true + +step "Summary" +# Walk each per-suite log and emit one key per failure: +# # — scalatest "*** FAILED ***" line +# #(aborted) — scalatest "*** ABORTED ***" line +# Each key is grep -Fxq'd against blacklist.txt; unmatched → unexpected. +declare -A fired +expected=0 +unexpected=0 +# Walk the suites that were actually dispatched this run (SUITE_MAP is the +# canonical list), not $LOG_DIR/*.log — that would also pick up stale per- +# suite logs left over from a previous run with a different profile / spark +# version. +while IFS=$'\t' read -r _module suite; do + log="$LOG_DIR/$suite.log" + [[ -f "$log" ]] || continue + # mvn-failed (rc != 0 → mvn died before scalatest could run) bypasses the + # blacklist: it always counts as unexpected, since blacklisting infra + # failures would mask real regressions across PRs. + # -ao (not '^'-anchored) tolerates any ANSI/CR residue prefixing the marker; + # `|| true` keeps a missing marker from aborting the whole summary under set -e. + rc=$(grep -aoE 'GLUTEN_UT_MVN_RC=[0-9]+' "$log" | tail -1 | cut -d= -f2 || true) + if [[ -n "$rc" && "$rc" != "0" ]]; then + unexpected=$((unexpected + 1)) + echo " ! $suite#(mvn-failed)" + continue + fi + clean=$(sed -E 's/\x1b\[[0-9;]*m//g' "$log") + keys=$(echo "$clean" | sed -nE 's/^- (.*) \*\*\* FAILED \*\*\*$/'"$suite"'#\1/p') + echo "$clean" | grep -q '\*\*\* ABORTED \*\*\*' && keys+=$'\n'"$suite#(aborted)" + while IFS= read -r key; do + [[ -z "$key" ]] && continue + if grep -Fxq -- "$key" "$BLACKLIST_FILE"; then + fired[$key]=1 + expected=$((expected + 1)) + else + unexpected=$((unexpected + 1)) + echo " ! $key" + fi + done <<< "$keys" +done < "$SUITE_MAP" + +# Blacklist entries that didn't fire this run. If a case stays stale +# across multiple runs it's a candidate for removal from blacklist.txt. +stale=$(while IFS= read -r entry; do + [[ -v fired[$entry] ]] || echo " ? $entry" +done < "$BLACKLIST_FILE") +if [[ -n "$stale" ]]; then + echo "stale blacklist entries (didn't fail this run; remove if consistently passing):" + echo "$stale" +fi + +echo "expected failures: $expected (on blacklist; not counted)" +echo "unexpected failures: $unexpected" +exit $((unexpected > 0 ? 1 : 0)) diff --git a/scripts/gluten_ut/slow_suites.txt b/scripts/gluten_ut/slow_suites.txt new file mode 100644 index 000000000..edb0e7e22 --- /dev/null +++ b/scripts/gluten_ut/slow_suites.txt @@ -0,0 +1,13 @@ +org.apache.spark.sql.GlutenSQLQueryTestSuite +org.apache.spark.sql.execution.datasources.parquet.GlutenParquetV2FilterSuite +org.apache.spark.sql.execution.datasources.parquet.GlutenParquetV1FilterSuite +org.apache.spark.sql.execution.datasources.parquet.GlutenParquetRowIndexSuite +org.apache.spark.sql.catalyst.expressions.GlutenCastSuite +org.apache.spark.sql.catalyst.expressions.GlutenTryCastSuite +org.apache.spark.sql.catalyst.expressions.GlutenDateExpressionsSuite +org.apache.spark.sql.execution.datasources.orc.GlutenOrcV2SchemaPruningSuite +org.apache.spark.sql.execution.datasources.orc.GlutenOrcV1SchemaPruningSuite +org.apache.spark.sql.execution.datasources.parquet.GlutenParquetV2SchemaPruningSuite +org.apache.spark.sql.execution.datasources.parquet.GlutenParquetV1SchemaPruningSuite +org.apache.spark.sql.connector.GlutenWriteDistributionAndOrderingSuite +org.apache.spark.sql.catalyst.expressions.GlutenPredicateSuite