Skip to content

Use only equals in UnsafeRow sort before repartition #51

Use only equals in UnsafeRow sort before repartition

Use only equals in UnsafeRow sort before repartition #51

#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
#
name: Build and test
on:
push:
branches:
- '**'
workflow_call:
inputs:
ansi_enabled:
required: false
type: boolean
default: false
jobs:
configure-jobs:
name: Configure jobs
runs-on: ubuntu-20.04
outputs:
java: ${{ steps.set-outputs.outputs.java }}
branch: ${{ steps.set-outputs.outputs.branch }}
hadoop: ${{ steps.set-outputs.outputs.hadoop }}
type: ${{ steps.set-outputs.outputs.type }}
envs: ${{ steps.set-outputs.outputs.envs }}
steps:
- name: Configure branch and additional environment variables
id: set-outputs
run: |
if [ "${{ github.event.schedule }}" = "0 1 * * *" ]; then
echo '::set-output name=java::8'
echo '::set-output name=branch::master'
echo '::set-output name=type::scheduled'
echo '::set-output name=envs::{}'
echo '::set-output name=hadoop::hadoop2'
elif [ "${{ github.event.schedule }}" = "0 4 * * *" ]; then
echo '::set-output name=java::8'
echo '::set-output name=branch::master'
echo '::set-output name=type::scheduled'
echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}'
echo '::set-output name=hadoop::hadoop3'
elif [ "${{ github.event.schedule }}" = "0 7 * * *" ]; then
echo '::set-output name=java::8'
echo '::set-output name=branch::branch-3.2'
echo '::set-output name=type::scheduled'
echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}'
echo '::set-output name=hadoop::hadoop3.2'
elif [ "${{ github.event.schedule }}" = "0 10 * * *" ]; then
echo '::set-output name=java::8'
echo '::set-output name=branch::master'
echo '::set-output name=type::pyspark-coverage-scheduled'
echo '::set-output name=envs::{"PYSPARK_CODECOV": "true"}'
echo '::set-output name=hadoop::hadoop3'
elif [ "${{ github.event.schedule }}" = "0 13 * * *" ]; then
echo '::set-output name=java::11'
echo '::set-output name=branch::master'
echo '::set-output name=type::scheduled'
echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}'
echo '::set-output name=hadoop::hadoop3'
elif [ "${{ github.event.schedule }}" = "0 16 * * *" ]; then
echo '::set-output name=java::17'
echo '::set-output name=branch::master'
echo '::set-output name=type::scheduled'
echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}'
echo '::set-output name=hadoop::hadoop3'
else
echo '::set-output name=java::8'
echo '::set-output name=branch::branch-3.3' # Default branch to run on. CHANGE here when a branch is cut out.
echo '::set-output name=type::regular'
echo '::set-output name=envs::{"SPARK_ANSI_SQL_MODE": "${{ inputs.ansi_enabled }}"}'
echo '::set-output name=hadoop::hadoop3'
fi
precondition:
name: Check changes
runs-on: ubuntu-20.04
env:
GITHUB_PREV_SHA: ${{ github.event.before }}
outputs:
required: ${{ steps.set-outputs.outputs.required }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
- name: Check all modules
id: set-outputs
run: |
build=`./dev/is-changed.py -m avro,build,catalyst,core,docker-integration-tests,examples,graphx,hadoop-cloud,hive,hive-thriftserver,kubernetes,kvstore,launcher,mesos,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,spark-ganglia-lgpl,sparkr,sql,sql-kafka-0-10,streaming,streaming-kafka-0-10,streaming-kinesis-asl,tags,unsafe,yarn`
pyspark=`./dev/is-changed.py -m avro,build,catalyst,core,graphx,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,sql,tags,unsafe`
sparkr=`./dev/is-changed.py -m avro,build,catalyst,core,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,repl,sketch,sparkr,sql,tags,unsafe`
tpcds=`./dev/is-changed.py -m build,catalyst,core,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe`
docker=`./dev/is-changed.py -m build,catalyst,core,docker-integration-tests,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe`
echo "{\"build\": \"$build\", \"pyspark\": \"$pyspark\", \"sparkr\": \"$sparkr\", \"tpcds\": \"$tpcds\", \"docker\": \"$docker\"}" > required.json
cat required.json
echo "::set-output name=required::$(cat required.json)"
# Build: build Spark and run the tests for specified modules.
build:
name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})"
needs: [configure-jobs, precondition]
# Run scheduled jobs for Apache Spark only
# Run regular jobs for commit in both Apache Spark and forked repository
if: >-
(github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled')
|| (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true')
# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
runs-on: ubuntu-20.04
strategy:
fail-fast: false
matrix:
java:
- ${{ needs.configure-jobs.outputs.java }}
hadoop:
- ${{ needs.configure-jobs.outputs.hadoop }}
hive:
- hive2.3
# TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now.
# Kinesis tests depends on external Amazon kinesis service.
# Note that the modules below are from sparktestsupport/modules.py.
modules:
- >-
core, unsafe, kvstore, avro,
network-common, network-shuffle, repl, launcher,
examples, sketch, graphx
- >-
catalyst, hive-thriftserver
- >-
streaming, sql-kafka-0-10, streaming-kafka-0-10,
mllib-local, mllib,
yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl
# Here, we split Hive and SQL tests into some of slow ones and the rest of them.
included-tags: [""]
excluded-tags: [""]
comment: [""]
include:
# Hive tests
- modules: hive
java: ${{ needs.configure-jobs.outputs.java }}
hadoop: ${{ needs.configure-jobs.outputs.hadoop }}
hive: hive2.3
included-tags: org.apache.spark.tags.SlowHiveTest
comment: "- slow tests"
- modules: hive
java: ${{ needs.configure-jobs.outputs.java }}
hadoop: ${{ needs.configure-jobs.outputs.hadoop }}
hive: hive2.3
excluded-tags: org.apache.spark.tags.SlowHiveTest
comment: "- other tests"
# SQL tests
- modules: sql
java: ${{ needs.configure-jobs.outputs.java }}
hadoop: ${{ needs.configure-jobs.outputs.hadoop }}
hive: hive2.3
included-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- slow tests"
- modules: sql
java: ${{ needs.configure-jobs.outputs.java }}
hadoop: ${{ needs.configure-jobs.outputs.hadoop }}
hive: hive2.3
excluded-tags: org.apache.spark.tags.ExtendedSQLTest
comment: "- other tests"
env:
MODULES_TO_TEST: ${{ matrix.modules }}
EXCLUDED_TAGS: ${{ matrix.excluded-tags }}
INCLUDED_TAGS: ${{ matrix.included-tags }}
HADOOP_PROFILE: ${{ matrix.hadoop }}
HIVE_PROFILE: ${{ matrix.hive }}
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: ${{ needs.configure-jobs.outputs.branch }}
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
${{ matrix.java }}-${{ matrix.hadoop }}-coursier-
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v1
with:
java-version: ${{ matrix.java }}
- name: Install Python 3.8
uses: actions/setup-python@v2
# We should install one Python that is higher then 3+ for SQL and Yarn because:
# - SQL component also has Python related tests, for example, IntegratedUDFTestUtils.
# - Yarn has a Python specific test too, for example, YarnClusterSuite.
if: contains(matrix.modules, 'yarn') || (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
with:
python-version: 3.8
architecture: x64
- name: Install Python packages (Python 3.8)
if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-'))
run: |
python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy xmlrunner
python3.8 -m pip list
# Run the tests.
- name: Run tests
env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }}
run: |
# Hive "other tests" test needs larger metaspace size based on experiment.
if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi
export SERIAL_SBT_TESTS=1
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS"
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: failure()
uses: actions/upload-artifact@v2
with:
name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }}
path: "**/target/unit-tests.log"
pyspark:
needs: [configure-jobs, precondition]
# Run PySpark coverage scheduled jobs for Apache Spark only
# Run scheduled jobs with JDK 17 in Apache Spark
# Run regular jobs for commit in both Apache Spark and forked repository
if: >-
(github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled')
|| (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17')
|| (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).pyspark == 'true')
name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }}"
runs-on: ubuntu-20.04
container:
image: dongjoon/apache-spark-github-action-image:20220207
strategy:
fail-fast: false
matrix:
java:
- ${{ needs.configure-jobs.outputs.java }}
modules:
- >-
pyspark-sql, pyspark-mllib, pyspark-resource
- >-
pyspark-core, pyspark-streaming, pyspark-ml
- >-
pyspark-pandas
- >-
pyspark-pandas-slow
env:
MODULES_TO_TEST: ${{ matrix.modules }}
HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }}
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_UNIDOC: true
SKIP_MIMA: true
METASPACE_SIZE: 1g
SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
pyspark-coursier-
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v1
with:
java-version: ${{ matrix.java }}
- name: List Python packages (Python 3.9, PyPy3)
run: |
python3.9 -m pip list
pypy3 -m pip list
- name: Install Conda for pip packaging test
run: |
curl -s https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh > miniconda.sh
bash miniconda.sh -b -p $HOME/miniconda
# Run the tests.
- name: Run tests
env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }}
run: |
export PATH=$PATH:$HOME/miniconda/bin
./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST"
- name: Upload coverage to Codecov
if: needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled'
uses: codecov/codecov-action@v2
with:
files: ./python/coverage.xml
flags: unittests
name: PySpark
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: failure()
uses: actions/upload-artifact@v2
with:
name: unit-tests-log-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3
path: "**/target/unit-tests.log"
sparkr:
needs: [configure-jobs, precondition]
if: >-
(needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).sparkr == 'true')
|| (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17')
name: "Build modules: sparkr"
runs-on: ubuntu-20.04
container:
image: dongjoon/apache-spark-github-action-image:20220207
env:
HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }}
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
SKIP_MIMA: true
SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
# In order to fetch changed files
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
sparkr-coursier-
- name: Install Java ${{ needs.configure-jobs.outputs.java }}
uses: actions/setup-java@v1
with:
java-version: ${{ needs.configure-jobs.outputs.java }}
- name: Run tests
run: |
# The followings are also used by `r-lib/actions/setup-r` to avoid
# R issues at docker environment
export TZ=UTC
export _R_CHECK_SYSTEM_CLOCK_=FALSE
./dev/run-tests --parallelism 1 --modules sparkr
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-sparkr--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3
path: "**/target/test-reports/*.xml"
# Static analysis, and documentation build
lint:
needs: configure-jobs
if: needs.configure-jobs.outputs.type == 'regular'
name: Linters, licenses, dependencies and documentation generation
runs-on: ubuntu-20.04
env:
LC_ALL: C.UTF-8
LANG: C.UTF-8
PYSPARK_DRIVER_PYTHON: python3.9
PYSPARK_PYTHON: python3.9
GITHUB_PREV_SHA: ${{ github.event.before }}
container:
image: dongjoon/apache-spark-github-action-image:20220207
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
# Cache local repositories. Note that GitHub Actions cache has a 2G limit.
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
docs-coursier-
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: docs-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
docs-maven-
- name: Install Python linter dependencies
run: |
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
# Jinja2 3.0.0+ causes error when building with Sphinx.
# See also https://issues.apache.org/jira/browse/SPARK-35375.
python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==21.12b0'
python3.9 -m pip install 'pandas-stubs==1.2.0.53'
- name: Install R linter dependencies and SparkR
run: |
apt update
apt-get install -y libcurl4-openssl-dev libgit2-dev libssl-dev libxml2-dev \
libfontconfig1-dev libharfbuzz-dev libfribidi-dev libfreetype6-dev libpng-dev \
libtiff5-dev libjpeg-dev
Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')"
Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')"
./R/install-dev.sh
- name: Instll JavaScript linter dependencies
run: |
apt update
apt-get install -y nodejs npm
- name: Install dependencies for documentation generation
run: |
# pandoc is required to generate PySpark APIs as well in nbsphinx.
apt-get install -y libcurl4-openssl-dev pandoc
# TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes.
# See also https://github.com/sphinx-doc/sphinx/issues/7551.
# Jinja2 3.0.0+ causes error when building with Sphinx.
# See also https://issues.apache.org/jira/browse/SPARK-35375.
# Pin the MarkupSafe to 2.0.1 to resolve the CI error.
# See also https://issues.apache.org/jira/browse/SPARK-38279.
python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0'
python3.9 -m pip install ipython_genutils # See SPARK-38517
python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8'
python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421
apt-get update -y
apt-get install -y ruby ruby-dev
Rscript -e "install.packages(c('devtools', 'testthat', 'knitr', 'rmarkdown', 'markdown', 'e1071', 'roxygen2', 'ggplot2', 'mvtnorm', 'statmod'), repos='https://cloud.r-project.org/')"
Rscript -e "devtools::install_version('pkgdown', version='2.0.1', repos='https://cloud.r-project.org')"
Rscript -e "devtools::install_version('preferably', version='0.4', repos='https://cloud.r-project.org')"
gem install bundler
cd docs
bundle install
- name: Install Java 8
uses: actions/setup-java@v1
with:
java-version: 8
- name: Scala linter
run: ./dev/lint-scala
- name: Java linter
run: ./dev/lint-java
- name: Python linter
run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python
- name: R linter
run: ./dev/lint-r
- name: JS linter
run: ./dev/lint-js
- name: License test
run: ./dev/check-license
- name: Dependencies test
run: ./dev/test-dependencies.sh
- name: Run documentation build
run: |
if [ -f "./dev/is-changed.py" ]; then
# Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs
pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"`
if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi
if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi
fi
cd docs
bundle exec jekyll build
java-11-17:
needs: [configure-jobs, precondition]
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true'
name: Java ${{ matrix.java }} build with Maven
strategy:
fail-fast: false
matrix:
java:
- 11
- 17
runs-on: ubuntu-20.04
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Maven local repository
uses: actions/cache@v2
with:
path: ~/.m2/repository
key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }}
restore-keys: |
java${{ matrix.java }}-maven-
- name: Install Java ${{ matrix.java }}
uses: actions/setup-java@v1
with:
java-version: ${{ matrix.java }}
- name: Build with Maven
run: |
export MAVEN_OPTS="-Xss64m -Xmx2g -XX:ReservedCodeCacheSize=1g -Dorg.slf4j.simpleLogger.defaultLogLevel=WARN"
export MAVEN_CLI_OPTS="--no-transfer-progress"
export JAVA_VERSION=${{ matrix.java }}
# It uses Maven's 'install' intentionally, see https://github.com/apache/spark/pull/26414.
./build/mvn $MAVEN_CLI_OPTS -DskipTests -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Djava.version=${JAVA_VERSION/-ea} install
rm -rf ~/.m2/repository/org/apache/spark
scala-213:
needs: [configure-jobs, precondition]
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true'
name: Scala 2.13 build with SBT
runs-on: ubuntu-20.04
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
scala-213-coursier-
- name: Install Java 8
uses: actions/setup-java@v1
with:
java-version: 8
- name: Build with SBT
run: |
./dev/change-scala-version.sh 2.13
./build/sbt -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile
tpcds-1g:
needs: [configure-jobs, precondition]
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).tpcds == 'true'
name: Run TPC-DS queries with SF=1
runs-on: ubuntu-20.04
env:
SPARK_LOCAL_IP: localhost
SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }}
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
tpcds-coursier-
- name: Install Java 8
uses: actions/setup-java@v1
with:
java-version: 8
- name: Cache TPC-DS generated data
id: cache-tpcds-sf-1
uses: actions/cache@v2
with:
path: ./tpcds-sf-1
key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
- name: Checkout tpcds-kit repository
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
uses: actions/checkout@v2
with:
repository: databricks/tpcds-kit
ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
path: ./tpcds-kit
- name: Build tpcds-kit
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: cd tpcds-kit/tools && make OS=LINUX
- name: Generate TPC-DS (SF=1) table data
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"
- name: Run TPC-DS queries (Sort merge join)
run: |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.preferSortMergeJoin=true
- name: Run TPC-DS queries (Broadcast hash join)
run: |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=10485760
- name: Run TPC-DS queries (Shuffled hash join)
run: |
SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite"
env:
SPARK_TPCDS_JOIN_CONF: |
spark.sql.autoBroadcastJoinThreshold=-1
spark.sql.join.forceApplyShuffledHashJoin=true
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-tpcds--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: failure()
uses: actions/upload-artifact@v2
with:
name: unit-tests-log-tpcds--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3
path: "**/target/unit-tests.log"
docker-integration-tests:
needs: [configure-jobs, precondition]
if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).docker == 'true'
name: Run Docker integration tests
runs-on: ubuntu-20.04
env:
HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }}
HIVE_PROFILE: hive2.3
GITHUB_PREV_SHA: ${{ github.event.before }}
SPARK_LOCAL_IP: localhost
ORACLE_DOCKER_IMAGE_NAME: gvenzl/oracle-xe:18.4.0
SKIP_MIMA: true
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
with:
fetch-depth: 0
repository: apache/spark
ref: branch-3.3
- name: Sync the current branch with the latest in Apache Spark
if: github.repository != 'apache/spark'
run: |
echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV
git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/}
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' merge --no-commit --progress --squash FETCH_HEAD
git -c user.name='Apache Spark Test Account' -c user.email='[email protected]' commit -m "Merged commit"
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
with:
path: ~/.cache/coursier
key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
docker-integration-coursier-
- name: Install Java 8
uses: actions/setup-java@v1
with:
java-version: 8
- name: Run tests
run: |
./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest
- name: Upload test results to report
if: always()
uses: actions/upload-artifact@v2
with:
name: test-results-docker-integration--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3
path: "**/target/test-reports/*.xml"
- name: Upload unit tests log files
if: failure()
uses: actions/upload-artifact@v2
with:
name: unit-tests-log-docker-integration--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3
path: "**/target/unit-tests.log"