Merge pull request #15 from Affirm/hossein/rebase-master-from-upstream

Hossein/rebase master from upstream
Affirm · Aug 3, 2021 · 6bb1615 · 6bb1615
2 parents bd7510b + 530c8ad
commit 6bb1615
Show file tree

Hide file tree

Showing 6,057 changed files with 674,942 additions and 291,286 deletions.
diff --git a/.asf.yaml b/.asf.yaml
@@ -0,0 +1,29 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# https://cwiki.apache.org/confluence/display/INFRA/git+-+.asf.yaml+features
+---
+github:
+  description: "Apache Spark - A unified analytics engine for large-scale data processing"
+  homepage: https://spark.apache.org/
+  labels:
+    - python
+    - scala
+    - r
+    - java
+    - big-data
+    - jdbc
+    - sql
+    - spark
diff --git a/.gitattributes b/.gitattributes
@@ -1,2 +1,7 @@
 *.bat text eol=crlf
 *.cmd text eol=crlf
+*.java text eol=lf
+*.scala text eol=lf
+*.xml text eol=lf
+*.py text eol=lf
+*.R text eol=lf
diff --git a/.github/PULL_REQUEST_TEMPLATE b/.github/PULL_REQUEST_TEMPLATE
@@ -6,6 +6,10 @@ Thanks for sending a pull request!  Here are some tips for you:
   4. Be sure to keep the PR description updated to reflect all changes.
   5. Please write your PR title to summarize what this PR proposes.
   6. If possible, provide a concise example to reproduce the issue for a faster review.
+  7. If you want to add a new configuration, please read the guideline first for naming configurations in
+     'core/src/main/scala/org/apache/spark/internal/config/ConfigEntry.scala'.
+  8. If you want to add or modify an error type or message, please read the guideline first in
+     'core/src/main/resources/error/README.md'.
 -->
 
 ### What changes were proposed in this pull request?
@@ -27,9 +31,11 @@ Please clarify why the changes are needed. For instance,
 -->
 
 
-### Does this PR introduce any user-facing change?
+### Does this PR introduce _any_ user-facing change?
 <!--
+Note that it means *any* user-facing change including all aspects such as the documentation fix.
 If yes, please clarify the previous behavior and the change this PR proposes - provide the console output, description and/or an example to show the behavior difference if possible.
+If possible, please also clarify if this is a user-facing change compared to the released Spark versions or within the unreleased branches such as master.
 If no, write 'No'.
 -->
 

diff --git a/.github/labeler.yml b/.github/labeler.yml
@@ -0,0 +1,152 @@
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+#
+# Pull Request Labeler Github Action Configuration: https://github.com/marketplace/actions/labeler
+#
+# Note that we currently cannot use the negatioon operator  (i.e. `!`)  for miniglob matches as they
+# would match any file that doesn't touch them. What's needed is the concept of `any `, which takes a
+# list of constraints / globs and then matches all of the constraints for either `any` of the files or
+# `all` of the files in the change set.
+#
+# However, `any`/`all` are not supported in a released version and testing off of the `main` branch
+# resulted in some other errors when testing.
+#
+# An issue has been opened upstream requesting that a release be cut that has support for all/any:
+#   - https://github.com/actions/labeler/issues/111
+#
+# While we wait for this issue to be handled upstream, we can remove
+# the negated / `!` matches for now and at least have labels again.
+#
+INFRA:
+  - ".github/**/*"
+  - "appveyor.yml"
+  - "tools/**/*"
+  - "dev/create-release/**/*"
+  - ".asf.yaml"
+  - ".gitattributes"
+  - ".gitignore"
+  - "dev/github_jira_sync.py"
+  - "dev/merge_spark_pr.py"
+  - "dev/run-tests-jenkins*"
+BUILD:
+ # Can be supported when a stable release with correct all/any is released
+ #- any: ['dev/**/*', '!dev/github_jira_sync.py', '!dev/merge_spark_pr.py', '!dev/.rat-excludes']
+ - "dev/**/*"
+ - "build/**/*"
+ - "project/**/*"
+ - "assembly/**/*"
+ - "**/*pom.xml"
+ - "bin/docker-image-tool.sh"
+ - "bin/find-spark-home*"
+ - "scalastyle-config.xml"
+ # These can be added in the above `any` clause (and the /dev/**/* glob removed) when
+ # `any`/`all` support is released
+ # - "!dev/github_jira_sync.py"
+ # - "!dev/merge_spark_pr.py"
+ # - "!dev/run-tests-jenkins*"
+ # - "!dev/.rat-excludes"
+DOCS:
+  - "docs/**/*"
+  - "**/README.md"
+  - "**/CONTRIBUTING.md"
+EXAMPLES:
+  - "examples/**/*"
+  - "bin/run-example*"
+# CORE needs to be updated when all/any are released upstream.
+CORE:
+  # - any: ["core/**/*", "!**/*UI.scala", "!**/ui/**/*"] # If any file matches all of the globs defined in the list started by `any`, label is applied.
+  - "core/**/*"
+  - "common/kvstore/**/*"
+  - "common/network-common/**/*"
+  - "common/network-shuffle/**/*"
+  - "python/pyspark/**/*.py"
+  - "python/pyspark/tests/**/*.py"
+SPARK SUBMIT:
+  - "bin/spark-submit*"
+SPARK SHELL:
+  - "repl/**/*"
+  - "bin/spark-shell*"
+SQL:
+#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming.py", "!python/pyspark/sql/tests/test_streaming.py"]
+  - "**/sql/**/*"
+  - "common/unsafe/**/*"
+  #- "!python/pyspark/sql/avro/**/*"
+  #- "!python/pyspark/sql/streaming.py"
+  #- "!python/pyspark/sql/tests/test_streaming.py"
+  - "bin/spark-sql*"
+  - "bin/beeline*"
+  - "sbin/*thriftserver*.sh"
+  - "**/*SQL*.R"
+  - "**/DataFrame.R"
+  - "**/*WindowSpec.R"
+  - "**/*catalog.R"
+  - "**/*column.R"
+  - "**/*functions.R"
+  - "**/*group.R"
+  - "**/*schema.R"
+  - "**/*types.R"
+AVRO:
+  - "external/avro/**/*"
+  - "python/pyspark/sql/avro/**/*"
+DSTREAM:
+  - "streaming/**/*"
+  - "data/streaming/**/*"
+  - "external/kinesis*"
+  - "external/kafka*"
+  - "python/pyspark/streaming/**/*"
+GRAPHX:
+  - "graphx/**/*"
+  - "data/graphx/**/*"
+ML:
+  - "**/ml/**/*"
+  - "**/*mllib_*.R"
+MLLIB:
+  - "**/spark/mllib/**/*"
+  - "mllib-local/**/*"
+  - "python/pyspark/mllib/**/*"
+STRUCTURED STREAMING:
+  - "**/sql/**/streaming/**/*"
+  - "external/kafka-0-10-sql/**/*"
+  - "python/pyspark/sql/streaming.py"
+  - "python/pyspark/sql/tests/test_streaming.py"
+  - "**/*streaming.R"
+PYTHON:
+  - "bin/pyspark*"
+  - "**/python/**/*"
+R:
+  - "**/r/**/*"
+  - "**/R/**/*"
+  - "bin/sparkR*"
+YARN:
+  - "resource-managers/yarn/**/*"
+MESOS:
+  - "resource-managers/mesos/**/*"
+  - "sbin/*mesos*.sh"
+KUBERNETES:
+  - "resource-managers/kubernetes/**/*"
+WINDOWS:
+  - "**/*.cmd"
+  - "R/pkg/tests/fulltests/test_Windows.R"
+WEB UI:
+  - "**/ui/**/*"
+  - "**/*UI.scala"
+DEPLOY:
+  - "sbin/**/*"
+
diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
@@ -0,0 +1,102 @@
+name: Run benchmarks
+
+on:
+  workflow_dispatch:
+    inputs:
+      class:
+        description: 'Benchmark class'
+        required: true
+        default: '*'
+      jdk:
+        description: 'JDK version: 8 or 11'
+        required: true
+        default: '8'
+      failfast:
+        description: 'Failfast: true or false'
+        required: true
+        default: 'true'
+      num-splits:
+        description: 'Number of job splits'
+        required: true
+        default: '1'
+
+jobs:
+  matrix-gen:
+    name: Generate matrix for job splits
+    runs-on: ubuntu-20.04
+    outputs:
+      matrix: ${{ steps.set-matrix.outputs.matrix }}
+    env:
+      SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
+    steps:
+    - name: Generate matrix
+      id: set-matrix
+      run: echo "::set-output name=matrix::["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]"
+
+  benchmark:
+    name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)"
+    needs: matrix-gen
+    # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
+    runs-on: ubuntu-20.04
+    strategy:
+      fail-fast: false
+      matrix:
+        split: ${{fromJSON(needs.matrix-gen.outputs.matrix)}}
+    env:
+      SPARK_BENCHMARK_FAILFAST: ${{ github.event.inputs.failfast }}
+      SPARK_BENCHMARK_NUM_SPLITS: ${{ github.event.inputs.num-splits }}
+      SPARK_BENCHMARK_CUR_SPLIT: ${{ matrix.split }}
+      SPARK_GENERATE_BENCHMARK_FILES: 1
+      SPARK_LOCAL_IP: localhost
+      # To prevent spark.test.home not being set. See more detail in SPARK-36007.
+      SPARK_HOME: ${{ github.workspace }}
+    steps:
+    - name: Checkout Spark repository
+      uses: actions/checkout@v2
+      # In order to get diff files
+      with:
+        fetch-depth: 0
+    - name: Cache Scala, SBT and Maven
+      uses: actions/cache@v2
+      with:
+        path: |
+          build/apache-maven-*
+          build/scala-*
+          build/*.jar
+          ~/.sbt
+        key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
+        restore-keys: |
+          build-
+    - name: Cache Coursier local repository
+      uses: actions/cache@v2
+      with:
+        path: ~/.cache/coursier
+        key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
+        restore-keys: |
+          benchmark-coursier-${{ github.event.inputs.jdk }}
+    - name: Install Java ${{ github.event.inputs.jdk }}
+      uses: actions/setup-java@v1
+      with:
+        java-version: ${{ github.event.inputs.jdk }}
+    - name: Run benchmarks
+      run: |
+        ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl test:package
+        # Make less noisy
+        cp conf/log4j.properties.template conf/log4j.properties
+        sed -i 's/log4j.rootCategory=INFO, console/log4j.rootCategory=WARN, console/g' conf/log4j.properties
+        # In benchmark, we use local as master so set driver memory only. Note that GitHub Actions has 7 GB memory limit.
+        bin/spark-submit \
+          --driver-memory 6g --class org.apache.spark.benchmark.Benchmarks \
+          --jars "`find . -name '*-SNAPSHOT-tests.jar' -o -name '*avro*-SNAPSHOT.jar' | paste -sd ',' -`" \
+          "`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \
+          "${{ github.event.inputs.class }}"
+        # To keep the directory structure and file permissions, tar them
+        # See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files
+        echo "Preparing the benchmark results:"
+        tar -cvf benchmark-results-${{ github.event.inputs.jdk }}.tar `git diff --name-only` `git ls-files --others --exclude-standard`
+    - name: Upload benchmark results
+      uses: actions/upload-artifact@v2
+      with:
+        name: benchmark-results-${{ github.event.inputs.jdk }}-${{ matrix.split }}
+        path: benchmark-results-${{ github.event.inputs.jdk }}.tar
+