From 2d763b2e56f0e9505006531414474998159ee1d1 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Thu, 15 Apr 2021 20:15:48 +0800
Subject: [PATCH 01/35] Update README.md

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index fc172bae..11d8279e 100644
--- a/README.md
+++ b/README.md
@@ -99,6 +99,7 @@ You will find jar files under oap-common/target and oap-spark/target.
 
 To enable rdd cache on Intel Optane PMem, you need add the following configurations to `spark-defaults.conf`
 ```
+spark.memory.pmem.extension.enabled true
 spark.memory.pmem.initial.path [Your Optane PMem paths seperate with comma]
 spark.memory.pmem.initial.size [Your Optane PMem size in GB]
 spark.memory.pmem.usable.ratio [from 0 to 1, 0.85 is recommended]

From e06463b4a2e810a31bf384a32ffe7d190fe531c3 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:14:33 +0800
Subject: [PATCH 02/35] Update pom.xml

---
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pom.xml b/pom.xml
index cc236bec..f66d5cc4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -15,7 +15,7 @@
         <junit.version>4.12</junit.version>
         <scala.version>2.12.10</scala.version>
         <scala.binary.version>2.12</scala.binary.version>
-        <spark.internal.version>3.0.0</spark.internal.version>
+        <spark.internal.version>3.1.1</spark.internal.version>
         <maven-patch-plugin.version>1.2</maven-patch-plugin.version>
         <maven-resource-plugin.version>2.6</maven-resource-plugin.version>
         <jetty.version>9.4.39.v20210325</jetty.version>

From 358c891ea0039354eaec8f5113aad085f7660172 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:15:37 +0800
Subject: [PATCH 03/35] Update MemoryConsumer.java


From a46dddca28b8854803db7487cdda0172856c3c3b Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:15:59 +0800
Subject: [PATCH 04/35] Update PMemManagerInitializer.java


From 832cd473d737e48a8eeaa2d3b67d6437f1794300 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:16:16 +0800
Subject: [PATCH 05/35] Update TaskMemoryManager.java


From 1b59a13f41d18c724404e9b401e674ff6e24fdc0 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:18:41 +0800
Subject: [PATCH 06/35] Update BytesToBytesMap.java

---
 .../spark/unsafe/map/BytesToBytesMap.java     | 114 +++++++++++++++---
 1 file changed, 95 insertions(+), 19 deletions(-)

diff --git a/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
index e7463908..d691baab 100644
--- a/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
+++ b/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java
@@ -393,10 +393,12 @@ public void remove() {
     }
 
     private void handleFailedDelete() {
-      // remove the spill file from disk
-      File file = spillWriters.removeFirst().getFile();
-      if (file != null && file.exists() && !file.delete()) {
-        logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+      if (spillWriters.size() > 0) {
+        // remove the spill file from disk
+        File file = spillWriters.removeFirst().getFile();
+        if (file != null && file.exists() && !file.delete()) {
+          logger.error("Was unable to delete spill file {}", file.getAbsolutePath());
+        }
       }
     }
   }
@@ -406,17 +408,10 @@ private void handleFailedDelete() {
    *
    * For efficiency, all calls to `next()` will return the same {@link Location} object.
    *
-   * If any other lookups or operations are performed on this map while iterating over it, including
-   * `lookup()`, the behavior of the returned iterator is undefined.
+   * The returned iterator is thread-safe. However if the map is modified while iterating over it,
+   * the behavior of the returned iterator is undefined.
    */
   public MapIterator iterator() {
-    return new MapIterator(numValues, loc, false);
-  }
-
-  /**
-   * Returns a thread safe iterator that iterates of the entries of this map.
-   */
-  public MapIterator safeIterator() {
     return new MapIterator(numValues, new Location(), false);
   }
 
@@ -427,19 +422,82 @@ public MapIterator safeIterator() {
    *
    * For efficiency, all calls to `next()` will return the same {@link Location} object.
    *
-   * If any other lookups or operations are performed on this map while iterating over it, including
-   * `lookup()`, the behavior of the returned iterator is undefined.
+   * The returned iterator is thread-safe. However if the map is modified while iterating over it,
+   * the behavior of the returned iterator is undefined.
    */
   public MapIterator destructiveIterator() {
     updatePeakMemoryUsed();
-    return new MapIterator(numValues, loc, true);
+    return new MapIterator(numValues, new Location(), true);
+  }
+
+  /**
+   * Iterator for the entries of this map. This is to first iterate over key indices in
+   * `longArray` then accessing values in `dataPages`. NOTE: this is different from `MapIterator`
+   * in the sense that key index is preserved here
+   * (See `UnsafeHashedRelation` for example of usage).
+   */
+  public final class MapIteratorWithKeyIndex implements Iterator<Location> {
+
+    /**
+     * The index in `longArray` where the key is stored.
+     */
+    private int keyIndex = 0;
+
+    private int numRecords;
+    private final Location loc;
+
+    private MapIteratorWithKeyIndex() {
+      this.numRecords = numValues;
+      this.loc = new Location();
+    }
+
+    @Override
+    public boolean hasNext() {
+      return numRecords > 0;
+    }
+
+    @Override
+    public Location next() {
+      if (!loc.isDefined() || !loc.nextValue()) {
+        while (longArray.get(keyIndex * 2) == 0) {
+          keyIndex++;
+        }
+        loc.with(keyIndex, 0, true);
+        keyIndex++;
+      }
+      numRecords--;
+      return loc;
+    }
+  }
+
+  /**
+   * Returns an iterator for iterating over the entries of this map,
+   * by first iterating over the key index inside hash map's `longArray`.
+   *
+   * For efficiency, all calls to `next()` will return the same {@link Location} object.
+   *
+   * The returned iterator is NOT thread-safe. If the map is modified while iterating over it,
+   * the behavior of the returned iterator is undefined.
+   */
+  public MapIteratorWithKeyIndex iteratorWithKeyIndex() {
+    return new MapIteratorWithKeyIndex();
+  }
+
+  /**
+   * The maximum number of allowed keys index.
+   *
+   * The value of allowed keys index is in the range of [0, maxNumKeysIndex - 1].
+   */
+  public int maxNumKeysIndex() {
+    return (int) (longArray.size() / 2);
   }
 
   /**
    * Looks up a key, and return a {@link Location} handle that can be used to test existence
    * and read/write values.
    *
-   * This function always return the same {@link Location} instance to avoid object allocation.
+   * This function always returns the same {@link Location} instance to avoid object allocation.
+   * This function is not thread-safe.
    */
   public Location lookup(Object keyBase, long keyOffset, int keyLength) {
     safeLookup(keyBase, keyOffset, keyLength, loc,
@@ -451,7 +509,8 @@ public Location lookup(Object keyBase, long keyOffset, int keyLength) {
    * Looks up a key, and return a {@link Location} handle that can be used to test existence
    * and read/write values.
    *
-   * This function always return the same {@link Location} instance to avoid object allocation.
+   * This function always returns the same {@link Location} instance to avoid object allocation.
+   * This function is not thread-safe.
    */
   public Location lookup(Object keyBase, long keyOffset, int keyLength, int hash) {
     safeLookup(keyBase, keyOffset, keyLength, loc, hash);
@@ -606,6 +665,14 @@ public boolean isDefined() {
       return isDefined;
     }
 
+    /**
+     * Returns index for key.
+     */
+    public int getKeyIndex() {
+      assert (isDefined);
+      return pos;
+    }
+
     /**
      * Returns the base object for key.
      */
@@ -743,14 +810,23 @@ public boolean append(Object kbase, long koff, int klen, Object vbase, long voff
         longArray.set(pos * 2 + 1, keyHashcode);
         isDefined = true;
 
+        // If the map has reached its growth threshold, try to grow it.
+        if (numKeys >= growthThreshold) {
         // We use two array entries per key, so the array size is twice the capacity.
         // We should compare the current capacity of the array, instead of its size.
-        if (numKeys >= growthThreshold && longArray.size() / 2 < MAX_CAPACITY) {
+        if (longArray.size() / 2 < MAX_CAPACITY) {
           try {
             growAndRehash();
           } catch (SparkOutOfMemoryError oom) {
             canGrowArray = false;
           }
+          } else {
+            // The map is already at MAX_CAPACITY and cannot grow. Instead, we prevent it from
+            // accepting any more new elements to make sure we don't exceed the load factor. If we
+            // need to spill later, this allows UnsafeKVExternalSorter to reuse the array for
+            // sorting.
+            canGrowArray = false;
+          }
         }
       }
       return true;

From 1320ad16694812ecd3710c6178a8ae13069305c3 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:20:12 +0800
Subject: [PATCH 07/35] Update MemoryAllocator.java


From 3ae52281e7aa198e45197e65b907d144ff5aead5 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:20:30 +0800
Subject: [PATCH 08/35] Update MemoryBlock.java


From 6350d1b421f9b54bd75cb4412010f50081d15b16 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:20:54 +0800
Subject: [PATCH 09/35] Update PMemReader.java

---
 .../apache/spark/util/collection/unsafe/sort/PMemReader.java | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReader.java b/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReader.java
index 1d2b1ee3..a838cc7d 100644
--- a/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReader.java
+++ b/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReader.java
@@ -79,6 +79,11 @@ public int getRecordLength() {
     public long getKeyPrefix() {
         return keyPrefix;
     }
+    
+   @Override
+    public long getCurrentPageNumber() {
+        throw new UnsupportedOperationException();
+   } 
 
     @Override
     public void close() {

From 6a0703f4ee1143f2b27345d6ddb300dd6e490a84 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:21:10 +0800
Subject: [PATCH 10/35] Update PMemReaderForUnsafeExternalSorter.java

---
 .../unsafe/sort/PMemReaderForUnsafeExternalSorter.java       | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReaderForUnsafeExternalSorter.java b/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReaderForUnsafeExternalSorter.java
index 4a39648b..018d2785 100644
--- a/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReaderForUnsafeExternalSorter.java
+++ b/src/main/java/org/apache/spark/util/collection/unsafe/sort/PMemReaderForUnsafeExternalSorter.java
@@ -64,6 +64,11 @@ public void loadNext() {
     public int getNumRecords() {
         return numRecords;
     }
+    
+    @Override
+    public long getCurrentPageNumber() {
+        throw new UnsupportedOperationException();
+    } 
 
     /**
      * load more PMem records in the buffer

From 8d852bf0b8094e6b0b56576594da2a72234c4b43 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:21:29 +0800
Subject: [PATCH 11/35] Update PMemSpillWriterFactory.java


From b29b62f99cf94a024220632617b47a61684a4616 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:21:51 +0800
Subject: [PATCH 12/35] Update PMemSpillWriterType.java


From 0aa0fa86d051e97c3f99607e2cb8ae1a18c3359b Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:22:08 +0800
Subject: [PATCH 13/35] Update PMemWriter.java


From 9e0c90f62992f3b76be859414ba3bbcc271a878b Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:22:24 +0800
Subject: [PATCH 14/35] Update SortedIteratorForSpills.java


From 1fa170624c1be847d9c6a97113a5b0efe80ae0a2 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:23:04 +0800
Subject: [PATCH 15/35] Update SortedPMemPageSpillWriter.java

---
 .../collection/unsafe/sort/SortedPMemPageSpillWriter.java   | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/src/main/java/org/apache/spark/util/collection/unsafe/sort/SortedPMemPageSpillWriter.java b/src/main/java/org/apache/spark/util/collection/unsafe/sort/SortedPMemPageSpillWriter.java
index bd8ca9c8..3b1324f9 100644
--- a/src/main/java/org/apache/spark/util/collection/unsafe/sort/SortedPMemPageSpillWriter.java
+++ b/src/main/java/org/apache/spark/util/collection/unsafe/sort/SortedPMemPageSpillWriter.java
@@ -212,6 +212,12 @@ public SortedPMemPageSpillReader() throws IOException{
                 numRecordsOnDisk = diskSpillWriter.recordsSpilled();
             }
         }
+        
+        @Override
+        public long getCurrentPageNumber() {
+           throw new UnsupportedOperationException();
+        }
+  
         @Override
         public boolean hasNext() {
             return curNumOfRec < numRecordsOnPMem + numRecordsOnDisk;

From e64a414ac1ac1e1b1a61c47829f1a8229435bdad Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:23:26 +0800
Subject: [PATCH 16/35] Update SpillWriterForUnsafeSorter.java


From 715bab38601c455c2f4abb0859828b1e7ea768d0 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:23:42 +0800
Subject: [PATCH 17/35] Update UnsafeExternalSorter.java

---
 .../unsafe/sort/UnsafeExternalSorter.java     | 195 ++++++++++++------
 1 file changed, 132 insertions(+), 63 deletions(-)

diff --git a/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
index 9f27c80b..7ca3f334 100644
--- a/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
+++ b/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java
@@ -113,11 +113,14 @@ public static UnsafeExternalSorter createWithExistingInMemorySorter(
       int initialSize,
       long pageSizeBytes,
       int numElementsForSpillThreshold,
-      UnsafeInMemorySorter inMemorySorter) throws IOException {
+      UnsafeInMemorySorter inMemorySorter,
+      long existingMemoryConsumption) throws IOException {
     UnsafeExternalSorter sorter = new UnsafeExternalSorter(taskMemoryManager, blockManager,
       serializerManager, taskContext, recordComparatorSupplier, prefixComparator, initialSize,
         pageSizeBytes, numElementsForSpillThreshold, inMemorySorter, false /* ignored */);
     sorter.spill(Long.MAX_VALUE, sorter);
+    taskContext.taskMetrics().incMemoryBytesSpilled(existingMemoryConsumption);
+    sorter.totalSpillBytes += existingMemoryConsumption;
     // The external sorter will be used to insert records, in-memory sorter is not needed.
     sorter.inMemSorter = null;
     return sorter;
@@ -217,22 +220,26 @@ public long spill(long size, MemoryConsumer trigger) throws IOException {
     }
 
     if (inMemSorter == null || inMemSorter.numRecords() <= 0) {
+      // There could still be some memory allocated when there are no records in the in-memory
+      // sorter. We will not spill it however, to ensure that we can always process at least one
+      // record before spilling. See the comments in `allocateMemoryForRecordIfNecessary` for why
+      // this is necessary.
       return 0L;
     }
 
     logger.info("Thread {} spilling sort data of {} to disk ({} {} so far)",
         Thread.currentThread().getId(),
         Utils.bytesToString(getMemoryUsage()),
-        spillWriters.size() ,
+        spillWriters.size(),
         spillWriters.size() > 1 ? " times" : " time");
     ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
     // Sorting records or not will be handled by different spill writer, here null is given instead.
     spillWithWriter(null, inMemSorter.numRecords(), writeMetrics, false);
     final long spillSize = freeMemory();
-    inMemSorter.reset();
     // Note that this is more-or-less going to be a multiple of the page size, so wasted space in
     // pages will currently be counted as memory spilled even though that space isn't actually
     // written to disk. This also counts the space needed to store the sorter's pointer array.
+    inMemSorter.freeMemory();
     // Reset the in-memory sorter's pointer array only after freeing up the memory pages holding the
     // records. Otherwise, if the task is over allocated memory, then without freeing the memory
     // pages, we might not be able to get memory for the pointer array.
@@ -262,14 +269,9 @@ public SpillWriterForUnsafeSorter spillWithWriter(
         taskContext.taskMetrics(),
         spillToPMemEnabled,
         isSorted);
-    if (spillWriter == null) {
-      logger.error("failed to get SpillWriter, please check related configuration.");
-      return null;
-    } else {
-      spillWriter.write();
-      spillWriters.add(spillWriter);
-      return spillWriter;
-    }
+    spillWriter.write();
+    spillWriters.add(spillWriter);
+    return spillWriter;
   }
 
   /**
@@ -343,7 +345,6 @@ private long freeMemory() {
     return memoryFreed;
   }
 
-
   private void freeSpills() {
     for (SpillWriterForUnsafeSorter spillWriter : spillWriters) {
       spillWriter.clearAll();
@@ -359,7 +360,7 @@ public void cleanupResources() {
       freeSpills();
       freeMemory();
       if (inMemSorter != null) {
-        inMemSorter.free();
+        inMemSorter.freeMemory();
         inMemSorter = null;
       }
     }
@@ -373,40 +374,53 @@ public void cleanupResources() {
   private void growPointerArrayIfNecessary() throws IOException {
     assert(inMemSorter != null);
     if (!inMemSorter.hasSpaceForAnotherRecord()) {
+      if (inMemSorter.numRecords() <= 0) {
+        // Spilling was triggered just before this method was called. The pointer array was freed
+        // during the spill, so a new pointer array needs to be allocated here.
+        LongArray array = allocateArray(inMemSorter.getInitialSize());
+        inMemSorter.expandPointerArray(array);
+        return;
+      }
+
       long used = inMemSorter.getMemoryUsage();
-      LongArray array;
+      LongArray array = null;
       try {
         // could trigger spilling
         array = allocateArray(used / 8 * 2);
       } catch (TooLargePageException e) {
         // The pointer array is too big to fix in a single page, spill.
         spill();
-        return;
       } catch (SparkOutOfMemoryError e) {
-        // should have trigger spilling
-        if (!inMemSorter.hasSpaceForAnotherRecord()) {
+        if (inMemSorter.numRecords() > 0) {
           logger.error("Unable to grow the pointer array");
           throw e;
         }
-        return;
+        // The new array could not be allocated, but that is not an issue as it is longer needed,
+        // as all records were spilled.
       }
-      // check if spilling is triggered or not
-      if (inMemSorter.hasSpaceForAnotherRecord()) {
-        freeArray(array);
-      } else {
-        inMemSorter.expandPointerArray(array);
+
+      if (inMemSorter.numRecords() <= 0) {
+        // Spilling was triggered while trying to allocate the new array.
+        if (array != null) {
+          // We succeeded in allocating the new array, but, since all records were spilled, a
+          // smaller array would also suffice.
+          freeArray(array);
+        }
+        // The pointer array was freed during the spill, so a new pointer array needs to be
+        // allocated here.
+        array = allocateArray(inMemSorter.getInitialSize());
       }
+      inMemSorter.expandPointerArray(array);
     }
   }
 
   /**
-   * Allocates more memory in order to insert an additional record. This will request additional
-   * memory from the memory manager and spill if the requested memory can not be obtained.
+   * Allocates an additional page in order to insert an additional record. This will request
+   * additional memory from the memory manager and spill if the requested memory can not be
+   * obtained.
    *
    * @param required the required space in the data page, in bytes, including space for storing
-   *                      the record size. This must be less than or equal to the page size (records
-   *                      that exceed the page size are handled via a different code path which uses
-   *                      special overflow pages).
+   *                 the record size.
    */
   private void acquireNewPageIfNecessary(int required) {
     if (currentPage == null ||
@@ -418,6 +432,37 @@ private void acquireNewPageIfNecessary(int required) {
     }
   }
 
+  /**
+   * Allocates more memory in order to insert an additional record. This will request additional
+   * memory from the memory manager and spill if the requested memory can not be obtained.
+   *
+   * @param required the required space in the data page, in bytes, including space for storing
+   *                 the record size.
+   */
+  private void allocateMemoryForRecordIfNecessary(int required) throws IOException {
+    // Step 1:
+    // Ensure that the pointer array has space for another record. This may cause a spill.
+    growPointerArrayIfNecessary();
+    // Step 2:
+    // Ensure that the last page has space for another record. This may cause a spill.
+    acquireNewPageIfNecessary(required);
+    // Step 3:
+    // The allocation in step 2 could have caused a spill, which would have freed the pointer
+    // array allocated in step 1. Therefore we need to check again whether we have to allocate
+    // a new pointer array.
+    //
+    // If the allocation in this step causes a spill event then it will not cause the page
+    // allocated in the previous step to be freed. The function `spill` only frees memory if at
+    // least one record has been inserted in the in-memory sorter. This will not be the case if
+    // we have spilled in the previous step.
+    //
+    // If we did not spill in the previous step then `growPointerArrayIfNecessary` will be a
+    // no-op that does not allocate any memory, and therefore can't cause a spill event.
+    //
+    // Thus there is no need to call `acquireNewPageIfNecessary` again after this step.
+    growPointerArrayIfNecessary();
+  }
+
   /**
    * Write a record to the sorter.
    */
@@ -432,11 +477,10 @@ public void insertRecord(
       spill();
     }
 
-    growPointerArrayIfNecessary();
-    int uaoSize = UnsafeAlignedOffset.getUaoSize();
+    final int uaoSize = UnsafeAlignedOffset.getUaoSize();
     // Need 4 or 8 bytes to store the record length.
     final int required = length + uaoSize;
-    acquireNewPageIfNecessary(required);
+    allocateMemoryForRecordIfNecessary(required);
 
     final Object base = currentPage.getBaseObject();
     final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
@@ -459,10 +503,9 @@ public void insertKVRecord(Object keyBase, long keyOffset, int keyLen,
       Object valueBase, long valueOffset, int valueLen, long prefix, boolean prefixIsNull)
     throws IOException {
 
-    growPointerArrayIfNecessary();
-    int uaoSize = UnsafeAlignedOffset.getUaoSize();
+    final int uaoSize = UnsafeAlignedOffset.getUaoSize();
     final int required = keyLen + valueLen + (2 * uaoSize);
-    acquireNewPageIfNecessary(required);
+    allocateMemoryForRecordIfNecessary(required);
 
     final Object base = currentPage.getBaseObject();
     final long recordAddress = taskMemoryManager.encodePageNumberAndOffset(currentPage, pageCursor);
@@ -483,6 +526,7 @@ public void insertKVRecord(Object keyBase, long keyOffset, int keyLen,
    */
   public void merge(UnsafeExternalSorter other) throws IOException {
     other.spill();
+    totalSpillBytes += other.totalSpillBytes;
     spillWriters.addAll(other.spillWriters);
     // remove them from `spillWriters`, or the files will be deleted in `cleanupResources`.
     other.spillWriters.clear();
@@ -523,10 +567,13 @@ public UnsafeSorterIterator getSortedIterator() throws IOException {
    */
   class SpillableIterator extends UnsafeSorterIterator {
     private UnsafeSorterIterator upstream;
-    private UnsafeSorterIterator nextUpstream = null;
     private MemoryBlock lastPage = null;
     private boolean loaded = false;
-    private int numRecords = 0;
+    private int numRecords;
+    private Object currentBaseObject;
+    private long currentBaseOffset;
+    private int currentRecordLength;
+    private long currentKeyPrefix;
 
     SpillableIterator(UnsafeSorterIterator inMemIterator) {
       this.upstream = inMemIterator;
@@ -538,27 +585,33 @@ public int getNumRecords() {
       return numRecords;
     }
 
+    @Override
+    public long getCurrentPageNumber() {
+      throw new UnsupportedOperationException();
+    }
+
     public long spill() throws IOException {
       synchronized (this) {
-        if (!(upstream instanceof UnsafeInMemorySorter.SortedIterator && nextUpstream == null
-          && numRecords > 0)) {
+        if (inMemSorter == null) {
           return 0L;
         }
-        UnsafeInMemorySorter.SortedIterator inMemIterator =
+
+    long currentPageNumber = upstream.getCurrentPageNumber();
+
+    UnsafeInMemorySorter.SortedIterator inMemIterator =
                 ((UnsafeInMemorySorter.SortedIterator) upstream).clone();
-        
+
         ShuffleWriteMetrics writeMetrics = new ShuffleWriteMetrics();
         long released = 0L;
         SpillWriterForUnsafeSorter spillWriter = spillWithWriter(inMemIterator, numRecords, writeMetrics, true);
-        nextUpstream = spillWriter.getSpillReader();
-        assert(nextUpstream != null);
+        upstream = spillWriter.getSpillReader();
+
         synchronized (UnsafeExternalSorter.this) {
           // release the pages except the one that is used. There can still be a caller that
           // is accessing the current record. We free this page in that caller's next loadNext()
           // call.
           for (MemoryBlock page : allocatedPages) {
-            if (!loaded || page.pageNumber !=
-                    ((UnsafeInMemorySorter.SortedIterator)upstream).getCurrentPageNumber()) {
+            if (!loaded || page.pageNumber != currentPageNumber) {
               released += page.size();
               freePage(page);
             } else {
@@ -566,13 +619,18 @@ public long spill() throws IOException {
             }
           }
           allocatedPages.clear();
+          if (lastPage != null) {
+            // Add the last page back to the list of allocated pages to make sure it gets freed in
+            // case loadNext() never gets called again.
+            allocatedPages.add(lastPage);
+          }
         }
 
         // in-memory sorter will not be used after spilling
         assert(inMemSorter != null);
         released += inMemSorter.getMemoryUsage();
         totalSortTimeNanos += inMemSorter.getSortTimeNanos();
-        inMemSorter.free();
+        inMemSorter.freeMemory();
         inMemSorter = null;
         taskContext.taskMetrics().incMemoryBytesSpilled(released);
         taskContext.taskMetrics().incDiskBytesSpilled(writeMetrics.bytesWritten());
@@ -588,26 +646,32 @@ public boolean hasNext() {
 
     @Override
     public void loadNext() throws IOException {
+      assert upstream != null;
       MemoryBlock pageToFree = null;
       try {
         synchronized (this) {
           loaded = true;
-          if (nextUpstream != null) {
-            // Just consumed the last record from in memory iterator
-            if(lastPage != null) {
-              // Do not free the page here, while we are locking `SpillableIterator`. The `freePage`
-              // method locks the `TaskMemoryManager`, and it's a bad idea to lock 2 objects in
-              // sequence. We may hit dead lock if another thread locks `TaskMemoryManager` and
-              // `SpillableIterator` in sequence, which may happen in
-              // `TaskMemoryManager.acquireExecutionMemory`.
-              pageToFree = lastPage;
-              lastPage = null;
-            }
-            upstream = nextUpstream;
-            nextUpstream = null;
+          // Just consumed the last record from the in-memory iterator.
+          if (lastPage != null) {
+            // Do not free the page here, while we are locking `SpillableIterator`. The `freePage`
+            // method locks the `TaskMemoryManager`, and it's a bad idea to lock 2 objects in
+            // sequence. We may hit dead lock if another thread locks `TaskMemoryManager` and
+            // `SpillableIterator` in sequence, which may happen in
+            // `TaskMemoryManager.acquireExecutionMemory`.
+            pageToFree = lastPage;
+            allocatedPages.clear();
+            lastPage = null;
           }
           numRecords--;
           upstream.loadNext();
+
+          // Keep track of the current base object, base offset, record length, and key prefix,
+          // so that the current record can still be read in case a spill is triggered and we
+          // switch to the spill writer's iterator.
+          currentBaseObject = upstream.getBaseObject();
+          currentBaseOffset = upstream.getBaseOffset();
+          currentRecordLength = upstream.getRecordLength();
+          currentKeyPrefix = upstream.getKeyPrefix();
         }
       } finally {
         if (pageToFree != null) {
@@ -618,22 +682,22 @@ public void loadNext() throws IOException {
 
     @Override
     public Object getBaseObject() {
-      return upstream.getBaseObject();
+      return currentBaseObject;
     }
 
     @Override
     public long getBaseOffset() {
-      return upstream.getBaseOffset();
+      return currentBaseOffset;
     }
 
     @Override
     public int getRecordLength() {
-      return upstream.getRecordLength();
+      return currentRecordLength;
     }
 
     @Override
     public long getKeyPrefix() {
-      return upstream.getKeyPrefix();
+      return currentKeyPrefix;
     }
   }
 
@@ -663,7 +727,7 @@ public UnsafeSorterIterator getIterator(int startIndex) throws IOException {
         }
         i += spillWriter.recordsSpilled();
       }
-      if (inMemSorter != null) {
+      if (inMemSorter != null && inMemSorter.numRecords() > 0) {
         UnsafeSorterIterator iter = inMemSorter.getSortedIterator();
         moveOver(iter, startIndex - i);
         queue.add(iter);
@@ -709,6 +773,11 @@ static class ChainedIterator extends UnsafeSorterIterator {
     public int getNumRecords() {
       return numRecords;
     }
+    
+   @Override
+    public long getCurrentPageNumber() {
+      return current.getCurrentPageNumber();
+    }
 
     @Override
     public boolean hasNext() {

From 485cc282f267337c8c53175c48bac71a30b4c833 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:23:57 +0800
Subject: [PATCH 18/35] Update UnsafeInMemorySorter.java

---
 .../unsafe/sort/UnsafeInMemorySorter.java     | 51 +++++++------------
 1 file changed, 17 insertions(+), 34 deletions(-)

diff --git a/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
index b1c1d393..575aeea0 100644
--- a/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
+++ b/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java
@@ -159,6 +159,10 @@ private int getUsableCapacity() {
     return (int) (array.size() / (radixSortSupport != null ? 2 : 1.5));
   }
 
+  public long getInitialSize() {
+    return initialSize;
+  }
+
   /**
    * Free the memory used by pointer array.
    */
@@ -168,47 +172,19 @@ public void freeWithoutLongArray() {
     }
   }
 
-  public void free() {
+  public void freeMemory() {
     if (consumer != null) {
       if (array != null) {
         consumer.freeArray(array);
       }
-      array = null;
-    }
-  }
 
-  public void resetWithoutLongArrray() {
-    if (consumer != null) {
-      // the call to consumer.allocateArray may trigger a spill which in turn access this instance
-      // and eventually re-enter this method and try to free the array again.  by setting the array
-      // to null and its length to 0 we effectively make the spill code-path a no-op.  setting the
-      // array to null also indicates that it has already been de-allocated which prevents a double
-      // de-allocation in free().
+      // Set the array to null instead of allocating a new array. Allocating an array could have
+      // triggered another spill and this method already is called from UnsafeExternalSorter when
+      // spilling. Attempting to allocate while spilling is dangerous, as we could be holding onto
+      // a large partially complete allocation, which may prevent other memory from being allocated.
+      // Instead we will allocate the new array when it is necessary.
       array = null;
       usableCapacity = 0;
-      pos = 0;
-      nullBoundaryPos = 0;
-      array = consumer.allocateArray(initialSize);
-      usableCapacity = getUsableCapacity();
-    }
-    pos = 0;
-    nullBoundaryPos = 0;
-  }
-
-  public void reset() {
-    if (consumer != null) {
-      consumer.freeArray(array);
-      // the call to consumer.allocateArray may trigger a spill which in turn access this instance
-      // and eventually re-enter this method and try to free the array again.  by setting the array
-      // to null and its length to 0 we effectively make the spill code-path a no-op.  setting the
-      // array to null also indicates that it has already been de-allocated which prevents a double
-      // de-allocation in free().
-      array = null;
-      usableCapacity = 0;
-      pos = 0;
-      nullBoundaryPos = 0;
-      array = consumer.allocateArray(initialSize);
-      usableCapacity = getUsableCapacity();
     }
     pos = 0;
     nullBoundaryPos = 0;
@@ -250,6 +226,7 @@ public boolean hasSpaceForAnotherRecord() {
   }
 
   public void expandPointerArray(LongArray newArray) {
+    if (array != null) {
     if (newArray.size() < array.size()) {
       // checkstyle.off: RegexpSinglelineJava
       throw new SparkOutOfMemoryError("Not enough memory to grow pointer array");
@@ -262,6 +239,7 @@ public void expandPointerArray(LongArray newArray) {
       newArray.getBaseOffset(),
       pos * 8L);
     consumer.freeArray(array);
+    }
     array = newArray;
     usableCapacity = getUsableCapacity();
   }
@@ -365,6 +343,7 @@ public void loadNext() {
     @Override
     public long getBaseOffset() { return baseOffset; }
 
+    @Override
     public long getCurrentPageNumber() {
       return currentPageNumber;
     }
@@ -387,6 +366,10 @@ public long getCurrentPageNumber() {
    * {@code next()} will return the same mutable object.
    */
   public UnsafeSorterIterator getSortedIterator() {
+    if (numRecords() == 0) {
+      // `array` might be null, so make sure that it is not accessed by returning early.
+      return new SortedIterator(0, 0);
+    }
     int offset = 0;
     long start = System.nanoTime();
     if (sortComparator != null) {

From 400ca0172346b238a5219cd055c1bfb928a15f2e Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:24:16 +0800
Subject: [PATCH 19/35] Update UnsafeSorterPMemSpillWriter.java


From 68ddd0d2fac8638326ce5246b12b9ee6ed7dc17a Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:24:52 +0800
Subject: [PATCH 20/35] Update UnsafeSorterPMemSpillWriter.java


From facb72e0a8578ccb7d8d82977d8d7dd5cb6bdaa1 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:25:19 +0800
Subject: [PATCH 21/35] Update UnsafeSorterSpillReader.java

---
 .../util/collection/unsafe/sort/UnsafeSorterSpillReader.java | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java b/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
index 25ca11fb..a7ed81b2 100644
--- a/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
+++ b/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeSorterSpillReader.java
@@ -97,6 +97,11 @@ public int getNumRecords() {
     return numRecords;
   }
 
+  @Override
+  public long getCurrentPageNumber() {
+    throw new UnsupportedOperationException();
+  }
+
   @Override
   public boolean hasNext() {
     return (numRecordsRemaining > 0);

From c33de3c71c28ac0c07570dcc4b5e66a66a572318 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:25:36 +0800
Subject: [PATCH 22/35] Update UnsafeSorterSpillWriter.java


From 3d2b3652e92e9135b3612bfa09903d72af6760b6 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:25:56 +0800
Subject: [PATCH 23/35] Update UnsafeSorterStreamSpillReader.java


From 1834c8d30198716bb50bfc30bd3a43d87169d4ad Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:26:15 +0800
Subject: [PATCH 24/35] Update UnsafeSorterStreamSpillWriter.java


From e6f9698d978c234f3765b4210aaf1daddb6fde80 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:26:39 +0800
Subject: [PATCH 25/35] Update package.scala

---
 .../spark/internal/config/package.scala       | 333 +++++++++++++++---
 1 file changed, 290 insertions(+), 43 deletions(-)

diff --git a/src/main/scala/org/apache/spark/internal/config/package.scala b/src/main/scala/org/apache/spark/internal/config/package.scala
index 07661590..8cf62cf4 100644
--- a/src/main/scala/org/apache/spark/internal/config/package.scala
+++ b/src/main/scala/org/apache/spark/internal/config/package.scala
@@ -17,6 +17,7 @@
 
 package org.apache.spark.internal
 
+import java.util.Locale
 import java.util.concurrent.TimeUnit
 
 import org.apache.spark.launcher.SparkLauncher
@@ -271,6 +272,13 @@ package object config {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString("0")
 
+  private[spark] val EXECUTOR_METRICS_FILESYSTEM_SCHEMES =
+    ConfigBuilder("spark.executor.metrics.fileSystemSchemes")
+      .doc("The file system schemes to report in executor metrics.")
+      .version("3.1.0")
+      .stringConf
+      .createWithDefaultString("file,hdfs")
+
   private[spark] val EXECUTOR_JAVA_OPTIONS =
     ConfigBuilder(SparkLauncher.EXECUTOR_EXTRA_JAVA_OPTIONS)
       .withPrepended(SparkLauncher.EXECUTOR_DEFAULT_JAVA_OPTIONS)
@@ -302,8 +310,8 @@ package object config {
     .createWithDefaultString("1g")
 
   private[spark] val EXECUTOR_MEMORY_OVERHEAD = ConfigBuilder("spark.executor.memoryOverhead")
-    .doc("The amount of non-heap memory to be allocated per executor in cluster mode, " +
-      "in MiB unless otherwise specified.")
+    .doc("The amount of non-heap memory to be allocated per executor, in MiB unless otherwise" +
+      " specified.")
     .version("2.3.0")
     .bytesConf(ByteUnit.MiB)
     .createOptional
@@ -337,11 +345,31 @@ package object config {
     .checkValue(_ >= 0, "The off-heap memory size must not be negative")
     .createWithDefault(0)
 
+  private[spark] val MEMORY_STORAGE_FRACTION = ConfigBuilder("spark.memory.storageFraction")
+    .doc("Amount of storage memory immune to eviction, expressed as a fraction of the " +
+      "size of the region set aside by spark.memory.fraction. The higher this is, the " +
+      "less working memory may be available to execution and tasks may spill to disk more " +
+      "often. Leaving this at the default value is recommended. ")
+    .version("1.6.0")
+    .doubleConf
+    .checkValue(v => v >= 0.0 && v < 1.0, "Storage fraction must be in [0,1)")
+    .createWithDefault(0.5)
+
+  private[spark] val MEMORY_FRACTION = ConfigBuilder("spark.memory.fraction")
+    .doc("Fraction of (heap space - 300MB) used for execution and storage. The " +
+      "lower this is, the more frequently spills and cached data eviction occur. " +
+      "The purpose of this config is to set aside memory for internal metadata, " +
+      "user data structures, and imprecise size estimation in the case of sparse, " +
+      "unusually large records. Leaving this at the default value is recommended.  ")
+    .version("1.6.0")
+    .doubleConf
+    .createWithDefault(0.6)
+
   val MEMORY_SPILL_PMEM_ENABLED =
     ConfigBuilder("spark.memory.spill.pmem.enabled")
       .doc("Set memory spill to PMem instead of disk.")
       .booleanConf
-      .createWithDefault(true)
+      .createWithDefault(false)
 
   val MEMORY_EXTENDED_PATH =
     ConfigBuilder("spark.memory.extended.path")
@@ -388,26 +416,6 @@ package object config {
           s"${ByteArrayMethods.MAX_ROUNDED_ARRAY_LENGTH}.")
       .createWithDefaultString("8m")
 
-  private[spark] val MEMORY_STORAGE_FRACTION = ConfigBuilder("spark.memory.storageFraction")
-    .doc("Amount of storage memory immune to eviction, expressed as a fraction of the " +
-      "size of the region set aside by spark.memory.fraction. The higher this is, the " +
-      "less working memory may be available to execution and tasks may spill to disk more " +
-      "often. Leaving this at the default value is recommended. ")
-    .version("1.6.0")
-    .doubleConf
-    .checkValue(v => v >= 0.0 && v < 1.0, "Storage fraction must be in [0,1)")
-    .createWithDefault(0.5)
-
-  private[spark] val MEMORY_FRACTION = ConfigBuilder("spark.memory.fraction")
-    .doc("Fraction of (heap space - 300MB) used for execution and storage. The " +
-      "lower this is, the more frequently spills and cached data eviction occur. " +
-      "The purpose of this config is to set aside memory for internal metadata, " +
-      "user data structures, and imprecise size estimation in the case of sparse, " +
-      "unusually large records. Leaving this at the default value is recommended.  ")
-    .version("1.6.0")
-    .doubleConf
-    .createWithDefault(0.6)
-
   private[spark] val STORAGE_SAFETY_FRACTION = ConfigBuilder("spark.storage.safetyFraction")
     .version("1.1.0")
     .doubleConf
@@ -464,6 +472,67 @@ package object config {
       .intConf
       .createWithDefault(1)
 
+  private[spark] val STORAGE_DECOMMISSION_ENABLED =
+    ConfigBuilder("spark.storage.decommission.enabled")
+      .doc("Whether to decommission the block manager when decommissioning executor")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED =
+    ConfigBuilder("spark.storage.decommission.shuffleBlocks.enabled")
+      .doc("Whether to transfer shuffle blocks during block manager decommissioning. Requires " +
+        "a migratable shuffle resolver (like sort based shuffle)")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val STORAGE_DECOMMISSION_SHUFFLE_MAX_THREADS =
+    ConfigBuilder("spark.storage.decommission.shuffleBlocks.maxThreads")
+      .doc("Maximum number of threads to use in migrating shuffle files.")
+      .version("3.1.0")
+      .intConf
+      .checkValue(_ > 0, "The maximum number of threads should be positive")
+      .createWithDefault(8)
+
+  private[spark] val STORAGE_DECOMMISSION_RDD_BLOCKS_ENABLED =
+    ConfigBuilder("spark.storage.decommission.rddBlocks.enabled")
+      .doc("Whether to transfer RDD blocks during block manager decommissioning.")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val STORAGE_DECOMMISSION_MAX_REPLICATION_FAILURE_PER_BLOCK =
+    ConfigBuilder("spark.storage.decommission.maxReplicationFailuresPerBlock")
+      .internal()
+      .doc("Maximum number of failures which can be handled for the replication of " +
+        "one RDD block when block manager is decommissioning and trying to move its " +
+        "existing blocks.")
+      .version("3.1.0")
+      .intConf
+      .createWithDefault(3)
+
+  private[spark] val STORAGE_DECOMMISSION_REPLICATION_REATTEMPT_INTERVAL =
+    ConfigBuilder("spark.storage.decommission.replicationReattemptInterval")
+      .internal()
+      .doc("The interval of time between consecutive cache block replication reattempts " +
+        "happening on each decommissioning executor (due to storage decommissioning).")
+      .version("3.1.0")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .checkValue(_ > 0, "Time interval between two consecutive attempts of " +
+        "cache block replication should be positive.")
+      .createWithDefaultString("30s")
+
+  private[spark] val STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH =
+    ConfigBuilder("spark.storage.decommission.fallbackStorage.path")
+      .doc("The location for fallback storage during block manager decommissioning. " +
+        "For example, `s3a://spark-storage/`. In case of empty, fallback storage is disabled. " +
+        "The storage should be managed by TTL because Spark will not clean it up.")
+      .version("3.1.0")
+      .stringConf
+      .checkValue(_.endsWith(java.io.File.separator), "Path should end with separator.")
+      .createOptional
+
   private[spark] val STORAGE_REPLICATION_TOPOLOGY_FILE =
     ConfigBuilder("spark.storage.replication.topologyFile")
       .version("2.1.0")
@@ -488,6 +557,13 @@ package object config {
       .timeConf(TimeUnit.MILLISECONDS)
       .createWithDefaultString(Network.NETWORK_TIMEOUT.defaultValueString)
 
+  private[spark] val STORAGE_BLOCKMANAGER_HEARTBEAT_TIMEOUT =
+    ConfigBuilder("spark.storage.blockManagerHeartbeatTimeoutMs")
+      .version("0.7.0")
+      .withAlternative("spark.storage.blockManagerSlaveTimeoutMs")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createOptional
+
   private[spark] val STORAGE_CLEANUP_FILES_AFTER_EXECUTOR_EXIT =
     ConfigBuilder("spark.storage.cleanupFilesAfterExecutorExit")
       .doc("Whether or not cleanup the files not served by the external shuffle service " +
@@ -595,6 +671,16 @@ package object config {
       .version("1.2.0")
       .fallbackConf(DYN_ALLOCATION_SCHEDULER_BACKLOG_TIMEOUT)
 
+  private[spark] val LEGACY_LOCALITY_WAIT_RESET =
+    ConfigBuilder("spark.locality.wait.legacyResetOnTaskLaunch")
+      .doc("Whether to use the legacy behavior of locality wait, which resets the delay timer " +
+        "anytime a task is scheduled. See Delay Scheduling section of TaskSchedulerImpl's class " +
+        "documentation for more details.")
+      .internal()
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
   private[spark] val LOCALITY_WAIT = ConfigBuilder("spark.locality.wait")
     .version("0.5.0")
     .timeConf(TimeUnit.MILLISECONDS)
@@ -711,6 +797,13 @@ package object config {
       .booleanConf
       .createWithDefault(true)
 
+  private[spark] val EXCLUDE_ON_FAILURE_ENABLED =
+    ConfigBuilder("spark.excludeOnFailure.enabled")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.enabled")
+      .booleanConf
+      .createOptional
+
   // Blacklist confs
   private[spark] val BLACKLIST_ENABLED =
     ConfigBuilder("spark.blacklist.enabled")
@@ -719,38 +812,44 @@ package object config {
       .createOptional
 
   private[spark] val MAX_TASK_ATTEMPTS_PER_EXECUTOR =
-    ConfigBuilder("spark.blacklist.task.maxTaskAttemptsPerExecutor")
-      .version("2.1.0")
+    ConfigBuilder("spark.excludeOnFailure.task.maxTaskAttemptsPerExecutor")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.task.maxTaskAttemptsPerExecutor")
       .intConf
       .createWithDefault(1)
 
   private[spark] val MAX_TASK_ATTEMPTS_PER_NODE =
-    ConfigBuilder("spark.blacklist.task.maxTaskAttemptsPerNode")
-      .version("2.1.0")
+    ConfigBuilder("spark.excludeOnFailure.task.maxTaskAttemptsPerNode")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.task.maxTaskAttemptsPerNode")
       .intConf
       .createWithDefault(2)
 
   private[spark] val MAX_FAILURES_PER_EXEC =
-    ConfigBuilder("spark.blacklist.application.maxFailedTasksPerExecutor")
-      .version("2.2.0")
+    ConfigBuilder("spark.excludeOnFailure.application.maxFailedTasksPerExecutor")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.application.maxFailedTasksPerExecutor")
       .intConf
       .createWithDefault(2)
 
   private[spark] val MAX_FAILURES_PER_EXEC_STAGE =
-    ConfigBuilder("spark.blacklist.stage.maxFailedTasksPerExecutor")
-      .version("2.1.0")
+    ConfigBuilder("spark.excludeOnFailure.stage.maxFailedTasksPerExecutor")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.stage.maxFailedTasksPerExecutor")
       .intConf
       .createWithDefault(2)
 
   private[spark] val MAX_FAILED_EXEC_PER_NODE =
-    ConfigBuilder("spark.blacklist.application.maxFailedExecutorsPerNode")
-      .version("2.2.0")
+    ConfigBuilder("spark.excludeOnFailure.application.maxFailedExecutorsPerNode")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.application.maxFailedExecutorsPerNode")
       .intConf
       .createWithDefault(2)
 
   private[spark] val MAX_FAILED_EXEC_PER_NODE_STAGE =
-    ConfigBuilder("spark.blacklist.stage.maxFailedExecutorsPerNode")
-      .version("2.1.0")
+    ConfigBuilder("spark.excludeOnFailure.stage.maxFailedExecutorsPerNode")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.stage.maxFailedExecutorsPerNode")
       .intConf
       .createWithDefault(2)
 
@@ -778,7 +877,35 @@ package object config {
       .version("2.3.0")
       .booleanConf
       .createWithDefault(false)
-  // End blacklist confs
+
+  private[spark] val EXCLUDE_ON_FAILURE_TIMEOUT_CONF =
+    ConfigBuilder("spark.excludeOnFailure.timeout")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.timeout")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createOptional
+
+  private[spark] val EXCLUDE_ON_FAILURE_KILL_ENABLED =
+    ConfigBuilder("spark.excludeOnFailure.killExcludedExecutors")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.killBlacklistedExecutors")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EXCLUDE_ON_FAILURE_LEGACY_TIMEOUT_CONF =
+    ConfigBuilder("spark.scheduler.executorTaskExcludeOnFailureTime")
+      .internal()
+      .version("3.1.0")
+      .withAlternative("spark.scheduler.executorTaskBlacklistTime")
+      .timeConf(TimeUnit.MILLISECONDS)
+      .createOptional
+
+  private[spark] val EXCLUDE_ON_FAILURE_FETCH_FAILURE_ENABLED =
+    ConfigBuilder("spark.excludeOnFailure.application.fetchFailure.enabled")
+      .version("3.1.0")
+      .withAlternative("spark.blacklist.application.fetchFailure.enabled")
+      .booleanConf
+      .createWithDefault(false)
 
   private[spark] val UNREGISTER_OUTPUT_ON_HOST_ON_FETCH_FAILURE =
     ConfigBuilder("spark.files.fetchFailure.unRegisterOutputOnHost")
@@ -1404,10 +1531,9 @@ package object config {
 
   private[spark] val SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED =
     ConfigBuilder("spark.shuffle.readHostLocalDisk")
-      .doc(s"If enabled (and `${SHUFFLE_USE_OLD_FETCH_PROTOCOL.key}` is disabled and external " +
-        s"shuffle `${SHUFFLE_SERVICE_ENABLED.key}` is enabled), shuffle " +
-        "blocks requested from those block managers which are running on the same host are read " +
-        "from the disk directly instead of being fetched as remote blocks over the network.")
+      .doc(s"If enabled (and `${SHUFFLE_USE_OLD_FETCH_PROTOCOL.key}` is disabled, shuffle " +
+        "blocks requested from those block managers which are running on the same host are " +
+        "read from the disk directly instead of being fetched as remote blocks over the network.")
       .version("3.0.0")
       .booleanConf
       .createWithDefault(true)
@@ -1443,10 +1569,12 @@ package object config {
       .createWithDefaultString("365d")
 
   private[spark] val UNSCHEDULABLE_TASKSET_TIMEOUT =
-    ConfigBuilder("spark.scheduler.blacklist.unschedulableTaskSetTimeout")
+    ConfigBuilder("spark.scheduler.excludeOnFailure.unschedulableTaskSetTimeout")
       .doc("The timeout in seconds to wait to acquire a new executor and schedule a task " +
-        "before aborting a TaskSet which is unschedulable because of being completely blacklisted.")
-      .version("2.4.1")
+        "before aborting a TaskSet which is unschedulable because all executors are " +
+        "excluded due to failures.")
+      .version("3.1.0")
+      .withAlternative("spark.scheduler.blacklist.unschedulableTaskSetTimeout")
       .timeConf(TimeUnit.SECONDS)
       .checkValue(v => v >= 0, "The value should be a non negative time value.")
       .createWithDefault(120)
@@ -1775,6 +1903,16 @@ package object config {
     .toSequence
     .createWithDefault(Nil)
 
+  private[spark] val ARCHIVES = ConfigBuilder("spark.archives")
+    .version("3.1.0")
+    .doc("Comma-separated list of archives to be extracted into the working directory of each " +
+      "executor. .jar, .tar.gz, .tgz and .zip are supported. You can specify the directory " +
+      "name to unpack via adding '#' after the file name to unpack, for example, " +
+      "'file.zip#directory'. This configuration is experimental.")
+    .stringConf
+    .toSequence
+    .createWithDefault(Nil)
+
   private[spark] val SUBMIT_DEPLOY_MODE = ConfigBuilder("spark.submit.deployMode")
     .version("1.5.0")
     .stringConf
@@ -1808,6 +1946,7 @@ package object config {
     ConfigBuilder("spark.scheduler.mode")
       .version("0.8.0")
       .stringConf
+      .transform(_.toUpperCase(Locale.ROOT))
       .createWithDefault(SchedulingMode.FIFO.toString)
 
   private[spark] val SCHEDULER_REVIVE_INTERVAL =
@@ -1854,6 +1993,30 @@ package object config {
       .timeConf(TimeUnit.MILLISECONDS)
       .createOptional
 
+  private[spark] val DECOMMISSION_ENABLED =
+    ConfigBuilder("spark.decommission.enabled")
+      .doc("When decommission enabled, Spark will try its best to shutdown the executor " +
+        s"gracefully. Spark will try to migrate all the RDD blocks (controlled by " +
+        s"${STORAGE_DECOMMISSION_RDD_BLOCKS_ENABLED.key}) and shuffle blocks (controlled by " +
+        s"${STORAGE_DECOMMISSION_SHUFFLE_BLOCKS_ENABLED.key}) from the decommissioning " +
+        s"executor to a remote executor when ${STORAGE_DECOMMISSION_ENABLED.key} is enabled. " +
+        s"With decommission enabled, Spark will also decommission an executor instead of " +
+        s"killing when ${DYN_ALLOCATION_ENABLED.key} enabled.")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EXECUTOR_DECOMMISSION_KILL_INTERVAL =
+    ConfigBuilder("spark.executor.decommission.killInterval")
+      .doc("Duration after which a decommissioned executor will be killed forcefully." +
+        "This config is useful for cloud environments where we know in advance when " +
+        "an executor is going to go down after decommissioning signal i.e. around 2 mins " +
+        "in aws spot nodes, 1/2 hrs in spot block nodes etc. This config is currently " +
+        "used to decide what tasks running on decommission executors to speculate.")
+      .version("3.1.0")
+      .timeConf(TimeUnit.SECONDS)
+      .createOptional
+
   private[spark] val STAGING_DIR = ConfigBuilder("spark.yarn.stagingDir")
     .doc("Staging directory used while submitting applications.")
     .version("2.0.0")
@@ -1866,4 +2029,88 @@ package object config {
     .bytesConf(ByteUnit.BYTE)
     .createOptional
 
+  private[spark] val RESOURCE_PROFILE_MERGE_CONFLICTS =
+    ConfigBuilder("spark.scheduler.resource.profileMergeConflicts")
+      .doc("If set to true, Spark will merge ResourceProfiles when different profiles " +
+        "are specified in RDDs that get combined into a single stage. When they are merged, " +
+        "Spark chooses the maximum of each resource and creates a new ResourceProfile. The " +
+        "default of false results in Spark throwing an exception if multiple different " +
+        "ResourceProfiles are found in RDDs going into the same stage.")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val STANDALONE_SUBMIT_WAIT_APP_COMPLETION =
+    ConfigBuilder("spark.standalone.submit.waitAppCompletion")
+      .doc("In standalone cluster mode, controls whether the client waits to exit until the " +
+        "application completes. If set to true, the client process will stay alive polling " +
+        "the driver's status. Otherwise, the client process will exit after submission.")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EXECUTOR_ALLOW_SPARK_CONTEXT =
+    ConfigBuilder("spark.executor.allowSparkContext")
+      .doc("If set to true, SparkContext can be created in executors.")
+      .version("3.0.1")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val EXECUTOR_KILL_ON_FATAL_ERROR_DEPTH =
+    ConfigBuilder("spark.executor.killOnFatalError.depth")
+      .doc("The max depth of the exception chain in a failed task Spark will search for a fatal " +
+        "error to check whether it should kill an executor. 0 means not checking any fatal " +
+        "error, 1 means checking only the exception but not the cause, and so on.")
+      .internal()
+      .version("3.1.0")
+      .intConf
+      .checkValue(_ >= 0, "needs to be a non-negative value")
+      .createWithDefault(5)
+
+  private[spark] val PUSH_BASED_SHUFFLE_ENABLED =
+    ConfigBuilder("spark.shuffle.push.enabled")
+      .doc("Set to 'true' to enable push-based shuffle on the client side and this works in " +
+        "conjunction with the server side flag spark.shuffle.server.mergedShuffleFileManagerImpl " +
+        "which needs to be set with the appropriate " +
+        "org.apache.spark.network.shuffle.MergedShuffleFileManager implementation for push-based " +
+        "shuffle to be enabled")
+      .version("3.1.0")
+      .booleanConf
+      .createWithDefault(false)
+
+  private[spark] val SHUFFLE_MERGER_MAX_RETAINED_LOCATIONS =
+    ConfigBuilder("spark.shuffle.push.maxRetainedMergerLocations")
+      .doc("Maximum number of shuffle push merger locations cached for push based shuffle. " +
+        "Currently, shuffle push merger locations are nothing but external shuffle services " +
+        "which are responsible for handling pushed blocks and merging them and serving " +
+        "merged blocks for later shuffle fetch.")
+      .version("3.1.0")
+      .intConf
+      .createWithDefault(500)
+
+  private[spark] val SHUFFLE_MERGER_LOCATIONS_MIN_THRESHOLD_RATIO =
+    ConfigBuilder("spark.shuffle.push.mergersMinThresholdRatio")
+      .doc("The minimum number of shuffle merger locations required to enable push based " +
+        "shuffle for a stage. This is specified as a ratio of the number of partitions in " +
+        "the child stage. For example, a reduce stage which has 100 partitions and uses the " +
+        "default value 0.05 requires at least 5 unique merger locations to enable push based " +
+        "shuffle. Merger locations are currently defined as external shuffle services.")
+      .version("3.1.0")
+      .doubleConf
+      .createWithDefault(0.05)
+
+  private[spark] val SHUFFLE_MERGER_LOCATIONS_MIN_STATIC_THRESHOLD =
+    ConfigBuilder("spark.shuffle.push.mergersMinStaticThreshold")
+      .doc(s"The static threshold for number of shuffle push merger locations should be " +
+        "available in order to enable push based shuffle for a stage. Note this config " +
+        s"works in conjunction with ${SHUFFLE_MERGER_LOCATIONS_MIN_THRESHOLD_RATIO.key}. " +
+        "Maximum of spark.shuffle.push.mergersMinStaticThreshold and " +
+        s"${SHUFFLE_MERGER_LOCATIONS_MIN_THRESHOLD_RATIO.key} ratio number of mergers needed to " +
+        "enable push based shuffle for a stage. For eg: with 1000 partitions for the child " +
+        "stage with spark.shuffle.push.mergersMinStaticThreshold as 5 and " +
+        s"${SHUFFLE_MERGER_LOCATIONS_MIN_THRESHOLD_RATIO.key} set to 0.05, we would need " +
+        "at least 50 mergers to enable push based shuffle for that stage.")
+      .version("3.1.0")
+      .doubleConf
+      .createWithDefault(5)
 }

From d1ac9c0dd2e315c4b0782497f922c4e18b4dee01 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:26:55 +0800
Subject: [PATCH 26/35] Update ExtendedMemoryPool.scala

---
 src/main/scala/org/apache/spark/memory/ExtendedMemoryPool.scala | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/main/scala/org/apache/spark/memory/ExtendedMemoryPool.scala b/src/main/scala/org/apache/spark/memory/ExtendedMemoryPool.scala
index 50dc05b8..9ba43483 100644
--- a/src/main/scala/org/apache/spark/memory/ExtendedMemoryPool.scala
+++ b/src/main/scala/org/apache/spark/memory/ExtendedMemoryPool.scala
@@ -108,4 +108,3 @@ private[memory] class ExtendedMemoryPool(lock: Object) extends MemoryPool(lock)
     numBytesToFree
   }
 }
-

From 3484c925d6b56076aa574661683739c9aef26b23 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:27:09 +0800
Subject: [PATCH 27/35] Update MemoryManager.scala


From ff985190a2cb9bb77dd8b22768f0312853f3d2f6 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:27:21 +0800
Subject: [PATCH 28/35] Update StorageMemoryPool.scala


From 8d96ae9a4837538c0c826f04a81bd7e0f4aeec91 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:27:36 +0800
Subject: [PATCH 29/35] Update UnifiedMemoryManager.scala


From 4c8864dbd5d60f295b82b36b6b0a98a8b569c6a0 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:28:00 +0800
Subject: [PATCH 30/35] Update MemoryStore.scala

---
 .../scala/org/apache/spark/storage/memory/MemoryStore.scala     | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala b/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
index 36174159..41302557 100644
--- a/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
+++ b/src/main/scala/org/apache/spark/storage/memory/MemoryStore.scala
@@ -95,9 +95,11 @@ private[spark] class MemoryStore(
   // A mapping from taskAttemptId to amount of memory used for unrolling a block (in bytes)
   // All accesses of this map are assumed to have manually synchronized on `memoryManager`
   private val onHeapUnrollMemoryMap = mutable.HashMap[Long, Long]()
+
   // Note: off-heap unroll memory is only used in putIteratorAsBytes() because off-heap caching
   // always stores serialized values.
   private val offHeapUnrollMemoryMap = mutable.HashMap[Long, Long]()
+
   // Note: pmem unroll memory is only used in putIteratorAsBytes() because off-heap caching
   // always stores serialized values.
   private val pmemUnrollMemoryMap = mutable.HashMap[Long, Long]()

From 1204cc11ca96883f16237381f18bd0e109afe906 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:28:19 +0800
Subject: [PATCH 31/35] Update BlockManager.scala

---
 .../apache/spark/storage/BlockManager.scala   | 200 +++++++++++++-----
 1 file changed, 148 insertions(+), 52 deletions(-)

diff --git a/src/main/scala/org/apache/spark/storage/BlockManager.scala b/src/main/scala/org/apache/spark/storage/BlockManager.scala
index 5b3379c8..c78ca91f 100644
--- a/src/main/scala/org/apache/spark/storage/BlockManager.scala
+++ b/src/main/scala/org/apache/spark/storage/BlockManager.scala
@@ -18,12 +18,13 @@
 package org.apache.spark.storage
 
 import java.io._
-import java.lang.ref.{ReferenceQueue => JReferenceQueue, WeakReference}
+import java.lang.ref.{WeakReference, ReferenceQueue => JReferenceQueue}
 import java.nio.ByteBuffer
 import java.nio.channels.Channels
 import java.util.Collections
 import java.util.concurrent.{CompletableFuture, ConcurrentHashMap, TimeUnit}
 
+import scala.collection.JavaConverters._
 import scala.collection.mutable
 import scala.collection.mutable.HashMap
 import scala.concurrent.{ExecutionContext, Future}
@@ -55,7 +56,8 @@ import org.apache.spark.network.util.TransportConf
 import org.apache.spark.rpc.RpcEnv
 import org.apache.spark.scheduler.ExecutorCacheTaskLocation
 import org.apache.spark.serializer.{SerializerInstance, SerializerManager}
-import org.apache.spark.shuffle.{ShuffleManager, ShuffleWriteMetricsReporter}
+import org.apache.spark.shuffle.{MigratableResolver, ShuffleManager, ShuffleWriteMetricsReporter}
+import org.apache.spark.storage.BlockManagerMessages.{DecommissionBlockManager, ReplicateBlock}
 import org.apache.spark.storage.memory._
 import org.apache.spark.unsafe.Platform
 import org.apache.spark.util._
@@ -119,9 +121,7 @@ private[spark] class ByteBufferBlockData(
 private[spark] class HostLocalDirManager(
     futureExecutionContext: ExecutionContext,
     cacheSize: Int,
-    externalBlockStoreClient: ExternalBlockStoreClient,
-    host: String,
-    externalShuffleServicePort: Int) extends Logging {
+    blockStoreClient: BlockStoreClient) extends Logging {
 
   private val executorIdToLocalDirsCache =
     CacheBuilder
@@ -129,24 +129,25 @@ private[spark] class HostLocalDirManager(
       .maximumSize(cacheSize)
       .build[String, Array[String]]()
 
-  private[spark] def getCachedHostLocalDirs()
-      : scala.collection.Map[String, Array[String]] = executorIdToLocalDirsCache.synchronized {
-    import scala.collection.JavaConverters._
-    return executorIdToLocalDirsCache.asMap().asScala
-  }
+  private[spark] def getCachedHostLocalDirs: Map[String, Array[String]] =
+    executorIdToLocalDirsCache.synchronized {
+      executorIdToLocalDirsCache.asMap().asScala.toMap
+    }
 
   private[spark] def getHostLocalDirs(
+      host: String,
+      port: Int,
       executorIds: Array[String])(
-      callback: Try[java.util.Map[String, Array[String]]] => Unit): Unit = {
+      callback: Try[Map[String, Array[String]]] => Unit): Unit = {
     val hostLocalDirsCompletable = new CompletableFuture[java.util.Map[String, Array[String]]]
-    externalBlockStoreClient.getHostLocalDirs(
+    blockStoreClient.getHostLocalDirs(
       host,
-      externalShuffleServicePort,
+      port,
       executorIds,
       hostLocalDirsCompletable)
     hostLocalDirsCompletable.whenComplete { (hostLocalDirs, throwable) =>
       if (hostLocalDirs != null) {
-        callback(Success(hostLocalDirs))
+        callback(Success(hostLocalDirs.asScala.toMap))
         executorIdToLocalDirsCache.synchronized {
           executorIdToLocalDirsCache.putAll(hostLocalDirs)
         }
@@ -164,7 +165,7 @@ private[spark] class HostLocalDirManager(
  * Note that [[initialize()]] must be called before the BlockManager is usable.
  */
 private[spark] class BlockManager(
-    executorId: String,
+    val executorId: String,
     rpcEnv: RpcEnv,
     val master: BlockManagerMaster,
     val serializerManager: SerializerManager,
@@ -255,7 +256,7 @@ private[spark] class BlockManager(
   private val maxOnHeapMemory = memoryManager.maxOnHeapStorageMemory
   private val maxOffHeapMemory = memoryManager.maxOffHeapStorageMemory
 
-  private val externalShuffleServicePort = StorageUtils.externalShuffleServicePort(conf)
+  private[spark] val externalShuffleServicePort = StorageUtils.externalShuffleServicePort(conf)
 
   var blockManagerId: BlockManagerId = _
 
@@ -271,9 +272,9 @@ private[spark] class BlockManager(
   private val maxFailuresBeforeLocationRefresh =
     conf.get(config.BLOCK_FAILURES_BEFORE_LOCATION_REFRESH)
 
-  private val slaveEndpoint = rpcEnv.setupEndpoint(
+  private val storageEndpoint = rpcEnv.setupEndpoint(
     "BlockManagerEndpoint" + BlockManager.ID_GENERATOR.next,
-    new BlockManagerSlaveEndpoint(rpcEnv, this, mapOutputTracker))
+    new BlockManagerStorageEndpoint(rpcEnv, this, mapOutputTracker))
 
   // Pending re-registration action being executed asynchronously or null if none is pending.
   // Accesses should synchronize on asyncReregisterLock.
@@ -287,6 +288,10 @@ private[spark] class BlockManager(
 
   private var blockReplicationPolicy: BlockReplicationPolicy = _
 
+  // visible for test
+  // This is volatile since if it's defined we should not accept remote blocks.
+  @volatile private[spark] var decommissioner: Option[BlockManagerDecommissioner] = None
+
   // A DownloadFileManager used to track all the files of remote blocks which are above the
   // specified memory threshold. Files will be deleted automatically based on weak reference.
   // Exposed for test
@@ -296,6 +301,26 @@ private[spark] class BlockManager(
 
   var hostLocalDirManager: Option[HostLocalDirManager] = None
 
+  @inline final private def isDecommissioning() = {
+    decommissioner.isDefined
+  }
+
+  @inline final private def checkShouldStore(blockId: BlockId) = {
+    // Don't reject broadcast blocks since they may be stored during task exec and
+    // don't need to be migrated.
+    if (isDecommissioning() && !blockId.isBroadcast) {
+        throw new BlockSavedOnDecommissionedBlockManagerException(blockId)
+    }
+  }
+
+  // This is a lazy val so someone can migrating RDDs even if they don't have a MigratableResolver
+  // for shuffles. Used in BlockManagerDecommissioner & block puts.
+  private[storage] lazy val migratableResolver: MigratableResolver = {
+    shuffleManager.shuffleBlockResolver.asInstanceOf[MigratableResolver]
+  }
+
+  override def getLocalDiskDirs: Array[String] = diskBlockManager.localDirsString
+
   /**
    * Abstraction for storing blocks from bytes, whether they start in memory or on disk.
    *
@@ -406,7 +431,7 @@ private[spark] class BlockManager(
             ThreadUtils.awaitReady(replicationFuture, Duration.Inf)
           } catch {
             case NonFatal(t) =>
-              throw new Exception("Error occurred while waiting for replication to finish", t)
+              throw new SparkException("Error occurred while waiting for replication to finish", t)
           }
         }
         if (blockWasSuccessfullyStored) {
@@ -508,7 +533,7 @@ private[spark] class BlockManager(
       diskBlockManager.localDirsString,
       maxOnHeapMemory,
       maxOffHeapMemory,
-      slaveEndpoint)
+      storageEndpoint)
 
     blockManagerId = if (idFromMaster != null) idFromMaster else id
 
@@ -524,20 +549,17 @@ private[spark] class BlockManager(
       registerWithExternalShuffleServer()
     }
 
-    hostLocalDirManager =
+    hostLocalDirManager = {
       if (conf.get(config.SHUFFLE_HOST_LOCAL_DISK_READING_ENABLED) &&
           !conf.get(config.SHUFFLE_USE_OLD_FETCH_PROTOCOL)) {
-        externalBlockStoreClient.map { blockStoreClient =>
-          new HostLocalDirManager(
-            futureExecutionContext,
-            conf.get(config.STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE),
-            blockStoreClient,
-            blockManagerId.host,
-            externalShuffleServicePort)
-        }
+        Some(new HostLocalDirManager(
+          futureExecutionContext,
+          conf.get(config.STORAGE_LOCAL_DISK_BY_EXECUTORS_CACHE_SIZE),
+          blockStoreClient))
       } else {
         None
       }
+    }
 
     logInfo(s"Initialized BlockManager: $blockManagerId")
   }
@@ -586,8 +608,8 @@ private[spark] class BlockManager(
    * an executor crash.
    *
    * This function deliberately fails silently if the master returns false (indicating that
-   * the slave needs to re-register). The error condition will be detected again by the next
-   * heart beat attempt or new block registration and another try to re-register all blocks
+   * the storage endpoint needs to re-register). The error condition will be detected again by the
+   * next heart beat attempt or new block registration and another try to re-register all blocks
    * will be made then.
    */
   private def reportAllBlocks(): Unit = {
@@ -611,7 +633,7 @@ private[spark] class BlockManager(
     // TODO: We might need to rate limit re-registering.
     logInfo(s"BlockManager $blockManagerId re-registering with master")
     master.registerBlockManager(blockManagerId, diskBlockManager.localDirsString, maxOnHeapMemory,
-      maxOffHeapMemory, slaveEndpoint)
+      maxOffHeapMemory, storageEndpoint)
     reportAllBlocks()
   }
 
@@ -660,7 +682,17 @@ private[spark] class BlockManager(
    */
   override def getLocalBlockData(blockId: BlockId): ManagedBuffer = {
     if (blockId.isShuffle) {
-      shuffleManager.shuffleBlockResolver.getBlockData(blockId)
+      logDebug(s"Getting local shuffle block ${blockId}")
+      try {
+        shuffleManager.shuffleBlockResolver.getBlockData(blockId)
+      } catch {
+        case e: IOException =>
+          if (conf.get(config.STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).isDefined) {
+            FallbackStorage.read(conf, blockId)
+          } else {
+            throw e
+          }
+      }
     } else {
       getLocalBytes(blockId) match {
         case Some(blockData) =>
@@ -693,6 +725,21 @@ private[spark] class BlockManager(
       blockId: BlockId,
       level: StorageLevel,
       classTag: ClassTag[_]): StreamCallbackWithID = {
+
+    checkShouldStore(blockId)
+
+    if (blockId.isShuffle) {
+      logDebug(s"Putting shuffle block ${blockId}")
+      try {
+        return migratableResolver.putShuffleBlockAsStream(blockId, serializerManager)
+      } catch {
+        case e: ClassCastException => throw new SparkException(
+          s"Unexpected shuffle block ${blockId} with unsupported shuffle " +
+          s"resolver ${shuffleManager.shuffleBlockResolver}")
+      }
+    }
+    logDebug(s"Putting regular block ${blockId}")
+    // All other blocks
     val (_, tmpFile) = diskBlockManager.createTempLocalBlock()
     val channel = new CountingWritableChannel(
       Channels.newChannel(serializerManager.wrapForEncryption(new FileOutputStream(tmpFile))))
@@ -713,7 +760,11 @@ private[spark] class BlockManager(
         // stream.
         channel.close()
         val blockSize = channel.getCount
-        TempFileBasedBlockStoreUpdater(blockId, level, classTag, tmpFile, blockSize).save()
+        val blockStored = TempFileBasedBlockStoreUpdater(
+          blockId, level, classTag, tmpFile, blockSize).save()
+        if (!blockStored) {
+          throw new Exception(s"Failure while trying to store block $blockId on $blockManagerId.")
+        }
       }
 
       override def onFailure(streamId: String, cause: Throwable): Unit = {
@@ -757,9 +808,9 @@ private[spark] class BlockManager(
    *
    * droppedMemorySize exists to account for when the block is dropped from memory to disk (so
    * it is still valid). This ensures that update in master will compensate for the increase in
-   * memory on slave.
+   * memory on the storage endpoint.
    */
-  private def reportBlockStatus(
+  private[spark] def reportBlockStatus(
       blockId: BlockId,
       status: BlockStatus,
       droppedMemorySize: Long = 0L): Unit = {
@@ -775,7 +826,7 @@ private[spark] class BlockManager(
   /**
    * Actually send a UpdateBlockInfo message. Returns the master's response,
    * which will be true if the block was successfully recorded and false if
-   * the slave needs to re-register.
+   * the storage endpoint needs to re-register.
    */
   private def tryToReportBlockStatus(
       blockId: BlockId,
@@ -973,7 +1024,7 @@ private[spark] class BlockManager(
     require(blockId != null, "BlockId is null")
 
     // Because all the remote blocks are registered in driver, it is not necessary to ask
-    // all the slave executors to get block status.
+    // all the storage endpoints to get block status.
     val locationsAndStatusOption = master.getLocationsAndStatus(blockId, blockManagerId.host)
     if (locationsAndStatusOption.isEmpty) {
       logDebug(s"Block $blockId is unknown by block manager master")
@@ -1115,7 +1166,7 @@ private[spark] class BlockManager(
       blockSize: Long): Option[ManagedBuffer] = {
     val file = ExecutorDiskUtils.getFile(localDirs, subDirsPerLocalDir, blockId.name)
     if (file.exists()) {
-      val mangedBuffer = securityManager.getIOEncryptionKey() match {
+      val managedBuffer = securityManager.getIOEncryptionKey() match {
         case Some(key) =>
           // Encrypted blocks cannot be memory mapped; return a special object that does decryption
           // and provides InputStream / FileRegion implementations for reading the data.
@@ -1126,7 +1177,7 @@ private[spark] class BlockManager(
           val transportConf = SparkTransportConf.fromSparkConf(conf, "shuffle")
           new FileSegmentManagedBuffer(transportConf, file, 0, file.length)
       }
-      Some(mangedBuffer)
+ Some(managedBuffer)
     } else {
       None
     }
@@ -1340,6 +1391,7 @@ private[spark] class BlockManager(
 
     require(blockId != null, "BlockId is null")
     require(level != null && level.isValid, "StorageLevel is null or invalid")
+    checkShouldStore(blockId)
 
     val putBlockInfo = {
       val newInfo = new BlockInfo(level, classTag, tellMaster)
@@ -1596,7 +1648,7 @@ private[spark] class BlockManager(
   /**
    * Get peer block managers in the system.
    */
-  private def getPeers(forceFetch: Boolean): Seq[BlockManagerId] = {
+  private[storage] def getPeers(forceFetch: Boolean): Seq[BlockManagerId] = {
     peerFetchLock.synchronized {
       val cachedPeersTtl = conf.get(config.STORAGE_CACHED_PEERS_TTL) // milliseconds
       val diff = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - lastPeerFetchTimeNs)
@@ -1606,23 +1658,32 @@ private[spark] class BlockManager(
         lastPeerFetchTimeNs = System.nanoTime()
         logDebug("Fetched peers from master: " + cachedPeers.mkString("[", ",", "]"))
       }
-      cachedPeers
+      if (cachedPeers.isEmpty &&
+          conf.get(config.STORAGE_DECOMMISSION_FALLBACK_STORAGE_PATH).isDefined) {
+        Seq(FallbackStorage.FALLBACK_BLOCK_MANAGER_ID)
+      } else {
+        cachedPeers
+      }
     }
   }
 
   /**
-   * Called for pro-active replenishment of blocks lost due to executor failures
+   * Replicates a block to peer block managers based on existingReplicas and maxReplicas
    *
    * @param blockId blockId being replicate
    * @param existingReplicas existing block managers that have a replica
    * @param maxReplicas maximum replicas needed
+   * @param maxReplicationFailures number of replication failures to tolerate before
+   *                               giving up.
+   * @return whether block was successfully replicated or not
    */
   def replicateBlock(
       blockId: BlockId,
       existingReplicas: Set[BlockManagerId],
-      maxReplicas: Int): Unit = {
+      maxReplicas: Int,
+      maxReplicationFailures: Option[Int] = None): Boolean = {
     logInfo(s"Using $blockManagerId to pro-actively replicate $blockId")
-    blockInfoManager.lockForReading(blockId).foreach { info =>
+    blockInfoManager.lockForReading(blockId).forall { info =>
       val data = doGetLocalBytes(blockId, info)
       val storageLevel = StorageLevel(
         useDisk = info.level.useDisk,
@@ -1630,11 +1691,13 @@ private[spark] class BlockManager(
         useOffHeap = info.level.useOffHeap,
         deserialized = info.level.deserialized,
         replication = maxReplicas)
-      // we know we are called as a result of an executor removal, so we refresh peer cache
-      // this way, we won't try to replicate to a missing executor with a stale reference
+      // we know we are called as a result of an executor removal or because the current executor
+      // is getting decommissioned. so we refresh peer cache before trying replication, we won't
+      // try to replicate to a missing executor/another decommissioning executor
       getPeers(forceFetch = true)
       try {
-        replicate(blockId, data, storageLevel, info.classTag, existingReplicas)
+        replicate(
+          blockId, data, storageLevel, info.classTag, existingReplicas, maxReplicationFailures)
       } finally {
         logDebug(s"Releasing lock for $blockId")
         releaseLockAndDispose(blockId, data)
@@ -1651,9 +1714,11 @@ private[spark] class BlockManager(
       data: BlockData,
       level: StorageLevel,
       classTag: ClassTag[_],
-      existingReplicas: Set[BlockManagerId] = Set.empty): Unit = {
+      existingReplicas: Set[BlockManagerId] = Set.empty,
+      maxReplicationFailures: Option[Int] = None): Boolean = {
 
-    val maxReplicationFailures = conf.get(config.STORAGE_MAX_REPLICATION_FAILURE)
+    val maxReplicationFailureCount = maxReplicationFailures.getOrElse(
+      conf.get(config.STORAGE_MAX_REPLICATION_FAILURE))
     val tLevel = StorageLevel(
       useDisk = level.useDisk,
       useMemory = level.useMemory,
@@ -1677,7 +1742,7 @@ private[spark] class BlockManager(
       blockId,
       numPeersToReplicateTo)
 
-    while(numFailures <= maxReplicationFailures &&
+    while(numFailures <= maxReplicationFailureCount &&
       !peersForReplication.isEmpty &&
       peersReplicatedTo.size < numPeersToReplicateTo) {
       val peer = peersForReplication.head
@@ -1701,6 +1766,10 @@ private[spark] class BlockManager(
         peersForReplication = peersForReplication.tail
         peersReplicatedTo += peer
       } catch {
+        // Rethrow interrupt exception
+        case e: InterruptedException =>
+          throw e
+        // Everything else we may retry
         case NonFatal(e) =>
           logWarning(s"Failed to replicate $blockId to $peer, failure #$numFailures", e)
           peersFailedToReplicateTo += peer
@@ -1725,9 +1794,11 @@ private[spark] class BlockManager(
     if (peersReplicatedTo.size < numPeersToReplicateTo) {
       logWarning(s"Block $blockId replicated to only " +
         s"${peersReplicatedTo.size} peer(s) instead of $numPeersToReplicateTo peers")
+      return false
     }
 
     logDebug(s"block $blockId replicated to ${peersReplicatedTo.mkString(", ")}")
+    return true
   }
 
   /**
@@ -1842,6 +1913,30 @@ private[spark] class BlockManager(
     blocksToRemove.size
   }
 
+  def decommissionBlockManager(): Unit = storageEndpoint.ask(DecommissionBlockManager)
+
+  private[spark] def decommissionSelf(): Unit = synchronized {
+    decommissioner match {
+      case None =>
+        logInfo("Starting block manager decommissioning process...")
+        decommissioner = Some(new BlockManagerDecommissioner(conf, this))
+        decommissioner.foreach(_.start())
+      case Some(_) =>
+        logDebug("Block manager already in decommissioning state")
+    }
+  }
+
+  /**
+   *  Returns the last migration time and a boolean denoting if all the blocks have been migrated.
+   *  If there are any tasks running since that time the boolean may be incorrect.
+   */
+  private[spark] def lastMigrationInfo(): (Long, Boolean) = {
+    decommissioner.map(_.lastMigrationInfo()).getOrElse((0, false))
+  }
+
+  private[storage] def getMigratableRDDBlocks(): Seq[ReplicateBlock] =
+    master.getReplicateInfoForRDDBlocks(blockManagerId)
+
   /**
    * Remove all blocks belonging to the given broadcast.
    */
@@ -1911,6 +2006,7 @@ private[spark] class BlockManager(
   }
 
   def stop(): Unit = {
+    decommissioner.foreach(_.stop())
     blockTransferService.close()
     if (blockStoreClient ne blockTransferService) {
       // Closing should be idempotent, but maybe not for the NioBlockTransferService.
@@ -1918,7 +2014,7 @@ private[spark] class BlockManager(
     }
     remoteBlockTempFileManager.stop()
     diskBlockManager.stop()
-    rpcEnv.stop(slaveEndpoint)
+    rpcEnv.stop(storageEndpoint)
     blockInfoManager.clear()
     memoryStore.clear()
     futureExecutionContext.shutdownNow()

From 99d75df71e6c55f149b7a7b17aba8dbbe17cb68a Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:28:56 +0800
Subject: [PATCH 32/35] Update PMemBlockObjectWriter.scala


From 30f5d5f7d075f50850edcb87968edb13fcf24678 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:29:12 +0800
Subject: [PATCH 33/35] Update StorageLevel.scala

---
 src/main/scala/org/apache/spark/storage/StorageLevel.scala | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/main/scala/org/apache/spark/storage/StorageLevel.scala b/src/main/scala/org/apache/spark/storage/StorageLevel.scala
index f31a9f13..01970ca0 100644
--- a/src/main/scala/org/apache/spark/storage/StorageLevel.scala
+++ b/src/main/scala/org/apache/spark/storage/StorageLevel.scala
@@ -166,6 +166,7 @@ object StorageLevel {
   val NONE = new StorageLevel(false, false, false, false, false)
   val DISK_ONLY = new StorageLevel(true, false, false, false, false)
   val DISK_ONLY_2 = new StorageLevel(true, false, false, false, false, 2)
+  val DISK_ONLY_3 = new StorageLevel(true, false, false, false, false, 3)
   val MEMORY_ONLY = new StorageLevel(false, true, false, false, true)
   val MEMORY_ONLY_2 = new StorageLevel(false, true, false, false, true, 2)
   val MEMORY_ONLY_SER = new StorageLevel(false, true, false, false, false)
@@ -187,6 +188,7 @@ object StorageLevel {
     case "NONE" => NONE
     case "DISK_ONLY" => DISK_ONLY
     case "DISK_ONLY_2" => DISK_ONLY_2
+    case "DISK_ONLY_3" => DISK_ONLY_3
     case "MEMORY_ONLY" => MEMORY_ONLY
     case "MEMORY_ONLY_2" => MEMORY_ONLY_2
     case "MEMORY_ONLY_SER" => MEMORY_ONLY_SER

From 3b5f6172d093427b217fa85c946cccef8c5f10b0 Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:29:26 +0800
Subject: [PATCH 34/35] Update SparkEnv.scala

---
 src/main/scala/org/apache/spark/SparkEnv.scala | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/src/main/scala/org/apache/spark/SparkEnv.scala b/src/main/scala/org/apache/spark/SparkEnv.scala
index da83f262..9b90ac9e 100644
--- a/src/main/scala/org/apache/spark/SparkEnv.scala
+++ b/src/main/scala/org/apache/spark/SparkEnv.scala
@@ -125,9 +125,12 @@ class SparkEnv (
   private[spark]
   def destroyPythonWorker(pythonExec: String,
       envVars: Map[String, String], worker: Socket): Unit = {
-    synchronized {
-      val key = (pythonExec, envVars)
-      pythonWorkers.get(key).foreach(_.stopWorker(worker))
+    def destroyPythonWorker(pythonExec: String,
+                            envVars: Map[String, String], worker: Socket): Unit = {
+      synchronized {
+        val key = (pythonExec, envVars)
+        pythonWorkers.get(key).foreach(_.stopWorker(worker))
+      }
     }
   }
 
@@ -377,7 +380,8 @@ object SparkEnv extends Logging {
             externalShuffleClient
           } else {
             None
-          }, blockManagerInfo)),
+          }, blockManagerInfo,
+          mapOutputTracker.asInstanceOf[MapOutputTrackerMaster])),
       registerOrLookupEndpoint(
         BlockManagerMaster.DRIVER_HEARTBEAT_ENDPOINT_NAME,
         new BlockManagerMasterHeartbeatEndpoint(rpcEnv, isLocal, blockManagerInfo)),
@@ -463,7 +467,8 @@ object SparkEnv extends Logging {
       hadoopConf: Configuration,
       schedulingMode: String,
       addedJars: Seq[String],
-      addedFiles: Seq[String]): Map[String, Seq[(String, String)]] = {
+      addedFiles: Seq[String],
+      addedArchives: Seq[String]): Map[String, Seq[(String, String)]] = {
 
     import Properties._
     val jvmInformation = Seq(
@@ -493,7 +498,7 @@ object SparkEnv extends Logging {
       .split(File.pathSeparator)
       .filterNot(_.isEmpty)
       .map((_, "System Classpath"))
-    val addedJarsAndFiles = (addedJars ++ addedFiles).map((_, "Added By User"))
+    val addedJarsAndFiles = (addedJars ++ addedFiles ++ addedArchives).map((_, "Added By User"))
     val classPaths = (addedJarsAndFiles ++ classPathEntries).sorted
 
     // Add Hadoop properties, it will not ignore configs including in Spark. Some spark

From 0c983ead75485b85ecb70fd4ae2e605cb1d1da1d Mon Sep 17 00:00:00 2001
From: Pei Qi <37234782+JustDoCoder@users.noreply.github.com>
Date: Fri, 16 Apr 2021 09:29:53 +0800
Subject: [PATCH 35/35] Update TestMemoryManager.scala