From 094843d19a579f6d31caff99d16df4889af511ef Mon Sep 17 00:00:00 2001
From: Praveen Rewar <8457124+praveenrewar@users.noreply.github.com>
Date: Mon, 19 Sep 2022 13:24:55 +0530
Subject: [PATCH] Apply as many changes as possible with errors at end (#532)

- Changes that depend on failing changes shouldn't be applied
- exit-early-on-apply-error flag can be used to exit as soon as an error is encountered while applying changes (default true)
- exit-early-on-wait-error flag can be used to exit as soon as an error is encountered while waiting for changes (default true)
---
 pkg/kapp/clusterapply/applying_changes.go   |  25 +-
 pkg/kapp/clusterapply/cluster_change.go     |   2 +-
 pkg/kapp/clusterapply/cluster_change_set.go |  27 ++-
 pkg/kapp/clusterapply/waiting_changes.go    |  27 ++-
 pkg/kapp/cmd/app/apply_flags.go             |   4 +
 test/e2e/apply_wait_error_test.go           | 253 ++++++++++++++++++++
 test/e2e/formatted_error_test.go            |   2 +-
 7 files changed, 317 insertions(+), 23 deletions(-)
 create mode 100644 test/e2e/apply_wait_error_test.go

diff --git a/pkg/kapp/clusterapply/applying_changes.go b/pkg/kapp/clusterapply/applying_changes.go
index 25bda01ad..b240deecf 100644
--- a/pkg/kapp/clusterapply/applying_changes.go
+++ b/pkg/kapp/clusterapply/applying_changes.go
@@ -23,10 +23,11 @@ type ApplyingChanges struct {
 	applied              map[*ctldgraph.Change]struct{}
 	clusterChangeFactory ClusterChangeFactory
 	ui                   UI
+	exitOnError          bool
 }
 
-func NewApplyingChanges(numTotal int, opts ApplyingChangesOpts, clusterChangeFactory ClusterChangeFactory, ui UI) *ApplyingChanges {
-	return &ApplyingChanges{numTotal, opts, map[*ctldgraph.Change]struct{}{}, clusterChangeFactory, ui}
+func NewApplyingChanges(numTotal int, opts ApplyingChangesOpts, clusterChangeFactory ClusterChangeFactory, ui UI, exitOnError bool) *ApplyingChanges {
+	return &ApplyingChanges{numTotal, opts, map[*ctldgraph.Change]struct{}{}, clusterChangeFactory, ui, exitOnError}
 }
 
 type applyResult struct {
@@ -37,14 +38,16 @@ type applyResult struct {
 	Err           error
 }
 
-func (c *ApplyingChanges) Apply(allChanges []*ctldgraph.Change) ([]WaitingChange, error) {
+func (c *ApplyingChanges) Apply(allChanges []*ctldgraph.Change) ([]WaitingChange, []string, error) {
 	startTime := time.Now()
 
+	var unsuccessfulChangeDesc []string
+
 	for {
 		nonAppliedChanges := c.nonAppliedChanges(allChanges)
 		if len(nonAppliedChanges) == 0 {
 			// Do not print applying message if no changes
-			return nil, nil
+			return nil, unsuccessfulChangeDesc, nil
 		}
 
 		c.ui.NotifySection("applying %d changes %s", len(nonAppliedChanges), c.stats())
@@ -88,10 +91,14 @@ func (c *ApplyingChanges) Apply(allChanges []*ctldgraph.Change) ([]WaitingChange
 
 			if result.Err != nil {
 				lastErr = result.Err
-				if result.Retryable {
-					continue
+				if !result.Retryable {
+					if c.exitOnError {
+						return nil, nil, result.Err
+					}
+					unsuccessfulChangeDesc = append(unsuccessfulChangeDesc, result.Err.Error())
+					c.markApplied(result.Change)
 				}
-				return nil, result.Err
+				continue
 			}
 
 			c.markApplied(result.Change)
@@ -99,11 +106,11 @@ func (c *ApplyingChanges) Apply(allChanges []*ctldgraph.Change) ([]WaitingChange
 		}
 
 		if len(appliedChanges) > 0 {
-			return appliedChanges, nil
+			return appliedChanges, unsuccessfulChangeDesc, nil
 		}
 
 		if time.Now().Sub(startTime) > c.opts.Timeout {
-			return nil, fmt.Errorf("Timed out waiting after %s: Last error: %w", c.opts.Timeout, lastErr)
+			return nil, unsuccessfulChangeDesc, fmt.Errorf("Timed out waiting after %s: Last error: %s", c.opts.Timeout, lastErr)
 		}
 
 		time.Sleep(c.opts.CheckInterval)
diff --git a/pkg/kapp/clusterapply/cluster_change.go b/pkg/kapp/clusterapply/cluster_change.go
index 84852677b..19b1921af 100644
--- a/pkg/kapp/clusterapply/cluster_change.go
+++ b/pkg/kapp/clusterapply/cluster_change.go
@@ -280,7 +280,7 @@ func (c *ClusterChange) applyErr(err error) error {
 		}
 	}
 
-	return fmt.Errorf("Applying %s: %s%s", c.ApplyDescription(),
+	return fmt.Errorf("%s: %s%s", c.ApplyDescription(),
 		uierrs.NewSemiStructuredError(err), hintMsg)
 }
 
diff --git a/pkg/kapp/clusterapply/cluster_change_set.go b/pkg/kapp/clusterapply/cluster_change_set.go
index 48d2c81cd..f60261253 100644
--- a/pkg/kapp/clusterapply/cluster_change_set.go
+++ b/pkg/kapp/clusterapply/cluster_change_set.go
@@ -5,7 +5,9 @@ package clusterapply
 
 import (
 	"fmt"
+	"strings"
 
+	uierrs "github.com/cppforlife/go-cli-ui/errors"
 	ctlconf "github.com/vmware-tanzu/carvel-kapp/pkg/kapp/config"
 	ctldiff "github.com/vmware-tanzu/carvel-kapp/pkg/kapp/diff"
 	ctldgraph "github.com/vmware-tanzu/carvel-kapp/pkg/kapp/diffgraph"
@@ -15,6 +17,9 @@ import (
 type ClusterChangeSetOpts struct {
 	ApplyingChangesOpts
 	WaitingChangesOpts
+
+	ExitEarlyOnApplyError bool
+	ExitEarlyOnWaitError  bool
 }
 
 type ClusterChangeSet struct {
@@ -95,18 +100,30 @@ func (c ClusterChangeSet) Apply(changesGraph *ctldgraph.ChangeGraph) error {
 
 	blockedChanges := ctldgraph.NewBlockedChanges(changesGraph)
 	applyingChanges := NewApplyingChanges(
-		expectedNumChanges, c.opts.ApplyingChangesOpts, c.clusterChangeFactory, c.ui)
-	waitingChanges := NewWaitingChanges(expectedNumChanges, c.opts.WaitingChangesOpts, c.ui)
+		expectedNumChanges, c.opts.ApplyingChangesOpts, c.clusterChangeFactory, c.ui, c.opts.ExitEarlyOnApplyError)
+	waitingChanges := NewWaitingChanges(expectedNumChanges, c.opts.WaitingChangesOpts, c.ui, c.opts.ExitEarlyOnWaitError)
+
+	var unsuccessfulChanges []string
 
 	for {
-		appliedChanges, err := applyingChanges.Apply(blockedChanges.Unblocked())
+		appliedChanges, unsuccessfulChangeDesc, err := applyingChanges.Apply(blockedChanges.Unblocked())
 		if err != nil {
 			return err
 		}
 
+		unsuccessfulChanges = append(unsuccessfulChanges, unsuccessfulChangeDesc...)
+
 		waitingChanges.Track(appliedChanges)
 
 		if waitingChanges.IsEmpty() {
+			if len(unsuccessfulChanges) == 1 {
+				return fmt.Errorf("%s", unsuccessfulChanges[0])
+			}
+
+			if len(unsuccessfulChanges) > 0 {
+				return uierrs.NewSemiStructuredError(fmt.Errorf("[%s]", strings.Join(unsuccessfulChanges, ", ")))
+			}
+
 			err := applyingChanges.Complete()
 			if err != nil {
 				c.ui.Notify([]string{fmt.Sprintf("Blocked changes:\n%s\n", blockedChanges.WhyBlocked(blockedChanges.Blocked()))})
@@ -116,11 +133,13 @@ func (c ClusterChangeSet) Apply(changesGraph *ctldgraph.ChangeGraph) error {
 			return waitingChanges.Complete()
 		}
 
-		doneChanges, err := waitingChanges.WaitForAny()
+		doneChanges, unsuccessfulChangeDesc, err := waitingChanges.WaitForAny()
 		if err != nil {
 			return err
 		}
 
+		unsuccessfulChanges = append(unsuccessfulChanges, unsuccessfulChangeDesc...)
+
 		for _, change := range doneChanges {
 			blockedChanges.Unblock(change.Graph)
 		}
diff --git a/pkg/kapp/clusterapply/waiting_changes.go b/pkg/kapp/clusterapply/waiting_changes.go
index 3cedd8430..18695caac 100644
--- a/pkg/kapp/clusterapply/waiting_changes.go
+++ b/pkg/kapp/clusterapply/waiting_changes.go
@@ -27,6 +27,7 @@ type WaitingChanges struct {
 	trackedChanges []WaitingChange
 	opts           WaitingChangesOpts
 	ui             UI
+	exitOnError    bool
 }
 
 type WaitingChange struct {
@@ -35,8 +36,8 @@ type WaitingChange struct {
 	startTime time.Time
 }
 
-func NewWaitingChanges(numTotal int, opts WaitingChangesOpts, ui UI) *WaitingChanges {
-	return &WaitingChanges{numTotal, 0, nil, opts, ui}
+func NewWaitingChanges(numTotal int, opts WaitingChangesOpts, ui UI, exitOnError bool) *WaitingChanges {
+	return &WaitingChanges{numTotal, 0, nil, opts, ui, exitOnError}
 }
 
 func (c *WaitingChanges) Track(changes []WaitingChange) {
@@ -54,7 +55,7 @@ type waitResult struct {
 	Err      error
 }
 
-func (c *WaitingChanges) WaitForAny() ([]WaitingChange, error) {
+func (c *WaitingChanges) WaitForAny() ([]WaitingChange, []string, error) {
 	startTime := time.Now()
 
 	for {
@@ -83,6 +84,7 @@ func (c *WaitingChanges) WaitForAny() ([]WaitingChange, error) {
 
 		var newInProgressChanges []WaitingChange
 		var doneChanges []WaitingChange
+		var unsuccessfulChangeDesc []string
 
 		for i := 0; i < len(c.trackedChanges); i++ {
 			result := <-waitCh
@@ -92,7 +94,12 @@ func (c *WaitingChanges) WaitForAny() ([]WaitingChange, error) {
 			c.ui.Notify(descMsgs)
 
 			if err != nil {
-				return nil, fmt.Errorf("%s: Errored: %w", desc, err)
+				err = fmt.Errorf("%s: Errored: %w", desc, err)
+				if c.exitOnError {
+					return nil, nil, err
+				}
+				unsuccessfulChangeDesc = append(unsuccessfulChangeDesc, err.Error())
+				continue
 			}
 			if state.Done {
 				c.numWaited++
@@ -111,7 +118,11 @@ func (c *WaitingChanges) WaitForAny() ([]WaitingChange, error) {
 				if len(state.Message) > 0 {
 					msg += " (" + state.Message + ")"
 				}
-				return nil, fmt.Errorf("%s: Finished unsuccessfully%s", desc, msg)
+				err := fmt.Errorf("%s: Finished unsuccessfully%s", desc, msg)
+				if c.exitOnError {
+					return nil, nil, err
+				}
+				unsuccessfulChangeDesc = append(unsuccessfulChangeDesc, err.Error())
 
 			case state.Done && state.Successful:
 				doneChanges = append(doneChanges, change)
@@ -120,8 +131,8 @@ func (c *WaitingChanges) WaitForAny() ([]WaitingChange, error) {
 
 		c.trackedChanges = newInProgressChanges
 
-		if len(c.trackedChanges) == 0 || len(doneChanges) > 0 {
-			return doneChanges, nil
+		if len(c.trackedChanges) == 0 || len(doneChanges) > 0 || len(unsuccessfulChangeDesc) > 0 {
+			return doneChanges, unsuccessfulChangeDesc, nil
 		}
 
 		if time.Now().Sub(startTime) > c.opts.Timeout {
@@ -129,7 +140,7 @@ func (c *WaitingChanges) WaitForAny() ([]WaitingChange, error) {
 			for _, change := range c.trackedChanges {
 				trackedResourcesDesc = append(trackedResourcesDesc, change.Cluster.Resource().Description())
 			}
-			return nil, uierrs.NewSemiStructuredError(fmt.Errorf("Timed out waiting after %s for resources: [%s]", c.opts.Timeout, strings.Join(trackedResourcesDesc, ", ")))
+			return nil, unsuccessfulChangeDesc, uierrs.NewSemiStructuredError(fmt.Errorf("Timed out waiting after %s for resources: [%s]", c.opts.Timeout, strings.Join(trackedResourcesDesc, ", ")))
 		}
 
 		time.Sleep(c.opts.CheckInterval)
diff --git a/pkg/kapp/cmd/app/apply_flags.go b/pkg/kapp/cmd/app/apply_flags.go
index ce4865a35..ecbd50293 100644
--- a/pkg/kapp/cmd/app/apply_flags.go
+++ b/pkg/kapp/cmd/app/apply_flags.go
@@ -50,6 +50,8 @@ func (s *ApplyFlags) SetWithDefaults(prefix string, defaults ApplyFlags, cmd *co
 	cmd.Flags().StringVar(&s.AddOrUpdateChangeOpts.DefaultUpdateStrategy, prefix+"apply-default-update-strategy",
 		defaults.AddOrUpdateChangeOpts.DefaultUpdateStrategy, "Change default update strategy")
 
+	cmd.Flags().BoolVar(&s.ExitEarlyOnApplyError, prefix+"exit-early-on-apply-error", true, "Exit quickly on apply failure")
+
 	cmd.Flags().BoolVar(&s.Wait, prefix+"wait", defaults.Wait, "Set to wait for changes to be applied")
 	cmd.Flags().BoolVar(&s.WaitIgnored, prefix+"wait-ignored", defaults.WaitIgnored, "Set to wait for ignored changes to be applied")
 
@@ -63,6 +65,8 @@ func (s *ApplyFlags) SetWithDefaults(prefix string, defaults ApplyFlags, cmd *co
 		5, "Maximum number of concurrent wait operations")
 
 	cmd.Flags().BoolVar(&s.ExitStatus, prefix+"apply-exit-status", false, "Return specific exit status based on number of changes")
+
+	cmd.Flags().BoolVar(&s.ExitEarlyOnWaitError, prefix+"exit-early-on-wait-error", true, "Exit quickly on wait failure")
 }
 
 func mustParseDuration(str string) time.Duration {
diff --git a/test/e2e/apply_wait_error_test.go b/test/e2e/apply_wait_error_test.go
new file mode 100644
index 000000000..a4fc2a971
--- /dev/null
+++ b/test/e2e/apply_wait_error_test.go
@@ -0,0 +1,253 @@
+// Copyright 2022 VMware, Inc.
+// SPDX-License-Identifier: Apache-2.0
+
+package e2e
+
+import (
+	"fmt"
+	"strings"
+	"testing"
+
+	uitest "github.com/cppforlife/go-cli-ui/ui/test"
+	"github.com/stretchr/testify/require"
+)
+
+func TestApplyWaitErrors(t *testing.T) {
+	env := BuildEnv(t)
+	logger := Logger{}
+	kapp := Kapp{t, env.Namespace, env.KappBinaryPath, logger}
+
+	yaml1 := `
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: my-job-fail
+  annotations:
+    kapp.k14s.io/change-group: "job-fail"
+spec:
+  template:
+    metadata:
+      name: my-job-fail
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: my-job-fail
+          image: busybox
+          command: [ "sh", "-c", "exit 1" ]
+  backoffLimit: 0
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: my-job-fail-2
+  annotations:
+    kapp.k14s.io/change-group: "job-fail"
+spec:
+  template:
+    metadata:
+      name: my-job-fail-2
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: my-job-fail-2
+          image: busybox
+          command: [ "sh", "-c", "exit 1" ]
+  backoffLimit: 0
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: my-job-succeed
+  annotations:
+    kapp.k14s.io/change-group: "job-succeed"
+    kapp.k14s.io/change-rule: "upsert before upserting service-not-deployed"
+spec:
+  template:
+    metadata:
+      name: my-job-succeed
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: my-job-succeed
+          image: busybox
+          command: [ "sh", "-c", "exit 0" ]
+  backoffLimit: 0
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: service-succeed
+  annotations:
+    kapp.k14s.io/change-rule: "upsert after upserting job-succeed"
+spec:
+  ports:
+  - port: 80
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: service-fail
+  annotations:
+    kapp.k14s.io/change-group: "service-fail"
+    kapp.k14s.io/change-rule: "upsert after upserting job-succeed"
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: service-not-deployed
+  annotations:
+    kapp.k14s.io/change-group: "service-not-deployed"
+    kapp.k14s.io/change-rule: "upsert after upserting job-fail"
+spec:
+  ports:
+  - port: 80
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: service-not-deployed-2
+  annotations:
+    kapp.k14s.io/change-rule: "upsert after upserting service-fail"
+spec:
+  ports:
+  - port: 80
+`
+
+	name := "test-multiple-errors"
+	cleanUp := func() {
+		kapp.Run([]string{"delete", "-a", name})
+	}
+
+	cleanUp()
+	defer cleanUp()
+
+	logger.Section("deploy with multiple errors and exit early on error set to false", func() {
+		_, err := kapp.RunWithOpts([]string{"deploy", "-f", "-", "-a", name, "--exit-early-on-apply-error=false", "--exit-early-on-wait-error=false"},
+			RunOpts{StdinReader: strings.NewReader(yaml1), AllowError: true})
+
+		expectedErrs := []string{
+			`kapp: Error:`,
+			fmt.Sprintf(`- waiting on reconcile job/my-job-fail-2 (batch/v1) namespace: %s: Finished unsuccessfully (Failed with reason BackoffLimitExceeded: Job has reached the specified backoff limit)`, env.Namespace),
+			fmt.Sprintf(`- waiting on reconcile job/my-job-fail (batch/v1) namespace: %s: Finished unsuccessfully (Failed with reason BackoffLimitExceeded: Job has reached the specified backoff limit)`, env.Namespace),
+			fmt.Sprintf(`- create service/service-fail (v1) namespace: %s: Creating resource service/service-fail (v1) namespace: kapp-test: API server says: Service "service-fail" is invalid: spec.ports: Required value (reason: Invalid)`, env.Namespace),
+		}
+
+		for _, expectedErr := range expectedErrs {
+			require.Containsf(t, err.Error(), expectedErr, "Expected to see expected err in output, but did not")
+		}
+
+		out := kapp.Run([]string{"inspect", "-a", name, "--filter-kind", "Service", "--json"})
+
+		expectedResources := []map[string]string{{
+			"age":             "<replaced>",
+			"kind":            "Service",
+			"name":            "service-succeed",
+			"namespace":       env.Namespace,
+			"owner":           "kapp",
+			"reconcile_info":  "",
+			"reconcile_state": "ok",
+		}}
+
+		inspectOut := uitest.JSONUIFromBytes(t, []byte(out))
+
+		require.Exactlyf(t, expectedResources, replaceAge(inspectOut.Tables[0].Rows), "Expected to see correct changes")
+	})
+}
+
+func TestExitEarlyOnApplyErrorFlag(t *testing.T) {
+	env := BuildEnv(t)
+	logger := Logger{}
+	kapp := Kapp{t, env.Namespace, env.KappBinaryPath, logger}
+
+	yaml1 := `
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: service-fail
+---
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: my-job-succeed
+  annotations:
+    kapp.k14s.io/change-group: "job-succeed"
+spec:
+  template:
+    metadata:
+      name: my-job-succeed
+    spec:
+      restartPolicy: Never
+      containers:
+        - name: my-job-succeed
+          image: busybox
+          command: [ "sh", "-c", "exit 0" ]
+  backoffLimit: 0
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: service-succeed
+  annotations:
+    kapp.k14s.io/change-rule: "upsert after upserting job-succeed"
+spec:
+  ports:
+  - port: 80
+`
+
+	name := "test-exit-early-on-apply-error"
+	cleanUp := func() {
+		kapp.Run([]string{"delete", "-a", name})
+	}
+
+	cleanUp()
+	defer cleanUp()
+
+	expectedErr := strings.ReplaceAll(`kapp: Error: create service/service-fail (v1) namespace: <test-ns>:
+  Creating resource service/service-fail (v1) namespace: <test-ns>:
+    API server says:
+      Service "service-fail" is invalid: spec.ports:
+        Required value (reason: Invalid)
+`, "<test-ns>", env.Namespace)
+
+	logger.Section("deploy with exit-early-on-apply-error=false", func() {
+		_, err := kapp.RunWithOpts([]string{"deploy", "-f", "-", "-a", name, "--exit-early-on-apply-error=false"},
+			RunOpts{StdinReader: strings.NewReader(yaml1), AllowError: true})
+
+		require.Containsf(t, err.Error(), expectedErr, "Expected to see expected err in output, but did not")
+
+		out := kapp.Run([]string{"inspect", "-a", name, "--json", "--filter-kind", "Service"})
+
+		expectedResources := []map[string]string{{
+			"age":             "<replaced>",
+			"kind":            "Service",
+			"name":            "service-succeed",
+			"namespace":       env.Namespace,
+			"owner":           "kapp",
+			"reconcile_info":  "",
+			"reconcile_state": "ok",
+		}}
+
+		inspectOut := uitest.JSONUIFromBytes(t, []byte(out))
+
+		require.Exactlyf(t, expectedResources, replaceAge(inspectOut.Tables[0].Rows), "Expected to see correct changes")
+	})
+
+	cleanUp()
+
+	logger.Section("deploy with exit-early-on-apply-error=true", func() {
+		_, err := kapp.RunWithOpts([]string{"deploy", "-f", "-", "-a", name},
+			RunOpts{StdinReader: strings.NewReader(yaml1), AllowError: true})
+
+		require.Containsf(t, err.Error(), expectedErr, "Expected to see expected err in output, but did not")
+
+		out := kapp.Run([]string{"inspect", "-a", name, "--json", "--filter-kind", "Service"})
+
+		expectedResources := []map[string]string{}
+
+		inspectOut := uitest.JSONUIFromBytes(t, []byte(out))
+
+		require.Exactlyf(t, expectedResources, replaceAge(inspectOut.Tables[0].Rows), "Expected to see correct changes")
+	})
+}
diff --git a/test/e2e/formatted_error_test.go b/test/e2e/formatted_error_test.go
index 345c77f0c..7dfe27782 100644
--- a/test/e2e/formatted_error_test.go
+++ b/test/e2e/formatted_error_test.go
@@ -50,7 +50,7 @@ spec:
 
 	logger.Section("deploy with errors", func() {
 		expectedErr := strings.TrimSpace(`
-kapp: Error: Applying create job/successful-job (batch/v1) namespace: default:
+kapp: Error: create job/successful-job (batch/v1) namespace: default:
   Creating resource job/successful-job (batch/v1) namespace: default:
     API server says:
       Job.batch "successful-job" is invalid: