From 4d26204c631442e777b7f0156feba509a332880c Mon Sep 17 00:00:00 2001
From: Isak Styf <isak.styf@masarinconsulting.se>
Date: Fri, 1 Nov 2024 16:39:27 +0100
Subject: [PATCH 1/5] abort kill delay if process closes after interrupt

if the process to be killed honours the SIGINT and closes down,
we should not sleep before trying to kill a process that is
no longer there
---
 runner/util_linux.go | 43 ++++++++++++++++++++++++++++++++++++------
 runner/util_unix.go  | 45 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 76 insertions(+), 12 deletions(-)

diff --git a/runner/util_linux.go b/runner/util_linux.go
index 658671b3..27cb5650 100644
--- a/runner/util_linux.go
+++ b/runner/util_linux.go
@@ -1,6 +1,7 @@
 package runner
 
 import (
+	"context"
 	"io"
 	"os/exec"
 	"syscall"
@@ -12,20 +13,50 @@ import (
 func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 	pid = cmd.Process.Pid
 
+	var killDelay time.Duration
+
 	if e.config.Build.SendInterrupt {
+		e.mainDebug("sending interrupt to process %d", pid)
 		// Sending a signal to make it clear to the process that it is time to turn off
 		if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
 			return
 		}
-		time.Sleep(e.config.killDelay())
+		// the kill delay is 0 by default unless the user has configured send_interrupt=true
+		// in which case it is fetched from the kill_delay setting in the .air.toml
+		killDelay = e.config.killDelay()
+		e.mainDebug("setting a kill timer for %s", killDelay.String())
 	}
 
-	// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
-	err = syscall.Kill(-pid, syscall.SIGKILL)
+	waitResult := make(chan error)
+	go func() {
+		defer close(waitResult)
+		_, _ = cmd.Process.Wait()
+	}()
+
+	// prepare a cancel context that can stop the killing if it is not needed
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	killResult := make(chan error)
+	// Spawn a goroutine that will kill the process after kill delay if we have not
+	// received a wait result before that.
+	go func() {
+		select {
+		case <-time.After(killDelay):
+			// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
+			killResult <- syscall.Kill(-pid, syscall.SIGKILL)
+		case <-ctx.Done():
+			return
+		}
+	}()
 
-	// Wait releases any resources associated with the Process.
-	_, _ = cmd.Process.Wait()
-	return
+	for {
+		select {
+		case err = <-killResult:
+		case <-waitResult:
+			return
+		}
+	}
 }
 
 func (e *Engine) startCmd(cmd string) (*exec.Cmd, io.ReadCloser, io.ReadCloser, error) {
diff --git a/runner/util_unix.go b/runner/util_unix.go
index afd47c89..8259d0ee 100644
--- a/runner/util_unix.go
+++ b/runner/util_unix.go
@@ -3,6 +3,7 @@
 package runner
 
 import (
+	"context"
 	"io"
 	"os"
 	"os/exec"
@@ -13,18 +14,50 @@ import (
 func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 	pid = cmd.Process.Pid
 
+	var killDelay time.Duration
+
 	if e.config.Build.SendInterrupt {
+		e.mainDebug("sending interrupt to process %d", pid)
 		// Sending a signal to make it clear to the process that it is time to turn off
 		if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
 			return
 		}
-		time.Sleep(e.config.killDelay())
+		// the kill delay is 0 by default unless the user has configured send_interrupt=true
+		// in which case it is fetched from the kill_delay setting in the .air.toml
+		killDelay = e.config.killDelay()
+		e.mainDebug("setting a kill timer for %s", killDelay.String())
+	}
+
+	waitResult := make(chan error)
+	go func() {
+		defer close(waitResult)
+		_, _ = cmd.Process.Wait()
+	}()
+
+	// prepare a cancel context that can stop the killing if it is not needed
+	ctx, cancel := context.WithCancel(context.Background())
+	defer cancel()
+
+	killResult := make(chan error)
+	// Spawn a goroutine that will kill the process after kill delay if we have not
+	// received a wait result before that.
+	go func() {
+		select {
+		case <-time.After(killDelay):
+			// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
+			killResult <- syscall.Kill(-pid, syscall.SIGKILL)
+		case <-ctx.Done():
+			return
+		}
+	}()
+
+	for {
+		select {
+		case err = <-killResult:
+		case <-waitResult:
+			return
+		}
 	}
-	// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
-	err = syscall.Kill(-pid, syscall.SIGKILL)
-	// Wait releases any resources associated with the Process.
-	_, _ = cmd.Process.Wait()
-	return pid, err
 }
 
 func (e *Engine) startCmd(cmd string) (*exec.Cmd, io.ReadCloser, io.ReadCloser, error) {

From ffd34c9e1592b39e3e96a9e277ff51fbd5be7314 Mon Sep 17 00:00:00 2001
From: Isak Styf <isak.styf@masarinconsulting.se>
Date: Fri, 1 Nov 2024 18:57:26 +0100
Subject: [PATCH 2/5] make sure to wait for both wait and kill results when
 kill delay is 0

---
 runner/util_linux.go | 28 +++++++++++++++++++++-------
 runner/util_test.go  |  3 ++-
 runner/util_unix.go  | 28 +++++++++++++++++++++-------
 3 files changed, 44 insertions(+), 15 deletions(-)

diff --git a/runner/util_linux.go b/runner/util_linux.go
index 27cb5650..ce1db1ff 100644
--- a/runner/util_linux.go
+++ b/runner/util_linux.go
@@ -2,6 +2,7 @@ package runner
 
 import (
 	"context"
+	"errors"
 	"io"
 	"os/exec"
 	"syscall"
@@ -27,12 +28,6 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		e.mainDebug("setting a kill timer for %s", killDelay.String())
 	}
 
-	waitResult := make(chan error)
-	go func() {
-		defer close(waitResult)
-		_, _ = cmd.Process.Wait()
-	}()
-
 	// prepare a cancel context that can stop the killing if it is not needed
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
@@ -50,10 +45,29 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		}
 	}()
 
+	waitResult := make(chan error)
+	go func() {
+		_, err := cmd.Process.Wait()
+		waitResult <- err
+	}()
+
+	results := make([]error, 0, 2)
+
 	for {
+		// collect the responses from the kill and wait goroutines
 		select {
 		case err = <-killResult:
-		case <-waitResult:
+			results = append(results, err)
+		case err = <-waitResult:
+			results = append(results, err)
+			// if we have a kill delay, we ignore the kill result
+			if killDelay > 0 {
+				results = append(results, nil)
+			}
+		}
+
+		if len(results) == 2 {
+			err = errors.Join(results...)
 			return
 		}
 	}
diff --git a/runner/util_test.go b/runner/util_test.go
index 275d5b78..3731fb7f 100644
--- a/runner/util_test.go
+++ b/runner/util_test.go
@@ -165,7 +165,8 @@ func Test_killCmd_no_process(t *testing.T) {
 		t.Errorf("expected error but got none")
 	}
 	if !errors.Is(err, syscall.ESRCH) {
-		t.Errorf("expected '%s' but got '%s'", syscall.ESRCH, errors.Unwrap(err))
+		err = errors.Unwrap(err)
+		t.Errorf("expected '%s' but got '%s'", syscall.ESRCH, err.Error())
 	}
 }
 
diff --git a/runner/util_unix.go b/runner/util_unix.go
index 8259d0ee..325835a3 100644
--- a/runner/util_unix.go
+++ b/runner/util_unix.go
@@ -4,6 +4,7 @@ package runner
 
 import (
 	"context"
+	"errors"
 	"io"
 	"os"
 	"os/exec"
@@ -28,12 +29,6 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		e.mainDebug("setting a kill timer for %s", killDelay.String())
 	}
 
-	waitResult := make(chan error)
-	go func() {
-		defer close(waitResult)
-		_, _ = cmd.Process.Wait()
-	}()
-
 	// prepare a cancel context that can stop the killing if it is not needed
 	ctx, cancel := context.WithCancel(context.Background())
 	defer cancel()
@@ -51,10 +46,29 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		}
 	}()
 
+	waitResult := make(chan error)
+	go func() {
+		_, err := cmd.Process.Wait()
+		waitResult <- err
+	}()
+
+	results := make([]error, 0, 2)
+
 	for {
+		// collect the responses from the kill and wait goroutines
 		select {
 		case err = <-killResult:
-		case <-waitResult:
+			results = append(results, err)
+		case err = <-waitResult:
+			results = append(results, err)
+			// if we have a kill delay, we ignore the kill result
+			if killDelay > 0 {
+				results = append(results, nil)
+			}
+		}
+
+		if len(results) == 2 {
+			err = errors.Join(results...)
 			return
 		}
 	}

From 45409a063f44a96487c7f0a34e1ca5c998651fe3 Mon Sep 17 00:00:00 2001
From: Isak Styf <isak.styf@masarinconsulting.se>
Date: Fri, 1 Nov 2024 21:37:28 +0100
Subject: [PATCH 3/5] use cmd.process.signal and cmd.process.kill instead of
 raw syscalls

---
 runner/util_linux.go | 23 ++++++++++++-----------
 runner/util_unix.go  | 21 +++++++++++----------
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/runner/util_linux.go b/runner/util_linux.go
index ce1db1ff..9b8f0762 100644
--- a/runner/util_linux.go
+++ b/runner/util_linux.go
@@ -4,8 +4,8 @@ import (
 	"context"
 	"errors"
 	"io"
+	"os"
 	"os/exec"
-	"syscall"
 	"time"
 
 	"github.com/creack/pty"
@@ -16,10 +16,16 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 
 	var killDelay time.Duration
 
+	waitResult := make(chan error)
+	go func() {
+		defer close(waitResult)
+		_, _ = cmd.Process.Wait()
+	}()
+
 	if e.config.Build.SendInterrupt {
 		e.mainDebug("sending interrupt to process %d", pid)
 		// Sending a signal to make it clear to the process that it is time to turn off
-		if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
+		if err = cmd.Process.Signal(os.Interrupt); err != nil {
 			return
 		}
 		// the kill delay is 0 by default unless the user has configured send_interrupt=true
@@ -38,19 +44,14 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 	go func() {
 		select {
 		case <-time.After(killDelay):
-			// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
-			killResult <- syscall.Kill(-pid, syscall.SIGKILL)
+			e.mainDebug("kill timer expired")
+			killResult <- cmd.Process.Kill()
 		case <-ctx.Done():
+			e.mainDebug("kill timer canceled")
 			return
 		}
 	}()
 
-	waitResult := make(chan error)
-	go func() {
-		_, err := cmd.Process.Wait()
-		waitResult <- err
-	}()
-
 	results := make([]error, 0, 2)
 
 	for {
@@ -61,7 +62,7 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		case err = <-waitResult:
 			results = append(results, err)
 			// if we have a kill delay, we ignore the kill result
-			if killDelay > 0 {
+			if killDelay > 0 && len(results) == 1 {
 				results = append(results, nil)
 			}
 		}
diff --git a/runner/util_unix.go b/runner/util_unix.go
index 325835a3..ecadb19a 100644
--- a/runner/util_unix.go
+++ b/runner/util_unix.go
@@ -17,10 +17,16 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 
 	var killDelay time.Duration
 
+	waitResult := make(chan error)
+	go func() {
+		defer close(waitResult)
+		_, _ = cmd.Process.Wait()
+	}()
+
 	if e.config.Build.SendInterrupt {
 		e.mainDebug("sending interrupt to process %d", pid)
 		// Sending a signal to make it clear to the process that it is time to turn off
-		if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
+		if err = cmd.Process.Signal(os.Interrupt); err != nil {
 			return
 		}
 		// the kill delay is 0 by default unless the user has configured send_interrupt=true
@@ -39,19 +45,14 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 	go func() {
 		select {
 		case <-time.After(killDelay):
-			// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
-			killResult <- syscall.Kill(-pid, syscall.SIGKILL)
+			e.mainDebug("kill timer expired")
+			killResult <- cmd.Process.Kill()
 		case <-ctx.Done():
+			e.mainDebug("kill timer canceled")
 			return
 		}
 	}()
 
-	waitResult := make(chan error)
-	go func() {
-		_, err := cmd.Process.Wait()
-		waitResult <- err
-	}()
-
 	results := make([]error, 0, 2)
 
 	for {
@@ -62,7 +63,7 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		case err = <-waitResult:
 			results = append(results, err)
 			// if we have a kill delay, we ignore the kill result
-			if killDelay > 0 {
+			if killDelay > 0 && len(results) == 1 {
 				results = append(results, nil)
 			}
 		}

From 4d976a7bb684367e8f06c70f8e9926bdce911a43 Mon Sep 17 00:00:00 2001
From: Isak Styf <isak.styf@masarinconsulting.se>
Date: Fri, 1 Nov 2024 23:19:03 +0100
Subject: [PATCH 4/5] switch back to syscall as os.process.signal or kill do
 not signal the entire group

---
 runner/util_linux.go | 9 +++++----
 runner/util_unix.go  | 7 ++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/runner/util_linux.go b/runner/util_linux.go
index 9b8f0762..c8bbf8b6 100644
--- a/runner/util_linux.go
+++ b/runner/util_linux.go
@@ -4,8 +4,8 @@ import (
 	"context"
 	"errors"
 	"io"
-	"os"
 	"os/exec"
+	"syscall"
 	"time"
 
 	"github.com/creack/pty"
@@ -24,8 +24,8 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 
 	if e.config.Build.SendInterrupt {
 		e.mainDebug("sending interrupt to process %d", pid)
-		// Sending a signal to make it clear to the process that it is time to turn off
-		if err = cmd.Process.Signal(os.Interrupt); err != nil {
+		// Sending a signal to make it clear to the process group that it is time to turn off
+		if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
 			return
 		}
 		// the kill delay is 0 by default unless the user has configured send_interrupt=true
@@ -45,7 +45,8 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		select {
 		case <-time.After(killDelay):
 			e.mainDebug("kill timer expired")
-			killResult <- cmd.Process.Kill()
+			// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
+			killResult <- syscall.Kill(-pid, syscall.SIGKILL)
 		case <-ctx.Done():
 			e.mainDebug("kill timer canceled")
 			return
diff --git a/runner/util_unix.go b/runner/util_unix.go
index ecadb19a..6201871c 100644
--- a/runner/util_unix.go
+++ b/runner/util_unix.go
@@ -25,8 +25,8 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 
 	if e.config.Build.SendInterrupt {
 		e.mainDebug("sending interrupt to process %d", pid)
-		// Sending a signal to make it clear to the process that it is time to turn off
-		if err = cmd.Process.Signal(os.Interrupt); err != nil {
+		// Sending a signal to make it clear to the process group that it is time to turn off
+		if err = syscall.Kill(-pid, syscall.SIGINT); err != nil {
 			return
 		}
 		// the kill delay is 0 by default unless the user has configured send_interrupt=true
@@ -46,7 +46,8 @@ func (e *Engine) killCmd(cmd *exec.Cmd) (pid int, err error) {
 		select {
 		case <-time.After(killDelay):
 			e.mainDebug("kill timer expired")
-			killResult <- cmd.Process.Kill()
+			// https://stackoverflow.com/questions/22470193/why-wont-go-kill-a-child-process-correctly
+			killResult <- syscall.Kill(-pid, syscall.SIGKILL)
 		case <-ctx.Done():
 			e.mainDebug("kill timer canceled")
 			return

From 52428bf7f43a18e0e96774071cccbf761d8bb1ca Mon Sep 17 00:00:00 2001
From: Isak Styf <isak.styf@masarinconsulting.se>
Date: Fri, 1 Nov 2024 23:29:49 +0100
Subject: [PATCH 5/5] remove unstable and slightly dangerous kill test

the test paniced if it ever got a nil err since t.Failf does not
imply that test is stopped

selecting a random pid to kill might have interesting side effects
when run and its result can never be guaranteed to be the same, meaning
that the test had intermittent failures

no code depends on the specific return code of killCmd, so there
does not seem to be a reason to have a unit test for it
---
 runner/util_test.go | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/runner/util_test.go b/runner/util_test.go
index 3731fb7f..8fd0a4b8 100644
--- a/runner/util_test.go
+++ b/runner/util_test.go
@@ -1,7 +1,6 @@
 package runner
 
 import (
-	"errors"
 	"fmt"
 	"os"
 	"os/exec"
@@ -10,7 +9,6 @@ import (
 	"runtime"
 	"strconv"
 	"strings"
-	"syscall"
 	"testing"
 	"time"
 
@@ -148,28 +146,6 @@ func TestAdaptToVariousPlatforms(t *testing.T) {
 	}
 }
 
-func Test_killCmd_no_process(t *testing.T) {
-	e := Engine{
-		config: &Config{
-			Build: cfgBuild{
-				SendInterrupt: false,
-			},
-		},
-	}
-	_, err := e.killCmd(&exec.Cmd{
-		Process: &os.Process{
-			Pid: 9999,
-		},
-	})
-	if err == nil {
-		t.Errorf("expected error but got none")
-	}
-	if !errors.Is(err, syscall.ESRCH) {
-		err = errors.Unwrap(err)
-		t.Errorf("expected '%s' but got '%s'", syscall.ESRCH, err.Error())
-	}
-}
-
 func Test_killCmd_SendInterrupt_false(t *testing.T) {
 	_, b, _, _ := runtime.Caller(0)