diff --git a/collector/container/Dockerfile b/collector/container/Dockerfile index 3ab2377157..17d789c368 100644 --- a/collector/container/Dockerfile +++ b/collector/container/Dockerfile @@ -30,6 +30,7 @@ COPY LICENSE-kernel-modules.txt /kernel-modules/LICENSE COPY container/libs/libsinsp-wrapper.so /usr/local/lib/ COPY container/bin/collector /usr/local/bin/ COPY container/bin/self-checks /usr/local/bin/self-checks +COPY container/status-check.sh /usr/local/bin/status-check.sh RUN echo '/usr/local/lib' > /etc/ld.so.conf.d/usrlocallib.conf && \ ldconfig && \ @@ -38,6 +39,14 @@ RUN echo '/usr/local/lib' > /etc/ld.so.conf.d/usrlocallib.conf && \ EXPOSE 8080 9090 +HEALTHCHECK \ + # health checks within the first 5s are not counted as failure + --start-period=5s \ + # perform health check every 5s + --interval=5s \ + # the command uses /ready API + CMD /usr/local/bin/status-check.sh + ENTRYPOINT ["/bootstrap.sh"] CMD collector-wrapper.sh \ diff --git a/collector/container/status-check.sh b/collector/container/status-check.sh new file mode 100755 index 0000000000..9a8d4fa5be --- /dev/null +++ b/collector/container/status-check.sh @@ -0,0 +1,21 @@ +#!/usr/bin/env bash + +# /ready API will return the following formatted response: +# { +# "collector" : { +# "drops" : 0, +# "events" : 9330070, +# "node" : "node.name", +# "preemptions" : 0 +# }, +# "status" : "ok" +# } +# +# Take the status line, split it by ":" and trim spaces and quotes. +STATUS=$(curl -s localhost:8080/ready | grep 'status' | awk -F ':' '{print $2}' | tr -d '"' | tr -d ' ') + +if [[ "${STATUS}" = "ok" ]]; then + exit 0 +else + exit 1 +fi diff --git a/integration-tests/suites/base.go b/integration-tests/suites/base.go index a0f8272c8a..c68bd9950e 100644 --- a/integration-tests/suites/base.go +++ b/integration-tests/suites/base.go @@ -69,9 +69,17 @@ func (s *IntegrationTestSuiteBase) StartCollector(disableGRPC bool, options *com s.Require().NoError(s.Collector().Setup(options)) s.Require().NoError(s.Collector().Launch()) + // Wait for collector to report healthy, includes initial setup and probes + // loading. It doesn't make sense to wait for very long, limit it to 1 min. + _, err := s.waitForContainerToBecomeHealthy( + "collector", + s.Collector().ContainerID, + defaultWaitTickSeconds, 1*time.Minute) + s.Require().NoError(err) // wait for self-check process to guarantee collector is started - s.Sensor().WaitProcessesN(s.Collector().ContainerID, 30*time.Second, 1) + selfCheckOk := s.Sensor().WaitProcessesN(s.Collector().ContainerID, 30*time.Second, 1) + s.Require().True(selfCheckOk) } // StopCollector will tear down the collector container and stop @@ -235,17 +243,33 @@ func (s *IntegrationTestSuiteBase) launchContainer(name string, args ...string) return outLines[len(outLines)-1], err } -func (s *IntegrationTestSuiteBase) waitForContainerToExit(containerName, containerID string, tickSeconds time.Duration) (bool, error) { +// Wait for a container to become a certain status. +// - tickSeconds -- how often to check for the status +// - timeoutThreshold -- the overall time limit for waiting, +// defaulting to 30 min if zero +// - filter -- description of the desired status +func (s *IntegrationTestSuiteBase) waitForContainerStatus( + containerName string, + containerID string, + tickSeconds time.Duration, + timeoutThreshold time.Duration, + filter string) (bool, error) { + cmd := []string{ common.RuntimeCommand, "ps", "-qa", "--filter", "id=" + containerID, - "--filter", "status=exited", + "--filter", filter, } start := time.Now() tick := time.Tick(tickSeconds) tickElapsed := time.Tick(1 * time.Minute) - timeout := time.After(30 * time.Minute) + + if timeoutThreshold == 0 { + timeoutThreshold = 30 * time.Minute + } + timeout := time.After(timeoutThreshold) + for { select { case <-tick: @@ -256,17 +280,41 @@ func (s *IntegrationTestSuiteBase) waitForContainerToExit(containerName, contain return true, nil } if err != nil { - fmt.Printf("Retrying waitForContainerToExit(%s, %s): Error: %v\n", containerName, containerID, err) + fmt.Printf("Retrying waitForContainerStatus(%s, %s): Error: %v\n", + containerName, containerID, err) } case <-timeout: - fmt.Printf("Timed out waiting for container %s to exit, elapsed Time: %s\n", containerName, time.Since(start)) - return false, nil + fmt.Printf("Timed out waiting for container %s to become %s, elapsed Time: %s\n", + containerName, filter, time.Since(start)) + return false, fmt.Errorf("Timeout waiting for container %s to become %s after %v", + containerName, filter, timeoutThreshold) case <-tickElapsed: - fmt.Printf("Waiting for container: %s, elapsed time: %s\n", containerName, time.Since(start)) + fmt.Printf("Waiting for container %s to become %s, elapsed time: %s\n", + containerName, filter, time.Since(start)) } } } +func (s *IntegrationTestSuiteBase) waitForContainerToBecomeHealthy( + containerName string, + containerID string, + tickSeconds time.Duration, + timeoutThreshold time.Duration) (bool, error) { + + return s.waitForContainerStatus(containerName, containerID, tickSeconds, + timeoutThreshold, "health=healthy") +} + +func (s *IntegrationTestSuiteBase) waitForContainerToExit( + containerName string, + containerID string, + tickSeconds time.Duration, + timeoutThreshold time.Duration) (bool, error) { + + return s.waitForContainerStatus(containerName, containerID, tickSeconds, + timeoutThreshold, "status=exited") +} + func (s *IntegrationTestSuiteBase) execContainer(containerName string, command []string) (string, error) { cmd := []string{common.RuntimeCommand, "exec", containerName} cmd = append(cmd, command...) @@ -342,7 +390,7 @@ func (s *IntegrationTestSuiteBase) RunCollectorBenchmark() { containerID, err := s.launchContainer(benchmarkName, benchmarkArgs...) s.Require().NoError(err) - _, err = s.waitForContainerToExit(benchmarkName, containerID, defaultWaitTickSeconds) + _, err = s.waitForContainerToExit(benchmarkName, containerID, defaultWaitTickSeconds, 0) s.Require().NoError(err) benchmarkLogs, err := s.containerLogs("benchmark") diff --git a/integration-tests/suites/benchmark.go b/integration-tests/suites/benchmark.go index 0d079ae5af..d9fbfd8a40 100644 --- a/integration-tests/suites/benchmark.go +++ b/integration-tests/suites/benchmark.go @@ -81,7 +81,7 @@ func (b *BenchmarkTestSuiteBase) RunInitContainer() { containerID, err := b.launchContainer("host-init", cmd...) require.NoError(b.T(), err) - if finished, _ := b.waitForContainerToExit("host-init", containerID, 5*time.Second); !finished { + if finished, _ := b.waitForContainerToExit("host-init", containerID, 5*time.Second, 0); !finished { logs, err := b.containerLogs("host-init") if err == nil { fmt.Println(logs) diff --git a/integration-tests/suites/image_json.go b/integration-tests/suites/image_json.go index 014cfec82b..784d86be3c 100644 --- a/integration-tests/suites/image_json.go +++ b/integration-tests/suites/image_json.go @@ -23,7 +23,7 @@ func (s *ImageLabelJSONTestSuite) TestRunImageWithJSONLabel() { containerID, err := s.launchContainer(name, image) s.Require().NoError(err) - _, err = s.waitForContainerToExit(name, containerID, defaultWaitTickSeconds) + _, err = s.waitForContainerToExit(name, containerID, defaultWaitTickSeconds, 0) s.Require().NoError(err) } diff --git a/integration-tests/suites/mock_sensor/expect_proc.go b/integration-tests/suites/mock_sensor/expect_proc.go index cd0511a7a4..709e9b9af6 100644 --- a/integration-tests/suites/mock_sensor/expect_proc.go +++ b/integration-tests/suites/mock_sensor/expect_proc.go @@ -18,7 +18,7 @@ func (s *MockSensor) ExpectProcessesN(t *testing.T, containerID string, timeout } func (s *MockSensor) WaitProcessesN(containerID string, timeout time.Duration, n int) bool { - return len(s.waitProcessesN(func() {}, containerID, timeout, n)) == n + return len(s.waitProcessesN(func() {}, containerID, timeout, n)) >= n } func (s *MockSensor) ExpectProcesses( @@ -101,7 +101,7 @@ func (s *MockSensor) ExpectLineages(t *testing.T, containerID string, timeout ti } func (s *MockSensor) waitProcessesN(timeoutFn func(), containerID string, timeout time.Duration, n int) []types.ProcessInfo { - if len(s.Processes(containerID)) == n { + if len(s.Processes(containerID)) >= n { return s.Processes(containerID) } @@ -116,7 +116,7 @@ loop: continue loop } - if len(s.Processes(containerID)) == n { + if len(s.Processes(containerID)) >= n { return s.Processes(containerID) } } diff --git a/integration-tests/suites/perf_event_open.go b/integration-tests/suites/perf_event_open.go index 1e51f13816..957a88d956 100644 --- a/integration-tests/suites/perf_event_open.go +++ b/integration-tests/suites/perf_event_open.go @@ -30,7 +30,7 @@ func (s *PerfEventOpenTestSuite) TestReadingTracepoints() { containerID, err := s.launchContainer("perf-event-open", "--privileged", image, "", "STDOUT") s.Require().NoError(err) - if finished, _ := s.waitForContainerToExit("perf-event-open", containerID, 5*time.Second); finished { + if finished, _ := s.waitForContainerToExit("perf-event-open", containerID, 5*time.Second, 0); finished { logs, err := s.containerLogs("perf-event-open") if err != nil { fmt.Println(logs)