Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Make tests verbose about startup failures #1473

Merged
merged 4 commits into from
Dec 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions collector/container/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ COPY LICENSE-kernel-modules.txt /kernel-modules/LICENSE
COPY container/libs/libsinsp-wrapper.so /usr/local/lib/
COPY container/bin/collector /usr/local/bin/
COPY container/bin/self-checks /usr/local/bin/self-checks
COPY container/status-check.sh /usr/local/bin/status-check.sh

RUN echo '/usr/local/lib' > /etc/ld.so.conf.d/usrlocallib.conf && \
ldconfig && \
Expand All @@ -38,6 +39,14 @@ RUN echo '/usr/local/lib' > /etc/ld.so.conf.d/usrlocallib.conf && \

EXPOSE 8080 9090

HEALTHCHECK \
# health checks within the first 5s are not counted as failure
--start-period=5s \
# perform health check every 5s
--interval=5s \
# the command uses /ready API
CMD /usr/local/bin/status-check.sh

ENTRYPOINT ["/bootstrap.sh"]

CMD collector-wrapper.sh \
Expand Down
21 changes: 21 additions & 0 deletions collector/container/status-check.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
#!/usr/bin/env bash

# /ready API will return the following formatted response:
# {
# "collector" : {
# "drops" : 0,
# "events" : 9330070,
# "node" : "node.name",
# "preemptions" : 0
# },
# "status" : "ok"
# }
#
# Take the status line, split it by ":" and trim spaces and quotes.
STATUS=$(curl -s localhost:8080/ready | grep 'status' | awk -F ':' '{print $2}' | tr -d '"' | tr -d ' ')

if [[ "${STATUS}" = "ok" ]]; then
exit 0
else
exit 1
fi
66 changes: 57 additions & 9 deletions integration-tests/suites/base.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,17 @@ func (s *IntegrationTestSuiteBase) StartCollector(disableGRPC bool, options *com

s.Require().NoError(s.Collector().Setup(options))
s.Require().NoError(s.Collector().Launch())
// Wait for collector to report healthy, includes initial setup and probes
// loading. It doesn't make sense to wait for very long, limit it to 1 min.
_, err := s.waitForContainerToBecomeHealthy(
"collector",
s.Collector().ContainerID,
defaultWaitTickSeconds, 1*time.Minute)
s.Require().NoError(err)

// wait for self-check process to guarantee collector is started
s.Sensor().WaitProcessesN(s.Collector().ContainerID, 30*time.Second, 1)
selfCheckOk := s.Sensor().WaitProcessesN(s.Collector().ContainerID, 30*time.Second, 1)
s.Require().True(selfCheckOk)
}

// StopCollector will tear down the collector container and stop
Expand Down Expand Up @@ -235,17 +243,33 @@ func (s *IntegrationTestSuiteBase) launchContainer(name string, args ...string)
return outLines[len(outLines)-1], err
}

func (s *IntegrationTestSuiteBase) waitForContainerToExit(containerName, containerID string, tickSeconds time.Duration) (bool, error) {
// Wait for a container to become a certain status.
// - tickSeconds -- how often to check for the status
// - timeoutThreshold -- the overall time limit for waiting,
// defaulting to 30 min if zero
// - filter -- description of the desired status
func (s *IntegrationTestSuiteBase) waitForContainerStatus(
containerName string,
containerID string,
tickSeconds time.Duration,
timeoutThreshold time.Duration,
filter string) (bool, error) {

cmd := []string{
common.RuntimeCommand, "ps", "-qa",
"--filter", "id=" + containerID,
"--filter", "status=exited",
"--filter", filter,
}

start := time.Now()
tick := time.Tick(tickSeconds)
tickElapsed := time.Tick(1 * time.Minute)
timeout := time.After(30 * time.Minute)

if timeoutThreshold == 0 {
timeoutThreshold = 30 * time.Minute
}
timeout := time.After(timeoutThreshold)

for {
select {
case <-tick:
Expand All @@ -256,17 +280,41 @@ func (s *IntegrationTestSuiteBase) waitForContainerToExit(containerName, contain
return true, nil
}
if err != nil {
fmt.Printf("Retrying waitForContainerToExit(%s, %s): Error: %v\n", containerName, containerID, err)
fmt.Printf("Retrying waitForContainerStatus(%s, %s): Error: %v\n",
containerName, containerID, err)
}
case <-timeout:
fmt.Printf("Timed out waiting for container %s to exit, elapsed Time: %s\n", containerName, time.Since(start))
return false, nil
fmt.Printf("Timed out waiting for container %s to become %s, elapsed Time: %s\n",
containerName, filter, time.Since(start))
return false, fmt.Errorf("Timeout waiting for container %s to become %s after %v",
containerName, filter, timeoutThreshold)
case <-tickElapsed:
fmt.Printf("Waiting for container: %s, elapsed time: %s\n", containerName, time.Since(start))
fmt.Printf("Waiting for container %s to become %s, elapsed time: %s\n",
containerName, filter, time.Since(start))
}
}
}

func (s *IntegrationTestSuiteBase) waitForContainerToBecomeHealthy(
containerName string,
containerID string,
tickSeconds time.Duration,
timeoutThreshold time.Duration) (bool, error) {

return s.waitForContainerStatus(containerName, containerID, tickSeconds,
timeoutThreshold, "health=healthy")
}

func (s *IntegrationTestSuiteBase) waitForContainerToExit(
containerName string,
containerID string,
tickSeconds time.Duration,
timeoutThreshold time.Duration) (bool, error) {

return s.waitForContainerStatus(containerName, containerID, tickSeconds,
timeoutThreshold, "status=exited")
}

func (s *IntegrationTestSuiteBase) execContainer(containerName string, command []string) (string, error) {
cmd := []string{common.RuntimeCommand, "exec", containerName}
cmd = append(cmd, command...)
Expand Down Expand Up @@ -342,7 +390,7 @@ func (s *IntegrationTestSuiteBase) RunCollectorBenchmark() {
containerID, err := s.launchContainer(benchmarkName, benchmarkArgs...)
s.Require().NoError(err)

_, err = s.waitForContainerToExit(benchmarkName, containerID, defaultWaitTickSeconds)
_, err = s.waitForContainerToExit(benchmarkName, containerID, defaultWaitTickSeconds, 0)
s.Require().NoError(err)

benchmarkLogs, err := s.containerLogs("benchmark")
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/suites/benchmark.go
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ func (b *BenchmarkTestSuiteBase) RunInitContainer() {
containerID, err := b.launchContainer("host-init", cmd...)
require.NoError(b.T(), err)

if finished, _ := b.waitForContainerToExit("host-init", containerID, 5*time.Second); !finished {
if finished, _ := b.waitForContainerToExit("host-init", containerID, 5*time.Second, 0); !finished {
logs, err := b.containerLogs("host-init")
if err == nil {
fmt.Println(logs)
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/suites/image_json.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ func (s *ImageLabelJSONTestSuite) TestRunImageWithJSONLabel() {
containerID, err := s.launchContainer(name, image)
s.Require().NoError(err)

_, err = s.waitForContainerToExit(name, containerID, defaultWaitTickSeconds)
_, err = s.waitForContainerToExit(name, containerID, defaultWaitTickSeconds, 0)
s.Require().NoError(err)
}

Expand Down
6 changes: 3 additions & 3 deletions integration-tests/suites/mock_sensor/expect_proc.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func (s *MockSensor) ExpectProcessesN(t *testing.T, containerID string, timeout
}

func (s *MockSensor) WaitProcessesN(containerID string, timeout time.Duration, n int) bool {
return len(s.waitProcessesN(func() {}, containerID, timeout, n)) == n
return len(s.waitProcessesN(func() {}, containerID, timeout, n)) >= n
}

func (s *MockSensor) ExpectProcesses(
Expand Down Expand Up @@ -101,7 +101,7 @@ func (s *MockSensor) ExpectLineages(t *testing.T, containerID string, timeout ti
}

func (s *MockSensor) waitProcessesN(timeoutFn func(), containerID string, timeout time.Duration, n int) []types.ProcessInfo {
if len(s.Processes(containerID)) == n {
if len(s.Processes(containerID)) >= n {
return s.Processes(containerID)
}

Expand All @@ -116,7 +116,7 @@ loop:
continue loop
}

if len(s.Processes(containerID)) == n {
if len(s.Processes(containerID)) >= n {
return s.Processes(containerID)
}
}
Expand Down
2 changes: 1 addition & 1 deletion integration-tests/suites/perf_event_open.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ func (s *PerfEventOpenTestSuite) TestReadingTracepoints() {
containerID, err := s.launchContainer("perf-event-open", "--privileged", image, "", "STDOUT")
s.Require().NoError(err)

if finished, _ := s.waitForContainerToExit("perf-event-open", containerID, 5*time.Second); finished {
if finished, _ := s.waitForContainerToExit("perf-event-open", containerID, 5*time.Second, 0); finished {
logs, err := s.containerLogs("perf-event-open")
if err != nil {
fmt.Println(logs)
Expand Down