Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TestMultipleVMs_Isolated is still flaky #581

Open
kzys opened this issue Feb 25, 2022 · 2 comments
Open

TestMultipleVMs_Isolated is still flaky #581

kzys opened this issue Feb 25, 2022 · 2 comments

Comments

@kzys
Copy link
Contributor

kzys commented Feb 25, 2022

https://buildkite.com/firecracker-microvm/firecracker-containerd/builds/2535#8267eaab-d952-4276-a956-c304790b50b0

=== RUN   TestMultipleVMs_Isolated
    service_integ_test.go:263: TestMultipleVMs_Isolated: will run 100 vm's
    service_integ_test.go:450: 
        	Error Trace:	service_integ_test.go:450
        	Error:      	Received unexpected error:
        	            	unexpected errors from VM 76: failed to create a VM while there are 0 Firecracker processes: rpc error: code = Unknown desc = failed to create VM: failed to dial the VM over vsock: non-temporary vsock dial failure: failed to dial "rootfs/firecracker.vsock" within 100ms: dial unix rootfs/firecracker.vsock: connect: resource temporarily unavailable
        	Test:       	TestMultipleVMs_Isolated
--- FAIL: TestMultipleVMs_Isolated (132.93s)

The whole error message doesn't sound right.

non-temporary vsock dial failure: failed to dial "rootfs/firecracker.vsock" within 100ms: dial unix rootfs/firecracker.vsock: connect: resource temporarily unavailable

@kzys
Copy link
Contributor Author

kzys commented Apr 27, 2022

We are hitting one hour timeout even. Something is wrong.

=== RUN   TestMultipleVMs_Isolated

    service_integ_test.go:269: TestMultipleVMs_Isolated: will run 100 vm's

panic: test timed out after 1h0m0s



goroutine 7359 [running]:

testing.(*M).startAlarm.func1()

	/usr/local/go/src/testing/testing.go:1710 +0x11f

created by time.goFunc

	/usr/local/go/src/time/sleep.go:180 +0x52



goroutine 1 [chan receive, 60 minutes]:

testing.(*T).Run(0xc00057c480, 0x1f48a06, 0x18, 0x1f8a3d8, 0x1)

	/usr/local/go/src/testing/testing.go:1249 +0x610

testing.runTests.func1(0xc00057c480)

	/usr/local/go/src/testing/testing.go:1521 +0xa7

testing.tRunner(0xc00057c480, 0xc0006bfce0)

	/usr/local/go/src/testing/testing.go:1203 +0x203

testing.runTests(0xc000312000, 0x2a85360, 0x2e, 0x2e, 0xc09240127ac50e15, 0x34635650755, 0x2af7be0, 0xc0006bfda0)

	/usr/local/go/src/testing/testing.go:1519 +0x613

testing.(*M).Run(0xc000310000, 0x0)

	/usr/local/go/src/testing/testing.go:1427 +0x3b4

main.main()

	_testmain.go:133 +0x237



goroutine 120 [semacquire, 59 minutes]:

sync.runtime_Semacquire(0xc00061ab20)

	/usr/local/go/src/runtime/sema.go:56 +0x45

sync.(*WaitGroup).Wait(0xc00061ab18)

	/usr/local/go/src/sync/waitgroup.go:130 +0xe5

golang.org/x/sync/errgroup.(*Group).Wait(0xc00061ab10, 0xc0006d26f0, 0x212e860)

	/go/pkg/mod/golang.org/x/[email protected]/errgroup/errgroup.go:40 +0x45

github.com/firecracker-microvm/firecracker-containerd/runtime.TestMultipleVMs_Isolated(0xc00057c600)

	/src/runtime/service_integ_test.go:450 +0xe45

testing.tRunner(0xc00057c600, 0x1f8a3d8)

	/usr/local/go/src/testing/testing.go:1203 +0x203

created by testing.(*T).Run

	/usr/local/go/src/testing/testing.go:1248 +0x5d8



goroutine 121 [select, 59 minutes]:

google.golang.org/grpc.(*ccBalancerWrapper).watcher(0xc000232900)

	/go/pkg/mod/google.golang.org/[email protected]/balancer_conn_wrappers.go:69 +0x197

created by google.golang.org/grpc.newCCBalancerWrapper

	/go/pkg/mod/google.golang.org/[email protected]/balancer_conn_wrappers.go:60 +0x2fc



goroutine 122 [chan receive, 59 minutes]:

google.golang.org/grpc.(*addrConn).resetTransport(0xc000550dc0)

	/go/pkg/mod/google.golang.org/[email protected]/clientconn.go:1214 +0x8fd

created by google.golang.org/grpc.(*addrConn).connect

	/go/pkg/mod/google.golang.org/[email protected]/clientconn.go:844 +0x107



goroutine 135 [IO wait, 59 minutes]:

internal/poll.runtime_pollWait(0x7ff85bad2658, 0x72, 0x2a976d8)

	/usr/local/go/src/runtime/netpoll.go:227 +0x55

internal/poll.(*pollDesc).wait(0xc0003edf18, 0x72, 0x8000, 0x8000, 0xffffffffffffffff)

	/usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0xe6

internal/poll.(*pollDesc).waitRead(...)

	/usr/local/go/src/internal/poll/fd_poll_runtime.go:92

internal/poll.(*FD).Read(0xc0003edf00, 0xc0004d8000, 0x8000, 0x8000, 0x0, 0x0, 0x0)

	/usr/local/go/src/internal/poll/fd_unix.go:166 +0x28f

net.(*netFD).Read(0xc0003edf00, 0xc0004d8000, 0x8000, 0x8000, 0x11, 0x0, 0x0)

	/usr/local/go/src/net/fd_posix.go:55 +0x69

net.(*conn).Read(0xc00057e000, 0xc0004d8000, 0x8000, 0x8000, 0xc0004d8000, 0x9, 0x9047c0)

	/usr/local/go/src/net/net.go:183 +0xec

bufio.(*Reader).Read(0xc0001920c0, 0xc000250120, 0x9, 0x9, 0x800010601, 0x1060100000000, 0x8)

	/usr/local/go/src/bufio/bufio.go:227 +0x7ba

io.ReadAtLeast(0x210ce60, 0xc0001920c0, 0xc000250120, 0x9, 0x9, 0x9, 0x1c9c710, 0x7ff82aee0008, 0xc0004ac0b8)

	/usr/local/go/src/io/io.go:328 +0x9d

io.ReadFull(...)

	/usr/local/go/src/io/io.go:347

golang.org/x/net/http2.readFrameHeader(0xc000250120, 0x9, 0x9, 0x210ce60, 0xc0001920c0, 0x0, 0xc000000000, 0x7ff85bacf378, 0x1)

	/go/pkg/mod/golang.org/x/[email protected]/http2/frame.go:237 +0xb1

golang.org/x/net/http2.(*Framer).ReadFrame(0xc0002500e0, 0xc0007f4d68, 0xc0007f4d68, 0x0, 0x0)

	/go/pkg/mod/golang.org/x/[email protected]/http2/frame.go:498 +0xff

google.golang.org/grpc/internal/transport.(*http2Client).reader(0xc00000c1e0)

	/go/pkg/mod/google.golang.org/[email protected]/internal/transport/http2_client.go:1353 +0x2c5

created by google.golang.org/grpc/internal/transport.newHTTP2Client

	/go/pkg/mod/google.golang.org/[email protected]/internal/transport/http2_client.go:346 +0x1489



goroutine 136 [select, 59 minutes]:

google.golang.org/grpc/internal/transport.(*controlBuffer).get(0xc0004ac050, 0x1, 0x0, 0x0, 0x0, 0x0)

	/go/pkg/mod/google.golang.org/[email protected]/internal/transport/controlbuf.go:407 +0x1ea

google.golang.org/grpc/internal/transport.(*loopyWriter).run(0xc00060e0c0, 0x0, 0x0)

	/go/pkg/mod/google.golang.org/[email protected]/internal/transport/controlbuf.go:527 +0x2df

google.golang.org/grpc/internal/transport.newHTTP2Client.func3(0xc00000c1e0)

	/go/pkg/mod/google.golang.org/[email protected]/internal/transport/http2_client.go:396 +0x12e

created by google.golang.org/grpc/internal/transport.newHTTP2Client

	/go/pkg/mod/google.golang.org/[email protected]/internal/transport/http2_client.go:394 +0x1b27



goroutine 138 [select, 59 minutes]:

github.com/containerd/ttrpc.(*Client).run.func1(0xc00024cc80, 0xc0005ba0e0)

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:265 +0x153

created by github.com/containerd/ttrpc.(*Client).run

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:262 +0x148



goroutine 24 [select, 59 minutes]:

github.com/containerd/ttrpc.(*Client).run(0xc00024cc80)

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:330 +0x2a5

created by github.com/containerd/ttrpc.NewClient

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:94 +0x35b



goroutine 139 [IO wait, 59 minutes]:

internal/poll.runtime_pollWait(0x7ff85bad2570, 0x72, 0x2a976d8)

	/usr/local/go/src/runtime/netpoll.go:227 +0x55

internal/poll.(*pollDesc).wait(0xc00024ca98, 0x72, 0x1000, 0x1000, 0xffffffffffffffff)

	/usr/local/go/src/internal/poll/fd_poll_runtime.go:87 +0xe6

internal/poll.(*pollDesc).waitRead(...)

	/usr/local/go/src/internal/poll/fd_poll_runtime.go:92

internal/poll.(*FD).Read(0xc00024ca80, 0xc0006fb000, 0x1000, 0x1000, 0x0, 0x0, 0x0)

	/usr/local/go/src/internal/poll/fd_unix.go:166 +0x28f

net.(*netFD).Read(0xc00024ca80, 0xc0006fb000, 0x1000, 0x1000, 0x1f184a0, 0x8fc9bc, 0x1c9c710)

	/usr/local/go/src/net/fd_posix.go:55 +0x69

net.(*conn).Read(0xc00057e258, 0xc0006fb000, 0x1000, 0x1000, 0x8, 0xc001a05bd0, 0x8b800f)

	/usr/local/go/src/net/net.go:183 +0xec

bufio.(*Reader).Read(0xc00003d0e0, 0xc0004bcee0, 0xa, 0xa, 0x80, 0xc0005dd000, 0xc001a05d30)

	/usr/local/go/src/bufio/bufio.go:227 +0x7ba

io.ReadAtLeast(0x210ce60, 0xc00003d0e0, 0xc0004bcee0, 0xa, 0xa, 0xa, 0xc001a05d70, 0x8b69b5, 0x8ec7fc)

	/usr/local/go/src/io/io.go:328 +0x9d

io.ReadFull(...)

	/usr/local/go/src/io/io.go:347

github.com/containerd/ttrpc.readMessageHeader(0xc0004bcee0, 0xa, 0xa, 0x210ce60, 0xc00003d0e0, 0x0, 0xc0004bce50, 0xc001a05df8, 0x8fbbaa)

	/go/pkg/mod/github.com/containerd/[email protected]/channel.go:53 +0x99

github.com/containerd/ttrpc.(*channel).recv(0xc0004bcec0, 0xc0004c4780, 0xc001a05f00, 0x0, 0x0, 0x0, 0x0, 0x0)

	/go/pkg/mod/github.com/containerd/[email protected]/channel.go:101 +0xa5

github.com/containerd/ttrpc.(*Client).run.func2(0xc0001c2180, 0xc00024cc80, 0xc0005ba0e0)

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:294 +0xfa

created by github.com/containerd/ttrpc.(*Client).run

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:286 +0x187



goroutine 327 [select, 59 minutes]:

github.com/containerd/ttrpc.(*Client).dispatch(0xc00024cc80, 0x212e860, 0xc0004bcf80, 0xc00043c8a0, 0xc000c12240, 0x7ff85c83fa68, 0x10)

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:163 +0x505

github.com/containerd/ttrpc.defaultClientInterceptor(0x212e860, 0xc0004bcf80, 0xc00043c8a0, 0xc000c12240, 0xc0006200c0, 0xc0006200d0, 0x15, 0x10)

	/go/pkg/mod/github.com/containerd/[email protected]/interceptor.go:49 +0x62

github.com/containerd/ttrpc.(*Client).Call(0xc00024cc80, 0x212e860, 0xc0004bcf80, 0x1f3855c, 0xb, 0x1f34967, 0x8, 0x1ef77a0, 0xc00067e820, 0x1eca380, ...)

	/go/pkg/mod/github.com/containerd/[email protected]/client.go:132 +0x495

github.com/firecracker-microvm/firecracker-containerd/proto/service/fccontrol/ttrpc.(*firecrackerClient).CreateVM(0xc00057e260, 0x212e860, 0xc0004bcf80, 0xc00067e820, 0x0, 0x0, 0x5)

	/src/proto/service/fccontrol/ttrpc/fccontrol.pb.go:172 +0x137

github.com/firecracker-microvm/firecracker-containerd/runtime.TestMultipleVMs_Isolated.func1(0x212e860, 0xc0004bcf80, 0x0, 0x0)

	/src/runtime/service_integ_test.go:360 +0x7f9

github.com/firecracker-microvm/firecracker-containerd/runtime.TestMultipleVMs_Isolated.func2(0xc000ba7750, 0x0)

	/src/runtime/service_integ_test.go:442 +0x5f

golang.org/x/sync/errgroup.(*Group).Go.func1(0xc00061ab10, 0xc000b84120)

	/go/pkg/mod/golang.org/x/[email protected]/errgroup/errgroup.go:57 +0x95

created by golang.org/x/sync/errgroup.(*Group).Go

	/go/pkg/mod/golang.org/x/[email protected]/errgroup/errgroup.go:54 +0x74

exit status 2

FAIL	github.com/firecracker-microvm/firecracker-containerd/runtime	3600.109s

make[1]: *** [Makefile:56: integ-test-TestMultipleVMs_Isolated] Error 1

make[1]: Leaving directory '/local/home/buildkite-agent/builds/ip-10-0-0-199-36/firecracker-microvm/firecracker-containerd/runtime'

make: *** [Makefile:52: integ-test] Error 2

kzys added a commit to kzys/firecracker-containerd that referenced this issue Apr 29, 2022
TestMultipleVMs_Isolated is still unstable (see firecracker-microvm#581). Apparently having
multiple simultaneous requests from the same client is known to be
problematic (see containerd/ttrpc#72).

This commit workarounds the issue by making a client per VM.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 20, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 20, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 20, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 20, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 21, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 21, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 21, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 21, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 21, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
The test is still unstable (see firecracker-microvm#581) and we couldn't spend much time
root-causing the issue.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry that.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 23, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 24, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 24, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 24, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 24, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 24, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 24, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
kzys added a commit to kzys/firecracker-containerd that referenced this issue May 24, 2022
This test is still unstable (see firecracker-microvm#581) and we tend to just retry
the test until it succeeds.

This commit makes the retry slightly faster by separating the test from
other runtime isolated tests and adding concurrency_group.

Signed-off-by: Kazuyoshi Kato <[email protected]>
@kzys
Copy link
Contributor Author

kzys commented May 26, 2022

There were Firecracker processes that couldn't progress much. I attached strace and then it made them move forward somehow. Maybe there is a timing issue?

fc-strace.zip

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Projects
None yet
Development

No branches or pull requests

2 participants