-
Notifications
You must be signed in to change notification settings - Fork 132
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: improve agent startup time #5976
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,14 +8,15 @@ | |
"strings" | ||
|
||
"github.com/gofiber/fiber/v2/middleware/cors" | ||
"google.golang.org/grpc" | ||
|
||
executorsclientv1 "github.com/kubeshop/testkube-operator/pkg/client/executors/v1" | ||
testkubeclientset "github.com/kubeshop/testkube-operator/pkg/clientset/versioned" | ||
"github.com/kubeshop/testkube/cmd/api-server/commons" | ||
"github.com/kubeshop/testkube/cmd/api-server/services" | ||
"github.com/kubeshop/testkube/internal/app/api/debug" | ||
"github.com/kubeshop/testkube/internal/app/api/oauth" | ||
"github.com/kubeshop/testkube/internal/common" | ||
"github.com/kubeshop/testkube/pkg/api/v1/testkube" | ||
cloudartifacts "github.com/kubeshop/testkube/pkg/cloud/data/artifact" | ||
cloudtestworkflow "github.com/kubeshop/testkube/pkg/cloud/data/testworkflow" | ||
"github.com/kubeshop/testkube/pkg/event/kind/cdevent" | ||
|
@@ -30,9 +31,6 @@ | |
"github.com/kubeshop/testkube/pkg/tcl/schedulertcl" | ||
"github.com/kubeshop/testkube/pkg/testworkflows/executionworker/executionworkertypes" | ||
"github.com/kubeshop/testkube/pkg/testworkflows/testworkflowprocessor/presets" | ||
|
||
"github.com/kubeshop/testkube/internal/common" | ||
"github.com/kubeshop/testkube/pkg/api/v1/testkube" | ||
"github.com/kubeshop/testkube/pkg/version" | ||
|
||
"golang.org/x/sync/errgroup" | ||
|
@@ -126,16 +124,32 @@ | |
|
||
inspector := commons.CreateImageInspector(cfg, configMapClient, secretClient) | ||
|
||
// Connect to NATS | ||
nc := commons.MustCreateNATSConnection(cfg) | ||
eventBus := bus.NewNATSBus(nc) | ||
if cfg.Trace { | ||
eventBus.TraceEvents() | ||
} | ||
eventsEmitter := event.NewEmitter(eventBus, cfg.TestkubeClusterName) | ||
|
||
var testWorkflowsClient testworkflowsclientv1.Interface | ||
var testWorkflowTemplatesClient testworkflowsclientv1.TestWorkflowTemplatesInterface | ||
|
||
var grpcClient cloud.TestKubeCloudAPIClient | ||
var grpcConn *grpc.ClientConn | ||
// Use local network for local access | ||
controlPlaneUrl := cfg.TestkubeProURL | ||
if strings.HasPrefix(controlPlaneUrl, fmt.Sprintf("%s:%d", cfg.APIServerFullname, cfg.GRPCServerPort)) { | ||
controlPlaneUrl = fmt.Sprintf("127.0.0.1:%d", cfg.GRPCServerPort) | ||
} | ||
|
||
log.DefaultLogger.Infow("Connecting to control plane", "server", controlPlaneUrl, "insecure", cfg.TestkubeProTLSInsecure, "skipVerify", cfg.TestkubeProSkipVerify, "certFile", cfg.TestkubeProCertFile, "keyFile", cfg.TestkubeProKeyFile, "caFile", cfg.TestkubeProCAFile) | ||
grpcConn, err := agent.NewClient(controlPlaneUrl, cfg.TestkubeProTLSInsecure, cfg.TestkubeProSkipVerify, cfg.TestkubeProCertFile, cfg.TestkubeProKeyFile, cfg.TestkubeProCAFile) | ||
grpcClient := cloud.NewTestKubeCloudAPIClient(grpcConn) | ||
defer grpcConn.Close() | ||
|
||
// First request will 'wait for ready' with a timeout of agent.InitialTimeout. | ||
proContext, err := commons.ReadProContext(ctx, cfg, grpcClient) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Moved this up a bit, as it groups nicely with the grpcConnection part, this is where we actually wait for the connection to move from idle -> transient error -> ready. |
||
commons.ExitOnError("cannot retrieve pro context from control plane", err) | ||
|
||
grpcConn, err = agent.NewGRPCConnection( | ||
ctx, | ||
cfg.TestkubeProTLSInsecure, | ||
|
@@ -148,8 +162,6 @@ | |
) | ||
commons.ExitOnError("error creating gRPC connection", err) | ||
|
||
grpcClient = cloud.NewTestKubeCloudAPIClient(grpcConn) | ||
|
||
if mode == common.ModeAgent && cfg.WorkflowStorage == "control-plane" { | ||
testWorkflowsClient = cloudtestworkflow.NewCloudTestWorkflowRepository(grpcClient, grpcConn, cfg.TestkubeProAPIKey) | ||
testWorkflowTemplatesClient = cloudtestworkflow.NewCloudTestWorkflowTemplateRepository(grpcClient, grpcConn, cfg.TestkubeProAPIKey) | ||
|
@@ -167,15 +179,7 @@ | |
triggerLeaseBackend := triggers.NewAcquireAlwaysLeaseBackend() | ||
artifactStorage := cloudartifacts.NewCloudArtifactsStorage(grpcClient, grpcConn, cfg.TestkubeProAPIKey) | ||
|
||
nc := commons.MustCreateNATSConnection(cfg) | ||
eventBus := bus.NewNATSBus(nc) | ||
if cfg.Trace { | ||
eventBus.TraceEvents() | ||
} | ||
eventsEmitter := event.NewEmitter(eventBus, cfg.TestkubeClusterName) | ||
|
||
// Check Pro/Enterprise subscription | ||
proContext := commons.ReadProContext(ctx, cfg, grpcClient) | ||
subscriptionChecker, err := checktcl.NewSubscriptionChecker(ctx, proContext, grpcClient, grpcConn) | ||
commons.ExitOnError("Failed creating subscription checker", err) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -31,7 +31,8 @@ import ( | |
) | ||
|
||
const ( | ||
timeout = 10 * time.Second | ||
InitialTimeout = 30 * time.Second | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With gRPC backoff this would amount to: 1 + 1.6 + 2.5 + 4 +6.5 + 10.5 -> CRASH after 30s, last attempt at 26.1s. Kubernetes will then restart the pod and it will try again. |
||
Timeout = 10 * time.Second | ||
apiKeyMeta = "api-key" | ||
clusterIDMeta = "cluster-id" | ||
cloudMigrateMeta = "migrate" | ||
|
@@ -44,6 +45,50 @@ const ( | |
// buffer up to five messages per worker | ||
const bufferSizePerWorker = 5 | ||
|
||
func NewClient(server string, isInsecure, skipVerify bool, | ||
certFile, keyFile, caFile string) (*grpc.ClientConn, error) { | ||
creds, err := newTransportCredentials(isInsecure, skipVerify, certFile, keyFile, caFile) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
userAgentStr := version.Version + "/" + version.Commit | ||
keepAlive := keepalive.ClientParameters{ | ||
Time: 10 * time.Second, | ||
Timeout: 5 * time.Second, | ||
PermitWithoutStream: true, | ||
} | ||
|
||
return grpc.NewClient(server, grpc.WithUserAgent(userAgentStr), grpc.WithKeepaliveParams(keepAlive), grpc.WithTransportCredentials(creds)) | ||
} | ||
|
||
func newTransportCredentials( | ||
isInsecure, skipVerify bool, | ||
certFile, keyFile, caFile string, | ||
) (credentials.TransportCredentials, error) { | ||
if isInsecure { | ||
return insecure.NewCredentials(), nil | ||
} | ||
|
||
tlsConfig := &tls.Config{MinVersion: tls.VersionTLS12} | ||
if skipVerify { | ||
tlsConfig = &tls.Config{InsecureSkipVerify: true} | ||
return credentials.NewTLS(tlsConfig), nil | ||
} | ||
|
||
if certFile != "" && keyFile != "" { | ||
if err := clientCert(tlsConfig, certFile, keyFile); err != nil { | ||
return nil, err | ||
} | ||
} | ||
if caFile != "" { | ||
if err := rootCAs(tlsConfig, caFile); err != nil { | ||
return nil, err | ||
} | ||
} | ||
return credentials.NewTLS(tlsConfig), nil | ||
} | ||
|
||
func NewGRPCConnection( | ||
ctx context.Context, | ||
isInsecure bool, | ||
|
@@ -52,7 +97,7 @@ func NewGRPCConnection( | |
certFile, keyFile, caFile string, | ||
logger *zap.SugaredLogger, | ||
) (*grpc.ClientConn, error) { | ||
ctx, cancel := context.WithTimeout(ctx, timeout) | ||
ctx, cancel := context.WithTimeout(ctx, Timeout) | ||
defer cancel() | ||
tlsConfig := &tls.Config{MinVersion: tls.VersionTLS12} | ||
if skipVerify { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Moving NATS first staggers the gRPC connection a bit when we start agent and control plane together. gRPC backoff is quite steep, but I want to avoid tweaking it until absolutely needed as these defaults are there for a reason.