diff --git a/api/types/usertasks/object.go b/api/types/usertasks/object.go index fe34b21d562c..72a71fe05d25 100644 --- a/api/types/usertasks/object.go +++ b/api/types/usertasks/object.go @@ -90,40 +90,40 @@ const ( // This value is used to populate the UserTasks.Spec.IssueType for Discover EC2 tasks. // The Web UI will then use those identifiers to show detailed instructions on how to fix the issue. const ( - // AutoDiscoverEC2IssueScriptInstanceNotRegistered is used to identify instances that failed to auto-enroll + // AutoDiscoverEC2IssueSSMInstanceNotRegistered is used to identify instances that failed to auto-enroll // because they are not present in Amazon Systems Manager. // This usually means that the Instance does not have the SSM Agent running, // or that the instance's IAM Profile does not allow have the managed IAM Policy AmazonSSMManagedInstanceCore assigned to it. - AutoDiscoverEC2IssueScriptInstanceNotRegistered = "ec2-ssm-agent-not-registered" + AutoDiscoverEC2IssueSSMInstanceNotRegistered = "ec2-ssm-agent-not-registered" - // AutoDiscoverEC2IssueScriptInstanceConnectionLost is used to identify instances that failed to auto-enroll + // AutoDiscoverEC2IssueSSMInstanceConnectionLost is used to identify instances that failed to auto-enroll // because the agent lost connection to Amazon Systems Manager. // This can happen if the user changed some setting in the instance's network or IAM profile. - AutoDiscoverEC2IssueScriptInstanceConnectionLost = "ec2-ssm-agent-connection-lost" + AutoDiscoverEC2IssueSSMInstanceConnectionLost = "ec2-ssm-agent-connection-lost" - // AutoDiscoverEC2IssueScriptInstanceUnsupportedOS is used to identify instances that failed to auto-enroll + // AutoDiscoverEC2IssueSSMInstanceUnsupportedOS is used to identify instances that failed to auto-enroll // because its OS is not supported by teleport. // This can happen if the instance is running Windows. - AutoDiscoverEC2IssueScriptInstanceUnsupportedOS = "ec2-ssm-unsupported-os" + AutoDiscoverEC2IssueSSMInstanceUnsupportedOS = "ec2-ssm-unsupported-os" - // AutoDiscoverEC2IssueScriptFailure is used to identify instances that failed to auto-enroll + // AutoDiscoverEC2IssueSSMScriptFailure is used to identify instances that failed to auto-enroll // because the installation script failed. // The invocation url must be included in the report, so that users can see what was wrong. - AutoDiscoverEC2IssueScriptFailure = "ec2-ssm-script-failure" + AutoDiscoverEC2IssueSSMScriptFailure = "ec2-ssm-script-failure" - // AutoDiscoverEC2IssueInvocationFailure is used to identify instances that failed to auto-enroll + // AutoDiscoverEC2IssueSSMInvocationFailure is used to identify instances that failed to auto-enroll // because the SSM Script Run (also known as Invocation) failed. // This happens when there's a failure with permissions or an invalid configuration (eg, invalid document name). - AutoDiscoverEC2IssueInvocationFailure = "ec2-ssm-invocation-failure" + AutoDiscoverEC2IssueSSMInvocationFailure = "ec2-ssm-invocation-failure" ) // discoverEC2IssueTypes is a list of issue types that can occur when trying to auto enroll EC2 instances. var discoverEC2IssueTypes = []string{ - AutoDiscoverEC2IssueScriptInstanceNotRegistered, - AutoDiscoverEC2IssueScriptInstanceConnectionLost, - AutoDiscoverEC2IssueScriptInstanceUnsupportedOS, - AutoDiscoverEC2IssueScriptFailure, - AutoDiscoverEC2IssueInvocationFailure, + AutoDiscoverEC2IssueSSMInstanceNotRegistered, + AutoDiscoverEC2IssueSSMInstanceConnectionLost, + AutoDiscoverEC2IssueSSMInstanceUnsupportedOS, + AutoDiscoverEC2IssueSSMScriptFailure, + AutoDiscoverEC2IssueSSMInvocationFailure, } // ValidateUserTask validates the UserTask object without modifying it. diff --git a/lib/auth/authclient/api.go b/lib/auth/authclient/api.go index 4f5849ffa647..80b25dc26d2b 100644 --- a/lib/auth/authclient/api.go +++ b/lib/auth/authclient/api.go @@ -755,6 +755,9 @@ type ReadDiscoveryAccessPoint interface { // GetProxies returns a list of registered proxies. GetProxies() ([]types.Server, error) + + // GetUserTask gets a single User Task by its name. + GetUserTask(ctx context.Context, name string) (*usertasksv1.UserTask, error) } // DiscoveryAccessPoint is an API interface implemented by a certificate authority (CA) to be diff --git a/lib/srv/discovery/discovery.go b/lib/srv/discovery/discovery.go index c2c16eeb43ec..adb2c6727de8 100644 --- a/lib/srv/discovery/discovery.go +++ b/lib/srv/discovery/discovery.go @@ -36,15 +36,18 @@ import ( "github.com/gravitational/trace" "github.com/jonboulle/clockwork" "github.com/sirupsen/logrus" + "google.golang.org/protobuf/types/known/timestamppb" "k8s.io/client-go/kubernetes" "k8s.io/client-go/rest" "github.com/gravitational/teleport" "github.com/gravitational/teleport/api/client/proto" + usertasksv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/usertasks/v1" usageeventsv1 "github.com/gravitational/teleport/api/gen/proto/go/usageevents/v1" "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/api/types/discoveryconfig" apievents "github.com/gravitational/teleport/api/types/events" + "github.com/gravitational/teleport/api/types/usertasks" "github.com/gravitational/teleport/api/utils/retryutils" "github.com/gravitational/teleport/lib/auth/authclient" "github.com/gravitational/teleport/lib/cloud" @@ -329,6 +332,7 @@ type Server struct { awsSyncStatus awsSyncStatus awsEC2ResourcesStatus awsResourcesStatus + awsEC2Tasks awsEC2Tasks // caRotationCh receives nodes that need to have their CAs rotated. caRotationCh chan []types.Server @@ -459,7 +463,8 @@ func (s *Server) initAWSWatchers(matchers []types.AWSMatcher) error { server.WithPollInterval(s.PollInterval), server.WithTriggerFetchC(s.newDiscoveryConfigChangedSub()), server.WithPreFetchHookFn(func() { - s.awsEC2ResourcesStatus.iterationStarted() + s.awsEC2ResourcesStatus.reset() + s.awsEC2Tasks.reset() }), ) if err != nil { @@ -972,6 +977,24 @@ func (s *Server) handleEC2RemoteInstallation(instances *server.EC2Instances) err discoveryConfig: instances.DiscoveryConfig, integration: instances.Integration, }, len(req.Instances)) + + for _, instance := range req.Instances { + s.awsEC2Tasks.addFailedEnrollment( + awsEC2TaskKey{ + accountID: instances.AccountID, + integration: instances.Integration, + issueType: usertasks.AutoDiscoverEC2IssueSSMInvocationFailure, + region: instances.Region, + }, + &usertasksv1.DiscoverEC2Instance{ + // TODO(marco): add instance name + DiscoveryConfig: instances.DiscoveryConfig, + DiscoveryGroup: s.DiscoveryGroup, + InstanceId: instance.InstanceID, + SyncTime: timestamppb.New(s.clock.Now()), + }, + ) + } return trace.Wrap(err) } return nil @@ -1084,6 +1107,7 @@ func (s *Server) handleEC2Discovery() { } s.updateDiscoveryConfigStatus(instances.EC2.DiscoveryConfig) + s.upsertTasksForAWSEC2FailedEnrollments() case <-s.ctx.Done(): s.ec2Watcher.Stop() return diff --git a/lib/srv/discovery/discovery_test.go b/lib/srv/discovery/discovery_test.go index 9a0488460bc5..974dd1bfcdad 100644 --- a/lib/srv/discovery/discovery_test.go +++ b/lib/srv/discovery/discovery_test.go @@ -65,6 +65,7 @@ import ( "github.com/gravitational/teleport/api/defaults" discoveryconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/discoveryconfig/v1" integrationpb "github.com/gravitational/teleport/api/gen/proto/go/teleport/integration/v1" + usertasksv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/usertasks/v1" usageeventsv1 "github.com/gravitational/teleport/api/gen/proto/go/usageevents/v1" "github.com/gravitational/teleport/api/internalutils/stream" "github.com/gravitational/teleport/api/types" @@ -199,9 +200,14 @@ func genEC2Instances(n int) []*ec2.Instance { type mockSSMInstaller struct { mu sync.Mutex installedInstances map[string]struct{} + runError error } func (m *mockSSMInstaller) Run(_ context.Context, req server.SSMRunRequest) error { + if m.runError != nil { + return m.runError + } + m.mu.Lock() defer m.mu.Unlock() for _, inst := range req.Instances { @@ -250,8 +256,9 @@ func TestDiscoveryServer(t *testing.T) { ) require.NoError(t, err) + dcForEC2SSMWithIntegrationName := uuid.NewString() dcForEC2SSMWithIntegration, err := discoveryconfig.NewDiscoveryConfig( - header.Metadata{Name: uuid.NewString()}, + header.Metadata{Name: dcForEC2SSMWithIntegrationName}, discoveryconfig.Spec{ DiscoveryGroup: defaultDiscoveryGroup, AWS: []types.AWSMatcher{{ @@ -269,6 +276,26 @@ func TestDiscoveryServer(t *testing.T) { ) require.NoError(t, err) + discoveryConfigForUserTaskTestName := uuid.NewString() + discoveryConfigForUserTaskTest, err := discoveryconfig.NewDiscoveryConfig( + header.Metadata{Name: discoveryConfigForUserTaskTestName}, + discoveryconfig.Spec{ + DiscoveryGroup: defaultDiscoveryGroup, + AWS: []types.AWSMatcher{{ + Types: []string{"ec2"}, + Regions: []string{"eu-west-2"}, + Tags: map[string]utils.Strings{"RunDiscover": {"please"}}, + SSM: &types.AWSSSM{DocumentName: "document"}, + Params: &types.InstallerParams{ + InstallTeleport: true, + EnrollMode: types.InstallParamEnrollMode_INSTALL_PARAM_ENROLL_MODE_SCRIPT, + }, + Integration: "my-integration", + }}, + }, + ) + require.NoError(t, err) + tcs := []struct { name string // presentInstances is a list of servers already present in teleport @@ -280,6 +307,8 @@ func TestDiscoveryServer(t *testing.T) { staticMatchers Matchers wantInstalledInstances []string wantDiscoveryConfigStatus *discoveryconfig.Status + userTasksDiscoverEC2Check require.ValueAssertionFunc + ssmRunError error }{ { name: "no nodes present, 1 found ", @@ -538,6 +567,74 @@ func TestDiscoveryServer(t *testing.T) { }, wantInstalledInstances: []string{"instance-id-1"}, }, + { + name: "one node found but SSM Run fails and DiscoverEC2 User Task is created", + presentInstances: []types.Server{}, + foundEC2Instances: []*ec2.Instance{ + { + InstanceId: aws.String("instance-id-1"), + Tags: []*ec2.Tag{{ + Key: aws.String("env"), + Value: aws.String("dev"), + }}, + State: &ec2.InstanceState{ + Name: aws.String(ec2.InstanceStateNameRunning), + }, + }, + }, + ssm: &mockSSMClient{ + commandOutput: &ssm.SendCommandOutput{ + Command: &ssm.Command{ + CommandId: aws.String("command-id-1"), + }, + }, + invokeOutput: &ssm.GetCommandInvocationOutput{ + Status: aws.String(ssm.CommandStatusSuccess), + ResponseCode: aws.Int64(0), + }, + }, + ssmRunError: trace.BadParameter("ssm run failed"), + emitter: &mockEmitter{ + eventHandler: func(t *testing.T, ae events.AuditEvent, server *Server) { + t.Helper() + require.Equal(t, &events.SSMRun{ + Metadata: events.Metadata{ + Type: libevents.SSMRunEvent, + Code: libevents.SSMRunSuccessCode, + }, + CommandID: "command-id-1", + AccountID: "owner", + InstanceID: "instance-id-1", + Region: "eu-central-1", + ExitCode: 0, + Status: ssm.CommandStatusSuccess, + }, ae) + }, + }, + staticMatchers: Matchers{}, + discoveryConfig: discoveryConfigForUserTaskTest, + wantInstalledInstances: []string{}, + userTasksDiscoverEC2Check: func(tt require.TestingT, i1 interface{}, i2 ...interface{}) { + existingTasks, ok := i1.([]*usertasksv1.UserTask) + require.True(t, ok, "failed to get existing tasks: %T", i1) + require.Len(t, existingTasks, 1) + existingTask := existingTasks[0] + + require.Equal(t, "OPEN", existingTask.GetSpec().State) + require.Equal(t, "my-integration", existingTask.GetSpec().Integration) + require.Equal(t, "ec2-ssm-invocation-failure", existingTask.GetSpec().IssueType) + require.Equal(t, "owner", existingTask.GetSpec().GetDiscoverEc2().GetAccountId()) + require.Equal(t, "eu-west-2", existingTask.GetSpec().GetDiscoverEc2().GetRegion()) + + taskInstances := existingTask.GetSpec().GetDiscoverEc2().Instances + require.Contains(t, taskInstances, "instance-id-1") + taskInstance := taskInstances["instance-id-1"] + + require.Equal(t, "instance-id-1", taskInstance.InstanceId) + require.Equal(t, discoveryConfigForUserTaskTestName, taskInstance.DiscoveryConfig) + require.Equal(t, defaultDiscoveryGroup, taskInstance.DiscoveryGroup) + }, + }, } for _, tc := range tcs { @@ -586,6 +683,7 @@ func TestDiscoveryServer(t *testing.T) { reporter := &mockUsageReporter{} installer := &mockSSMInstaller{ installedInstances: make(map[string]struct{}), + runError: tc.ssmRunError, } tlsServer.Auth().SetUsageReporter(reporter) @@ -640,6 +738,20 @@ func TestDiscoveryServer(t *testing.T) { return true }, 500*time.Millisecond, 50*time.Millisecond) } + if tc.userTasksDiscoverEC2Check != nil { + var allUserTasks []*usertasksv1.UserTask + var nextToken string + for { + var userTasks []*usertasksv1.UserTask + userTasks, nextToken, err = tlsServer.Auth().UserTasks.ListUserTasks(context.Background(), 0, "") + require.NoError(t, err) + allUserTasks = append(allUserTasks, userTasks...) + if nextToken == "" { + break + } + } + tc.userTasksDiscoverEC2Check(t, allUserTasks) + } }) } } diff --git a/lib/srv/discovery/status.go b/lib/srv/discovery/status.go index a7194d87372b..8d6960e557e6 100644 --- a/lib/srv/discovery/status.go +++ b/lib/srv/discovery/status.go @@ -25,10 +25,17 @@ import ( "time" "github.com/gravitational/trace" + "github.com/sirupsen/logrus" + "google.golang.org/protobuf/types/known/timestamppb" discoveryconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/discoveryconfig/v1" + usertasksv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/usertasks/v1" + "github.com/gravitational/teleport/api/types" "github.com/gravitational/teleport/api/types/discoveryconfig" + "github.com/gravitational/teleport/api/types/usertasks" + "github.com/gravitational/teleport/api/utils/retryutils" libevents "github.com/gravitational/teleport/lib/events" + "github.com/gravitational/teleport/lib/services" aws_sync "github.com/gravitational/teleport/lib/srv/discovery/fetchers/aws-sync" "github.com/gravitational/teleport/lib/srv/server" ) @@ -204,7 +211,7 @@ type awsResourceGroupResult struct { failed int } -func (d *awsResourcesStatus) iterationStarted() { +func (d *awsResourcesStatus) reset() { d.mu.Lock() defer d.mu.Unlock() @@ -293,5 +300,232 @@ func (s *Server) ReportEC2SSMInstallationResult(ctx context.Context, result *ser s.updateDiscoveryConfigStatus(result.DiscoveryConfig) + s.awsEC2Tasks.addFailedEnrollment( + awsEC2TaskKey{ + integration: result.IntegrationName, + issueType: result.IssueType, + accountID: result.SSMRunEvent.AccountID, + region: result.SSMRunEvent.Region, + }, + &usertasksv1.DiscoverEC2Instance{ + // TODO(marco): add instance name + InvocationUrl: result.SSMRunEvent.InvocationURL, + DiscoveryConfig: result.DiscoveryConfig, + DiscoveryGroup: s.DiscoveryGroup, + SyncTime: timestamppb.New(result.SSMRunEvent.Time), + InstanceId: result.SSMRunEvent.InstanceID, + }, + ) + + return nil +} + +// awsEC2Tasks contains the Discover EC2 User Tasks that must be reported to the user. +type awsEC2Tasks struct { + mu sync.RWMutex + // instancesIssues maps the Discover EC2 User Task grouping parts to a set of instances metadata. + instancesIssues map[awsEC2TaskKey]map[string]*usertasksv1.DiscoverEC2Instance + // issuesSyncQueue is used to register which groups were changed in memory but were not yet sent to the cluster. + // When upserting User Tasks, if the group is not in issuesSyncQueue, + // then the cluster already has the latest version of this particular group. + issuesSyncQueue map[awsEC2TaskKey]struct{} +} + +// awsEC2TaskKey identifies a UserTask group. +type awsEC2TaskKey struct { + integration string + issueType string + accountID string + region string +} + +// iterationStarted clears out any in memory issues that were recorded. +// This is used when starting a new Auto Discover EC2 watcher iteration. +func (d *awsEC2Tasks) reset() { + d.mu.Lock() + defer d.mu.Unlock() + + d.instancesIssues = make(map[awsEC2TaskKey]map[string]*usertasksv1.DiscoverEC2Instance) + d.issuesSyncQueue = make(map[awsEC2TaskKey]struct{}) +} + +// addFailedEnrollment adds an enrollment failure of a given instance. +func (d *awsEC2Tasks) addFailedEnrollment(g awsEC2TaskKey, instance *usertasksv1.DiscoverEC2Instance) { + // Only failures associated with an Integration are reported. + // There's no major blocking for showing non-integration User Tasks, but this keeps scope smaller. + if g.integration == "" { + return + } + + d.mu.Lock() + defer d.mu.Unlock() + if d.instancesIssues == nil { + d.instancesIssues = make(map[awsEC2TaskKey]map[string]*usertasksv1.DiscoverEC2Instance) + } + if _, ok := d.instancesIssues[g]; !ok { + d.instancesIssues[g] = make(map[string]*usertasksv1.DiscoverEC2Instance) + } + d.instancesIssues[g][instance.InstanceId] = instance + + if d.issuesSyncQueue == nil { + d.issuesSyncQueue = make(map[awsEC2TaskKey]struct{}) + } + d.issuesSyncQueue[g] = struct{}{} +} + +// acquireSemaphoreForUserTask tries to acquire a semaphore lock for this user task. +// It returns a func which must be called to release the lock. +// It also returns a context which is tied to the lease and will be canceled if the lease ends. +func (s *Server) acquireSemaphoreForUserTask(userTaskName string) (releaseFn func(), ctx context.Context, err error) { + // Use the deterministic task name as semaphore name. + semaphoreName := userTaskName + semaphoreExpiration := 5 * time.Second + + // AcquireSemaphoreLock will retry until the semaphore is acquired. + // This prevents multiple discovery services to write AWS resources in parallel. + // lease must be released to cleanup the resource in auth server. + lease, err := services.AcquireSemaphoreLockWithRetry( + s.ctx, + services.SemaphoreLockConfigWithRetry{ + SemaphoreLockConfig: services.SemaphoreLockConfig{ + Service: s.AccessPoint, + Params: types.AcquireSemaphoreRequest{ + SemaphoreKind: types.KindUserTask, + SemaphoreName: semaphoreName, + MaxLeases: 1, + Holder: s.Config.ServerID, + }, + Expiry: semaphoreExpiration, + Clock: s.clock, + }, + Retry: retryutils.LinearConfig{ + Clock: s.clock, + First: time.Second, + Step: semaphoreExpiration / 2, + Max: semaphoreExpiration, + Jitter: retryutils.NewJitter(), + }, + }, + ) + if err != nil { + return nil, nil, trace.Wrap(err) + } + + // Once the lease parent context is canceled, the lease will be released. + ctxWithLease, cancel := context.WithCancel(lease) + + releaseFn = func() { + cancel() + lease.Stop() + if err := lease.Wait(); err != nil { + s.Log.WithError(err).WithField("semaphore", userTaskName).Warn("error cleaning up UserTask semaphore") + } + } + + return releaseFn, ctxWithLease, nil +} + +// mergeUpsertDiscoverEC2Task takes the current DiscoverEC2 User Task issues stored in memory and +// merges them against the ones that exist in the cluster. +// +// All of this flow is protected by a lock to ensure there's no race between this and other DiscoveryServices. +func (s *Server) mergeUpsertDiscoverEC2Task(taskGroup awsEC2TaskKey, failedInstances map[string]*usertasksv1.DiscoverEC2Instance) error { + userTaskName := usertasks.TaskNameForDiscoverEC2(usertasks.TaskNameForDiscoverEC2Parts{ + Integration: taskGroup.integration, + IssueType: taskGroup.issueType, + AccountID: taskGroup.accountID, + Region: taskGroup.region, + }) + + releaseFn, ctxWithLease, err := s.acquireSemaphoreForUserTask(userTaskName) + if err != nil { + return trace.Wrap(err) + } + defer releaseFn() + + // Fetch the current task because it might have instances discovered by another group of DiscoveryServices. + currentUserTask, err := s.AccessPoint.GetUserTask(ctxWithLease, userTaskName) + switch { + case trace.IsNotFound(err): + case err != nil: + return trace.Wrap(err) + default: + failedInstances = s.discoverEC2UserTaskAddExistingInstances(currentUserTask, failedInstances) + } + + // If the DiscoveryService is stopped, or the issue does not happen again + // the task is removed to prevent users from working on issues that are no longer happening. + taskExpiration := s.clock.Now().Add(2 * s.PollInterval) + + task, err := usertasks.NewDiscoverEC2UserTask( + &usertasksv1.UserTaskSpec{ + Integration: taskGroup.integration, + TaskType: usertasks.TaskTypeDiscoverEC2, + IssueType: taskGroup.issueType, + State: usertasks.TaskStateOpen, + DiscoverEc2: &usertasksv1.DiscoverEC2{ + AccountId: taskGroup.accountID, + Region: taskGroup.region, + Instances: failedInstances, + }, + }, + usertasks.WithExpiration(taskExpiration), + ) + if err != nil { + return trace.Wrap(err) + } + + if _, err := s.AccessPoint.UpsertUserTask(ctxWithLease, task); err != nil { + return trace.Wrap(err) + } + return nil } + +// discoverEC2UserTaskAddExistingInstances takes the UserTask stored in the cluster and merges it into the existing map of failed instances. +func (s *Server) discoverEC2UserTaskAddExistingInstances(currentUserTask *usertasksv1.UserTask, failedInstances map[string]*usertasksv1.DiscoverEC2Instance) map[string]*usertasksv1.DiscoverEC2Instance { + for existingInstanceID, existingInstance := range currentUserTask.Spec.DiscoverEc2.Instances { + // Each DiscoveryService works on all the DiscoveryConfigs assigned to a given DiscoveryGroup. + // So, it's safe to say that current DiscoveryService has the last state for a given DiscoveryGroup. + // If other instances exist for this DiscoveryGroup, they can be discarded because, as said before, the current DiscoveryService has the last state for a given DiscoveryGroup. + if existingInstance.DiscoveryGroup == s.DiscoveryGroup { + continue + } + + // For existing instances whose sync time is too far in the past, just drop them. + // This ensures that if an instance is removed from AWS, it will eventually disappear from the User Tasks' instance list. + // It might also be the case that the DiscoveryConfig was changed and the instance is no longer matched (because of labels/regions or other matchers). + instanceIssueExpiration := s.clock.Now().Add(-2 * s.PollInterval) + if existingInstance.SyncTime.AsTime().Before(instanceIssueExpiration) { + continue + } + + // Merge existing cluster state into in-memory object. + failedInstances[existingInstanceID] = existingInstance + } + return failedInstances +} + +func (s *Server) upsertTasksForAWSEC2FailedEnrollments() { + s.awsEC2Tasks.mu.Lock() + defer s.awsEC2Tasks.mu.Unlock() + for g := range s.awsEC2Tasks.issuesSyncQueue { + instancesIssueByID := s.awsEC2Tasks.instancesIssues[g] + if len(instancesIssueByID) == 0 { + continue + } + + if err := s.mergeUpsertDiscoverEC2Task(g, instancesIssueByID); err != nil { + s.Log.WithError(err).WithFields(logrus.Fields{ + "integration": g.integration, + "issue_type": g.issueType, + "aws_account_id": g.accountID, + "aws_region": g.region, + }, + ).Warning("Failed to create discover ec2 user task.", g.integration, g.issueType, g.accountID, g.region) + continue + } + + delete(s.awsEC2Tasks.issuesSyncQueue, g) + } +} diff --git a/lib/srv/server/ssm_install.go b/lib/srv/server/ssm_install.go index b97fe8eb23de..ac74601e7f3a 100644 --- a/lib/srv/server/ssm_install.go +++ b/lib/srv/server/ssm_install.go @@ -35,6 +35,7 @@ import ( "github.com/gravitational/teleport" apievents "github.com/gravitational/teleport/api/types/events" + "github.com/gravitational/teleport/api/types/usertasks" awslib "github.com/gravitational/teleport/lib/cloud/aws" libevents "github.com/gravitational/teleport/lib/events" ) @@ -59,6 +60,9 @@ type SSMInstallationResult struct { // DiscoveryConfig is the DiscoveryConfig name which originated this Run Request. // Empty if using static matchers (coming from the `teleport.yaml`). DiscoveryConfig string + // IssueType identifies the type of issue that occurred if the installation failed. + // These are well known identifiers that can be found at types.AutoDiscoverEC2Issue*. + IssueType string } // SSMInstaller handles running SSM commands that install Teleport on EC2 instances. @@ -191,7 +195,7 @@ func (si *SSMInstaller) Run(ctx context.Context, req SSMRunRequest) error { return trace.Wrap(g.Wait()) } -func invalidSSMInstanceInstallationResult(req SSMRunRequest, instanceID, status string) *SSMInstallationResult { +func invalidSSMInstanceInstallationResult(req SSMRunRequest, instanceID, status, issueType string) *SSMInstallationResult { return &SSMInstallationResult{ SSMRunEvent: &apievents.SSMRun{ Metadata: apievents.Metadata{ @@ -207,6 +211,7 @@ func invalidSSMInstanceInstallationResult(req SSMRunRequest, instanceID, status }, IntegrationName: req.IntegrationName, DiscoveryConfig: req.DiscoveryConfig, + IssueType: issueType, } } @@ -215,6 +220,7 @@ func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRu for _, instanceID := range instanceIDsState.missing { installationResult := invalidSSMInstanceInstallationResult(req, instanceID, "EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.", + usertasks.AutoDiscoverEC2IssueSSMInstanceNotRegistered, ) if err := si.ReportSSMInstallationResultFunc(ctx, installationResult); err != nil { errs = append(errs, trace.Wrap(err)) @@ -224,6 +230,7 @@ func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRu for _, instanceID := range instanceIDsState.connectionLost { installationResult := invalidSSMInstanceInstallationResult(req, instanceID, "SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.", + usertasks.AutoDiscoverEC2IssueSSMInstanceConnectionLost, ) if err := si.ReportSSMInstallationResultFunc(ctx, installationResult); err != nil { errs = append(errs, trace.Wrap(err)) @@ -233,6 +240,7 @@ func (si *SSMInstaller) emitInvalidInstanceEvents(ctx context.Context, req SSMRu for _, instanceID := range instanceIDsState.unsupportedOS { installationResult := invalidSSMInstanceInstallationResult(req, instanceID, "EC2 instance is running an unsupported Operating System. Only Linux is supported.", + usertasks.AutoDiscoverEC2IssueSSMInstanceUnsupportedOS, ) if err := si.ReportSSMInstallationResultFunc(ctx, installationResult); err != nil { errs = append(errs, trace.Wrap(err)) @@ -350,6 +358,7 @@ func (si *SSMInstaller) checkCommand(ctx context.Context, req SSMRunRequest, com SSMRunEvent: invocationResultEvent, IntegrationName: req.IntegrationName, DiscoveryConfig: req.DiscoveryConfig, + IssueType: usertasks.AutoDiscoverEC2IssueSSMScriptFailure, })) } @@ -363,6 +372,7 @@ func (si *SSMInstaller) checkCommand(ctx context.Context, req SSMRunRequest, com SSMRunEvent: stepResultEvent, IntegrationName: req.IntegrationName, DiscoveryConfig: req.DiscoveryConfig, + IssueType: usertasks.AutoDiscoverEC2IssueSSMScriptFailure, })) } } diff --git a/lib/srv/server/ssm_install_test.go b/lib/srv/server/ssm_install_test.go index 8e5225ee3a88..5d24398433bc 100644 --- a/lib/srv/server/ssm_install_test.go +++ b/lib/srv/server/ssm_install_test.go @@ -144,6 +144,7 @@ func TestSSMInstaller(t *testing.T) { Status: ssm.CommandStatusSuccess, InvocationURL: "https://eu-central-1.console.aws.amazon.com/systems-manager/run-command/command-id-1/instance-id-1", }, + IssueType: "ec2-ssm-script-failure", }}, }, { @@ -188,6 +189,7 @@ func TestSSMInstaller(t *testing.T) { Status: ssm.CommandStatusSuccess, InvocationURL: "https://eu-central-1.console.aws.amazon.com/systems-manager/run-command/command-id-1/instance-id-1", }, + IssueType: "ec2-ssm-script-failure", }}, }, { @@ -234,6 +236,7 @@ func TestSSMInstaller(t *testing.T) { StandardError: "timeout error", InvocationURL: "https://eu-central-1.console.aws.amazon.com/systems-manager/run-command/command-id-1/instance-id-1", }, + IssueType: "ec2-ssm-script-failure", }}, }, { @@ -284,6 +287,7 @@ func TestSSMInstaller(t *testing.T) { StandardError: "timeout error", InvocationURL: "https://eu-central-1.console.aws.amazon.com/systems-manager/run-command/command-id-1/instance-id-1", }, + IssueType: "ec2-ssm-script-failure", }}, }, { @@ -351,6 +355,7 @@ func TestSSMInstaller(t *testing.T) { Status: ssm.CommandStatusSuccess, InvocationURL: "https://eu-central-1.console.aws.amazon.com/systems-manager/run-command/command-id-1/instance-id-1", }, + IssueType: "ec2-ssm-script-failure", }, { SSMRunEvent: &events.SSMRun{ @@ -365,6 +370,7 @@ func TestSSMInstaller(t *testing.T) { ExitCode: -1, Status: "SSM Agent in EC2 Instance is not connecting to SSM Service. Restart or reinstall the SSM service. See https://docs.aws.amazon.com/systems-manager/latest/userguide/ami-preinstalled-agent.html#verify-ssm-agent-status for more details.", }, + IssueType: "ec2-ssm-agent-connection-lost", }, { SSMRunEvent: &events.SSMRun{ @@ -379,6 +385,7 @@ func TestSSMInstaller(t *testing.T) { ExitCode: -1, Status: "EC2 instance is running an unsupported Operating System. Only Linux is supported.", }, + IssueType: "ec2-ssm-unsupported-os", }, { SSMRunEvent: &events.SSMRun{ @@ -393,6 +400,7 @@ func TestSSMInstaller(t *testing.T) { ExitCode: -1, Status: "EC2 Instance is not registered in SSM. Make sure that the instance has AmazonSSMManagedInstanceCore policy assigned.", }, + IssueType: "ec2-ssm-agent-not-registered", }, }, }, @@ -448,6 +456,7 @@ func TestSSMInstaller(t *testing.T) { StandardOutput: "custom output", InvocationURL: "https://eu-central-1.console.aws.amazon.com/systems-manager/run-command/command-id-1/instance-id-1", }, + IssueType: "ec2-ssm-script-failure", }}, }, { @@ -488,6 +497,7 @@ func TestSSMInstaller(t *testing.T) { Status: ssm.CommandStatusSuccess, InvocationURL: "https://eu-central-1.console.aws.amazon.com/systems-manager/run-command/command-id-1/instance-id-1", }, + IssueType: "ec2-ssm-script-failure", }}, }, // todo(amk): test that incomplete commands eventually return diff --git a/lib/web/usertasks_test.go b/lib/web/usertasks_test.go index 0e77cb8cd638..79a8a553e799 100644 --- a/lib/web/usertasks_test.go +++ b/lib/web/usertasks_test.go @@ -63,11 +63,11 @@ func TestUserTask(t *testing.T) { } issueTypes := []string{ - usertasks.AutoDiscoverEC2IssueInvocationFailure, - usertasks.AutoDiscoverEC2IssueScriptFailure, - usertasks.AutoDiscoverEC2IssueScriptInstanceConnectionLost, - usertasks.AutoDiscoverEC2IssueScriptInstanceNotRegistered, - usertasks.AutoDiscoverEC2IssueScriptInstanceUnsupportedOS, + usertasks.AutoDiscoverEC2IssueSSMInvocationFailure, + usertasks.AutoDiscoverEC2IssueSSMScriptFailure, + usertasks.AutoDiscoverEC2IssueSSMInstanceConnectionLost, + usertasks.AutoDiscoverEC2IssueSSMInstanceNotRegistered, + usertasks.AutoDiscoverEC2IssueSSMInstanceUnsupportedOS, } var userTaskForTest *usertasksv1.UserTask for _, issueType := range issueTypes {