Skip to content

Commit

Permalink
Create DiscoverEC2 User Tasks when Auto Discover fails on EC2 instanc…
Browse files Browse the repository at this point in the history
…es (#47064)

* Create DiscoverEC2 User Tasks for failed EC2 enrollments

This PR changes the DiscoveryService to start creating and updating
Discover EC2 User Tasks.

So, what are Discover EC2 User Tasks?
When users set up Auto Discover for EC2 Instances, they don't have a
good way of checking for issues on their configured matchers.

We created User Tasks as a way to warn Users that something's wrong.
Each User Task should describe an issue that happened and a way to fix
it.
This has potential to be used to report unexpected events trough the
whole system, which are not errors per se, but something the user should
take action in order to improve the situation.
In this case, we are creating a sub type of those tasks: DiscoverEC2.

From now on, when the DiscoveryService fails to auto-enroll an instance,
it will create a DiscoverEC2 User Task grouping all the failed instances
by the following props:
- integration
- issue type
- account id
- region

A follow up PR will also create notifications so that the user can
actually be notified on those User Tasks and take action.

* improve naming and split function into multiple methods

* rename task issues variable name

* fix context early cancelation
  • Loading branch information
marcoandredinis authored Oct 16, 2024
1 parent 95af4c1 commit 34f9bb7
Show file tree
Hide file tree
Showing 8 changed files with 417 additions and 24 deletions.
30 changes: 15 additions & 15 deletions api/types/usertasks/object.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,40 +90,40 @@ const (
// This value is used to populate the UserTasks.Spec.IssueType for Discover EC2 tasks.
// The Web UI will then use those identifiers to show detailed instructions on how to fix the issue.
const (
// AutoDiscoverEC2IssueScriptInstanceNotRegistered is used to identify instances that failed to auto-enroll
// AutoDiscoverEC2IssueSSMInstanceNotRegistered is used to identify instances that failed to auto-enroll
// because they are not present in Amazon Systems Manager.
// This usually means that the Instance does not have the SSM Agent running,
// or that the instance's IAM Profile does not allow have the managed IAM Policy AmazonSSMManagedInstanceCore assigned to it.
AutoDiscoverEC2IssueScriptInstanceNotRegistered = "ec2-ssm-agent-not-registered"
AutoDiscoverEC2IssueSSMInstanceNotRegistered = "ec2-ssm-agent-not-registered"

// AutoDiscoverEC2IssueScriptInstanceConnectionLost is used to identify instances that failed to auto-enroll
// AutoDiscoverEC2IssueSSMInstanceConnectionLost is used to identify instances that failed to auto-enroll
// because the agent lost connection to Amazon Systems Manager.
// This can happen if the user changed some setting in the instance's network or IAM profile.
AutoDiscoverEC2IssueScriptInstanceConnectionLost = "ec2-ssm-agent-connection-lost"
AutoDiscoverEC2IssueSSMInstanceConnectionLost = "ec2-ssm-agent-connection-lost"

// AutoDiscoverEC2IssueScriptInstanceUnsupportedOS is used to identify instances that failed to auto-enroll
// AutoDiscoverEC2IssueSSMInstanceUnsupportedOS is used to identify instances that failed to auto-enroll
// because its OS is not supported by teleport.
// This can happen if the instance is running Windows.
AutoDiscoverEC2IssueScriptInstanceUnsupportedOS = "ec2-ssm-unsupported-os"
AutoDiscoverEC2IssueSSMInstanceUnsupportedOS = "ec2-ssm-unsupported-os"

// AutoDiscoverEC2IssueScriptFailure is used to identify instances that failed to auto-enroll
// AutoDiscoverEC2IssueSSMScriptFailure is used to identify instances that failed to auto-enroll
// because the installation script failed.
// The invocation url must be included in the report, so that users can see what was wrong.
AutoDiscoverEC2IssueScriptFailure = "ec2-ssm-script-failure"
AutoDiscoverEC2IssueSSMScriptFailure = "ec2-ssm-script-failure"

// AutoDiscoverEC2IssueInvocationFailure is used to identify instances that failed to auto-enroll
// AutoDiscoverEC2IssueSSMInvocationFailure is used to identify instances that failed to auto-enroll
// because the SSM Script Run (also known as Invocation) failed.
// This happens when there's a failure with permissions or an invalid configuration (eg, invalid document name).
AutoDiscoverEC2IssueInvocationFailure = "ec2-ssm-invocation-failure"
AutoDiscoverEC2IssueSSMInvocationFailure = "ec2-ssm-invocation-failure"
)

// discoverEC2IssueTypes is a list of issue types that can occur when trying to auto enroll EC2 instances.
var discoverEC2IssueTypes = []string{
AutoDiscoverEC2IssueScriptInstanceNotRegistered,
AutoDiscoverEC2IssueScriptInstanceConnectionLost,
AutoDiscoverEC2IssueScriptInstanceUnsupportedOS,
AutoDiscoverEC2IssueScriptFailure,
AutoDiscoverEC2IssueInvocationFailure,
AutoDiscoverEC2IssueSSMInstanceNotRegistered,
AutoDiscoverEC2IssueSSMInstanceConnectionLost,
AutoDiscoverEC2IssueSSMInstanceUnsupportedOS,
AutoDiscoverEC2IssueSSMScriptFailure,
AutoDiscoverEC2IssueSSMInvocationFailure,
}

// ValidateUserTask validates the UserTask object without modifying it.
Expand Down
3 changes: 3 additions & 0 deletions lib/auth/authclient/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -755,6 +755,9 @@ type ReadDiscoveryAccessPoint interface {

// GetProxies returns a list of registered proxies.
GetProxies() ([]types.Server, error)

// GetUserTask gets a single User Task by its name.
GetUserTask(ctx context.Context, name string) (*usertasksv1.UserTask, error)
}

// DiscoveryAccessPoint is an API interface implemented by a certificate authority (CA) to be
Expand Down
26 changes: 25 additions & 1 deletion lib/srv/discovery/discovery.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,15 +36,18 @@ import (
"github.com/gravitational/trace"
"github.com/jonboulle/clockwork"
"github.com/sirupsen/logrus"
"google.golang.org/protobuf/types/known/timestamppb"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"

"github.com/gravitational/teleport"
"github.com/gravitational/teleport/api/client/proto"
usertasksv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/usertasks/v1"
usageeventsv1 "github.com/gravitational/teleport/api/gen/proto/go/usageevents/v1"
"github.com/gravitational/teleport/api/types"
"github.com/gravitational/teleport/api/types/discoveryconfig"
apievents "github.com/gravitational/teleport/api/types/events"
"github.com/gravitational/teleport/api/types/usertasks"
"github.com/gravitational/teleport/api/utils/retryutils"
"github.com/gravitational/teleport/lib/auth/authclient"
"github.com/gravitational/teleport/lib/cloud"
Expand Down Expand Up @@ -329,6 +332,7 @@ type Server struct {

awsSyncStatus awsSyncStatus
awsEC2ResourcesStatus awsResourcesStatus
awsEC2Tasks awsEC2Tasks

// caRotationCh receives nodes that need to have their CAs rotated.
caRotationCh chan []types.Server
Expand Down Expand Up @@ -459,7 +463,8 @@ func (s *Server) initAWSWatchers(matchers []types.AWSMatcher) error {
server.WithPollInterval(s.PollInterval),
server.WithTriggerFetchC(s.newDiscoveryConfigChangedSub()),
server.WithPreFetchHookFn(func() {
s.awsEC2ResourcesStatus.iterationStarted()
s.awsEC2ResourcesStatus.reset()
s.awsEC2Tasks.reset()
}),
)
if err != nil {
Expand Down Expand Up @@ -972,6 +977,24 @@ func (s *Server) handleEC2RemoteInstallation(instances *server.EC2Instances) err
discoveryConfig: instances.DiscoveryConfig,
integration: instances.Integration,
}, len(req.Instances))

for _, instance := range req.Instances {
s.awsEC2Tasks.addFailedEnrollment(
awsEC2TaskKey{
accountID: instances.AccountID,
integration: instances.Integration,
issueType: usertasks.AutoDiscoverEC2IssueSSMInvocationFailure,
region: instances.Region,
},
&usertasksv1.DiscoverEC2Instance{
// TODO(marco): add instance name
DiscoveryConfig: instances.DiscoveryConfig,
DiscoveryGroup: s.DiscoveryGroup,
InstanceId: instance.InstanceID,
SyncTime: timestamppb.New(s.clock.Now()),
},
)
}
return trace.Wrap(err)
}
return nil
Expand Down Expand Up @@ -1084,6 +1107,7 @@ func (s *Server) handleEC2Discovery() {
}

s.updateDiscoveryConfigStatus(instances.EC2.DiscoveryConfig)
s.upsertTasksForAWSEC2FailedEnrollments()
case <-s.ctx.Done():
s.ec2Watcher.Stop()
return
Expand Down
114 changes: 113 additions & 1 deletion lib/srv/discovery/discovery_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ import (
"github.com/gravitational/teleport/api/defaults"
discoveryconfigv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/discoveryconfig/v1"
integrationpb "github.com/gravitational/teleport/api/gen/proto/go/teleport/integration/v1"
usertasksv1 "github.com/gravitational/teleport/api/gen/proto/go/teleport/usertasks/v1"
usageeventsv1 "github.com/gravitational/teleport/api/gen/proto/go/usageevents/v1"
"github.com/gravitational/teleport/api/internalutils/stream"
"github.com/gravitational/teleport/api/types"
Expand Down Expand Up @@ -199,9 +200,14 @@ func genEC2Instances(n int) []*ec2.Instance {
type mockSSMInstaller struct {
mu sync.Mutex
installedInstances map[string]struct{}
runError error
}

func (m *mockSSMInstaller) Run(_ context.Context, req server.SSMRunRequest) error {
if m.runError != nil {
return m.runError
}

m.mu.Lock()
defer m.mu.Unlock()
for _, inst := range req.Instances {
Expand Down Expand Up @@ -250,8 +256,9 @@ func TestDiscoveryServer(t *testing.T) {
)
require.NoError(t, err)

dcForEC2SSMWithIntegrationName := uuid.NewString()
dcForEC2SSMWithIntegration, err := discoveryconfig.NewDiscoveryConfig(
header.Metadata{Name: uuid.NewString()},
header.Metadata{Name: dcForEC2SSMWithIntegrationName},
discoveryconfig.Spec{
DiscoveryGroup: defaultDiscoveryGroup,
AWS: []types.AWSMatcher{{
Expand All @@ -269,6 +276,26 @@ func TestDiscoveryServer(t *testing.T) {
)
require.NoError(t, err)

discoveryConfigForUserTaskTestName := uuid.NewString()
discoveryConfigForUserTaskTest, err := discoveryconfig.NewDiscoveryConfig(
header.Metadata{Name: discoveryConfigForUserTaskTestName},
discoveryconfig.Spec{
DiscoveryGroup: defaultDiscoveryGroup,
AWS: []types.AWSMatcher{{
Types: []string{"ec2"},
Regions: []string{"eu-west-2"},
Tags: map[string]utils.Strings{"RunDiscover": {"please"}},
SSM: &types.AWSSSM{DocumentName: "document"},
Params: &types.InstallerParams{
InstallTeleport: true,
EnrollMode: types.InstallParamEnrollMode_INSTALL_PARAM_ENROLL_MODE_SCRIPT,
},
Integration: "my-integration",
}},
},
)
require.NoError(t, err)

tcs := []struct {
name string
// presentInstances is a list of servers already present in teleport
Expand All @@ -280,6 +307,8 @@ func TestDiscoveryServer(t *testing.T) {
staticMatchers Matchers
wantInstalledInstances []string
wantDiscoveryConfigStatus *discoveryconfig.Status
userTasksDiscoverEC2Check require.ValueAssertionFunc
ssmRunError error
}{
{
name: "no nodes present, 1 found ",
Expand Down Expand Up @@ -538,6 +567,74 @@ func TestDiscoveryServer(t *testing.T) {
},
wantInstalledInstances: []string{"instance-id-1"},
},
{
name: "one node found but SSM Run fails and DiscoverEC2 User Task is created",
presentInstances: []types.Server{},
foundEC2Instances: []*ec2.Instance{
{
InstanceId: aws.String("instance-id-1"),
Tags: []*ec2.Tag{{
Key: aws.String("env"),
Value: aws.String("dev"),
}},
State: &ec2.InstanceState{
Name: aws.String(ec2.InstanceStateNameRunning),
},
},
},
ssm: &mockSSMClient{
commandOutput: &ssm.SendCommandOutput{
Command: &ssm.Command{
CommandId: aws.String("command-id-1"),
},
},
invokeOutput: &ssm.GetCommandInvocationOutput{
Status: aws.String(ssm.CommandStatusSuccess),
ResponseCode: aws.Int64(0),
},
},
ssmRunError: trace.BadParameter("ssm run failed"),
emitter: &mockEmitter{
eventHandler: func(t *testing.T, ae events.AuditEvent, server *Server) {
t.Helper()
require.Equal(t, &events.SSMRun{
Metadata: events.Metadata{
Type: libevents.SSMRunEvent,
Code: libevents.SSMRunSuccessCode,
},
CommandID: "command-id-1",
AccountID: "owner",
InstanceID: "instance-id-1",
Region: "eu-central-1",
ExitCode: 0,
Status: ssm.CommandStatusSuccess,
}, ae)
},
},
staticMatchers: Matchers{},
discoveryConfig: discoveryConfigForUserTaskTest,
wantInstalledInstances: []string{},
userTasksDiscoverEC2Check: func(tt require.TestingT, i1 interface{}, i2 ...interface{}) {
existingTasks, ok := i1.([]*usertasksv1.UserTask)
require.True(t, ok, "failed to get existing tasks: %T", i1)
require.Len(t, existingTasks, 1)
existingTask := existingTasks[0]

require.Equal(t, "OPEN", existingTask.GetSpec().State)
require.Equal(t, "my-integration", existingTask.GetSpec().Integration)
require.Equal(t, "ec2-ssm-invocation-failure", existingTask.GetSpec().IssueType)
require.Equal(t, "owner", existingTask.GetSpec().GetDiscoverEc2().GetAccountId())
require.Equal(t, "eu-west-2", existingTask.GetSpec().GetDiscoverEc2().GetRegion())

taskInstances := existingTask.GetSpec().GetDiscoverEc2().Instances
require.Contains(t, taskInstances, "instance-id-1")
taskInstance := taskInstances["instance-id-1"]

require.Equal(t, "instance-id-1", taskInstance.InstanceId)
require.Equal(t, discoveryConfigForUserTaskTestName, taskInstance.DiscoveryConfig)
require.Equal(t, defaultDiscoveryGroup, taskInstance.DiscoveryGroup)
},
},
}

for _, tc := range tcs {
Expand Down Expand Up @@ -586,6 +683,7 @@ func TestDiscoveryServer(t *testing.T) {
reporter := &mockUsageReporter{}
installer := &mockSSMInstaller{
installedInstances: make(map[string]struct{}),
runError: tc.ssmRunError,
}
tlsServer.Auth().SetUsageReporter(reporter)

Expand Down Expand Up @@ -640,6 +738,20 @@ func TestDiscoveryServer(t *testing.T) {
return true
}, 500*time.Millisecond, 50*time.Millisecond)
}
if tc.userTasksDiscoverEC2Check != nil {
var allUserTasks []*usertasksv1.UserTask
var nextToken string
for {
var userTasks []*usertasksv1.UserTask
userTasks, nextToken, err = tlsServer.Auth().UserTasks.ListUserTasks(context.Background(), 0, "")
require.NoError(t, err)
allUserTasks = append(allUserTasks, userTasks...)
if nextToken == "" {
break
}
}
tc.userTasksDiscoverEC2Check(t, allUserTasks)
}
})
}
}
Expand Down
Loading

0 comments on commit 34f9bb7

Please sign in to comment.