Skip to content

Commit

Permalink
Revert "refactor(disruption): use fixed maxUnavailable (#639)"
Browse files Browse the repository at this point in the history
This reverts commit 4fe0422.
  • Loading branch information
hspedro committed Oct 22, 2024
1 parent 9682f8f commit 3bc165a
Show file tree
Hide file tree
Showing 41 changed files with 649 additions and 656 deletions.
1 change: 0 additions & 1 deletion cmd/managementapi/wire.go
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,6 @@ func initializeManagementMux(ctx context.Context, conf config.Config) (*runtime.
providers.ProvideDefinitionConstructors,

// services
service.NewSchedulerManagerConfig,
service.NewSchedulerManager,
service.NewOperationManager,

Expand Down
6 changes: 1 addition & 5 deletions cmd/managementapi/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

13 changes: 11 additions & 2 deletions cmd/runtimewatcher/wire.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"github.com/topfreegames/maestro/internal/core/services/events"
"github.com/topfreegames/maestro/internal/core/services/workers"
"github.com/topfreegames/maestro/internal/core/worker"
workerconfigs "github.com/topfreegames/maestro/internal/core/worker/config"
"github.com/topfreegames/maestro/internal/core/worker/runtimewatcher"
"github.com/topfreegames/maestro/internal/service"
)
Expand All @@ -42,16 +43,24 @@ func provideRuntimeWatcherBuilder() *worker.WorkerBuilder {
}
}

func provideRuntimeWatcherConfig(c config.Config) *workerconfigs.RuntimeWatcherConfig {
return &workerconfigs.RuntimeWatcherConfig{
DisruptionWorkerIntervalSeconds: c.GetDuration("runtimeWatcher.disruptionWorker.intervalSeconds"),
DisruptionSafetyPercentage: c.GetFloat64("runtimeWatcher.disruptionWorker.safetyPercentage"),
}
}

var WorkerOptionsSet = wire.NewSet(
service.NewRuntimeKubernetes,
service.NewRoomStorageRedis,
RoomManagerSet,
wire.Struct(new(worker.WorkerOptions), "RoomManager", "Runtime"))
provideRuntimeWatcherConfig,
wire.Struct(new(worker.WorkerOptions), "Runtime", "RoomStorage", "RoomManager", "RuntimeWatcherConfig"))

var RoomManagerSet = wire.NewSet(
service.NewSchedulerStoragePg,
service.NewClockTime,
service.NewPortAllocatorRandom,
service.NewRoomStorageRedis,
service.NewGameRoomInstanceStorageRedis,
service.NewSchedulerCacheRedis,
service.NewRoomManagerConfig,
Expand Down
28 changes: 20 additions & 8 deletions cmd/runtimewatcher/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 0 additions & 1 deletion cmd/worker/wire.go
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ func initializeWorker(c config.Config, builder *worker.WorkerBuilder) (*workerss
worker.ProvideWorkerOptions,
workersservice.NewWorkersManager,
service.NewSchedulerManager,
service.NewSchedulerManagerConfig,
)

return &workersservice.WorkersManager{}, nil
Expand Down
6 changes: 1 addition & 5 deletions cmd/worker/wire_gen.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 0 additions & 2 deletions config/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -79,8 +79,6 @@ operations:
limit: 1000

services:
schedulerManager:
defaultPdbMaxUnavailable: "5%"
roomManager:
roomPingTimeoutMillis: 240000
roomInitializationTimeoutMillis: 120000
Expand Down
46 changes: 2 additions & 44 deletions docs/reference/Scheduler.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,7 @@ autoscaling:
"metadata": {}
}
}
],
"pdbMaxUnavailable": "5%",
]
"autoscaling": {
"enabled": true,
"min": 10,
Expand Down Expand Up @@ -235,7 +234,6 @@ forwarders: Forwarders
autoscaling: Autoscaling
spec: Spec
annotation: Map
pdbMaxUnavailable: String
```
- **Name**: Scheduler name. This name is unique and will be the same name used for the kubernetes namespace. It's
Expand All @@ -256,7 +254,6 @@ pdbMaxUnavailable: String
used by them, limits and images. More info [here](#spec).
- **annotations**: Allows annotations for the scheduler's game room. Know more about annotations on
Kubernetes [here](https://kubernetes.io/docs/concepts/overview/working-with-objects/annotations)
- **pdbMaxUnavailable**: Defines the disruption budget for game rooms. Optional and defaults to 5%. Value can be defined as a string representing the % between 0 and 100, "15%", or a raw number of rooms "100".
### PortRange
The **PortRange** is used to select a random port for a GRU between **start** and **end**.
Expand Down Expand Up @@ -406,43 +403,4 @@ It is represented as:
- **name**: Name of the port. Facilitates on recognition;
- **protocol**: Port protocol. Can be UDP, TCP or SCTP.;
- **port**: The port exposed.
- **hostPortRange**: The [port range](#portrange) for the port to be allocated in the host. Mutually exclusive with the port range configured in the root structure.

#### PDB Max Unavailable

A string value that defines the disruption budget of Game Rooms from a specific scheduler.
Maestro will create a [PDB Resource](https://kubernetes.io/docs/tasks/run-application/configure-pdb/)
to prevent evictions drastically impacting availability of the Game Rooms.

By default this value is set to 5%, so at worst runtime can evit 5% of the pods. There is no way to control
what pods will be evicted - if it prefers ready, pending, etc.

The configuration can be specified with this order of precedence:

1. Value specified in Scheduler's definition

```json
{
"pdbMaxUnavailable": "10%"
}
```

2. Value specified in the ENV VAR:

```shell
MAESTRO_SERVICES_SCHEDULERMANAGER_DEFAULTPDBMAXUNAVAILABLE="10%"
```

3. Value specified in the [config.yaml](../../config/config.yaml):

```yaml
services:
schedulerManager:
defaultPdbMaxUnavailable: "5%"
```

4. Value specified in [code](../../internal/core/entities/pdb/pdb.go) that defaults to 5%:

```go
const DefaultPdbMaxUnavailablePercentage = "5%"
```
- **hostPortRange**: The [port range](#portrange) for the port to be allocated in the host. Mutually exclusive with the port range configured in the root structure.
64 changes: 56 additions & 8 deletions internal/adapters/runtime/kubernetes/scheduler.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,18 +27,19 @@ import (
"strconv"

"github.com/topfreegames/maestro/internal/core/entities"
pdbEntity "github.com/topfreegames/maestro/internal/core/entities/pdb"
"github.com/topfreegames/maestro/internal/core/ports/errors"
"go.uber.org/zap"
v1 "k8s.io/api/core/v1"
v1Policy "k8s.io/api/policy/v1"
kerrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/intstr"
)

const (
MajorKubeVersionPDB int = 1
MinorKubeVersionPDB int = 21
DefaultDisruptionSafetyPercentage float64 = 0.05
MajorKubeVersionPDB int = 1
MinorKubeVersionPDB int = 21
)

func (k *kubernetes) isPDBSupported() bool {
Expand Down Expand Up @@ -95,7 +96,10 @@ func (k *kubernetes) createPDBFromScheduler(ctx context.Context, scheduler *enti
},
},
Spec: v1Policy.PodDisruptionBudgetSpec{
MaxUnavailable: pdbEntity.ConvertStrToSpec(scheduler.PdbMaxUnavailable),
MinAvailable: &intstr.IntOrString{
Type: intstr.Int,
IntVal: int32(0),
},
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"maestro-scheduler": scheduler.Name,
Expand Down Expand Up @@ -169,8 +173,29 @@ func (k *kubernetes) DeleteScheduler(ctx context.Context, scheduler *entities.Sc
return nil
}

func (k *kubernetes) UpdateScheduler(ctx context.Context, scheduler *entities.Scheduler) error {
// Check if PDB exists, if not, create it
func (k *kubernetes) MitigateDisruption(
ctx context.Context,
scheduler *entities.Scheduler,
roomAmount int,
safetyPercentage float64,
) error {
if scheduler == nil {
return errors.NewErrInvalidArgument("empty pointer received for scheduler, can not mitigate disruptions")
}

incSafetyPercentage := 1.0
if safetyPercentage < DefaultDisruptionSafetyPercentage {
k.logger.Warn(
"invalid safety percentage, using default percentage",
zap.Float64("safetyPercentage", safetyPercentage),
zap.Float64("DefaultDisruptionSafetyPercentage", DefaultDisruptionSafetyPercentage),
)
safetyPercentage = DefaultDisruptionSafetyPercentage
}
incSafetyPercentage += safetyPercentage

// For kubernetes mitigating disruptions means updating the current PDB
// minAvailable to the number of occupied rooms if above a threshold
pdb, err := k.clientSet.PolicyV1().PodDisruptionBudgets(scheduler.Name).Get(ctx, scheduler.Name, metav1.GetOptions{})
if err != nil && !kerrors.IsNotFound(err) {
// Non-recoverable errors
Expand All @@ -184,8 +209,31 @@ func (k *kubernetes) UpdateScheduler(ctx context.Context, scheduler *entities.Sc
}
}

var currentPdbMinAvailable int32
// PDB might exist and is based on MaxUnavailable
if pdb.Spec.MinAvailable != nil {
currentPdbMinAvailable = pdb.Spec.MinAvailable.IntVal
}

if currentPdbMinAvailable == int32(float64(roomAmount)*incSafetyPercentage) {
return nil
}

// In theory, the PDB object can be changed in the runtime in the meantime after
// fetching initial state/ask for creation (beginning of the function) and before
// updating the value. This should never happen in production because there is only
// one agent setting this PDB in the namespace and it's the worker. However, on tests
// we were seeing intermittent failures running parallel cases, hence why adding this
// code it is safer to update the PDB object
pdb, err = k.clientSet.PolicyV1().PodDisruptionBudgets(scheduler.Name).Get(ctx, scheduler.Name, metav1.GetOptions{})
if err != nil || pdb == nil {
return errors.NewErrUnexpected("non recoverable error when getting PDB for scheduler '%s': %s", scheduler.Name, err)
}
pdb.Spec = v1Policy.PodDisruptionBudgetSpec{
MaxUnavailable: pdbEntity.ConvertStrToSpec(scheduler.PdbMaxUnavailable),
MinAvailable: &intstr.IntOrString{
Type: intstr.Int,
IntVal: int32(float64(roomAmount) * incSafetyPercentage),
},
Selector: &metav1.LabelSelector{
MatchLabels: map[string]string{
"maestro-scheduler": scheduler.Name,
Expand All @@ -195,7 +243,7 @@ func (k *kubernetes) UpdateScheduler(ctx context.Context, scheduler *entities.Sc

_, err = k.clientSet.PolicyV1().PodDisruptionBudgets(scheduler.Name).Update(ctx, pdb, metav1.UpdateOptions{})
if err != nil {
return errors.NewErrUnexpected("error updating PDB for scheduler '%s': %s", scheduler.Name, err)
return errors.NewErrUnexpected("error updating PDB to mitigate disruptions for scheduler '%s': %s", scheduler.Name, err)
}

return nil
Expand Down
Loading

0 comments on commit 3bc165a

Please sign in to comment.