Skip to content

Commit

Permalink
Merge pull request #2397 from ASFHyP3/develop
Browse files Browse the repository at this point in the history
Release v7.8.0
  • Loading branch information
AndrewPlayer3 authored Aug 30, 2024
2 parents 66de587 + ef99d1c commit 9b75b01
Show file tree
Hide file tree
Showing 26 changed files with 212 additions and 19 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy-enterprise-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
job_spec/ARIA_AUTORIFT.yml
job_spec/ARIA_RAIDER.yml
job_spec/INSAR_ISCE.yml
instance_types: m6id.xlarge,m6id.2xlarge,m6id.4xlarge,m6id.8xlarge,m6idn.xlarge,m6idn.2xlarge,m6idn.4xlarge,m6idn.8xlarge
instance_types: c6id.xlarge,c6id.2xlarge,c6id.4xlarge,c6id.8xlarge
default_max_vcpus: 640
expanded_max_vcpus: 640
required_surplus: 0
Expand Down
6 changes: 3 additions & 3 deletions .github/workflows/deploy-enterprise.yml
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ jobs:
job_spec/ARIA_AUTORIFT.yml
job_spec/ARIA_RAIDER.yml
job_spec/INSAR_ISCE.yml
instance_types: m6id.xlarge,m6id.2xlarge,m6id.4xlarge,m6id.8xlarge,m6idn.xlarge,m6idn.2xlarge,m6idn.4xlarge,m6idn.8xlarge
instance_types: c6id.xlarge,c6id.2xlarge,c6id.4xlarge,c6id.8xlarge
default_max_vcpus: 4000 # Max: 13000
expanded_max_vcpus: 4000 # Max: 13000
required_surplus: 0
Expand All @@ -65,7 +65,7 @@ jobs:
job_spec/ARIA_AUTORIFT.yml
job_spec/ARIA_RAIDER.yml
job_spec/INSAR_ISCE.yml
instance_types: m6id.xlarge,m6id.2xlarge,m6id.4xlarge,m6id.8xlarge,m6idn.xlarge,m6idn.2xlarge,m6idn.4xlarge,m6idn.8xlarge
instance_types: c6id.xlarge,c6id.2xlarge,c6id.4xlarge,c6id.8xlarge
default_max_vcpus: 1000 # Max: 10316
expanded_max_vcpus: 1000 # Max: 10316
required_surplus: 0
Expand All @@ -85,7 +85,7 @@ jobs:
job_spec/ARIA_AUTORIFT.yml
job_spec/ARIA_RAIDER.yml
job_spec/INSAR_ISCE.yml
instance_types: m6id.xlarge,m6id.2xlarge,m6id.4xlarge,m6id.8xlarge,m6idn.xlarge,m6idn.2xlarge,m6idn.4xlarge,m6idn.8xlarge
instance_types: c6id.xlarge,c6id.2xlarge,c6id.4xlarge,c6id.8xlarge
default_max_vcpus: 1600 # Max 1652
expanded_max_vcpus: 1600 # Max 1652
required_surplus: 0
Expand Down
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,17 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).


## [7.8.0]

### Added
- Allow overriding certain AWS Batch compute environment parameters (including instance types and AMI) within a job spec.
- Allow job spec tasks to require GPU resources.

### Changed
- The `SRG_GSLC` job type now runs within a GPU environment.
- Revert ARIA hyp3 deployments back to C-instance family - including the job-spec CLI parameter `omp-num-threads` to ensure multiple jobs fit on single instance.
- Deployments with INSAR_ISCE.yml job specs will now use a dedicated compute environment with on-demand instances instead of spot instances for INSAR_ISCE jobs.

## [7.7.2]

### Change
Expand Down
53 changes: 51 additions & 2 deletions apps/compute-cf.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,15 @@ Outputs:
JobQueueArn:
Value: !Ref BatchJobQueue

{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
{{ name }}ComputeEnvironmentArn:
Value: !Ref {{ name }}ComputeEnvironment

{{ name }}JobQueueArn:
Value: !Ref {{ name }}JobQueue
{% endfor %}

TaskRoleArn:
Value: !GetAtt TaskRole.Arn

Expand Down Expand Up @@ -85,18 +94,58 @@ Resources:
Tags:
Name: !Ref AWS::StackName

BatchJobQueue:
Type: AWS::Batch::JobQueue
Properties:
Priority: 1
ComputeEnvironmentOrder:
- ComputeEnvironment: !Ref ComputeEnvironment
Order: 1
SchedulingPolicyArn: !Ref SchedulingPolicy

SchedulingPolicy:
Type: AWS::Batch::SchedulingPolicy

BatchJobQueue:
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set env = job_spec['compute_environment'] %}
{% set name = env['name'] %}
{% set instance_types = env['instance_types'].split(',') if 'instance_types' in env else '!Ref InstanceTypes' %}
{% set ami_id = env['ami_id'] if 'ami_id' in env else '!Ref AmiId' %}
{% set type = env['allocation_type'] if 'allocation_type' in env else 'SPOT' %}
{% set strategy = env['allocation_strategy'] if 'allocation_strategy' in env else 'SPOT_PRICE_CAPACITY_OPTIMIZED' %}
{{ name }}ComputeEnvironment:
Type: AWS::Batch::ComputeEnvironment
Properties:
ServiceRole: !GetAtt BatchServiceRole.Arn
Type: MANAGED
ComputeResources:
Type: {{ type }}
AllocationStrategy: {{ strategy }}
MinvCpus: 0
MaxvCpus: !Ref MaxvCpus
InstanceTypes: {{ instance_types }}
ImageId: {{ ami_id }}
Subnets: !Ref SubnetIds
InstanceRole: !Ref InstanceProfile
SecurityGroupIds:
- !Ref SecurityGroup
LaunchTemplate:
LaunchTemplateId: !Ref LaunchTemplate
Version: !GetAtt LaunchTemplate.LatestVersionNumber
Tags:
Name: !Ref AWS::StackName

{{ name }}JobQueue:
Type: AWS::Batch::JobQueue
Properties:
Priority: 1
ComputeEnvironmentOrder:
- ComputeEnvironment: !Ref ComputeEnvironment
- ComputeEnvironment: !Ref {{ name }}ComputeEnvironment
Order: 1
SchedulingPolicyArn: !Ref SchedulingPolicy

{% endfor %}

TaskRole:
Type: {{ 'Custom::JplRole' if security_environment in ('JPL', 'JPL-public') else 'AWS::IAM::Role' }}
Properties:
Expand Down
8 changes: 8 additions & 0 deletions apps/handle-batch-event/handle-batch-event-cf.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ Parameters:
JobQueueArn:
Type: String

{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{{ job_spec['compute_environment']['name'] }}JobQueueArn:
Type: String
{% endfor %}

JobsTable:
Type: String

Expand Down Expand Up @@ -95,6 +100,9 @@ Resources:
detail:
jobQueue:
- !Ref JobQueueArn
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
- !Ref {{ job_spec['compute_environment']['name'] }}JobQueueArn
{% endfor %}
status:
- RUNNING
Targets:
Expand Down
12 changes: 12 additions & 0 deletions apps/main-cf.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ Resources:
Properties:
Parameters:
ComputeEnvironmentArn: !GetAtt Cluster.Outputs.ComputeEnvironmentArn
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
{{ name }}ComputeEnvironmentArn: !GetAtt Cluster.Outputs.{{ name }}ComputeEnvironmentArn
{% endfor %}
DefaultMaxvCpus: !Ref DefaultMaxvCpus
ExpandedMaxvCpus: !Ref ExpandedMaxvCpus
MonthlyBudget: !Ref MonthlyBudget
Expand All @@ -169,6 +173,10 @@ Resources:
Properties:
Parameters:
JobQueueArn: !GetAtt Cluster.Outputs.JobQueueArn
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
{{ name }}JobQueueArn: !GetAtt Cluster.Outputs.{{ name }}JobQueueArn
{% endfor %}
JobsTable: !Ref JobsTable
{% if security_environment == 'EDC' %}
SecurityGroupId: !GetAtt Cluster.Outputs.SecurityGroupId
Expand All @@ -181,6 +189,10 @@ Resources:
Properties:
Parameters:
JobQueueArn: !GetAtt Cluster.Outputs.JobQueueArn
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
{{ name }}JobQueueArn: !GetAtt Cluster.Outputs.{{ name }}JobQueueArn
{% endfor %}
TaskRoleArn: !GetAtt Cluster.Outputs.TaskRoleArn
JobsTable: !Ref JobsTable
Bucket: !Ref ContentBucket
Expand Down
64 changes: 63 additions & 1 deletion apps/scale-cluster/scale-cluster-cf.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,12 @@ Parameters:
ComputeEnvironmentArn:
Type: String

{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
{{ name }}ComputeEnvironmentArn:
Type: String
{% endfor %}

DefaultMaxvCpus:
Type: Number
MinValue: 0
Expand Down Expand Up @@ -79,7 +85,11 @@ Resources:
Resource: "*"
- Effect: Allow
Action: batch:UpdateComputeEnvironment
Resource: !Ref ComputeEnvironmentArn
Resource:
- !Ref ComputeEnvironmentArn
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
- !Ref {{ job_spec['compute_environment']['name'] }}ComputeEnvironmentArn
{% endfor %}

Lambda:
Type: AWS::Lambda::Function
Expand Down Expand Up @@ -118,6 +128,11 @@ Resources:
Targets:
- Arn: !GetAtt Lambda.Arn
Id: lambda
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
- Arn: !GetAtt {{ name }}Lambda.Arn
Id: {{ name }}lambda
{% endfor %}

EventPermission:
Type: AWS::Lambda::Permission
Expand All @@ -126,3 +141,50 @@ Resources:
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt Schedule.Arn

{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
{{ name }}LogGroup:
Type: AWS::Logs::LogGroup
Properties:
LogGroupName: !Sub "{{ '/aws/lambda/${' + name + 'Lambda}' }}"
RetentionInDays: 90

{{ name }}Lambda:
Type: AWS::Lambda::Function
Properties:
Environment:
Variables:
COMPUTE_ENVIRONMENT_ARN: !Ref {{ name }}ComputeEnvironmentArn
MONTHLY_BUDGET: !Ref MonthlyBudget
DEFAULT_MAX_VCPUS: !Ref DefaultMaxvCpus
EXPANDED_MAX_VCPUS: !Ref ExpandedMaxvCpus
REQUIRED_SURPLUS: !Ref RequiredSurplus
Code: src/
Handler: scale_cluster.lambda_handler
MemorySize: 128
Role: !GetAtt Role.Arn
Runtime: python3.9
Timeout: 30
{% if security_environment == 'EDC' %}
VpcConfig:
SecurityGroupIds:
- !Ref SecurityGroupId
SubnetIds: !Ref SubnetIds
{% endif %}

{{ name }}EventInvokeConfig:
Type: AWS::Lambda::EventInvokeConfig
Properties:
FunctionName: !Ref {{ name }}Lambda
Qualifier: $LATEST
MaximumRetryAttempts: 0

{{ name }}EventPermission:
Type: AWS::Lambda::Permission
Properties:
FunctionName: !GetAtt {{ name }}Lambda.Arn
Action: lambda:InvokeFunction
Principal: events.amazonaws.com
SourceArn: !GetAtt Schedule.Arn
{% endfor %}
4 changes: 3 additions & 1 deletion apps/step-function.json.j2
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,9 @@
"Parameters": {
"JobDefinition": "{{ '${'+ snake_to_pascal_case(task['name']) + '}' }}",
"JobName.$": "$.job_id",
"JobQueue": "${JobQueueArn}",
{% set name = job_spec['compute_environment']['name'] %}
{% set job_queue = name + 'JobQueueArn' if 'Default' != name else 'JobQueueArn' %}
"JobQueue": "{{ '${' + job_queue + '}' }}",
"ShareIdentifier": "default",
"SchedulingPriorityOverride.$": "$.priority",
"Parameters.$": "$.job_parameters",
Expand Down
17 changes: 17 additions & 0 deletions apps/workflow-cf.yml.j2
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@ Parameters:
JobQueueArn:
Type: String

{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{{ job_spec['compute_environment']['name'] }}JobQueueArn:
Type: String
{% endfor %}

JobsTable:
Type: String

Expand Down Expand Up @@ -36,6 +41,7 @@ Outputs:
StepFunctionArn:
Value: !Ref StepFunction


Resources:
{% for job_type, job_spec in job_types.items() %}
{% for task in job_spec['tasks'] %}
Expand All @@ -60,6 +66,10 @@ Resources:
Value: "{{ task['vcpu'] }}"
- Type: MEMORY
Value: "{{ task['memory'] }}"
{% if 'gpu' in task %}
- Type: GPU
Value: "{{ task['gpu'] }}"
{% endif %}
Command:
{% for command in task['command'] %}
- {{ command }}
Expand All @@ -83,6 +93,10 @@ Resources:
DefinitionS3Location: step-function.json
DefinitionSubstitutions:
JobQueueArn: !Ref JobQueueArn
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
{% set name = job_spec['compute_environment']['name'] %}
{{ name }}JobQueueArn: !Ref {{ name }}JobQueueArn
{% endfor %}
{% for job_type, job_spec in job_types.items() %}
{% for task in job_spec['tasks'] %}
{{ snake_to_pascal_case(task['name']) }}: !Ref {{ snake_to_pascal_case(task['name']) }}
Expand Down Expand Up @@ -124,6 +138,9 @@ Resources:
Action: batch:SubmitJob
Resource:
- !Ref JobQueueArn
{% for job_type, job_spec in job_types.items() if 'Default' != job_spec['compute_environment']['name'] %}
- !Ref {{ job_spec['compute_environment']['name'] }}JobQueueArn
{% endfor %}
{% for job_type, job_spec in job_types.items() %}
{% for task in job_spec['tasks'] %}
- !Ref {{ snake_to_pascal_case(task['name']) }}
Expand Down
2 changes: 2 additions & 0 deletions job_spec/ARIA_AUTORIFT.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ AUTORIFT:
DEFAULT:
cost: 1.0
validators: []
compute_environment:
name: 'Default'
tasks:
- name: ''
image: ghcr.io/asfhyp3/hyp3-autorift
Expand Down
2 changes: 2 additions & 0 deletions job_spec/ARIA_RAIDER.yml
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ ARIA_RAIDER:
DEFAULT:
cost: 1.0
validators: []
compute_environment:
name: 'Default'
tasks:
- name: ''
image: ghcr.io/dbekaert/raider
Expand Down
2 changes: 2 additions & 0 deletions job_spec/AUTORIFT.yml
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ AUTORIFT:
DEFAULT:
cost: 1.0
validators: []
compute_environment:
name: 'Default'
tasks:
- name: ''
image: ghcr.io/asfhyp3/hyp3-autorift
Expand Down
2 changes: 2 additions & 0 deletions job_spec/AUTORIFT_ITS_LIVE.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ AUTORIFT:
DEFAULT:
cost: 1.0
validators: []
compute_environment:
name: 'Default'
tasks:
- name: ''
image: ghcr.io/asfhyp3/hyp3-autorift
Expand Down
2 changes: 2 additions & 0 deletions job_spec/INSAR_GAMMA.yml
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ INSAR_GAMMA:
cost: 1.0
validators:
- check_dem_coverage
compute_environment:
name: 'Default'
tasks:
- name: ''
image: 845172464411.dkr.ecr.us-west-2.amazonaws.com/hyp3-gamma
Expand Down
Loading

0 comments on commit 9b75b01

Please sign in to comment.