Skip to content

Commit

Permalink
Merge pull request #31 from rewiringamerica/natalie/list_jobs
Browse files Browse the repository at this point in the history
Updates to --list_jobs and batch settings
  • Loading branch information
nweires authored Nov 27, 2023
2 parents a019c83 + ac67e71 commit e034567
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 28 deletions.
65 changes: 43 additions & 22 deletions buildstockbatch/gcp/gcp.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,31 +382,43 @@ def clean(self):
delete_job(self.gcp_batch_job_name)
self.clean_postprocessing_job()

def list_jobs(self):
def show_jobs(self):
"""
List existing GCP Batch jobs that match the provided project.
Show the existing GCP Batch and Cloud Run jobs that match the provided project, if they exist.
"""
# TODO: this only shows jobs that exist in GCP Batch - update it to also
# show any post-processing steps that may be running.
# GCP Batch job that runs the simulations
client = batch_v1.BatchServiceClient()

request = batch_v1.ListJobsRequest()
request.parent = f"projects/{self.gcp_project}/locations/{self.region}"
request.filter = f"name:{request.parent}/jobs/{self.job_identifier}"
request.order_by = "create_time desc"
request.page_size = 10
logger.info(f"Showing the first 10 existing jobs that match: {request.filter}\n")
response = client.list_jobs(request)
for job in response.jobs:
logger.debug(job)
logger.info(f"Name: {job.name}")
try:
job = client.get_job(batch_v1.GetJobRequest(name=self.gcp_batch_job_name))
logger.info("Batch job")
logger.info(f" Name: {job.name}")
logger.info(f" UID: {job.uid}")
logger.info(f" Status: {job.status.state.name}")
task_counts = collections.defaultdict(int)
for group in job.status.task_groups.values():
for status, count in group.counts.items():
task_counts[status] += count
logger.info(f" Task statuses: {dict(task_counts)}")
logger.debug(f"Full job info:\n{job}")
except exceptions.NotFound:
logger.info(f"No existing Batch jobs match: {self.gcp_batch_job_name}")
logger.info(f"See all Batch jobs at https://console.cloud.google.com/batch/jobs?project={self.gcp_project}")

# Postprocessing Cloud Run job
jobs_client = run_v2.JobsClient()
try:
job = jobs_client.get_job(name=self.postprocessing_job_name)
last_execution = job.latest_created_execution
status = "Running"
if last_execution.completion_time:
status = "Completed"
logger.info("Post-processing Cloud Run job")
logger.info(f" Name: {job.name}")
logger.info(f" Status of latest run ({last_execution.name}): {status}")
logger.debug(f"Full job info:\n{job}")
except exceptions.NotFound:
logger.info(f"No existing Cloud Run jobs match {self.postprocessing_job_name}")
logger.info(f"See all Cloud Run jobs at https://console.cloud.google.com/run/jobs?project={self.gcp_project}")

def run_batch(self):
"""
Expand Down Expand Up @@ -570,13 +582,22 @@ def run_batch(self):
memory_mib=job_env_cfg.get("memory_mib", 1024),
)

# Give three minutes per simulation, plus ten minutes for job overhead
task_duration_secs = 60 * (10 + n_sims_per_job * 3)
task = batch_v1.TaskSpec(
runnables=[runnable],
compute_resource=resources,
# TODO: Confirm what happens if this fails repeatedly, or for only some tasks, and document it.
max_retry_count=2,
# TODO: How long does this timeout need to be?
max_run_duration="5000s",
# Allow retries, but only when the machine is preempted.
max_retry_count=3,
lifecycle_policies=[
batch_v1.LifecyclePolicy(
action=batch_v1.LifecyclePolicy.Action.RETRY_TASK,
action_condition=batch_v1.LifecyclePolicy.ActionCondition(
exit_codes=[50001] # Exit code for preemptions
),
)
],
max_run_duration=f"{task_duration_secs}s",
)

# How many of these tasks to run.
Expand Down Expand Up @@ -1087,7 +1108,7 @@ def main():
help="Only validate the project YAML file and references. Nothing is executed",
action="store_true",
)
group.add_argument("--list_jobs", help="List existing jobs", action="store_true")
group.add_argument("--show_jobs", help="List existing jobs", action="store_true")
group.add_argument(
"--postprocessonly",
help="Only do postprocessing, useful for when the simulations are already done",
Expand Down Expand Up @@ -1120,8 +1141,8 @@ def main():
if args.clean:
batch.clean()
return
if args.list_jobs:
batch.list_jobs()
if args.show_jobs:
batch.show_jobs()
return
elif args.postprocessonly:
batch.build_image()
Expand Down
12 changes: 6 additions & 6 deletions docs/run_sims.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ Running a project file is straightforward. Call the ``buildstock_local`` command

Eagle
~~~~~
After you have :ref:`activated the appropriate conda environment on Eagle <eagle_install>`,
After you have :ref:`activated the appropriate conda environment on Eagle <eagle_install>`,
you can submit a project file to be simulated by passing it to the ``buildstock_eagle`` command.

.. command-output:: buildstock_eagle --help
Expand All @@ -40,8 +40,8 @@ Eagle specific project configuration
To get a project to run on Eagle, you will need to make a few changes to your :doc:`project_defn`.
First, the ``output_directory`` should be in ``/scratch/your_username/some_directory`` or in ``/projects`` somewhere.
Building stock simulations generate a lot of output quickly and the ``/scratch`` or ``/projects`` filesystem are
equipped to handle that kind of I/O throughput where your ``/home`` directory is not and may cause
stability issues across the whole system.
equipped to handle that kind of I/O throughput where your ``/home`` directory is not and may cause
stability issues across the whole system.

Next, you will need to add an ``eagle`` top level key to the project file, which will look something like this

Expand Down Expand Up @@ -69,7 +69,7 @@ jobs runs a batch of simulations on a single compute node. Sometimes a handful
of jobs will fail due to issues with Eagle (filesystem or timeouts). If most of
the jobs succeeded, rather than rerun everything you can resubmit just the jobs
that failed with the ``--rerun_failed`` command line argument. This will also
clear out and rerun the postprocessing.
clear out and rerun the postprocessing.


Amazon Web Services
Expand Down Expand Up @@ -152,9 +152,9 @@ this option makes it easier to quickly assign a new ID with each run. It also ma
List existing jobs
..................

Run ``buildstock_gcp --list_jobs your_project_file.yml`` to see a list of all existing
Run ``buildstock_gcp --show_jobs your_project_file.yml`` to see the existing
jobs matching the project specified. This can show you whether a previously-started job
has completed or is still running.
has completed, is still running, or has already been cleaned up.


Cleaning up after yourself
Expand Down

0 comments on commit e034567

Please sign in to comment.