Merge pull request #31 from rewiringamerica/natalie/list_jobs

Updates to --list_jobs and batch settings
rewiringamerica · Nov 27, 2023 · e034567 · e034567
2 parents a019c83 + ac67e71
commit e034567
Show file tree

Hide file tree

Showing 2 changed files with 49 additions and 28 deletions.
diff --git a/buildstockbatch/gcp/gcp.py b/buildstockbatch/gcp/gcp.py
@@ -382,31 +382,43 @@ def clean(self):
         delete_job(self.gcp_batch_job_name)
         self.clean_postprocessing_job()
 
-    def list_jobs(self):
+    def show_jobs(self):
         """
-        List existing GCP Batch jobs that match the provided project.
+        Show the existing GCP Batch and Cloud Run jobs that match the provided project, if they exist.
         """
-        # TODO: this only shows jobs that exist in GCP Batch - update it to also
-        # show any post-processing steps that may be running.
+        # GCP Batch job that runs the simulations
         client = batch_v1.BatchServiceClient()
-
-        request = batch_v1.ListJobsRequest()
-        request.parent = f"projects/{self.gcp_project}/locations/{self.region}"
-        request.filter = f"name:{request.parent}/jobs/{self.job_identifier}"
-        request.order_by = "create_time desc"
-        request.page_size = 10
-        logger.info(f"Showing the first 10 existing jobs that match: {request.filter}\n")
-        response = client.list_jobs(request)
-        for job in response.jobs:
-            logger.debug(job)
-            logger.info(f"Name: {job.name}")
+        try:
+            job = client.get_job(batch_v1.GetJobRequest(name=self.gcp_batch_job_name))
+            logger.info("Batch job")
+            logger.info(f"  Name: {job.name}")
             logger.info(f"  UID: {job.uid}")
             logger.info(f"  Status: {job.status.state.name}")
             task_counts = collections.defaultdict(int)
             for group in job.status.task_groups.values():
                 for status, count in group.counts.items():
                     task_counts[status] += count
             logger.info(f"  Task statuses: {dict(task_counts)}")
+            logger.debug(f"Full job info:\n{job}")
+        except exceptions.NotFound:
+            logger.info(f"No existing Batch jobs match: {self.gcp_batch_job_name}")
+        logger.info(f"See all Batch jobs at https://console.cloud.google.com/batch/jobs?project={self.gcp_project}")
+
+        # Postprocessing Cloud Run job
+        jobs_client = run_v2.JobsClient()
+        try:
+            job = jobs_client.get_job(name=self.postprocessing_job_name)
+            last_execution = job.latest_created_execution
+            status = "Running"
+            if last_execution.completion_time:
+                status = "Completed"
+            logger.info("Post-processing Cloud Run job")
+            logger.info(f"  Name: {job.name}")
+            logger.info(f"  Status of latest run ({last_execution.name}): {status}")
+            logger.debug(f"Full job info:\n{job}")
+        except exceptions.NotFound:
+            logger.info(f"No existing Cloud Run jobs match {self.postprocessing_job_name}")
+        logger.info(f"See all Cloud Run jobs at https://console.cloud.google.com/run/jobs?project={self.gcp_project}")
 
     def run_batch(self):
         """
@@ -570,13 +582,22 @@ def run_batch(self):
             memory_mib=job_env_cfg.get("memory_mib", 1024),
         )
 
+        # Give three minutes per simulation, plus ten minutes for job overhead
+        task_duration_secs = 60 * (10 + n_sims_per_job * 3)
         task = batch_v1.TaskSpec(
             runnables=[runnable],
             compute_resource=resources,
-            # TODO: Confirm what happens if this fails repeatedly, or for only some tasks, and document it.
-            max_retry_count=2,
-            # TODO: How long does this timeout need to be?
-            max_run_duration="5000s",
+            # Allow retries, but only when the machine is preempted.
+            max_retry_count=3,
+            lifecycle_policies=[
+                batch_v1.LifecyclePolicy(
+                    action=batch_v1.LifecyclePolicy.Action.RETRY_TASK,
+                    action_condition=batch_v1.LifecyclePolicy.ActionCondition(
+                        exit_codes=[50001]  # Exit code for preemptions
+                    ),
+                )
+            ],
+            max_run_duration=f"{task_duration_secs}s",
         )
 
         # How many of these tasks to run.
@@ -1087,7 +1108,7 @@ def main():
             help="Only validate the project YAML file and references. Nothing is executed",
             action="store_true",
         )
-        group.add_argument("--list_jobs", help="List existing jobs", action="store_true")
+        group.add_argument("--show_jobs", help="List existing jobs", action="store_true")
         group.add_argument(
             "--postprocessonly",
             help="Only do postprocessing, useful for when the simulations are already done",
@@ -1120,8 +1141,8 @@ def main():
         if args.clean:
             batch.clean()
             return
-        if args.list_jobs:
-            batch.list_jobs()
+        if args.show_jobs:
+            batch.show_jobs()
             return
         elif args.postprocessonly:
             batch.build_image()

diff --git a/docs/run_sims.rst b/docs/run_sims.rst
@@ -22,7 +22,7 @@ Running a project file is straightforward. Call the ``buildstock_local`` command
 
 Eagle
 ~~~~~
-After you have :ref:`activated the appropriate conda environment on Eagle <eagle_install>`, 
+After you have :ref:`activated the appropriate conda environment on Eagle <eagle_install>`,
 you can submit a project file to be simulated by passing it to the ``buildstock_eagle`` command.
 
 .. command-output:: buildstock_eagle --help
@@ -40,8 +40,8 @@ Eagle specific project configuration
 To get a project to run on Eagle, you will need to make a few changes to your :doc:`project_defn`.
 First, the ``output_directory`` should be in ``/scratch/your_username/some_directory`` or in ``/projects`` somewhere.
 Building stock simulations generate a lot of output quickly and the ``/scratch`` or ``/projects`` filesystem are
-equipped to handle that kind of I/O throughput where your ``/home`` directory is not and may cause 
-stability issues across the whole system. 
+equipped to handle that kind of I/O throughput where your ``/home`` directory is not and may cause
+stability issues across the whole system.
 
 Next, you will need to add an ``eagle`` top level key to the project file, which will look something like this
 
@@ -69,7 +69,7 @@ jobs runs a batch of simulations on a single compute node. Sometimes a handful
 of jobs will fail due to issues with Eagle (filesystem or timeouts). If most of
 the jobs succeeded, rather than rerun everything you can resubmit just the jobs
 that failed with the ``--rerun_failed`` command line argument. This will also
-clear out and rerun the postprocessing. 
+clear out and rerun the postprocessing.
 
 
 Amazon Web Services
@@ -152,9 +152,9 @@ this option makes it easier to quickly assign a new ID with each run. It also ma
 List existing jobs
 ..................
 
-Run ``buildstock_gcp --list_jobs your_project_file.yml`` to see a list of all existing
+Run ``buildstock_gcp --show_jobs your_project_file.yml`` to see the existing
 jobs matching the project specified. This can show you whether a previously-started job
-has completed or is still running.
+has completed, is still running, or has already been cleaned up.
 
 
 Cleaning up after yourself