From 9bb18ef26ae4b3779c73c2d26f9580238efd5fd3 Mon Sep 17 00:00:00 2001
From: Hugo Herter <git@hugoherter.com>
Date: Mon, 25 Sep 2023 16:20:30 +0200
Subject: [PATCH] fixup! Fix: Errors in allocation exited entire scheduling

---
 vm_supervisor/views/__init__.py | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/vm_supervisor/views/__init__.py b/vm_supervisor/views/__init__.py
index 57842abff..7f1808ecd 100644
--- a/vm_supervisor/views/__init__.py
+++ b/vm_supervisor/views/__init__.py
@@ -1,16 +1,15 @@
-import asyncio
 import binascii
-import json
 import logging
 from hashlib import sha256
 from pathlib import Path
 from string import Template
-from typing import Awaitable, Coroutine, Dict, List, Optional
+from typing import Awaitable, Dict, Optional
 
 import aiodns
 import aiohttp
 from aiohttp import web
 from aiohttp.web_exceptions import HTTPNotFound
+from aleph_message.exceptions import UnknownHashError
 from aleph_message.models import ItemHash
 from pydantic import ValidationError
 
@@ -222,7 +221,9 @@ async def update_allocations(request: web.Request):
 
     # Second start persistent VMs and instances sequentially to limit resource usage.
 
+    # Exceptions that can be raised when starting a VM:
     vm_creation_exceptions = (
+        UnknownHashError,
         ResourceDownloadError,
         FileTooLargeError,
         VmSetupError,
@@ -234,9 +235,9 @@ async def update_allocations(request: web.Request):
 
     # Schedule the start of persistent VMs:
     for vm_hash in allocation.persistent_vms:
-        vm_hash = ItemHash(vm_hash)
-        logger.info(f"Starting long running VM {vm_hash}")
         try:
+            logger.info(f"Starting long running VM '{vm_hash}'")
+            vm_hash = ItemHash(vm_hash)
             await start_persistent_vm(vm_hash, pubsub)
         except vm_creation_exceptions as error:
             logger.exception(error)
@@ -244,13 +245,13 @@ async def update_allocations(request: web.Request):
 
     # Schedule the start of instances:
     for instance_hash in allocation.instances:
-        instance_hash = ItemHash(instance_hash)
-        logger.info(f"Starting instance {instance_hash}")
+        logger.info(f"Starting instance '{instance_hash}'")
         try:
-            await start_persistent_vm(vm_hash, pubsub)
+            instance_hash = ItemHash(instance_hash)
+            await start_persistent_vm(instance_hash, pubsub)
         except vm_creation_exceptions as error:
             logger.exception(error)
-            scheduling_errors[vm_hash] = error
+            scheduling_errors[instance_hash] = error
 
     # Log unsupported features
     if allocation.on_demand_vms:
@@ -274,6 +275,9 @@ async def update_allocations(request: web.Request):
             "success": not failing,
             "successful": list(successful),
             "failing": list(failing),
+            "errors": {
+                vm_hash: repr(error) for vm_hash, error in scheduling_errors.items()
+            },
         },
         status=status_code,
     )