From 9bb18ef26ae4b3779c73c2d26f9580238efd5fd3 Mon Sep 17 00:00:00 2001 From: Hugo Herter Date: Mon, 25 Sep 2023 16:20:30 +0200 Subject: [PATCH] fixup! Fix: Errors in allocation exited entire scheduling --- vm_supervisor/views/__init__.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/vm_supervisor/views/__init__.py b/vm_supervisor/views/__init__.py index 57842abff..7f1808ecd 100644 --- a/vm_supervisor/views/__init__.py +++ b/vm_supervisor/views/__init__.py @@ -1,16 +1,15 @@ -import asyncio import binascii -import json import logging from hashlib import sha256 from pathlib import Path from string import Template -from typing import Awaitable, Coroutine, Dict, List, Optional +from typing import Awaitable, Dict, Optional import aiodns import aiohttp from aiohttp import web from aiohttp.web_exceptions import HTTPNotFound +from aleph_message.exceptions import UnknownHashError from aleph_message.models import ItemHash from pydantic import ValidationError @@ -222,7 +221,9 @@ async def update_allocations(request: web.Request): # Second start persistent VMs and instances sequentially to limit resource usage. + # Exceptions that can be raised when starting a VM: vm_creation_exceptions = ( + UnknownHashError, ResourceDownloadError, FileTooLargeError, VmSetupError, @@ -234,9 +235,9 @@ async def update_allocations(request: web.Request): # Schedule the start of persistent VMs: for vm_hash in allocation.persistent_vms: - vm_hash = ItemHash(vm_hash) - logger.info(f"Starting long running VM {vm_hash}") try: + logger.info(f"Starting long running VM '{vm_hash}'") + vm_hash = ItemHash(vm_hash) await start_persistent_vm(vm_hash, pubsub) except vm_creation_exceptions as error: logger.exception(error) @@ -244,13 +245,13 @@ async def update_allocations(request: web.Request): # Schedule the start of instances: for instance_hash in allocation.instances: - instance_hash = ItemHash(instance_hash) - logger.info(f"Starting instance {instance_hash}") + logger.info(f"Starting instance '{instance_hash}'") try: - await start_persistent_vm(vm_hash, pubsub) + instance_hash = ItemHash(instance_hash) + await start_persistent_vm(instance_hash, pubsub) except vm_creation_exceptions as error: logger.exception(error) - scheduling_errors[vm_hash] = error + scheduling_errors[instance_hash] = error # Log unsupported features if allocation.on_demand_vms: @@ -274,6 +275,9 @@ async def update_allocations(request: web.Request): "success": not failing, "successful": list(successful), "failing": list(failing), + "errors": { + vm_hash: repr(error) for vm_hash, error in scheduling_errors.items() + }, }, status=status_code, )