Skip to content

Commit

Permalink
fixup! Fix: Errors in allocation exited entire scheduling
Browse files Browse the repository at this point in the history
  • Loading branch information
hoh committed Sep 25, 2023
1 parent 6098a39 commit 9bb18ef
Showing 1 changed file with 13 additions and 9 deletions.
22 changes: 13 additions & 9 deletions vm_supervisor/views/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,15 @@
import asyncio
import binascii
import json
import logging
from hashlib import sha256
from pathlib import Path
from string import Template
from typing import Awaitable, Coroutine, Dict, List, Optional
from typing import Awaitable, Dict, Optional

import aiodns
import aiohttp
from aiohttp import web
from aiohttp.web_exceptions import HTTPNotFound
from aleph_message.exceptions import UnknownHashError
from aleph_message.models import ItemHash
from pydantic import ValidationError

Expand Down Expand Up @@ -222,7 +221,9 @@ async def update_allocations(request: web.Request):

# Second start persistent VMs and instances sequentially to limit resource usage.

# Exceptions that can be raised when starting a VM:
vm_creation_exceptions = (
UnknownHashError,
ResourceDownloadError,
FileTooLargeError,
VmSetupError,
Expand All @@ -234,23 +235,23 @@ async def update_allocations(request: web.Request):

# Schedule the start of persistent VMs:
for vm_hash in allocation.persistent_vms:
vm_hash = ItemHash(vm_hash)
logger.info(f"Starting long running VM {vm_hash}")
try:
logger.info(f"Starting long running VM '{vm_hash}'")
vm_hash = ItemHash(vm_hash)
await start_persistent_vm(vm_hash, pubsub)
except vm_creation_exceptions as error:
logger.exception(error)
scheduling_errors[vm_hash] = error

# Schedule the start of instances:
for instance_hash in allocation.instances:
instance_hash = ItemHash(instance_hash)
logger.info(f"Starting instance {instance_hash}")
logger.info(f"Starting instance '{instance_hash}'")
try:
await start_persistent_vm(vm_hash, pubsub)
instance_hash = ItemHash(instance_hash)
await start_persistent_vm(instance_hash, pubsub)
except vm_creation_exceptions as error:
logger.exception(error)
scheduling_errors[vm_hash] = error
scheduling_errors[instance_hash] = error

# Log unsupported features
if allocation.on_demand_vms:
Expand All @@ -274,6 +275,9 @@ async def update_allocations(request: web.Request):
"success": not failing,
"successful": list(successful),
"failing": list(failing),
"errors": {
vm_hash: repr(error) for vm_hash, error in scheduling_errors.items()
},
},
status=status_code,
)

0 comments on commit 9bb18ef

Please sign in to comment.