Skip to content

Commit

Permalink
Fix resume runtime after a pause (#4904)
Browse files Browse the repository at this point in the history
  • Loading branch information
rbren authored Nov 12, 2024
1 parent d9c5f11 commit 0633a99
Showing 1 changed file with 35 additions and 7 deletions.
42 changes: 35 additions & 7 deletions openhands/runtime/impl/remote/remote_runtime.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,6 +138,7 @@ def _check_existing_runtime(self) -> bool:
response = self._send_request(
'GET',
f'{self.config.sandbox.remote_runtime_api_url}/sessions/{self.sid}',
is_retry=False,
timeout=5,
)
except requests.HTTPError as e:
Expand Down Expand Up @@ -168,6 +169,7 @@ def _build_runtime(self):
response = self._send_request(
'GET',
f'{self.config.sandbox.remote_runtime_api_url}/registry_prefix',
is_retry=False,
timeout=10,
)
response_json = response.json()
Expand Down Expand Up @@ -198,6 +200,7 @@ def _build_runtime(self):
response = self._send_request(
'GET',
f'{self.config.sandbox.remote_runtime_api_url}/image_exists',
is_retry=False,
params={'image': self.container_image},
timeout=10,
)
Expand Down Expand Up @@ -234,6 +237,7 @@ def _start_runtime(self):
response = self._send_request(
'POST',
f'{self.config.sandbox.remote_runtime_api_url}/start',
is_retry=False,
json=start_request,
)
self._parse_runtime_response(response)
Expand All @@ -246,6 +250,7 @@ def _resume_runtime(self):
self._send_request(
'POST',
f'{self.config.sandbox.remote_runtime_api_url}/resume',
is_retry=False,
json={'runtime_id': self.runtime_id},
timeout=30,
)
Expand Down Expand Up @@ -283,14 +288,12 @@ def _wait_until_alive_impl(self):
assert runtime_data['runtime_id'] == self.runtime_id
assert 'pod_status' in runtime_data
pod_status = runtime_data['pod_status']
self.log('debug', runtime_data)
self.log('debug', f'Pod status: {pod_status}')

# FIXME: We should fix it at the backend of /start endpoint, make sure
# the pod is created before returning the response.
# Retry a period of time to give the cluster time to start the pod
if pod_status == 'Not Found':
raise RuntimeNotReadyError(
f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
)
if pod_status == 'Ready':
try:
self._send_request(
Expand All @@ -305,12 +308,23 @@ def _wait_until_alive_impl(self):
f'Runtime /alive failed to respond with 200: {e}'
)
return
if pod_status in ('Failed', 'Unknown'):
elif (
pod_status == 'Not Found'
or pod_status == 'Pending'
or pod_status == 'Running'
): # nb: Running is not yet Ready
raise RuntimeNotReadyError(
f'Runtime (ID={self.runtime_id}) is not yet ready. Status: {pod_status}'
)
elif pod_status in ('Failed', 'Unknown'):
# clean up the runtime
self.close()
raise RuntimeError(
f'Runtime (ID={self.runtime_id}) failed to start. Current status: {pod_status}'
)
else:
# Maybe this should be a hard failure, but passing through in case the API changes
self.log('warning', f'Unknown pod status: {pod_status}')

self.log(
'debug',
Expand All @@ -327,6 +341,7 @@ def close(self, timeout: int = 10):
response = self._send_request(
'POST',
f'{self.config.sandbox.remote_runtime_api_url}/stop',
is_retry=False,
json={'runtime_id': self.runtime_id},
timeout=timeout,
)
Expand All @@ -342,7 +357,7 @@ def close(self, timeout: int = 10):
finally:
self.session.close()

def run_action(self, action: Action) -> Observation:
def run_action(self, action: Action, is_retry: bool = False) -> Observation:
if action.timeout is None:
action.timeout = self.config.sandbox.timeout
if isinstance(action, FileEditAction):
Expand All @@ -367,6 +382,7 @@ def run_action(self, action: Action) -> Observation:
response = self._send_request(
'POST',
f'{self.runtime_url}/execute_action',
is_retry=False,
json=request_body,
# wait a few more seconds to get the timeout error from client side
timeout=action.timeout + 5,
Expand All @@ -380,7 +396,7 @@ def run_action(self, action: Action) -> Observation:
)
return obs

def _send_request(self, method, url, **kwargs):
def _send_request(self, method, url, is_retry=False, **kwargs):
is_runtime_request = self.runtime_url and self.runtime_url in url
try:
return send_request(self.session, method, url, **kwargs)
Expand All @@ -392,6 +408,15 @@ def _send_request(self, method, url, **kwargs):
raise RuntimeDisconnectedError(
f'404 error while connecting to {self.runtime_url}'
)
elif is_runtime_request and e.response.status_code == 503:
if not is_retry:
self.log('warning', 'Runtime appears to be paused. Resuming...')
self._resume_runtime()
self._wait_until_alive()
return self._send_request(method, url, True, **kwargs)
else:
raise e

else:
raise e

Expand Down Expand Up @@ -444,6 +469,7 @@ def copy_to(
response = self._send_request(
'POST',
f'{self.runtime_url}/upload_file',
is_retry=False,
files=upload_data,
params=params,
timeout=300,
Expand All @@ -467,6 +493,7 @@ def list_files(self, path: str | None = None) -> list[str]:
response = self._send_request(
'POST',
f'{self.runtime_url}/list_files',
is_retry=False,
json=data,
timeout=30,
)
Expand All @@ -480,6 +507,7 @@ def copy_from(self, path: str) -> Path:
response = self._send_request(
'GET',
f'{self.runtime_url}/download_files',
is_retry=False,
params=params,
stream=True,
timeout=30,
Expand Down

0 comments on commit 0633a99

Please sign in to comment.