-
Notifications
You must be signed in to change notification settings - Fork 27
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[Fetch Migration] Improvements to subprocess handling #372
Changes from all commits
1bb3905
c1d0c79
347bc5a
6af8de7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,8 +1,10 @@ | ||
import argparse | ||
import logging | ||
import math | ||
import subprocess | ||
import time | ||
from subprocess import Popen | ||
from typing import Optional, List | ||
import math | ||
|
||
import requests | ||
from prometheus_client import Metric | ||
|
@@ -11,20 +13,35 @@ | |
from endpoint_info import EndpointInfo | ||
from migration_monitor_params import MigrationMonitorParams | ||
|
||
__PROMETHEUS_METRICS_ENDPOINT = "/metrics/prometheus" | ||
__SHUTDOWN_ENDPOINT = "/shutdown" | ||
# Path to the Data Prepper Prometheus metrics API endpoint | ||
# Used to monitor the progress of the migration | ||
__METRICS_API_PATH = "/metrics/prometheus" | ||
__SHUTDOWN_API_PATH = "/shutdown" | ||
__DOC_SUCCESS_METRIC = "_opensearch_documentsSuccess" | ||
__RECORDS_IN_FLIGHT_METRIC = "_BlockingBuffer_recordsInFlight" | ||
__NO_PARTITIONS_METRIC = "_noPartitionsAcquired" | ||
|
||
|
||
# Gracefully shutdown a subprocess | ||
def shutdown_process(proc: Popen) -> Optional[int]: | ||
# Process is still running, send SIGTERM | ||
proc.terminate() | ||
try: | ||
proc.wait(timeout=60) | ||
except subprocess.TimeoutExpired: | ||
if proc.returncode is None: | ||
# Failed to terminate, send SIGKILL | ||
proc.kill() | ||
return proc.returncode | ||
|
||
|
||
def shutdown_pipeline(endpoint: EndpointInfo): | ||
shutdown_endpoint = endpoint.url + __SHUTDOWN_ENDPOINT | ||
shutdown_endpoint = endpoint.url + __SHUTDOWN_API_PATH | ||
requests.post(shutdown_endpoint, auth=endpoint.auth, verify=endpoint.verify_ssl) | ||
|
||
|
||
def fetch_prometheus_metrics(endpoint: EndpointInfo) -> Optional[List[Metric]]: | ||
metrics_endpoint = endpoint.url + __PROMETHEUS_METRICS_ENDPOINT | ||
metrics_endpoint = endpoint.url + __METRICS_API_PATH | ||
try: | ||
response = requests.get(metrics_endpoint, auth=endpoint.auth, verify=endpoint.verify_ssl) | ||
response.raise_for_status() | ||
|
@@ -65,35 +82,82 @@ def check_if_complete(doc_count: Optional[int], in_flight: Optional[int], no_par | |
return False | ||
|
||
|
||
def check_and_log_progress(endpoint_info: EndpointInfo, target_doc_count: int, prev_no_partitions_count: int) -> \ | ||
tuple[bool, int]: | ||
terminal: bool = False | ||
# If the API call fails, the response is empty | ||
metrics = fetch_prometheus_metrics(endpoint_info) | ||
if metrics is not None: | ||
success_docs = get_metric_value(metrics, __DOC_SUCCESS_METRIC) | ||
rec_in_flight = get_metric_value(metrics, __RECORDS_IN_FLIGHT_METRIC) | ||
no_partitions_count = get_metric_value(metrics, __NO_PARTITIONS_METRIC) | ||
Comment on lines
+91
to
+93
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What can go wrong in getting these values, since we're relying on not just one but two processes? |
||
if success_docs is not None: # pragma no cover | ||
completion_percentage: int = math.floor((success_docs * 100) / target_doc_count) | ||
progress_message: str = "Completed " + str(success_docs) + \ | ||
" docs ( " + str(completion_percentage) + "% )" | ||
logging.info(progress_message) | ||
else: | ||
logging.warning("Could not fetch progress stats from Data Prepper response, " + | ||
"will retry on next polling cycle...") | ||
terminal = check_if_complete(success_docs, rec_in_flight, no_partitions_count, prev_no_partitions_count, | ||
target_doc_count) | ||
if not terminal: | ||
# Save no_partitions_count | ||
prev_no_partitions_count = no_partitions_count | ||
else: | ||
logging.warning("Data Prepper metrics API call failed, will retry on next polling cycle...") | ||
# TODO - Handle idle non-terminal pipeline | ||
return terminal, prev_no_partitions_count | ||
|
||
|
||
def monitor_local(args: MigrationMonitorParams, dp_process: Popen, poll_interval_seconds: int = 30) -> Optional[int]: | ||
kartg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
endpoint_info = EndpointInfo(args.data_prepper_endpoint) | ||
target_doc_count: int = args.target_count | ||
# Counter to track the no_partition_count metric | ||
no_partition_count: int = 0 | ||
is_migration_complete = False | ||
logging.info("Starting migration monitor until target doc count: " + str(target_doc_count)) | ||
# Sets returncode. A value of None means the subprocess has not yet terminated | ||
dp_process.poll() | ||
while dp_process.returncode is None and not is_migration_complete: | ||
try: | ||
dp_process.wait(timeout=poll_interval_seconds) | ||
except subprocess.TimeoutExpired: | ||
pass | ||
if dp_process.returncode is None: | ||
is_migration_complete, no_partition_count = check_and_log_progress( | ||
endpoint_info, target_doc_count, no_partition_count) | ||
# Loop terminated | ||
if not is_migration_complete: | ||
logging.error("Migration did not complete, process exited with code: " + str(dp_process.returncode)) | ||
# TODO - Implement rollback | ||
kartg marked this conversation as resolved.
Show resolved
Hide resolved
|
||
logging.error("Please delete any partially migrated indices before retrying the migration.") | ||
return dp_process.returncode | ||
else: | ||
# Shut down Data Prepper pipeline via API | ||
logging.info("Migration monitor observed successful migration and idle pipeline, shutting down...\n") | ||
shutdown_pipeline(endpoint_info) | ||
if dp_process.returncode is None: | ||
# Workaround for https://github.com/opensearch-project/data-prepper/issues/3141 | ||
return shutdown_process(dp_process) | ||
else: | ||
return dp_process.returncode | ||
|
||
|
||
def run(args: MigrationMonitorParams, poll_interval_seconds: int = 30) -> None: | ||
endpoint = EndpointInfo(args.data_prepper_endpoint) | ||
endpoint_info = EndpointInfo(args.data_prepper_endpoint) | ||
target_doc_count: int = args.target_count | ||
prev_no_partitions_count = 0 | ||
terminal = False | ||
# Counter to track the no_partition_count metric | ||
no_partition_count: int = 0 | ||
is_migration_complete = False | ||
logging.info("Starting migration monitor until target doc count: " + str(target_doc_count)) | ||
while not terminal: | ||
while not is_migration_complete: | ||
time.sleep(poll_interval_seconds) | ||
# If the API call fails, the response is empty | ||
metrics = fetch_prometheus_metrics(endpoint) | ||
if metrics is not None: | ||
success_docs = get_metric_value(metrics, __DOC_SUCCESS_METRIC) | ||
rec_in_flight = get_metric_value(metrics, __RECORDS_IN_FLIGHT_METRIC) | ||
no_partitions_count = get_metric_value(metrics, __NO_PARTITIONS_METRIC) | ||
if success_docs is not None: # pragma no cover | ||
completion_percentage: int = math.floor((success_docs * 100) / target_doc_count) | ||
progress_message: str = "Completed " + str(success_docs) + \ | ||
" docs ( " + str(completion_percentage) + "% )" | ||
logging.info(progress_message) | ||
else: | ||
logging.info("Could not fetch metrics from Data Prepper, will retry on next polling cycle...") | ||
terminal = check_if_complete(success_docs, rec_in_flight, no_partitions_count, | ||
prev_no_partitions_count, target_doc_count) | ||
if not terminal: | ||
# Save no_partitions_count | ||
prev_no_partitions_count = no_partitions_count | ||
is_migration_complete, no_partition_count = check_and_log_progress( | ||
endpoint_info, target_doc_count, no_partition_count) | ||
# Loop terminated, shut down the Data Prepper pipeline | ||
logging.info("Migration monitor observed successful migration and idle pipeline, shutting down...\n") | ||
shutdown_pipeline(endpoint) | ||
shutdown_pipeline(endpoint_info) | ||
|
||
|
||
if __name__ == '__main__': # pragma no cover | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can your code confirm that DataPrepper is bound to only localhost? I could see a lot of difficulty happening if a second managing agent came along and started acting on it - or if somebody else DDOS'ed the API (while it was causing a lot of enery on the source cluster).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, this code needs an update to stop accepting
dp_endpoint
. Currently thelocalhost
configuration is driven only by the Dockerfile:opensearch-migrations/FetchMigration/Dockerfile
Line 22 in 2ec00c0
This is obviously not a strong check. I'll make that change in a follow-up PR.