Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Let bsub retry on LSF message "Request from non-LSF host rejected" #9195

Merged
merged 7 commits into from
Nov 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test_ert.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ jobs:
- name: CLI Test
if: inputs.test-type == 'cli-tests'
run: |
pytest --cov=ert --cov-report=xml:cov1.xml --junit-xml=junit.xml -n logical -v --benchmark-disable --dist loadgroup tests/ui_tests/cli
pytest --cov=ert --cov-report=xml:cov1.xml --junit-xml=junit.xml -o junit_family=legacy -n logical --maxprocesses=2 -v --benchmark-disable --dist loadgroup tests/ui_tests/cli

- name: Unit Test
if: inputs.test-type == 'unit-tests'
Expand Down
7 changes: 3 additions & 4 deletions ci/testkomodo.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ install_test_dependencies () {
pip install ".[dev]"
}

run_ert_with_opm () {
run_ert_with_opm() {
pushd "${CI_TEST_ROOT}"

cp -r "${CI_SOURCE_ROOT}/test-data/flow_example" ert_with_opm
Expand All @@ -24,7 +24,7 @@ run_ert_with_opm () {
ert test_run flow.ert ||
(
# In case ert fails, print log files if they are there:
cat spe1_out/realization-0/iter-0/STATUS || true
cat spe1_out/realization-0/iter-0/STATUS || true
cat spe1_out/realization-0/iter-0/ERROR || true
cat spe1_out/realization-0/iter-0/FLOW.stderr.0 || true
cat spe1_out/realization-0/iter-0/FLOW.stdout.0 || true
Expand All @@ -41,7 +41,7 @@ start_tests () {
pushd ${CI_TEST_ROOT}/tests

# Run all ert tests except tests evaluating memory consumption and tests requiring windows manager (GUI tests)
pytest --eclipse-simulator -n logical --show-capture=stderr -v --max-worker-restart 0 \
pytest --eclipse-simulator -n auto --show-capture=stderr -v --max-worker-restart 0 \
-m "not limit_memory and not requires_window_manager" --benchmark-disable --dist loadgroup
return_code_ert_main_tests=$?

Expand Down Expand Up @@ -72,7 +72,6 @@ start_tests () {

set -e


return_code_combined_tests=0
# We error if one or more returncodes are nonzero
if [ "$return_code_ert_main_tests" -ne 0 ]; then
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ dependencies = [
"python-dateutil",
"python-multipart",
"pyyaml",
"qtpy",
"qtpy==2.4.1",
"requests",
"resfo",
"scipy >= 1.10.1",
Expand All @@ -71,7 +71,7 @@ dependencies = [
"tqdm>=4.62.0",
"typing_extensions>=4.5",
"uvicorn >= 0.17.0",
"websockets",
"websockets < 14",
"xarray",
"xtgeo >= 3.3.0",
]
Expand Down
9 changes: 8 additions & 1 deletion src/ert/scheduler/lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,14 @@ class RunningJob:
LSF_INFO_JSON_FILENAME = "lsf_info.json"
FLAKY_SSH_RETURNCODE = 255
JOB_ALREADY_FINISHED_BKILL_MSG = "Job has already finished"
BSUB_FAILURE_MESSAGES = ("Job not submitted",)
BSUB_FAILURE_MESSAGES = (
"Error in rusage section",
"Expeced number, string",
"No such queue",
"Too many processors requested",
"cannot be used in the resource requirement section",
"duplicate section",
)


def _parse_jobs_dict(jobs: Mapping[str, JobState]) -> dict[str, AnyJob]:
Expand Down
25 changes: 21 additions & 4 deletions tests/unit_tests/forward_model_runner/test_job.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def max_memory_per_subprocess_layer(layers: int) -> int:
job = Job(
{
"executable": executable,
"argList": [str(layers), str(int(1e6))],
"argList": [str(layers), str(int(1e7))],
},
0,
)
Expand All @@ -144,7 +144,8 @@ def max_memory_per_subprocess_layer(layers: int) -> int:
assert max_seens[1] + memory_per_numbers_list < max_seens[2]


@pytest.mark.flaky(reruns=3)
@pytest.mark.integration_test
@pytest.mark.flaky(reruns=5)
@pytest.mark.usefixtures("use_tmpdir")
def test_memory_profile_in_running_events():
scriptname = "increasing_memory.py"
Expand Down Expand Up @@ -190,10 +191,26 @@ def test_memory_profile_in_running_events():
# Avoid the tail of the array, then the process is tearing down
).all(), f"Emitted memory usage not increasing, got {emitted_rss_values[:-3]=}"

memory_deltas = np.diff(np.array(emitted_rss_values[7:]))
if not len(memory_deltas):
# This can happen if memory profiling is lagging behind the process
# we are trying to track.
memory_deltas = np.diff(np.array(emitted_rss_values[2:]))

lenience_factor = 4
# Ideally this is 1 which corresponds to being able to track every memory
# allocation perfectly. But on loaded hardware, some of the allocations can be
# missed due to process scheduling. Bump as needed.

assert (
np.diff(np.array(emitted_rss_values[7:])).max() < 3 * 1024 * 1024
max(memory_deltas) < lenience_factor * 1024 * 1024
# Avoid the first steps, which includes the Python interpreters memory usage
), f"Memory increased too sharply, missing a measurement? Got {emitted_rss_values[7:]=}"
), (
"Memory increased too sharply, missing a measurement? "
f"Got {emitted_rss_values=} with selected diffs {memory_deltas}. "
"If the maximal number is at the beginning, it is probably the Python process "
"startup that is tracked."
)

if sys.platform.startswith("darwin"):
# No oom_score on MacOS
Expand Down
8 changes: 2 additions & 6 deletions tests/unit_tests/scheduler/test_generic_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -162,13 +162,9 @@ async def test_kill_actually_kills(driver: Driver, tmp_path, pytestconfig):
# Allow more time when tested on a real compute cluster to avoid false positives.
job_kill_window = 60
test_grace_time = 120
elif sys.platform.startswith("darwin"):
# Mitigate flakiness on low-power test nodes
job_kill_window = 5
test_grace_time = 8
else:
job_kill_window = 1
test_grace_time = 2
job_kill_window = 5 # Busy test nodes require a long kill window
test_grace_time = 8

async def kill_job_once_started(iens):
nonlocal driver
Expand Down
3 changes: 2 additions & 1 deletion tests/unit_tests/scheduler/test_lsf_driver.py
Original file line number Diff line number Diff line change
Expand Up @@ -578,7 +578,6 @@ async def test_that_bsub_will_retry_and_fail(
" '&' cannot be used in the resource requirement section. Job not submitted.",
),
(255, "Error in rusage section. Job not submitted."),
(255, "Job not submitted."),
],
)
async def test_that_bsub_will_fail_without_retries(
Expand All @@ -604,6 +603,8 @@ async def test_that_bsub_will_fail_without_retries(
[
(0, "void"),
(FLAKY_SSH_RETURNCODE, ""),
(0, "Request from non-LSF host rejected"),
(FLAKY_SSH_RETURNCODE, "Request from non-LSF host rejected"),
],
)
async def test_that_bsub_will_retry_and_succeed(
Expand Down
Loading