Skip to content

Commit

Permalink
Consistently reproduce crash and improve test reliability
Browse files Browse the repository at this point in the history
- Add test to consistently reproduce rdb load callback crash
- Avoid checking close_asap when no data was processed

Signed-off-by: naglera <[email protected]>
  • Loading branch information
naglera committed Oct 21, 2024
1 parent 2479711 commit 6f9d737
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 5 deletions.
8 changes: 4 additions & 4 deletions src/rdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -2929,14 +2929,14 @@ int rdbLoadProgressCallback(rio *r, const void *buf, size_t len) {
loadingAbsProgress(r->processed_bytes);
processEventsWhileBlocked();
processModuleLoadingProgressEvent(0);
if (server.repl_provisional_primary.close_asap == 1) {
serverLog(LL_WARNING, "Primary main connection dropped during RDB load callback");
return -1;
}
}
if (server.repl_state == REPL_STATE_TRANSFER && rioCheckType(r) == RIO_TYPE_CONN) {
server.stat_net_repl_input_bytes += len;
}
if (server.repl_provisional_primary.close_asap == 1) {
serverLog(LL_WARNING, "Primary main connection dropped during RDB load callback");
return -1;
}
return 0;
}

Expand Down
66 changes: 65 additions & 1 deletion tests/integration/dual-channel-replication.tcl
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,7 @@ start_server {tags {"dual-channel-replication external:skip"}} {
}

$primary debug log "killing replica main connection"
set replica_main_conn_id [get_client_id_by_last_cmd $primary "sync"]
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
assert {$replica_main_conn_id != ""}
set loglines [count_log_lines -1]
$primary client kill id $replica_main_conn_id
Expand All @@ -1197,3 +1197,67 @@ start_server {tags {"dual-channel-replication external:skip"}} {
stop_write_load $load_handle
}
}


start_server {tags {"dual-channel-replication external:skip"}} {
set primary [srv 0 client]
set primary_host [srv 0 host]
set primary_port [srv 0 port]
set loglines [count_log_lines 0]

$primary config set repl-diskless-sync yes
$primary config set dual-channel-replication-enabled yes
$primary config set loglevel debug
$primary config set repl-diskless-sync-delay 5; # allow catch failed sync before retry

# Generating RDB will cost 100 sec to generate
$primary debug populate 1000000 primary 1
$primary config set rdb-key-save-delay -1000

start_server {} {
set replica [srv 0 client]
set replica_host [srv 0 host]
set replica_port [srv 0 port]
set replica_log [srv 0 stdout]

$replica config set dual-channel-replication-enabled yes
$replica config set loglevel debug
$replica config set repl-timeout 10
$replica config set repl-diskless-load flush-before-load

test "Replica notice main-connection killed during rdb load callback" {; # https://github.com/valkey-io/valkey/issues/1152
set loglines [count_log_lines 0]
$replica replicaof $primary_host $primary_port
# Wait for sync session to start
wait_for_condition 500 1000 {
[string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
[string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
[s -1 rdb_bgsave_in_progress] eq 1
} else {
fail "replica didn't start sync session in time"
}
wait_for_log_messages 0 {"*Loading RDB produced by Valkey version*"} $loglines 1000 10
$primary set key val
set replica_main_conn_id [get_client_id_by_last_cmd $primary "psync"]
$primary debug log "killing replica main connection $replica_main_conn_id"
assert {$replica_main_conn_id != ""}
set loglines [count_log_lines 0]
$primary client kill id $replica_main_conn_id
# Wait for primary to abort the sync
wait_for_condition 50 1000 {
[string match {*replicas_waiting_psync:0*} [$primary info replication]]
} else {
fail "Primary did not free repl buf block after sync failure"
}
wait_for_log_messages 0 {"*Primary main connection dropped during RDB load callback*"} $loglines 1000 10
# Replica should retry
wait_for_condition 500 1000 {
[string match "*slave*,state=wait_bgsave*,type=rdb-channel*" [$primary info replication]] &&
[string match "*slave*,state=bg_transfer*,type=main-channel*" [$primary info replication]] &&
[s -1 rdb_bgsave_in_progress] eq 1
} else {
fail "replica didn't retry after connection close"
}
}
}
}

0 comments on commit 6f9d737

Please sign in to comment.