-
Notifications
You must be signed in to change notification settings - Fork 372
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Sentry Nodes lagging behind validator in consensus process #2430
Comments
Posting also the # Mechanism to connect to the ABCI application: socket | grpc
abci = "socket"
# Database backend: goleveldb | boltdb
# * goleveldb (github.com/syndtr/goleveldb - most popular implementation)
# - pure go
# - stable
#* boltdb (uses etcd's fork of bolt - go.etcd.io/bbolt)
# - EXPERIMENTAL
# - may be faster is some use-cases (random reads - indexer)
# - use boltdb build tag (go build -tags boltdb)
db_backend = "goleveldb"
# Database directory
db_dir = "db"
# If this node is many blocks behind the tip of the chain, FastSync
# allows them to catchup quickly by downloading blocks in parallel
# and verifying their commits
fast_sync = true
# If true, query the ABCI app on connecting to a new peer
# so the app can decide if we should keep the connection or not
filter_peers = false
home = ""
# A custom human readable name for this node
moniker = "val1"
# Path to the JSON file containing the private key to use for node authentication in the p2p protocol
node_key_file = "secrets/node_key.json"
# Path to the JSON file containing the private key to use as a validator in the consensus protocol
priv_validator_key_file = "secrets/priv_validator_key.json"
# TCP or UNIX socket address for Tendermint to listen on for
# connections from an external PrivValidator process
priv_validator_laddr = ""
# Path to the JSON file containing the last sign state of a validator
priv_validator_state_file = "priv_validator_state.json"
# TCP or UNIX socket address for the profiling server to listen on
prof_laddr = ""
# TCP or UNIX socket address of the ABCI application,
# or the name of an ABCI application compiled in with the Tendermint binary
proxy_app = "tcp://127.0.0.1:26658"
##### consensus configuration options #####
[consensus]
# EmptyBlocks mode and possible interval between empty blocks
create_empty_blocks = true
create_empty_blocks_interval = "0s"
home = ""
# Reactor sleep duration parameters
peer_gossip_sleep_duration = "100ms"
peer_query_maj23_sleep_duration = "2s"
# Make progress as soon as we have all the precommits (as if TimeoutCommit = 0)
skip_timeout_commit = false
timeout_commit = "1s"
timeout_precommit = "1s"
timeout_precommit_delta = "500ms"
timeout_prevote = "1s"
timeout_prevote_delta = "500ms"
timeout_propose = "3s"
timeout_propose_delta = "500ms"
wal_file = "wal/cs.wal/wal"
##### mempool configuration options #####
[mempool]
broadcast = true
# Size of the cache (used to filter transactions we saw earlier) in transactions
cache_size = 10000
home = ""
# Limit the total size of all txs in the mempool.
# This only accounts for raw transactions (e.g. given 1MB transactions and
# max_txs_bytes=5MB, mempool will only accept 5 transactions).
max_pending_txs_bytes = 1073741824
recheck = true
# Maximum number of transactions in the mempool
size = 5000
wal_dir = ""
##### peer to peer configuration options #####
[p2p]
# Toggle to disable guard against peers connecting from the same ip.
allow_duplicate_ip = false
dial_timeout = "3s"
# Address to advertise to peers for them to dial
# If empty, will use the same port as the laddr,
# and will introspect on the listener or use UPnP
# to figure out the address.
external_address = ""
# Time to wait before flushing messages out on the connection
flush_throttle_timeout = "10ms"
# Peer connection configuration.
handshake_timeout = "20s"
home = ""
# Address to listen for incoming connections
laddr = "tcp://gnodevx-gnoland-val1-0:26656"
# Maximum number of inbound peers
max_num_inbound_peers = 40
# Maximum number of outbound peers to connect to, excluding persistent peers
max_num_outbound_peers = 10
# Maximum size of a message packet payload, in bytes
max_packet_msg_payload_size = 10240
# Comma separated list of nodes to keep persistent connections to
persistent_peers = "g1pc09jrzqu2j87syp6n9k4v35kphe4x56hawkt7@gnodevx-gnoland-sen1-headless.gnoland:26656,g1phpp92d4a60376yr4vpfff2q4a9gh4m8yf09hr@gnodevx-gnoland-sen2-headless.gnoland:26656"
# Set true to enable the peer-exchange reactor
pex = false
# Comma separated list of peer IDs to keep private (will not be gossiped to other peers)
private_peer_ids = ""
# Rate at which packets can be received, in bytes/second
recv_rate = 20000000
# Seed mode, in which node constantly crawls the network and looks for
# peers. If another node asks it for addresses, it responds and disconnects.
#
# Does not work if the peer-exchange reactor is disabled.
seed_mode = false
# Issue: https://github.com/gnolang/gno/issues/2308
# Comma separated list of seed nodes to connect to
seeds = ""
# Rate at which packets can be sent, in bytes/second
send_rate = 20000000
test_dial_fail = false
test_fuzz = false
# UPNP port forwarding
upnp = false
[p2p.test_fuzz_config]
MaxDelay = "3s"
Mode = 0
ProbDropConn = 0.0
ProbDropRW = 0.2
ProbSleep = 0.0
##### rpc server configuration options #####
[rpc]
# A list of non simple headers the client is allowed to use with cross-domain requests
cors_allowed_headers = ["Origin", "Accept", "Content-Type", "X-Requested-With", "X-Server-Time"]
# A list of methods the client is allowed to use with cross-domain requests
cors_allowed_methods = ["HEAD", "GET", "POST", "OPTIONS"]
# A list of origins a cross-domain request can be executed from
# Default value '[]' disables cors support
# Use '["*"]' to allow any origin
cors_allowed_origins = ["*"]
# TCP or UNIX socket address for the gRPC server to listen on
# NOTE: This server only supports /broadcast_tx_commit
grpc_laddr = ""
# Maximum number of simultaneous connections.
# Does not include RPC (HTTP&WebSocket) connections. See max_open_connections
# If you want to accept a larger number than the default, make sure
# you increase your OS limits.
# 0 - unlimited.
# Should be < {ulimit -Sn} - {MaxNumInboundPeers} - {MaxNumOutboundPeers} - {N of wal, db and other open files}
# 1024 - 40 - 10 - 50 = 924 = ~900
grpc_max_open_connections = 900
home = ""
# TCP or UNIX socket address for the RPC server to listen on
laddr = "tcp://gnodevx-gnoland-val1-0:26657"
# Maximum size of request body, in bytes
max_body_bytes = 1000000
# Maximum size of request header, in bytes
max_header_bytes = 1048576
# Maximum number of simultaneous connections (including WebSocket).
# Does not include gRPC connections. See grpc_max_open_connections
# If you want to accept a larger number than the default, make sure
# you increase your OS limits.
# 0 - unlimited.
# Should be < {ulimit -Sn} - {MaxNumInboundPeers} - {MaxNumOutboundPeers} - {N of wal, db and other open files}
# 1024 - 40 - 10 - 50 = 924 = ~900
max_open_connections = 900
# How long to wait for a tx to be committed during /broadcast_tx_commit.
# WARNING: Using a value larger than 10s will result in increasing the
# global HTTP write timeout, which applies to all connections and endpoints.
# See https://github.com/tendermint/classic/issues/3435
timeout_broadcast_tx_commit = "10s"
# The path to a file containing certificate that is used to create the HTTPS server.
# Might be either absolute path or path related to tendermint's config directory.
# If the certificate is signed by a certificate authority,
# the certFile should be the concatenation of the server's certificate, any intermediates,
# and the CA's certificate.
# NOTE: both tls_cert_file and tls_key_file must be present for Tendermint to create HTTPS server. Otherwise, HTTP server is run.
tls_cert_file = ""
# The path to a file containing matching private key that is used to create the HTTPS server.
# Might be either absolute path or path related to tendermint's config directory.
# NOTE: both tls_cert_file and tls_key_file must be present for Tendermint to create HTTPS server. Otherwise, HTTP server is run.
tls_key_file = ""
# Activate unsafe RPC commands like /dial_seeds and /unsafe_flush_mempool
unsafe = false
##### node telemetry #####
[telemetry]
enabled = true
# the endpoint to export metrics to, like a local OpenTelemetry collector
exporter_endpoint = "grafana-k8s-monitoring-alloy.grafana-system.svc.cluster.local:4317"
meter_name = "gnodevx"
service_name = "gnodevx"
##### event store #####
[tx_event_store]
# Type of event store
event_store_type = "none"
# Event store parameters
[tx_event_store.event_store_params] |
Only sentry2 is running as sentry mode, right? |
@r3v4s I have 2 node in sentry mode and 1 node as validator.
|
@r3v4s recommended that in the case of having multiple validators the Sentry nodes would catch up. I have tested the scenario of having 2 sentry and 2 validators and indeed the issue of lagging behind got solved. |
@zivkovicmilos Although the issue got solved, I think it is still quite crucial to explore the root cause. The scenario of having a couple of sentry in front of one unique validator is not a rare scenario but actually quite common. |
I had to reread this twice, and pinch myself. Is this actually real? As far as I'm aware, we don't have any smart protocol logic for faster sentry node sync 🫠 |
I'll make some tests on my side and see how it's working. As a reminder, here is the config for sentry node architecture: Validator node configurationpex = false
persistent_peers = [ "list of sentry nodes" ]
addr_book_strict = false Sentry Node Configurationpex = true
persistent_peers = [ "validator node, optionally other sentry nodes" ]
private_peer_id = "validator node id"
addr_book_strict = false |
@albttx the |
Yes, since addrbook.json has been removed 😅 |
As you can see in my pr #2438 , i made a 3 validators cluster with 1 validator is sentry mode system, and it's working just fine :) |
Oh sorry, question was |
Quite not sure yet, I've had similar experience with Adding secondary validator did fixed, so I just recommended add another one. I'll dig into to this issue. |
@albttx the setup you have brought up is different from the one here described in this scenario. I've brought up the scenario described in the issue with your setup and I can replicate the issue. A graph here for the sake of clearness: graph TD;
validator:val1-->sentry:sen1;
validator:val1-->sentry:sen2;
Also to be precise. In this setup validator |
I just tested your system, it's working for me... Using my PR, here is the docker-compose.yml
ps: of course, you need to edit the genesis.json and have only gno3 as validator |
Just tried with gno3-sentry-1 and gno3-sentry-2 having gno3 has persistent_peers (useless since there is already the connection on the other side), and it's works. My guess would be that you have either an issue in your |
Very first config.toml (from sentry) has configs of..
Can you check if your |
|
Error discovered working on #2430 On `/status` all node are showed with `voting_power=1` ![image](https://github.com/gnolang/gno/assets/8089712/09b3bd4c-3060-4681-a7a1-82c425734d8e) This fixes it :)
After a call with @mazzy89 , we agreed on: the log
ps: This code is still in tendermint and cometbft:
We didn't make the test, but almost sure we could replicate it there with the same infrastructure. |
Error discovered working on gnolang#2430 On `/status` all node are showed with `voting_power=1` ![image](https://github.com/gnolang/gno/assets/8089712/09b3bd4c-3060-4681-a7a1-82c425734d8e) This fixes it :)
Sentry Nodes laggind behind validator in consensus process
Description
I have configured a network having 2 Sentry nodes and 1 validator. Beyond the default values and the values necessary to set up a node as a Sentry, here a copy of the
config.toml
for a Sentry node:The Sentry nodes, which run as a full-node, are left behind compared to the validator.
Your environment
Steps to reproduce
Spin up a network from scratch crafting a genesis.
Expected behaviour
Sentry nodes will commit newer blocks consequently at the validator commiting blocks after consensus.
Actual behaviour
The validator gossipes the transactions and finishes consensus faster than the sentry. The sentry gets the commit but not yet the block so it cannot continue because it does not know the block.
Logs
Proposed solution
cc @zivkovicmilos
The text was updated successfully, but these errors were encountered: