From 3df22e2907aa41cee1ae6b95f59c643bdf81b16b Mon Sep 17 00:00:00 2001 From: Giulio Calacoci Date: Mon, 31 Jul 2023 16:41:04 +0200 Subject: [PATCH] Add a new primary_checkpoint_timeout configuration option If `primary_conninfo` is set, is possible to add for a server a `primary_checkpoint_timeout` option, which is the maximum time (in seconds) for Barman to wait for a new WAL file to be produced before forcing the execution of a checkpoint on the primary. Signed-off-by: Giulio Calacoci --- barman/config.py | 3 + barman/postgres.py | 29 ++++++++++ barman/server.py | 1 + .../50-primary_checkpoint_timeout.md | 13 +++++ doc/manual/50-feature-details.en.md | 58 +++++++++++-------- tests/test_config.py | 2 + tests/test_postgres.py | 52 +++++++++++++++++ tests/testing_helpers.py | 1 + 8 files changed, 136 insertions(+), 23 deletions(-) create mode 100644 doc/barman.5.d/50-primary_checkpoint_timeout.md diff --git a/barman/config.py b/barman/config.py index 537a993a4..91695cf12 100644 --- a/barman/config.py +++ b/barman/config.py @@ -491,6 +491,7 @@ class ServerConfig(object): "pre_recovery_script", "pre_wal_delete_script", "pre_wal_delete_retry_script", + "primary_checkpoint_timeout", "primary_conninfo", "primary_ssh_command", "recovery_options", @@ -615,6 +616,7 @@ class ServerConfig(object): "parallel_jobs": "1", "parallel_jobs_start_batch_period": "1", "parallel_jobs_start_batch_size": "10", + "primary_checkpoint_timeout": "0", "recovery_options": "", "create_slot": "manual", "retention_policy_mode": "auto", @@ -658,6 +660,7 @@ class ServerConfig(object): "parallel_jobs": int, "parallel_jobs_start_batch_period": int, "parallel_jobs_start_batch_size": int, + "primary_checkpoint_timeout": int, "recovery_options": RecoveryOptions, "recovery_staging_path": parse_recovery_staging_path, "create_slot": parse_create_slot, diff --git a/barman/postgres.py b/barman/postgres.py index 1cf519ea2..c6ebf652d 100644 --- a/barman/postgres.py +++ b/barman/postgres.py @@ -21,6 +21,7 @@ """ import atexit +import datetime import logging from abc import ABCMeta from multiprocessing import Process, Queue @@ -1581,6 +1582,7 @@ def __init__( primary_conninfo, immediate_checkpoint=False, slot_name=None, + primary_checkpoint_timeout=0, application_name="barman", ): """ @@ -1606,6 +1608,7 @@ def __init__( # The standby needs a connection to the primary so that it can # perform WAL switches itself when calling pg_backup_stop. self.primary = PostgreSQLConnection(self.primary_conninfo) + self.primary_checkpoint_timeout = primary_checkpoint_timeout def close(self): """Close the connection to PostgreSQL.""" @@ -1658,6 +1661,32 @@ def switch_wal_in_background(self, done_q, times=10, wait=10): except Empty: # An empty queue just means we haven't yet been told to stop pass + if self.primary_checkpoint_timeout: + _logger.warning( + "Barman attempted to switch WALs %s times on the primary " + "server, but the backup has not yet completed. " + "A checkpoint will be forced on the primary server " + "in %s seconds to ensure the backup can complete." + % (times, self.primary_checkpoint_timeout) + ) + sleep_time = datetime.datetime.now() + datetime.timedelta( + seconds=self.primary_checkpoint_timeout + ) + while True: + try: + # Always check if the queue is empty, so we know to stop + # before the checkpoint execution + if done_q.get(timeout=wait): + return + except Empty: + # If the queue is empty, we can proceed to the checkpoint + # if enough time has passed + if sleep_time < datetime.datetime.now(): + self.primary.checkpoint() + self.primary.switch_wal() + break + # break out of the loop after the checkpoint and wal switch + # execution. The connection will be closed in the finally statement finally: # Close the connection since only this subprocess will ever use it self.primary.close() diff --git a/barman/server.py b/barman/server.py index c6be673e1..67102546d 100644 --- a/barman/server.py +++ b/barman/server.py @@ -295,6 +295,7 @@ def _init_postgres(self, config): config.primary_conninfo, config.immediate_checkpoint, config.slot_name, + config.primary_checkpoint_timeout, ) else: self.postgres = PostgreSQLConnection( diff --git a/doc/barman.5.d/50-primary_checkpoint_timeout.md b/doc/barman.5.d/50-primary_checkpoint_timeout.md new file mode 100644 index 000000000..5a32f285c --- /dev/null +++ b/doc/barman.5.d/50-primary_checkpoint_timeout.md @@ -0,0 +1,13 @@ +primary_checkpoint_timeout +: This defines the amount of seconds that Barman will wait at the end of a + backup if no new WAL files are produced, before forcing a checkpoint on + the primary server. + + If not set or set to 0, Barman will not force a checkpoint on the primary, + and wait indefinitely for new WAL files to be produced. + + The value of this option should be greater of the value of the + `archive_timeout` set on the primary server. + + This option works only if `primary_conninfo` option is set, and it is + ignored otherwise. diff --git a/doc/manual/50-feature-details.en.md b/doc/manual/50-feature-details.en.md index 75e68ac59..bd2e6900b 100644 --- a/doc/manual/50-feature-details.en.md +++ b/doc/manual/50-feature-details.en.md @@ -100,7 +100,6 @@ tablespace in the above option. If found, the specified bandwidth limit will be enforced. If not, the default bandwidth limit for that server will be applied. - ### Network Compression It is possible to reduce the size of transferred data using @@ -118,8 +117,8 @@ Setting this option to `true` will enable data compression during network transfers (for both backup and recovery). By default it is set to `false`. - ### Backup Compression + Barman can use the compression features of pg_basebackup in order to compress the backup data during the backup process. This can be enabled using the `backup_compression` config option (global/per server): @@ -128,9 +127,10 @@ using the `backup_compression` config option (global/per server): > in this section are not available with the `rsync` or `local-rsync` > backup methods. Only with `postgres` backup method. -#### Compression algorithms +#### Compression algorithms + Setting this option will cause pg_basebackup to compress the backup -using the specified compression algorithm. Currently, supported +using the specified compression algorithm. Currently, supported algorithm in Barman are: `gzip` `lz4` and `zstd`. ``` ini @@ -138,7 +138,7 @@ backup_compression = gzip|lz4|zstd ``` Barman requires the CLI utility for the selected compression algorithm -to be available on both the Barman server *and* the PostgreSQL server. +to be available on both the Barman server _and_ the PostgreSQL server. The CLI utility is used to extract the backup label from the compressed backup and to decompress the backup on the PostgreSQL server during recovery. These can be installed through system packages named `gzip`, @@ -161,23 +161,26 @@ recovery. These can be installed through system packages named `gzip`, > section for more information. #### Compression workers -This optional parameter allows compression using multiple threads to increase compression speed (default being 0). + +This optional parameter allows compression using multiple threads to increase compression speed (default being 0). ```ini backup_compression_workers = 2 ``` > **Note:** This option is only available with `zstd` compression. - + > **Note:** `zstd` version must be 1.5.0 or higher. Or 1.4.4 or higher compiled with multithreading option. #### Compression level + The compression level can be specified using the `backup_compression_level` option. This should be set to an integer value supported by the compression algorithm specified in `backup_compression`. #### Compression location + When using Barman with PostgreSQL version 15 or higher it is possible to specify for compression to happen on the server (i.e. PostgreSQL will compress the backup) or on the client (i.e. pg_basebackup @@ -201,6 +204,7 @@ in order to have pg_basebackup uncompress the data before writing it to disk: #### Compression format + ``` ini backup_compression_format = plain|tar ``` @@ -209,20 +213,20 @@ If `backup_compression_format` is unset or has the value `tar` then the backup will be written to disk as compressed tarballs. A description of both the `plain` and `tar` formats can be found in the [pg_basebackup documentation][pg_basebackup-documentation]. - + > **IMPORTANT:** Barman uses external tools to manage compressed backups. > Depending on the `backup_compression` and `backup_compression_format` -> You may need to install one or more tools on the Postgres server and +> You may need to install one or more tools on the Postgres server and > the Barman server. > The following table will help you choose according to your configuration. -| **backup_compression** | **backup_compression_format** | **Postgres server** | **Barman server** | +| **backup_compression** | **backup_compression_format** | **Postgres server** | **Barman server** | |:---------:|:---------------------:|:-------------------------:|:----------------------:| -| gzip | plain | **tar** | None | -| gzip | tar | **tar** | **tar** | -| lz4 | plain | **tar, lz4** | None | +| gzip | plain | **tar** | None | +| gzip | tar | **tar** | **tar** | +| lz4 | plain | **tar, lz4** | None | | lz4 | tar | **tar, lz4** | **tar, lz4** | -| zstd | plain | **tar, zstd** | None | +| zstd | plain | **tar, zstd** | None | | zstd | tar | **tar, zstd** | **tar, zstd** | ### Concurrent backup @@ -285,10 +289,19 @@ having to wait for a WAL switch to occur naturally. > **NOTE:** It is especially important that `primary_conninfo` is > set if the standby is to be backed up when there is little or no write -> traffic on the primary. If `primary_conninfo` is not set then the -> backup will still run however it will wait at the stop backup stage -> until the current WAL semgent on the primary is newer than the latest -> WAL required by the backup. +> traffic on the primary. + +As of Barman 3.8.0, If `primary_conninfo` is set, is possible to add for a server a +`primary_checkpoint_timeout` option, which is the maximum time (in seconds) +for Barman to wait for a new WAL file to be produced +before forcing the execution of a checkpoint on the primary. +The `primary_checkpoint_timeout` option should be set to an amount of seconds +greater of the value of the `archive_timeout` option set on the primary server. + +If `primary_conninfo` is not set then the +backup will still run however it will wait at the stop backup stage +until the current WAL segment on the primary is newer than the latest +WAL required by the backup. Barman currently requires that WAL files and backup data come from the same PostgreSQL server. In the case that the standby is promoted to primary @@ -380,6 +393,7 @@ backup_method = local-rsync ``` ## Archiving features + ### WAL compression The `barman cron` command will compress WAL files if the `compression` @@ -397,7 +411,7 @@ values: - `custom_decompression_filter`: a decompression filter - `custom_compression_magic`: a hex string to identify a custom compressed wal file -> *NOTE:* All methods but `pybzip2` and `pygzip` require `barman +> _NOTE:_ All methods but `pybzip2` and `pygzip` require `barman > archive-wal` to fork a new process. ### Synchronous WAL streaming @@ -416,7 +430,7 @@ First of all, you need to retrieve the application name of the Barman ``` bash barman@backup$ barman show-servers pg|grep streaming_archiver_name - streaming_archiver_name: barman_receive_wal + streaming_archiver_name: barman_receive_wal ``` Then the application name should be added to the `postgresql.conf` @@ -460,8 +474,8 @@ Status of streaming clients for server 'pg': Flush location : 0/9000098 (diff: 0 B) ``` - ## Catalog management features + ### Minimum redundancy safety You can define the minimum number of periodic backups for a PostgreSQL @@ -479,7 +493,6 @@ This will protect you from accidental `barman delete` operations. > minimum redundancy requirements. Regularly check Barman's log for > messages on this topic. - ### Retention policies Barman supports **retention policies** for backups. @@ -611,7 +624,6 @@ Currently, the only allowed value for `wal_retention_policy` is the special value `main`, that maps the retention policy of archive logs to that of base backups. - ## Hook scripts Barman allows a database administrator to run hook scripts on these diff --git a/tests/test_config.py b/tests/test_config.py index 5c6ea41a1..eab3d82e3 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -177,6 +177,7 @@ def test_config(self): "backup_method": "rsync", "max_incoming_wals_queue": None, "primary_conninfo": None, + "primary_checkpoint_timeout": 0, } ) assert main.__dict__ == expected @@ -214,6 +215,7 @@ def test_config(self): "errors_directory": "/some/barman/home/web/errors", "max_incoming_wals_queue": None, "primary_conninfo": None, + "primary_checkpoint_timeout": 0, } ) assert web.__dict__ == expected diff --git a/tests/test_postgres.py b/tests/test_postgres.py index d82d048c5..baab2da5e 100644 --- a/tests/test_postgres.py +++ b/tests/test_postgres.py @@ -1902,6 +1902,58 @@ def test_switch_wal_in_background_stops_when_asked( # THEN switch_wal is called on the primary exactly once assert mock_primary_conn.switch_wal.call_count == 1 + @patch("barman.postgres.PostgreSQLConnection") + @patch("barman.postgres.super") + def test_switch_wal_in_background_calls_checkpoint( + self, _mock_super, mock_psql_conn, caplog + ): + """ + Verify switch_wal_in_background runs the expected number of times and + then executes a checkpoint after `primary_checkpoint_timeout` value. + """ + # GIVEN a connection to a standby PostgreSQL instance + main_proc_primary_conn = Mock() + child_proc_primary_conn = Mock() + mock_psql_conn.side_effect = [main_proc_primary_conn, child_proc_primary_conn] + standby = StandbyPostgreSQLConnection( + self._standby_conninfo, self._primary_conninfo + ) + + # WHEN switch_wal_in_background is called with times=2 AND primary_checkpoint_timeout + # is set to 5 seconds + standby.primary_checkpoint_timeout = 5 + times = 2 + start_time = datetime.datetime.now() + standby.switch_wal_in_background(Queue(), times, 0) + end_time = datetime.datetime.now() + + # THEN switch_wal is called on the primary conn in the child process + # 3 times (2 before the timer and 1 after the checkpoint) + assert child_proc_primary_conn.switch_wal.call_count == times + 1 + + # AND switch_wal is not called on the primary conn in the parent process + assert main_proc_primary_conn.switch_wal.call_count == 0 + + # AND the checkpoint is called on the primary conn in the child process + assert child_proc_primary_conn.checkpoint.call_count == 1 + + # AND the duration between the start and end of the function is greater than + # the primary_checkpoint_timeout value + assert (end_time - start_time).total_seconds() > 5 + + # AND a warning is logged that the checkpoint was called + assert ( + "Barman attempted to switch WALs %s times on the primary " + "server, but the backup has not yet completed. " + "A checkpoint will be forced on the primary server " + "in %s seconds to ensure the backup can complete." + % (times, standby.primary_checkpoint_timeout) + in caplog.text + ) + + # AND the child process primary conn was closed + child_proc_primary_conn.close.assert_called_once() + @pytest.mark.parametrize( "stop_fun", ("stop_concurrent_backup", "stop_exclusive_backup") ) diff --git a/tests/testing_helpers.py b/tests/testing_helpers.py index 6f111c4bd..4a381aa92 100644 --- a/tests/testing_helpers.py +++ b/tests/testing_helpers.py @@ -324,6 +324,7 @@ def build_config_dictionary(config_keys=None): "parallel_jobs": 1, "create_slot": "manual", "forward_config_path": False, + "primary_checkpoint_timeout": 0, "primary_conninfo": None, "snapshot_disks": None, "snapshot_instance": None,