Add a new primary_checkpoint_timeout configuration option

If `primary_conninfo` is set, is possible to add for a server a `primary_checkpoint_timeout` option, which is the maximum time (in seconds) for Barman to wait for a new WAL file to be produced before forcing the execution of a checkpoint on the primary. Signed-off-by: Giulio Calacoci <[email protected]>
EnterpriseDB · Jul 31, 2023 · 3df22e2 · 3df22e2
1 parent 59602a4
commit 3df22e2
Show file tree

Hide file tree

Showing 8 changed files with 136 additions and 23 deletions.
diff --git a/barman/config.py b/barman/config.py
@@ -491,6 +491,7 @@ class ServerConfig(object):
         "pre_recovery_script",
         "pre_wal_delete_script",
         "pre_wal_delete_retry_script",
+        "primary_checkpoint_timeout",
         "primary_conninfo",
         "primary_ssh_command",
         "recovery_options",
@@ -615,6 +616,7 @@ class ServerConfig(object):
         "parallel_jobs": "1",
         "parallel_jobs_start_batch_period": "1",
         "parallel_jobs_start_batch_size": "10",
+        "primary_checkpoint_timeout": "0",
         "recovery_options": "",
         "create_slot": "manual",
         "retention_policy_mode": "auto",
@@ -658,6 +660,7 @@ class ServerConfig(object):
         "parallel_jobs": int,
         "parallel_jobs_start_batch_period": int,
         "parallel_jobs_start_batch_size": int,
+        "primary_checkpoint_timeout": int,
         "recovery_options": RecoveryOptions,
         "recovery_staging_path": parse_recovery_staging_path,
         "create_slot": parse_create_slot,

diff --git a/barman/postgres.py b/barman/postgres.py
@@ -21,6 +21,7 @@
 """
 
 import atexit
+import datetime
 import logging
 from abc import ABCMeta
 from multiprocessing import Process, Queue
@@ -1581,6 +1582,7 @@ def __init__(
         primary_conninfo,
         immediate_checkpoint=False,
         slot_name=None,
+        primary_checkpoint_timeout=0,
         application_name="barman",
     ):
         """
@@ -1606,6 +1608,7 @@ def __init__(
         # The standby needs a connection to the primary so that it can
         # perform WAL switches itself when calling pg_backup_stop.
         self.primary = PostgreSQLConnection(self.primary_conninfo)
+        self.primary_checkpoint_timeout = primary_checkpoint_timeout
 
     def close(self):
         """Close the connection to PostgreSQL."""
@@ -1658,6 +1661,32 @@ def switch_wal_in_background(self, done_q, times=10, wait=10):
                 except Empty:
                     # An empty queue just means we haven't yet been told to stop
                     pass
+            if self.primary_checkpoint_timeout:
+                _logger.warning(
+                    "Barman attempted to switch WALs %s times on the primary "
+                    "server, but the backup has not yet completed. "
+                    "A checkpoint will be forced on the primary server "
+                    "in %s seconds to ensure the backup can complete."
+                    % (times, self.primary_checkpoint_timeout)
+                )
+                sleep_time = datetime.datetime.now() + datetime.timedelta(
+                    seconds=self.primary_checkpoint_timeout
+                )
+                while True:
+                    try:
+                        # Always check if the queue is empty, so we know to stop
+                        # before the checkpoint execution
+                        if done_q.get(timeout=wait):
+                            return
+                    except Empty:
+                        # If the queue is empty, we can proceed to the checkpoint
+                        # if enough time has passed
+                        if sleep_time < datetime.datetime.now():
+                            self.primary.checkpoint()
+                            self.primary.switch_wal()
+                            break
+                            # break out of the loop after the checkpoint and wal switch
+                            # execution. The connection will be closed in the finally statement
         finally:
             # Close the connection since only this subprocess will ever use it
             self.primary.close()

diff --git a/barman/server.py b/barman/server.py
@@ -295,6 +295,7 @@ def _init_postgres(self, config):
                     config.primary_conninfo,
                     config.immediate_checkpoint,
                     config.slot_name,
+                    config.primary_checkpoint_timeout,
                 )
             else:
                 self.postgres = PostgreSQLConnection(

diff --git a/doc/barman.5.d/50-primary_checkpoint_timeout.md b/doc/barman.5.d/50-primary_checkpoint_timeout.md
@@ -0,0 +1,13 @@
+primary_checkpoint_timeout
+:   This defines the amount of seconds that Barman will wait at the end of a
+    backup if no new WAL files are produced, before forcing a checkpoint on
+    the primary server.
+
+    If not set or set to 0, Barman will not force a checkpoint on the primary,
+    and wait indefinitely for new WAL files to be produced.
+
+    The value of this option should be greater of the value of the 
+    `archive_timeout` set on the primary server.
+
+    This option works only if `primary_conninfo` option is set, and it is
+    ignored otherwise.
diff --git a/doc/manual/50-feature-details.en.md b/doc/manual/50-feature-details.en.md
@@ -100,7 +100,6 @@ tablespace in the above option. If found, the specified bandwidth
 limit will be enforced. If not, the default bandwidth limit for that
 server will be applied.
 
-
 ### Network Compression
 
 It is possible to reduce the size of transferred data using
@@ -118,8 +117,8 @@ Setting this option to `true` will enable data compression during
 network transfers (for both backup and recovery). By default it is set
 to `false`.
 
-
 ### Backup Compression
+
 Barman can use the compression features of pg_basebackup in order to
 compress the backup data during the backup process. This can be enabled
 using the `backup_compression` config option (global/per server):
@@ -128,17 +127,18 @@ using the `backup_compression` config option (global/per server):
 > in this section are not available with the `rsync` or `local-rsync`
 > backup methods. Only with `postgres` backup method.
 
-#### Compression algorithms 
+#### Compression algorithms
+
 Setting this option will cause pg_basebackup to compress the backup
-using the specified compression algorithm. Currently, supported 
+using the specified compression algorithm. Currently, supported
 algorithm in Barman are: `gzip` `lz4` and `zstd`.
 
 ``` ini
 backup_compression = gzip|lz4|zstd
 ```
 
 Barman requires the CLI utility for the selected compression algorithm
-to be available on both the Barman server *and* the PostgreSQL server.
+to be available on both the Barman server _and_ the PostgreSQL server.
 The CLI utility is used to extract the backup label from the compressed
 backup and to decompress the backup on the PostgreSQL server during
 recovery. These can be installed through system packages named `gzip`,
@@ -161,23 +161,26 @@ recovery. These can be installed through system packages named `gzip`,
 > section for more information.
 
 #### Compression workers
-This optional parameter allows compression using multiple threads to increase compression speed (default being 0). 
+
+This optional parameter allows compression using multiple threads to increase compression speed (default being 0).
 
 ```ini
 backup_compression_workers = 2
 ```
 
 > **Note:** This option is only available with `zstd` compression.
- 
+
 > **Note:** `zstd` version must be 1.5.0 or higher. Or 1.4.4 or higher compiled with multithreading option.
 
 #### Compression level
+
 The compression level can be specified using the
 `backup_compression_level` option. This should be set to an integer
 value supported by the compression algorithm specified in
 `backup_compression`.
 
 #### Compression location
+
 When using Barman with PostgreSQL version 15 or higher it is possible
 to specify for compression to happen on the server (i.e. PostgreSQL
 will compress the backup) or on the client (i.e. pg_basebackup
@@ -201,6 +204,7 @@ in order to have pg_basebackup uncompress the data before writing it
 to disk:
 
 #### Compression format
+
 ``` ini
 backup_compression_format = plain|tar
 ```
@@ -209,20 +213,20 @@ If `backup_compression_format` is unset or has the value `tar` then
 the backup will be written to disk as compressed tarballs. A description
 of both the `plain` and `tar` formats can be found in the [pg_basebackup
 documentation][pg_basebackup-documentation].
-   
+
 > **IMPORTANT:** Barman uses external tools to manage compressed backups.
 > Depending on the `backup_compression` and `backup_compression_format`
-> You may need to install one or more tools on the Postgres server and 
+> You may need to install one or more tools on the Postgres server and
 > the Barman server.
 > The following table will help you choose according to your configuration.
 
-| **backup_compression** | **backup_compression_format** | **Postgres server** | **Barman server** | 
+| **backup_compression** | **backup_compression_format** | **Postgres server** | **Barman server** |
 |:---------:|:---------------------:|:-------------------------:|:----------------------:|
-| gzip | plain | **tar** | None | 
-| gzip |  tar  | **tar** | **tar** | 
-| lz4 | plain | **tar, lz4** | None | 
+| gzip | plain | **tar** | None |
+| gzip |  tar  | **tar** | **tar** |
+| lz4 | plain | **tar, lz4** | None |
 | lz4 |  tar  | **tar, lz4** | **tar, lz4** |
-| zstd | plain | **tar, zstd** | None | 
+| zstd | plain | **tar, zstd** | None |
 | zstd |  tar  | **tar, zstd** | **tar, zstd** |
 
 ### Concurrent backup
@@ -285,10 +289,19 @@ having to wait for a WAL switch to occur naturally.
 
 > **NOTE:** It is especially important that `primary_conninfo` is
 > set if the standby is to be backed up when there is little or no write
-> traffic on the primary. If `primary_conninfo` is not set then the
-> backup will still run however it will wait at the stop backup stage
-> until the current WAL semgent on the primary is newer than the latest
-> WAL required by the backup.
+> traffic on the primary.
+
+As of Barman 3.8.0, If `primary_conninfo` is set, is possible to add for a server a
+`primary_checkpoint_timeout` option, which is the maximum time (in seconds)
+for Barman to wait for a new WAL file to be produced
+before forcing the execution of a checkpoint on the primary.
+The `primary_checkpoint_timeout` option should be set to an amount of seconds
+greater of the value of the `archive_timeout` option set on the primary server.
+
+If `primary_conninfo` is not set then the
+backup will still run however it will wait at the stop backup stage
+until the current WAL segment on the primary is newer than the latest
+WAL required by the backup.
 
 Barman currently requires that WAL files and backup data come from the
 same PostgreSQL server. In the case that the standby is promoted to primary
@@ -380,6 +393,7 @@ backup_method = local-rsync
 ```
 
 ## Archiving features
+
 ### WAL compression
 
 The `barman cron` command will compress WAL files if the `compression`
@@ -397,7 +411,7 @@ values:
       - `custom_decompression_filter`: a decompression filter
       - `custom_compression_magic`: a hex string to identify a custom compressed wal file
 
-> *NOTE:* All methods but `pybzip2` and `pygzip` require `barman
+> _NOTE:_ All methods but `pybzip2` and `pygzip` require `barman
 > archive-wal` to fork a new process.
 
 ### Synchronous WAL streaming
@@ -416,7 +430,7 @@ First of all, you need to retrieve the application name of the Barman
 
 ``` bash
 barman@backup$ barman show-servers pg|grep streaming_archiver_name
-	streaming_archiver_name: barman_receive_wal
+ streaming_archiver_name: barman_receive_wal
 ```
 
 Then the application name should be added to the `postgresql.conf`
@@ -460,8 +474,8 @@ Status of streaming clients for server 'pg':
      Flush location  : 0/9000098 (diff: 0 B)
 ```
 
-
 ## Catalog management features
+
 ### Minimum redundancy safety
 
 You can define the minimum number of periodic backups for a PostgreSQL
@@ -479,7 +493,6 @@ This will protect you from accidental `barman delete` operations.
 > minimum redundancy requirements. Regularly check Barman's log for
 > messages on this topic.
 
-
 ### Retention policies
 
 Barman supports **retention policies** for backups.
@@ -611,7 +624,6 @@ Currently, the only allowed value for `wal_retention_policy` is the
 special value `main`, that maps the retention policy of archive logs
 to that of base backups.
 
-
 ## Hook scripts
 
 Barman allows a database administrator to run hook scripts on these

diff --git a/tests/test_config.py b/tests/test_config.py
@@ -177,6 +177,7 @@ def test_config(self):
                 "backup_method": "rsync",
                 "max_incoming_wals_queue": None,
                 "primary_conninfo": None,
+                "primary_checkpoint_timeout": 0,
             }
         )
         assert main.__dict__ == expected
@@ -214,6 +215,7 @@ def test_config(self):
                 "errors_directory": "/some/barman/home/web/errors",
                 "max_incoming_wals_queue": None,
                 "primary_conninfo": None,
+                "primary_checkpoint_timeout": 0,
             }
         )
         assert web.__dict__ == expected

diff --git a/tests/test_postgres.py b/tests/test_postgres.py
@@ -1902,6 +1902,58 @@ def test_switch_wal_in_background_stops_when_asked(
         # THEN switch_wal is called on the primary exactly once
         assert mock_primary_conn.switch_wal.call_count == 1
 
+    @patch("barman.postgres.PostgreSQLConnection")
+    @patch("barman.postgres.super")
+    def test_switch_wal_in_background_calls_checkpoint(
+        self, _mock_super, mock_psql_conn, caplog
+    ):
+        """
+        Verify switch_wal_in_background runs the expected number of times and
+        then executes a checkpoint after `primary_checkpoint_timeout` value.
+        """
+        # GIVEN a connection to a standby PostgreSQL instance
+        main_proc_primary_conn = Mock()
+        child_proc_primary_conn = Mock()
+        mock_psql_conn.side_effect = [main_proc_primary_conn, child_proc_primary_conn]
+        standby = StandbyPostgreSQLConnection(
+            self._standby_conninfo, self._primary_conninfo
+        )
+
+        # WHEN switch_wal_in_background is called with times=2 AND primary_checkpoint_timeout
+        # is set to 5 seconds
+        standby.primary_checkpoint_timeout = 5
+        times = 2
+        start_time = datetime.datetime.now()
+        standby.switch_wal_in_background(Queue(), times, 0)
+        end_time = datetime.datetime.now()
+
+        # THEN switch_wal is called on the primary conn in the child process
+        # 3 times (2 before the timer and 1 after the checkpoint)
+        assert child_proc_primary_conn.switch_wal.call_count == times + 1
+
+        # AND switch_wal is not called on the primary conn in the parent process
+        assert main_proc_primary_conn.switch_wal.call_count == 0
+
+        # AND the checkpoint is called on the primary conn in the child process
+        assert child_proc_primary_conn.checkpoint.call_count == 1
+
+        # AND the duration between the start and end of the function is greater than
+        # the primary_checkpoint_timeout value
+        assert (end_time - start_time).total_seconds() > 5
+
+        # AND a warning is logged that the checkpoint was called
+        assert (
+            "Barman attempted to switch WALs %s times on the primary "
+            "server, but the backup has not yet completed. "
+            "A checkpoint will be forced on the primary server "
+            "in %s seconds to ensure the backup can complete."
+            % (times, standby.primary_checkpoint_timeout)
+            in caplog.text
+        )
+
+        # AND the child process primary conn was closed
+        child_proc_primary_conn.close.assert_called_once()
+
     @pytest.mark.parametrize(
         "stop_fun", ("stop_concurrent_backup", "stop_exclusive_backup")
     )

diff --git a/tests/testing_helpers.py b/tests/testing_helpers.py
@@ -324,6 +324,7 @@ def build_config_dictionary(config_keys=None):
         "parallel_jobs": 1,
         "create_slot": "manual",
         "forward_config_path": False,
+        "primary_checkpoint_timeout": 0,
         "primary_conninfo": None,
         "snapshot_disks": None,
         "snapshot_instance": None,