Skip to content

Commit

Permalink
Add a new primary_checkpoint_timeout configuration option
Browse files Browse the repository at this point in the history
If `primary_conninfo` is set, is possible to add for a server a
`primary_checkpoint_timeout` option, which is the maximum time (in seconds)
for Barman to wait for a new WAL file to be produced
before forcing the execution of a checkpoint on the primary.

Signed-off-by: Giulio Calacoci <[email protected]>
  • Loading branch information
gcalacoci committed Jul 31, 2023
1 parent 59602a4 commit 3df22e2
Show file tree
Hide file tree
Showing 8 changed files with 136 additions and 23 deletions.
3 changes: 3 additions & 0 deletions barman/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -491,6 +491,7 @@ class ServerConfig(object):
"pre_recovery_script",
"pre_wal_delete_script",
"pre_wal_delete_retry_script",
"primary_checkpoint_timeout",
"primary_conninfo",
"primary_ssh_command",
"recovery_options",
Expand Down Expand Up @@ -615,6 +616,7 @@ class ServerConfig(object):
"parallel_jobs": "1",
"parallel_jobs_start_batch_period": "1",
"parallel_jobs_start_batch_size": "10",
"primary_checkpoint_timeout": "0",
"recovery_options": "",
"create_slot": "manual",
"retention_policy_mode": "auto",
Expand Down Expand Up @@ -658,6 +660,7 @@ class ServerConfig(object):
"parallel_jobs": int,
"parallel_jobs_start_batch_period": int,
"parallel_jobs_start_batch_size": int,
"primary_checkpoint_timeout": int,
"recovery_options": RecoveryOptions,
"recovery_staging_path": parse_recovery_staging_path,
"create_slot": parse_create_slot,
Expand Down
29 changes: 29 additions & 0 deletions barman/postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
"""

import atexit
import datetime
import logging
from abc import ABCMeta
from multiprocessing import Process, Queue
Expand Down Expand Up @@ -1581,6 +1582,7 @@ def __init__(
primary_conninfo,
immediate_checkpoint=False,
slot_name=None,
primary_checkpoint_timeout=0,
application_name="barman",
):
"""
Expand All @@ -1606,6 +1608,7 @@ def __init__(
# The standby needs a connection to the primary so that it can
# perform WAL switches itself when calling pg_backup_stop.
self.primary = PostgreSQLConnection(self.primary_conninfo)
self.primary_checkpoint_timeout = primary_checkpoint_timeout

def close(self):
"""Close the connection to PostgreSQL."""
Expand Down Expand Up @@ -1658,6 +1661,32 @@ def switch_wal_in_background(self, done_q, times=10, wait=10):
except Empty:
# An empty queue just means we haven't yet been told to stop
pass
if self.primary_checkpoint_timeout:
_logger.warning(
"Barman attempted to switch WALs %s times on the primary "
"server, but the backup has not yet completed. "
"A checkpoint will be forced on the primary server "
"in %s seconds to ensure the backup can complete."
% (times, self.primary_checkpoint_timeout)
)
sleep_time = datetime.datetime.now() + datetime.timedelta(
seconds=self.primary_checkpoint_timeout
)
while True:
try:
# Always check if the queue is empty, so we know to stop
# before the checkpoint execution
if done_q.get(timeout=wait):
return
except Empty:
# If the queue is empty, we can proceed to the checkpoint
# if enough time has passed
if sleep_time < datetime.datetime.now():
self.primary.checkpoint()
self.primary.switch_wal()
break
# break out of the loop after the checkpoint and wal switch
# execution. The connection will be closed in the finally statement
finally:
# Close the connection since only this subprocess will ever use it
self.primary.close()
Expand Down
1 change: 1 addition & 0 deletions barman/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,7 @@ def _init_postgres(self, config):
config.primary_conninfo,
config.immediate_checkpoint,
config.slot_name,
config.primary_checkpoint_timeout,
)
else:
self.postgres = PostgreSQLConnection(
Expand Down
13 changes: 13 additions & 0 deletions doc/barman.5.d/50-primary_checkpoint_timeout.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
primary_checkpoint_timeout
: This defines the amount of seconds that Barman will wait at the end of a
backup if no new WAL files are produced, before forcing a checkpoint on
the primary server.

If not set or set to 0, Barman will not force a checkpoint on the primary,
and wait indefinitely for new WAL files to be produced.

The value of this option should be greater of the value of the
`archive_timeout` set on the primary server.

This option works only if `primary_conninfo` option is set, and it is
ignored otherwise.
58 changes: 35 additions & 23 deletions doc/manual/50-feature-details.en.md
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,6 @@ tablespace in the above option. If found, the specified bandwidth
limit will be enforced. If not, the default bandwidth limit for that
server will be applied.


### Network Compression

It is possible to reduce the size of transferred data using
Expand All @@ -118,8 +117,8 @@ Setting this option to `true` will enable data compression during
network transfers (for both backup and recovery). By default it is set
to `false`.


### Backup Compression

Barman can use the compression features of pg_basebackup in order to
compress the backup data during the backup process. This can be enabled
using the `backup_compression` config option (global/per server):
Expand All @@ -128,17 +127,18 @@ using the `backup_compression` config option (global/per server):
> in this section are not available with the `rsync` or `local-rsync`
> backup methods. Only with `postgres` backup method.
#### Compression algorithms
#### Compression algorithms

Setting this option will cause pg_basebackup to compress the backup
using the specified compression algorithm. Currently, supported
using the specified compression algorithm. Currently, supported
algorithm in Barman are: `gzip` `lz4` and `zstd`.

``` ini
backup_compression = gzip|lz4|zstd
```

Barman requires the CLI utility for the selected compression algorithm
to be available on both the Barman server *and* the PostgreSQL server.
to be available on both the Barman server _and_ the PostgreSQL server.
The CLI utility is used to extract the backup label from the compressed
backup and to decompress the backup on the PostgreSQL server during
recovery. These can be installed through system packages named `gzip`,
Expand All @@ -161,23 +161,26 @@ recovery. These can be installed through system packages named `gzip`,
> section for more information.
#### Compression workers
This optional parameter allows compression using multiple threads to increase compression speed (default being 0).

This optional parameter allows compression using multiple threads to increase compression speed (default being 0).

```ini
backup_compression_workers = 2
```

> **Note:** This option is only available with `zstd` compression.
> **Note:** `zstd` version must be 1.5.0 or higher. Or 1.4.4 or higher compiled with multithreading option.
#### Compression level

The compression level can be specified using the
`backup_compression_level` option. This should be set to an integer
value supported by the compression algorithm specified in
`backup_compression`.

#### Compression location

When using Barman with PostgreSQL version 15 or higher it is possible
to specify for compression to happen on the server (i.e. PostgreSQL
will compress the backup) or on the client (i.e. pg_basebackup
Expand All @@ -201,6 +204,7 @@ in order to have pg_basebackup uncompress the data before writing it
to disk:

#### Compression format

``` ini
backup_compression_format = plain|tar
```
Expand All @@ -209,20 +213,20 @@ If `backup_compression_format` is unset or has the value `tar` then
the backup will be written to disk as compressed tarballs. A description
of both the `plain` and `tar` formats can be found in the [pg_basebackup
documentation][pg_basebackup-documentation].

> **IMPORTANT:** Barman uses external tools to manage compressed backups.
> Depending on the `backup_compression` and `backup_compression_format`
> You may need to install one or more tools on the Postgres server and
> You may need to install one or more tools on the Postgres server and
> the Barman server.
> The following table will help you choose according to your configuration.
| **backup_compression** | **backup_compression_format** | **Postgres server** | **Barman server** |
| **backup_compression** | **backup_compression_format** | **Postgres server** | **Barman server** |
|:---------:|:---------------------:|:-------------------------:|:----------------------:|
| gzip | plain | **tar** | None |
| gzip | tar | **tar** | **tar** |
| lz4 | plain | **tar, lz4** | None |
| gzip | plain | **tar** | None |
| gzip | tar | **tar** | **tar** |
| lz4 | plain | **tar, lz4** | None |
| lz4 | tar | **tar, lz4** | **tar, lz4** |
| zstd | plain | **tar, zstd** | None |
| zstd | plain | **tar, zstd** | None |
| zstd | tar | **tar, zstd** | **tar, zstd** |

### Concurrent backup
Expand Down Expand Up @@ -285,10 +289,19 @@ having to wait for a WAL switch to occur naturally.

> **NOTE:** It is especially important that `primary_conninfo` is
> set if the standby is to be backed up when there is little or no write
> traffic on the primary. If `primary_conninfo` is not set then the
> backup will still run however it will wait at the stop backup stage
> until the current WAL semgent on the primary is newer than the latest
> WAL required by the backup.
> traffic on the primary.
As of Barman 3.8.0, If `primary_conninfo` is set, is possible to add for a server a
`primary_checkpoint_timeout` option, which is the maximum time (in seconds)
for Barman to wait for a new WAL file to be produced
before forcing the execution of a checkpoint on the primary.
The `primary_checkpoint_timeout` option should be set to an amount of seconds
greater of the value of the `archive_timeout` option set on the primary server.

If `primary_conninfo` is not set then the
backup will still run however it will wait at the stop backup stage
until the current WAL segment on the primary is newer than the latest
WAL required by the backup.

Barman currently requires that WAL files and backup data come from the
same PostgreSQL server. In the case that the standby is promoted to primary
Expand Down Expand Up @@ -380,6 +393,7 @@ backup_method = local-rsync
```

## Archiving features

### WAL compression

The `barman cron` command will compress WAL files if the `compression`
Expand All @@ -397,7 +411,7 @@ values:
- `custom_decompression_filter`: a decompression filter
- `custom_compression_magic`: a hex string to identify a custom compressed wal file

> *NOTE:* All methods but `pybzip2` and `pygzip` require `barman
> _NOTE:_ All methods but `pybzip2` and `pygzip` require `barman
> archive-wal` to fork a new process.
### Synchronous WAL streaming
Expand All @@ -416,7 +430,7 @@ First of all, you need to retrieve the application name of the Barman

``` bash
barman@backup$ barman show-servers pg|grep streaming_archiver_name
streaming_archiver_name: barman_receive_wal
streaming_archiver_name: barman_receive_wal
```

Then the application name should be added to the `postgresql.conf`
Expand Down Expand Up @@ -460,8 +474,8 @@ Status of streaming clients for server 'pg':
Flush location : 0/9000098 (diff: 0 B)
```


## Catalog management features

### Minimum redundancy safety

You can define the minimum number of periodic backups for a PostgreSQL
Expand All @@ -479,7 +493,6 @@ This will protect you from accidental `barman delete` operations.
> minimum redundancy requirements. Regularly check Barman's log for
> messages on this topic.

### Retention policies

Barman supports **retention policies** for backups.
Expand Down Expand Up @@ -611,7 +624,6 @@ Currently, the only allowed value for `wal_retention_policy` is the
special value `main`, that maps the retention policy of archive logs
to that of base backups.


## Hook scripts

Barman allows a database administrator to run hook scripts on these
Expand Down
2 changes: 2 additions & 0 deletions tests/test_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,6 +177,7 @@ def test_config(self):
"backup_method": "rsync",
"max_incoming_wals_queue": None,
"primary_conninfo": None,
"primary_checkpoint_timeout": 0,
}
)
assert main.__dict__ == expected
Expand Down Expand Up @@ -214,6 +215,7 @@ def test_config(self):
"errors_directory": "/some/barman/home/web/errors",
"max_incoming_wals_queue": None,
"primary_conninfo": None,
"primary_checkpoint_timeout": 0,
}
)
assert web.__dict__ == expected
Expand Down
52 changes: 52 additions & 0 deletions tests/test_postgres.py
Original file line number Diff line number Diff line change
Expand Up @@ -1902,6 +1902,58 @@ def test_switch_wal_in_background_stops_when_asked(
# THEN switch_wal is called on the primary exactly once
assert mock_primary_conn.switch_wal.call_count == 1

@patch("barman.postgres.PostgreSQLConnection")
@patch("barman.postgres.super")
def test_switch_wal_in_background_calls_checkpoint(
self, _mock_super, mock_psql_conn, caplog
):
"""
Verify switch_wal_in_background runs the expected number of times and
then executes a checkpoint after `primary_checkpoint_timeout` value.
"""
# GIVEN a connection to a standby PostgreSQL instance
main_proc_primary_conn = Mock()
child_proc_primary_conn = Mock()
mock_psql_conn.side_effect = [main_proc_primary_conn, child_proc_primary_conn]
standby = StandbyPostgreSQLConnection(
self._standby_conninfo, self._primary_conninfo
)

# WHEN switch_wal_in_background is called with times=2 AND primary_checkpoint_timeout
# is set to 5 seconds
standby.primary_checkpoint_timeout = 5
times = 2
start_time = datetime.datetime.now()
standby.switch_wal_in_background(Queue(), times, 0)
end_time = datetime.datetime.now()

# THEN switch_wal is called on the primary conn in the child process
# 3 times (2 before the timer and 1 after the checkpoint)
assert child_proc_primary_conn.switch_wal.call_count == times + 1

# AND switch_wal is not called on the primary conn in the parent process
assert main_proc_primary_conn.switch_wal.call_count == 0

# AND the checkpoint is called on the primary conn in the child process
assert child_proc_primary_conn.checkpoint.call_count == 1

# AND the duration between the start and end of the function is greater than
# the primary_checkpoint_timeout value
assert (end_time - start_time).total_seconds() > 5

# AND a warning is logged that the checkpoint was called
assert (
"Barman attempted to switch WALs %s times on the primary "
"server, but the backup has not yet completed. "
"A checkpoint will be forced on the primary server "
"in %s seconds to ensure the backup can complete."
% (times, standby.primary_checkpoint_timeout)
in caplog.text
)

# AND the child process primary conn was closed
child_proc_primary_conn.close.assert_called_once()

@pytest.mark.parametrize(
"stop_fun", ("stop_concurrent_backup", "stop_exclusive_backup")
)
Expand Down
1 change: 1 addition & 0 deletions tests/testing_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,6 +324,7 @@ def build_config_dictionary(config_keys=None):
"parallel_jobs": 1,
"create_slot": "manual",
"forward_config_path": False,
"primary_checkpoint_timeout": 0,
"primary_conninfo": None,
"snapshot_disks": None,
"snapshot_instance": None,
Expand Down

0 comments on commit 3df22e2

Please sign in to comment.