From a46d1bd872bdfd0f85fc0525f26c95370de1d943 Mon Sep 17 00:00:00 2001
From: Lucas Kent <rubickent@gmail.com>
Date: Wed, 6 Nov 2024 18:44:44 +1100
Subject: [PATCH] Set a magic config which makes our kafka integration tests
 complete in half the time (#1800)

---
 shotover-proxy/benches/windsock/kafka/bench.rs           | 9 +++++++++
 .../docker-compose-short-idle-timeout.yaml               | 7 +++++++
 .../kafka/cluster-1-rack/docker-compose.yaml             | 7 +++++++
 .../kafka/cluster-2-racks/docker-compose.yaml            | 7 +++++++
 .../kafka/cluster-3-racks/docker-compose.yaml            | 9 ++++++++-
 .../test-configs/kafka/cluster-mtls/docker-compose.yaml  | 7 +++++++
 .../kafka/cluster-sasl-plain/docker-compose.yaml         | 7 +++++++
 .../cluster-sasl-scram-over-mtls/docker-compose.yaml     | 7 +++++++
 .../kafka/cluster-sasl-scram/docker-compose.yaml         | 7 +++++++
 .../test-configs/kafka/cluster-tls/docker-compose.yaml   | 7 +++++++
 10 files changed, 73 insertions(+), 1 deletion(-)

diff --git a/shotover-proxy/benches/windsock/kafka/bench.rs b/shotover-proxy/benches/windsock/kafka/bench.rs
index 920f1f3e6..0c64814c2 100644
--- a/shotover-proxy/benches/windsock/kafka/bench.rs
+++ b/shotover-proxy/benches/windsock/kafka/bench.rs
@@ -171,6 +171,15 @@ impl KafkaBench {
                             "KAFKA_CFG_PROCESS_ROLES".to_owned(),
                             "controller,broker".to_owned(),
                         ),
+                        // This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+                        // new consumer group by avoiding constant rebalances as each initial consumer joins.
+                        // See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+
+                        // However for this benchmark we already discard the initial results as a warmup stage, so better to just have the benchmark startup faster.
+                        (
+                            "KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS".to_owned(),
+                            "0".to_owned(),
+                        ),
                         (
                             "KAFKA_HEAP_OPTS".to_owned(),
                             "-Xmx4096M -Xms4096M".to_owned(),
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose-short-idle-timeout.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose-short-idle-timeout.yaml
index 7701306be..dd37eaeb9 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose-short-idle-timeout.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose-short-idle-timeout.yaml
@@ -31,6 +31,13 @@ services:
 
       # connections.max.idle.ms is set to 20s for testing shotovers handling of idle connection timeouts
       KAFKA_CFG_CONNECTIONS_MAX_IDLE_MS: 20000
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose.yaml
index bc3eac691..7c9bd5940 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-1-rack/docker-compose.yaml
@@ -28,6 +28,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-2-racks/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-2-racks/docker-compose.yaml
index e1a5b226c..91eec63b2 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-2-racks/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-2-racks/docker-compose.yaml
@@ -29,6 +29,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-3-racks/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-3-racks/docker-compose.yaml
index 12fc2886a..e19e9ee7a 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-3-racks/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-3-racks/docker-compose.yaml
@@ -29,6 +29,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
@@ -53,4 +60,4 @@ services:
       KAFKA_CFG_ADVERTISED_LISTENERS: "BROKER://172.16.1.4:9092"
       KAFKA_CFG_NODE_ID: 2
       KAFKA_CFG_BROKER_RACK: "rack3"
-    volumes: *volumes
\ No newline at end of file
+    volumes: *volumes
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-mtls/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-mtls/docker-compose.yaml
index dea55ff05..0a92c0d46 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-mtls/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-mtls/docker-compose.yaml
@@ -31,6 +31,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-sasl-plain/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-sasl-plain/docker-compose.yaml
index 128d42886..24fce30c1 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-sasl-plain/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-sasl-plain/docker-compose.yaml
@@ -34,6 +34,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram-over-mtls/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram-over-mtls/docker-compose.yaml
index 5bab7e723..3224801de 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram-over-mtls/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram-over-mtls/docker-compose.yaml
@@ -42,6 +42,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram/docker-compose.yaml
index 2881064e2..b2ffe5fb3 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-sasl-scram/docker-compose.yaml
@@ -34,6 +34,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka
diff --git a/shotover-proxy/tests/test-configs/kafka/cluster-tls/docker-compose.yaml b/shotover-proxy/tests/test-configs/kafka/cluster-tls/docker-compose.yaml
index 7c2724660..fc283add6 100644
--- a/shotover-proxy/tests/test-configs/kafka/cluster-tls/docker-compose.yaml
+++ b/shotover-proxy/tests/test-configs/kafka/cluster-tls/docker-compose.yaml
@@ -29,6 +29,13 @@ services:
       KAFKA_CFG_OFFSETS_TOPIC_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 3
       KAFKA_CFG_TRANSACTION_STATE_LOG_MIN_ISR: 2
+
+      # This cfg is set to 3000 by default, which for a typical workload reduces the overhead of creating a
+      # new consumer group by avoiding constant rebalances as each initial consumer joins.
+      # See: https://cwiki.apache.org/confluence/display/KAFKA/KIP-134%3A+Delay+initial+consumer+group+rebalance
+      #
+      # However for an integration test workload we are constantly spinning up single consumer groups, so the default value makes the tests take twice as long to run.
+      KAFKA_CFG_GROUP_INITIAL_REBALANCE_DELAY_MS: "0"
     volumes: &volumes
       - type: tmpfs
         target: /bitnami/kafka