Skip to content

Commit

Permalink
(GH-4013) Clean up stranded partitions
Browse files Browse the repository at this point in the history
If a partition is fully detached, but fails to be dropped during its GC
operation, subsequent GC operations will not see that partition at all.
It will be stranded and PuppetDB will never remove it.

During GC, search for stranded partitions that need to be removed and
add them to the list of partitions that need to be dropped.

There is no structural way to tell the difference between a
non-partitioned table and a detached partition table. This PR uses a
regular expression, which means that PuppetDB cannot have any
non-partitioned tables matching the regular expressions used to identify
stranded partitions.
  • Loading branch information
austb committed Oct 17, 2024
1 parent 74eeca3 commit 5e1d0f5
Show file tree
Hide file tree
Showing 2 changed files with 54 additions and 14 deletions.
40 changes: 29 additions & 11 deletions src/puppetlabs/puppetdb/scf/storage.clj
Original file line number Diff line number Diff line change
Expand Up @@ -1642,6 +1642,21 @@
(jdbc/do-commands (format "ALTER TABLE %s DETACH PARTITION %s FINALIZE" parent pending))
(str pending))))

(defn find-stranded-partitions
"Identify tables that match the child format of a partitioned table (like reports_historical)
that are not present in the pg_inherits table. These partitions have been detached, but failed
to be deleted.
Tables that are not partitioned will also not be in the pg_inherits table, so you MUST
write a child-format that does not match any non-partitioned tables.
Returns a list of strings. Each string is a stranded partition that should be removed."
[child-format]
(->> ["SELECT tablename FROM pg_tables WHERE tablename ~ ? AND tablename NOT IN ( SELECT inhrelid::regclass::text FROM pg_catalog.pg_inherits)"
child-format]
jdbc/query-to-vec
(map (comp str :tablename))))

(defn prune-daily-partitions
"Either detaches or drops obsolete day-oriented partitions
older than the date. Deletes or detaches only the oldest such candidate if
Expand Down Expand Up @@ -1725,7 +1740,7 @@
"Drops the given set of tables. Will throw an SQLException termination if the
operation takes much longer than PDB_GC_DAILY_PARTITION_DROP_LOCK_TIMEOUT_MS."
[old-partition-tables update-lock-status status-key]
(let [drop #(doseq [table old-partition-tables]
(let [drop #(doseq [table (distinct old-partition-tables)]
(try
(update-lock-status status-key inc)
(jdbc/do-commands
Expand All @@ -1749,9 +1764,10 @@
;; PG14+
(let [detached-tables
(detach-daily-partitions "resource_events_historical" date incremental?
update-lock-status :write-locking-resource-events)]
update-lock-status :write-locking-resource-events)
stranded-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$")]
(jdbc/with-db-transaction []
(drop-partition-tables! detached-tables
(drop-partition-tables! (concat detached-tables stranded-tables)
update-lock-status :write-locking-resource-events)))))

(defn delete-reports-older-than-in-pg-11!
Expand Down Expand Up @@ -1792,13 +1808,15 @@
;; PG14+
;; Detach partition concurrently must take place outside of a transaction.
(let [detached-resource-event-tables
(detach-daily-partitions "resource_events_historical" effective-resource-events-ttl
incremental? update-lock-status
:write-locking-resource-events)
(detach-daily-partitions "resource_events_historical" effective-resource-events-ttl
incremental? update-lock-status
:write-locking-resource-events)
stranded-events-tables (find-stranded-partitions "^resource_events_\\d\\d\\d\\d\\d\\d\\d\\dz$")
detached-report-tables
(detach-daily-partitions "reports_historical" report-ttl
incremental? update-lock-status
:write-locking-reports)]
(detach-daily-partitions "reports_historical" report-ttl
incremental? update-lock-status
:write-locking-reports)
stranded-reports-tables (find-stranded-partitions "^reports_\\d\\d\\d\\d\\d\\d\\d\\dz$")]
;; Now we can delete the partitions with less intrusive locking.
(jdbc/with-db-transaction []
;; Nothing should acquire locks on the detached tables, but to be safe, acquire
Expand All @@ -1808,9 +1826,9 @@
;; force a resource-events GC. prior to partitioning, this would have happened
;; via a cascade when the report was deleted, but now we just drop whole tables
;; of resource events.
(drop-partition-tables! detached-resource-event-tables
(drop-partition-tables! (concat detached-resource-event-tables stranded-events-tables)
update-lock-status :write-locking-resource-events)
(drop-partition-tables! detached-report-tables
(drop-partition-tables! (concat detached-report-tables stranded-reports-tables)
update-lock-status :write-locking-reports))))))

;; A db version that is "allowed" but not supported is deprecated
Expand Down
28 changes: 25 additions & 3 deletions test/puppetlabs/puppetdb/cli/services_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
[puppetlabs.puppetdb.scf.partitioning
:refer [create-resource-events-partition
create-reports-partition
get-temporal-partitions]]
get-temporal-partitions]
:as part]
[puppetlabs.trapperkeeper.testutils.logging
:refer [with-log-output
logs-matching
Expand Down Expand Up @@ -592,7 +593,7 @@
(with-test-db
(let [config (-> (create-temp-config)
(assoc :database *db* :read-database *read-db*)
(assoc-in [:database :gc-interval] "0.01"))
(assoc-in [:database :gc-interval] "60"))
store-report #(sync-command-post (svc-utils/pdb-cmd-url)
example-certname
"store report"
Expand Down Expand Up @@ -662,7 +663,28 @@
(is (empty?
(jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table])))

(jdbc/do-commands "DELETE FROM reports"))))))))
(jdbc/do-commands "DELETE FROM reports")))

(testing "a detached partition that was not removed is cleaned up by gc"
(let [old-ts (-> 2 time/days time/ago)
partition-table (format "reports_%s"
(part/date-suffix (part/to-zoned-date-time (time/to-timestamp old-ts))))]
(store-report (time/to-string old-ts))
(store-report (to-string (now)))

;; Strand the partition before calling GC
(jdbc/do-commands-outside-txn
(format "ALTER TABLE reports_historical DETACH PARTITION %s CONCURRENTLY" partition-table))

(svcs/sweep-reports! *db* {:incremental? false
:report-ttl (time/parse-period "1d")
:resource-events-ttl (time/parse-period "1d")
:db-lock-status db-lock-status})

(is (empty?
(jdbc/query ["SELECT tablename FROM pg_tables WHERE tablename = ?" partition-table])))

(jdbc/do-commands "DELETE FROM reports"))))))))

(deftest reports-analysis
;; For now, just test for the initial invocation
Expand Down

0 comments on commit 5e1d0f5

Please sign in to comment.