From ce8de9590b92e2b6b0c72fe0841fafb50e05ecd1 Mon Sep 17 00:00:00 2001 From: Matthew de Detrich Date: Fri, 18 Feb 2022 12:17:32 +0100 Subject: [PATCH] Add sbt-paradox docs --- .github/workflows/ci.yml | 59 +++++++++++++ .jvmopts | 3 + CONTRIBUTING.md | 17 +++- README.md | 12 ++- build.sbt | 88 ++++++++++++++++++- .../kafka/backup/BackupClientInterface.scala | 31 +++---- .../backup/configs/TimeConfiguration.scala | 10 +-- .../{application.conf => reference.conf} | 0 core-s3/src/main/resources/reference.conf | 1 + docs/src/main/paradox/application/index.md | 49 +++++++++++ docs/src/main/paradox/backup/configuration.md | 24 +++++ docs/src/main/paradox/backup/index.md | 31 +++++++ docs/src/main/paradox/ci.md | 42 +++++++++ docs/src/main/paradox/doc-generation.md | 42 +++++++++ docs/src/main/paradox/index.md | 26 ++++++ docs/src/main/paradox/overview.md | 19 ++++ docs/src/main/paradox/persistence/design.md | 26 ++++++ docs/src/main/paradox/persistence/index.md | 12 +++ .../paradox/persistence/s3/configuration.md | 15 ++++ docs/src/main/paradox/persistence/s3/index.md | 12 +++ .../src/main/paradox/restore/configuration.md | 13 +++ docs/src/main/paradox/restore/index.md | 13 +++ docs/src/main/paradox/security.md | 29 ++++++ project/plugins.sbt | 33 ++++--- project/project-info.conf | 57 ++++++++++++ 25 files changed, 626 insertions(+), 38 deletions(-) create mode 100644 .jvmopts rename core-restore/src/main/resources/{application.conf => reference.conf} (100%) create mode 100644 docs/src/main/paradox/application/index.md create mode 100644 docs/src/main/paradox/backup/configuration.md create mode 100644 docs/src/main/paradox/backup/index.md create mode 100644 docs/src/main/paradox/ci.md create mode 100644 docs/src/main/paradox/doc-generation.md create mode 100644 docs/src/main/paradox/index.md create mode 100644 docs/src/main/paradox/overview.md create mode 100644 docs/src/main/paradox/persistence/design.md create mode 100644 docs/src/main/paradox/persistence/index.md create mode 100644 docs/src/main/paradox/persistence/s3/configuration.md create mode 100644 docs/src/main/paradox/persistence/s3/index.md create mode 100644 docs/src/main/paradox/restore/configuration.md create mode 100644 docs/src/main/paradox/restore/index.md create mode 100644 docs/src/main/paradox/security.md create mode 100644 project/project-info.conf diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 35e04efa..8f4cfa6b 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,8 +64,67 @@ jobs: - name: Build project run: sbt ++${{ matrix.scala }} clean coverage test + - name: Compile docs + run: sbt ++${{ matrix.scala }} docs/makeSite + - name: Upload coverage data to Coveralls env: COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }} COVERALLS_FLAG_NAME: Scala ${{ matrix.scala }} run: sbt ++${{ matrix.scala }} coverageReport coverageAggregate coveralls + + - name: Compress target directories + run: tar cf targets.tar target cli-compaction/target compaction-gcs/target backup-s3/target compaction-s3/target docs/target cli-backup/target core-restore/target restore-s3/target core-gcs/target core-compaction/target core-s3/target core-backup/target core-cli/target cli-restore/target core/target restore-gcs/target backup-gcs/target project/target + + - name: Upload target directories + uses: actions/upload-artifact@v2 + with: + name: target-${{ matrix.os }}-${{ matrix.scala }}-${{ matrix.java }} + path: targets.tar + + publish: + name: Publish Artifacts + needs: [build] + if: github.event_name != 'pull_request' && (github.ref == 'refs/heads/main') + strategy: + matrix: + os: [ubuntu-latest] + scala: [2.13.8] + java: [temurin@11] + runs-on: ${{ matrix.os }} + steps: + - name: Checkout current branch (full) + uses: actions/checkout@v2 + with: + fetch-depth: 0 + + - name: Setup Java (temurin@11) + if: matrix.java == 'temurin@11' + uses: actions/setup-java@v2 + with: + distribution: temurin + java-version: 11 + + - name: Cache sbt + uses: actions/cache@v2 + with: + path: | + ~/.sbt + ~/.ivy2/cache + ~/.coursier/cache/v1 + ~/.cache/coursier/v1 + ~/AppData/Local/Coursier/Cache/v1 + ~/Library/Caches/Coursier/v1 + key: ${{ runner.os }}-sbt-cache-v2-${{ hashFiles('**/*.sbt') }}-${{ hashFiles('project/build.properties') }} + + - name: Download target directories (2.13.8) + uses: actions/download-artifact@v2 + with: + name: target-${{ matrix.os }}-2.13.8-${{ matrix.java }} + + - name: Inflate target directories (2.13.8) + run: | + tar xf targets.tar + rm targets.tar + + - run: sbt ++${{ matrix.scala }} docs/ghpagesPushSite diff --git a/.jvmopts b/.jvmopts new file mode 100644 index 00000000..74196af4 --- /dev/null +++ b/.jvmopts @@ -0,0 +1,3 @@ +-XX:+IgnoreUnrecognizedVMOptions +--add-opens java.base/java.lang=ALL-UNNAMED +--illegal-access=permit diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index cc7c13f7..41f357bc 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -67,8 +67,8 @@ the [scalafmt installation guide][scalafmt-installation-link] for more details * There are native builds of Scalafmt that let you run a `scalafmt` as a CLI tool, see the CLI section in [scalafmt installation guide][scalafmt-installation-link] -Note that a github action exists which will check that your PR is formatted when you create it. The check runs -separately ad in parallel to the main build/tests +Note that a github action exists which will check that your code is formatted whenever you create a PR. For more details +read the [documentation](https://aiven.github.io/guardian-for-apache-kafka/ci.html#scalafmt) ## sbt - Compiling, Building and Testing @@ -81,6 +81,19 @@ it will start a REPL session where you can type in commands, i.e. * `core/compile` will only compile the `core` project. See [build.sbt](build.sbt) to get a reference for how the projects are named * `publishLocal` will publish the project into the local `~/.m2` repository +* `clean` will clean all builds targets (including documentation) from the project. Note that sbt stores build +in sub-directories named `target` +* `reload` will reload sbt which is used when the [sbt][sbt-link] build definition is changed + +## sbt - documentation + +Documentation is also built within SBT, i.e. + +* `docs/makeSite` will compile documentation +* `docs/previewSite` will compile documentation (if needed) and open the result in your system's default browser + +For details about how the document generation works go +[here](https://aiven.github.io/guardian-for-apache-kafka/doc-generation.html) [adopt-openjdk-link]: https://adoptopenjdk.net/ [metals-link]: https://scalameta.org/metals/ diff --git a/README.md b/README.md index 74d65e93..82c3c11e 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,14 @@ # Guardian for Apache Kafka® -Guardian is a backup and restore tool for Apache Kafka clusters. It is designed to continuously stream kafka topics -into persistent/object storages such as S3 and also provides tools for restoring said backups. +Guardian is a backup and restore tool for Apache Kafka clusters. It is designed to continuously stream kafka topics into +persistent/object storages such as S3 and also provides tools for restoring said backups. + +## Documentation + +* [Guardian reference](https://aiven.github.io/guardian-for-apache-kafka/) documentation. ## Trademarks -Apache Kafka is either a registered trademark or trademark of the Apache Software Foundation in the United States and/or other countries. + +Apache Kafka is either a registered trademark or trademark of the Apache Software Foundation in the United States and/or +other countries. diff --git a/build.sbt b/build.sbt index cd56e065..730e955d 100644 --- a/build.sbt +++ b/build.sbt @@ -1,4 +1,5 @@ import com.jsuereth.sbtpgp.PgpKeys.publishSigned +import com.lightbend.paradox.apidoc.ApidocPlugin.autoImport.apidocRootPackage ThisBuild / scalaVersion := "2.13.8" ThisBuild / organization := "aiven.io" @@ -75,6 +76,33 @@ val cliSettings = Seq( val baseName = "guardian-for-apache-kafka" +lazy val guardian = project + .in(file(".")) + .enablePlugins(ScalaUnidocPlugin) + .disablePlugins(SitePlugin) + .aggregate( + core, + coreCli, + coreS3, + coreGCS, + coreBackup, + backupS3, + backupGCS, + cliBackup, + coreCompaction, + compactionS3, + compactionGCS, + cliCompaction, + coreRestore, + restoreS3, + restoreGCS, + cliRestore + ) + .settings( + publish / skip := true, + crossScalaVersions := List() // workaround for https://github.com/sbt/sbt/issues/3465 + ) + lazy val core = project .in(file("core")) .settings( @@ -268,12 +296,65 @@ lazy val cliRestore = project ) .enablePlugins(JavaAppPackaging) +def binaryVersion(key: String): String = key.substring(0, key.lastIndexOf('.')) + +lazy val docs = project + .enablePlugins(ParadoxPlugin, ParadoxSitePlugin, PreprocessPlugin, GhpagesPlugin) + .settings( + Compile / paradox / name := "Guardian for Apache Kafka", + publish / skip := true, + makeSite := makeSite.dependsOn(LocalRootProject / ScalaUnidoc / doc).value, + previewPath := (Paradox / siteSubdirName).value, + paradoxTheme := Some(builtinParadoxTheme("generic")), + apidocRootPackage := "io.aiven.guardian", + Preprocess / siteSubdirName := s"api/${projectInfoVersion.value}", + Preprocess / sourceDirectory := (LocalRootProject / ScalaUnidoc / unidoc / target).value, + git.remoteRepo := scmInfo.value.get.connection.replace("scm:git:", ""), + paradoxGroups := Map("Language" -> Seq("Scala")), + paradoxProperties ++= Map( + "akka.version" -> akkaVersion, + "akka-http.version" -> akkaHttpVersion, + "akka-streams-json.version" -> akkaStreamsJson, + "pure-config.version" -> pureConfigVersion, + "decline.version" -> declineVersion, + "scala-logging.version" -> scalaLoggingVersion, + "extref.akka.base_url" -> s"https://doc.akka.io/docs/akka/${binaryVersion(akkaVersion)}/%s", + "extref.akka-stream-json.base_url" -> s"https://github.com/mdedetrich/akka-streams-json", + "extref.alpakka.base_url" -> s"https://doc.akka.io/api/alpakka/${binaryVersion(alpakkaVersion)}/%s", + "extref.alpakka-docs.base_url" -> s"https://docs.akka.io/docs/alpakka/${binaryVersion(alpakkaVersion)}/%s", + "extref.pureconfig.base_url" -> s"https://pureconfig.github.io/docs/", + "scaladoc.io.aiven.guardian.base_url" -> s"/guardian-for-apache-kafka/${(Preprocess / siteSubdirName).value}/" + ) + ) + +ThisBuild / homepage := Some(url("https://github.com/aiven/akka-streams-json")) + +ThisBuild / scmInfo := Some( + ScmInfo(url("https://github.com/aiven/guardian-for-apache-kafka"), + "scm:git:git@github.com:aiven/guardian-for-apache-kafka.git" + ) +) + +ThisBuild / developers := List( + Developer("jlprat", "Josep Prat", "josep.prat@aiven.io", url("https://github.com/jlprat")), + Developer("mdedetrich", "Matthew de Detrich", "matthew.dedetrich@aiven.io", url("https://github.com/mdedetrich")), + Developer("reta", "Andriy Redko", "andriy.redko@aiven.io", url("https://github.com/reta")) +) + +maintainer := "matthew.dedetrich@aiven.io" + +ThisBuild / licenses += ("Apache-2.0", url("https://opensource.org/licenses/Apache-2.0")) + // This is currently causing problems, see https://github.com/djspiewak/sbt-github-actions/issues/74 ThisBuild / githubWorkflowUseSbtThinClient := false -ThisBuild / githubWorkflowTargetBranches := Seq("main") // Once we have branches per version, add the pattern here +ThisBuild / githubWorkflowTargetBranches := Seq("main") + +// Once we have branches per version, add the pattern here, see +// https://github.com/djspiewak/sbt-github-actions#integration-with-sbt-ci-release +ThisBuild / githubWorkflowPublishTargetBranches := Seq(RefPredicate.Equals(Ref.Branch("main"))) -ThisBuild / githubWorkflowPublishTargetBranches := Seq() +ThisBuild / githubWorkflowPublish := Seq(WorkflowStep.Sbt(List("docs/ghpagesPushSite"))) ThisBuild / githubWorkflowBuildPreamble := Seq( WorkflowStep.Sbt(List("scalafixAll --check"), name = Some("Linter: Scalafix checks")) @@ -300,7 +381,8 @@ ThisBuild / githubWorkflowEnv ++= Map( ThisBuild / githubWorkflowJavaVersions := List(JavaSpec.temurin("11")) ThisBuild / githubWorkflowBuild := Seq( - WorkflowStep.Sbt(List("clean", "coverage", "test"), name = Some("Build project")) + WorkflowStep.Sbt(List("clean", "coverage", "test"), name = Some("Build project")), + WorkflowStep.Sbt(List("docs/makeSite"), name = Some("Compile docs")) ) ThisBuild / githubWorkflowBuildPostamble ++= Seq( diff --git a/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/BackupClientInterface.scala b/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/BackupClientInterface.scala index c57904e2..39f30c75 100644 --- a/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/BackupClientInterface.scala +++ b/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/BackupClientInterface.scala @@ -69,38 +69,39 @@ trait BackupClientInterface[T <: KafkaClientInterface] extends StrictLogging { * @param key * The object key or filename for what is currently being backed up * @return - * A [[Future]] with a [[UploadStateResult]] data structure that optionally contains the state associated with - * `key` along with the previous latest state before `key` (if it exists) + * A [[scala.concurrent.Future]] with a [[UploadStateResult]] data structure that optionally contains the state + * associated with `key` along with the previous latest state before `key` (if it exists) */ def getCurrentUploadState(key: String): Future[UploadStateResult] /** A sink that is executed whenever a previously existing Backup needs to be terminated and closed. Generally - * speaking this [[Sink]] is similar to the [[backupToStorageSink]] except that - * [[kafkaClientInterface.CursorContext]] is not required since no Kafka messages are being written. + * speaking this [[akka.stream.scaladsl.Sink]] is similar to the `backupToStorageSink` except that + * `kafkaClientInterface.CursorContext` is not required since no Kafka messages are being written. * - * Note that the terminate refers to the fact that this Sink is executed with a `null]` [[Source]] which when written - * to an already existing unfinished backup terminates the containing JSON array so that it becomes valid parsable - * JSON. + * Note that the terminate refers to the fact that this Sink is executed with a `null]` + * [[akka.stream.scaladsl.Source]] which when written to an already existing unfinished backup terminates the + * containing JSON array so that it becomes valid parsable JSON. * @param previousState * A data structure containing both the [[State]] along with the associated key which you can refer to in order to - * define your [[Sink]] + * define your [[akka.stream.scaladsl.Sink]] * @return - * A [[Sink]] that points to an existing key defined by `previousState.previousKey` + * A [[akka.stream.scaladsl.Sink]] that points to an existing key defined by `previousState.previousKey` */ def backupToStorageTerminateSink(previousState: PreviousState): Sink[ByteString, Future[BackupResult]] - /** Override this method to define how to backup a `ByteString` combined with Kafka + /** Override this method to define how to backup a [[akka.util.ByteString]] combined with Kafka * `kafkaClientInterface.CursorContext` to a `DataSource` * @param key * The object key or filename for what is being backed up * @param currentState * The current state if it exists. If this is empty then a new backup is being created with the associated `key` - * otherwise if this contains a [[State]] then the defined [[Sink]] needs to handle resuming a previously - * unfinished backup with that `key` by directly appending the [[ByteString]] data. + * otherwise if this contains a [[State]] then the defined [[akka.stream.scaladsl.Sink]] needs to handle resuming a + * previously unfinished backup with that `key` by directly appending the [[akka.util.ByteString]] data. * @return - * A [[Sink]] that given a [[ByteString]] (containing a single Kafka [[ReducedConsumerRecord]]) along with its - * [[kafkaClientInterface.CursorContext]] backs up the data to your data storage. The [[Sink]] is also responsible - * for executing [[kafkaClientInterface.commitCursor]] when the data is successfully backed up + * A [[akka.stream.scaladsl.Sink]] that given a [[akka.util.ByteString]] (containing a single Kafka + * [[io.aiven.guardian.kafka.models.ReducedConsumerRecord]]) along with its `kafkaClientInterface.CursorContext` + * backs up the data to your data storage. The [[akka.stream.scaladsl.Sink]] is also responsible for executing + * `kafkaClientInterface.commitCursor` when the data is successfully backed up */ def backupToStorageSink(key: String, currentState: Option[State] diff --git a/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/configs/TimeConfiguration.scala b/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/configs/TimeConfiguration.scala index 89dd41f4..3b87f30b 100644 --- a/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/configs/TimeConfiguration.scala +++ b/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/configs/TimeConfiguration.scala @@ -13,11 +13,11 @@ sealed trait TimeConfiguration */ final case class PeriodFromFirst(duration: FiniteDuration) extends TimeConfiguration -/** Backs up objects/files by collecting received Kafka messages into a single time slice based on a [[ChronoUnit]]. - * When suspending/resuming the backup client, this option will reuse existing objects/files if they fall into the - * currently configured `chronoUnit`. +/** Backs up objects/files by collecting received Kafka messages into a single time slice based on a + * [[java.time.temporal.ChronoUnit]]. When suspending/resuming the backup client, this option will reuse existing + * objects/files if they fall into the currently configured `chronoUnit`. * @param chronoUnit - * Timestamps for kafka messages that are contained within the configured [[ChronoUnit]] will be placed into the same - * object/file. + * Timestamps for kafka messages that are contained within the configured [[java.time.temporal.ChronoUnit]] will be + * placed into the same object/file. */ final case class ChronoUnitSlice(chronoUnit: ChronoUnit) extends TimeConfiguration diff --git a/core-restore/src/main/resources/application.conf b/core-restore/src/main/resources/reference.conf similarity index 100% rename from core-restore/src/main/resources/application.conf rename to core-restore/src/main/resources/reference.conf diff --git a/core-s3/src/main/resources/reference.conf b/core-s3/src/main/resources/reference.conf index f2f26bc1..19f46913 100644 --- a/core-s3/src/main/resources/reference.conf +++ b/core-s3/src/main/resources/reference.conf @@ -6,6 +6,7 @@ alpakka.s3 { scheme = ${?ALPAKKA_S3_FORWARD_PROXY_SCHEME} host = ${?ALPAKKA_S3_FORWARD_PROXY_HOST} port = ${?ALPAKKA_S3_FORWARD_PROXY_PORT} + credentials { username = ${?ALPAKKA_S3_FORWARD_PROXY_CREDENTIALS_USERNAME} password = ${?ALPAKKA_S3_FORWARD_PROXY_CREDENTIALS_PASSWORD} diff --git a/docs/src/main/paradox/application/index.md b/docs/src/main/paradox/application/index.md new file mode 100644 index 00000000..f118fd0d --- /dev/null +++ b/docs/src/main/paradox/application/index.md @@ -0,0 +1,49 @@ +# Application + +Guardian also becomes packaged as various application/s that lets you run it using a CLI interface. Currently, the +binaries provided are + +* restore: A continuously running binary that performs the restore operation. +* backup: A binary which when executed allows you to restore an existing backup. + +The CLI follows POSIX guidelines which means you can use `--help` as an argument to provide information on all of the +parameters. + +## Package formats + +Guardian is currently packaged using [sbt-native-packager](https://github.com/sbt/sbt-native-packager) to provide the +following formats by using the sbt shell. + +* `rpm` + * restore: `cliRestore/rpm:packageBin`. Created `rpm` file will be contained + in `cli-restore/target/rpm/RPMS/noarch/` + * backup: `cliBackup/rpm:packageBin`. Created `rpm` file will be contained in `cli-backup/target/rpm/RPMS/noarch/` + NOTE: In order to build packages you need to have the [rpm-tools](https://rpm.org/) (specifically `rpmbuild`) + installed and available on `PATH`. Please consult your Linux distribution for more info +* `zip` + * restore: `cliRestore/universal:packageBin`. Created `zip` file will be contained + in `cli-restore/target/universal/` + * backup: `cliBackup/universal:packageBin`. Created `zip` file will be contained in `cli-backup/target/universal/` +* `tar` + * restore: `cliRestore/universal:packageZipTarball`. Created `tar` file will be contained + in `cli-restore/target/universal/` + * backup: `cliBackup/universal:packageZipTarball`. Created `tar` file will be contained + in `cli-backup/target/universal/` +* `Xz` + * restore: `cliRestore/universal:packageXzTarball`. Created `xz` file will be contained + in `cli-restore/target/universal/` + * backup: `cliBackup/universal:packageXzTarball`. Created `xz` file will be contained + in `cli-backup/target/universal/` + +Note that for these packages formats you need to have JRE installed on your system to run the package. For more details +about packaging read the [docs](https://sbt-native-packager.readthedocs.io/en/latest/) + +## Design + +Each application is contained within a corresponding sbt submodule, i.e. the application for `backup` is contained +within the `cli-backup` sbt submodule. The `core-cli` sbt submodule contains common cli arguments (i.e. `kafka-topics`). + +Scala packaging has been disabled for these submodules which means that when publishing/packaging Guardian it won't push +any built `.jar` files. This is because its unnecessary since you are meant to run these applications as a binary and +not include it as a library. By the same token this also means that the cli modules are built with global inlining +using `"-opt-inline-from:**"`, see [here](https://www.lightbend.com/blog/scala-inliner-optimizer) for more info. diff --git a/docs/src/main/paradox/backup/configuration.md b/docs/src/main/paradox/backup/configuration.md new file mode 100644 index 00000000..7f83a312 --- /dev/null +++ b/docs/src/main/paradox/backup/configuration.md @@ -0,0 +1,24 @@ +# Configuration + +## Reference + +@@snip (/core-backup/src/main/resources/reference.conf) + +Scala API doc @apidoc[kafka.backup.configs.Backup] + +## Explanation + +* `kafka-group-id`: The group id for the Kafka consumer that's used in restore tool +* `time-configuration`: How to slice the persisted keys/files based by time + * `type`: The type of time configuration. Either `period-from-first` or `chrono-unit-slice` + * `period-from-first`: Guardian will split up the backup keys/files determined by the `duration` specified. The + key/filename will be determined by the timestamp of the first message received from the Kafka consumer with + each further key/filename being incremented by the configured `duration`. If guardian is shut down then it + will terminate and complete stream with the final element in the JSON array being a `null` + * This is done so it's possible to determine if a backup has been terminated by shut down of Guardian and + also because it's not really possible to resume using arbitrary durations. + * `chrono-unit-slice`: Guardian will split up the backup keys/files determined by the `chrono-unit` which + represent intervals such as days and weeks. As such when using this setting its possible for Guardian to + resume from a previous uncompleted backup. + * `duration`: If configuration is `period-from-first` then this determines max period of time for each time slice. + * `chrono-unit`: if configuration is `chrono-unit-slice` the `chrono-unit` determines diff --git a/docs/src/main/paradox/backup/index.md b/docs/src/main/paradox/backup/index.md new file mode 100644 index 00000000..25402fb6 --- /dev/null +++ b/docs/src/main/paradox/backup/index.md @@ -0,0 +1,31 @@ +# Backup + +The backup module is responsible for backing up a specific set of Kafka topics into a persistent storage. The backup +runs as a continuous stream that is split depending on time buckets which are configurable. The format for backups is in +JSON consisting of a large JSON array filled with JSON objects that have the following format. + +```json +{ + "topic": "kafka topic", + "partition": 0, + "offset": 0, + "key": "a2V5", + "value": "dmFsdWU=", + "timestamp": 0, + "timestamp_type": 0 +} +``` + +The `key` and `value` are Base64 encoded byte arrays (in the above example `"a2V5"` decodes to the string `key` +and `"dmFsdWU="` decodes to the string `value`). This is due to the fact that the backup tool can make no assumptions on +the format of the key or value, so we encode the raw byte arrays. + +One thing to note is that its possible for the last JSON object in the JSON array to be `null`, see for more info. + +@@toc { depth=2 } + +@@@ index + +* [configuration](configuration.md) + +@@@ diff --git a/docs/src/main/paradox/ci.md b/docs/src/main/paradox/ci.md new file mode 100644 index 00000000..47d3c9a4 --- /dev/null +++ b/docs/src/main/paradox/ci.md @@ -0,0 +1,42 @@ +# CI - Continuous Integration + +Guardian uses github actions to perform CI whenever a pull request is made and when a pull request is merged into +master. CI is also responsible for publishing github github. The integration with github actions for the main build is +performed using [sbt-github-actions][sbt-github-actions-link]. + +## Design + +One thing to note about [sbt-github-actions][sbt-github-actions-link] is that it generates the github workflow files +directly from the sbt [build definition file](https://github.com/aiven/guardian-for-apache-kafka/blob/main/build.sbt). +This means that the `build.sbt` is the source of truth and hence [sbt-github-actions][sbt-github-actions-link] also +checks that the github workflow is in sync with `build.sbt` as part of the CI process. + +Essentially that means any changes to `build.sbt` (such as updating Scala versions) can also cause changes in github +workflow actions. Likewise if you need to do any custom changes to +the [ci.yaml](https://github.com/aiven/guardian-for-apache-kafka/blob/main/.github/workflows/ci.yml) you need to do this +in `build.sbt` using [sbt-github-actions][sbt-github-actions-link] SBT dsl. + +To regenerate the relevant github workflow files after changes to `build.sbt` are done you need to run + +``` +githubWorkflowGenerate +``` + +In the sbt shell. For more information go [here](https://github.com/djspiewak/sbt-github-actions#generative-plugin) + +## Scalafmt + +In addition and separately to [sbt-github-actions][sbt-github-actions] Guardian also has a [scalafmt][scalafmt-link] +pipeline that checks the code is correctly formatted on each PR. This allows +the [scalafmt pipeline](https://github.com/aiven/guardian-for-apache-kafka/blob/main/.github/workflows/format.yml) to +run at the same time the main build does. Furthermore, it +uses [scalafmt-native](https://scalameta.org/scalafmt/docs/installation.html#native-image) for improved runtime +performance (typically it takes 5-10 seconds to check the entire project is formatted) + +This means that if you ever update the scalafmt version in +the [configuration file](https://github.com/aiven/guardian-for-apache-kafka/blob/main/.scalafmt.conf#L1) you also need +to update it in +the [scalafmt-pipeline](https://github.com/aiven/guardian-for-apache-kafka/blob/main/.github/workflows/format.yml#L26). + +[sbt-github-actions-link]: https://github.com/djspiewak/sbt-github-actions +[scalafmt-link]: https://scalameta.org/scalafmt/ diff --git a/docs/src/main/paradox/doc-generation.md b/docs/src/main/paradox/doc-generation.md new file mode 100644 index 00000000..ab0a3149 --- /dev/null +++ b/docs/src/main/paradox/doc-generation.md @@ -0,0 +1,42 @@ +# Document Generation + +Guardian uses [sbt-paradox][sbt-paradox-link] as the main plugin for generating documentation which is hosted +using [github pages][github-pages-link]. In addition various other plugins are used which are noted below + +* [sbt-paradox-api-doc](https://github.com/lightbend/sbt-paradox-apidoc): Allows you to directly link to Scala + documentation using the `@@apidoc` directive +* [sbt-paradox-project-info](https://github.com/lightbend/sbt-paradox-project-info): Provides an `@@projectInfo` + directive that derives common information about the project (such as dependencies, project info etc etc) +* [sbt-site](https://github.com/sbt/sbt-site): Used in conjunction with [sbt-paradox][sbt-paradox-link] to generate the + final site structure +* [sbt-ghpages](https://github.com/sbt/sbt-ghpages): Used for uploading the final site + to [github-pages][github-pages-link]. +* [sbt-unidoc](https://github.com/sbt/sbt-unidoc): Used to aggregate/concatenate documentation Scala API documentation + from various sbt modules into a single documentation result + +## Design + +[sbt-paradox][sbt-paradox-link] generates documentation using standard [Markdown](https://www.markdownguide.org/). The +documentation can be found in the [docs-folder](https://github.com/aiven/guardian-for-apache-kafka/tree/main/docs). Note +that this folder also corresponds to a sbt-module which is also named `docs` which also means that commands related to +documentation are run in that sbt sub-project (i.e. `docs/makeSite` generates the documentation site). + +Guardian also uses [scaladoc][scaladoc-link] which is already included within Scala compiler/SBT to generate Scala API +documentation. +[scaladoc][scaladoc-link] is analogous to Java's own [javadoc](https://en.wikipedia.org/wiki/Javadoc) which generates +API documentation that is written within the code itself. + +One advantage of using [sbt-paradox][sbt-paradox-link] and its various plugins as the main driver for documentation +generation is it that checks at document generation (i.e. compile time) that the docs are well-formed. This checking +includes + +* references to other links +* references to specific Scala API documentation directly using Scala classes/objects/traits +* TOC (table of contents) are well-formed (e.g. you don't have markdown files in `docs` which aren't referenced + anywhere) +* references to versions from Guardians various Scala submodules are always up-to-date +* references to code snippets + +[sbt-paradox-link]: https://github.com/lightbend/paradox +[github-pages-link]: https://pages.github.com/ +[scaladoc-link]: https://docs.scala-lang.org/style/scaladoc.html diff --git a/docs/src/main/paradox/index.md b/docs/src/main/paradox/index.md new file mode 100644 index 00000000..c0bf2e1a --- /dev/null +++ b/docs/src/main/paradox/index.md @@ -0,0 +1,26 @@ +# Guardian for Apache Kafka Documentation + +Guardian for Apache Kafka is an open source utility for backing up [Apache Kafka](https://kafka.apache.org/) clusters. +It is built using [Scala](https://www.scala-lang.org/) entirely +with [Akka-Streams](https://doc.akka.io/docs/akka/current/stream/index.html) +to ensure that the tool runs reliably and as desired with large datasets in different scenarios. + +@@toc { depth=2 } + +@@@ index + +* [overview](overview.md) +* [security](security.md) +* [ci](ci.md) +* [doc-generation](doc-generation.md) +* [application](application/index.md) +* [backup](backup/index.md) +* [persistence](persistence/index.md) +* [restore](restore/index.md) + +@@@ + +## Trademarks + +Apache Kafka is either a registered trademark or trademark of the Apache Software Foundation in the United States and/or +other countries. diff --git a/docs/src/main/paradox/overview.md b/docs/src/main/paradox/overview.md new file mode 100644 index 00000000..40593802 --- /dev/null +++ b/docs/src/main/paradox/overview.md @@ -0,0 +1,19 @@ +# Overview + +Guardian for Apache Kafka is an open source utility for backing up [Apache Kafka](https://kafka.apache.org/) clusters. +It is built using [Scala](https://www.scala-lang.org/) entirely +with [Akka-Streams](https://doc.akka.io/docs/akka/current/stream/index.html) +to ensure that the tool runs as desired with large datasets in different scenarios. + +## Versions + +The core modules are compiled against: + +* Akka Streams $akka.version$+ (@extref:[Reference](akka:stream/index.html), [Github](https://github.com/akka/akka)) +* Akka Streams Json $akka-streams-json.version$+ ([Github](https://github.com/mdedetrich/akka-streams-json)) +* PureConfig $pure-config.version$+ ([Reference](https://pureconfig.github.io/docs/), [Github](https://github.com/pureconfig/pureconfig)) +* ScalaLogging $scala-logging.version$+ ([Github](https://github.com/lightbend/scala-logging)) + +The cli modules are compiled against: + +* Decline $decline.version$+ ([Reference](https://ben.kirw.in/decline/), [Github](https://github.com/bkirwi/decline)) diff --git a/docs/src/main/paradox/persistence/design.md b/docs/src/main/paradox/persistence/design.md new file mode 100644 index 00000000..c282b13d --- /dev/null +++ b/docs/src/main/paradox/persistence/design.md @@ -0,0 +1,26 @@ +# Design + +Storage mechanisms are implemented via the @apidoc[BackupClientInterface] and @apidoc[RestoreClientInterface]. To add +custom storage mechanisms you need to implement these methods. These interfaces are designed to be as simple as possible +while being completely abstract to allow for any theoretical storage mechanism. + +## BackupClientInterface + +The @apidoc[BackupClientInterface] implements the entire backup flow including the resuming from a previously terminated +backup. Of note is the @apidoc[BackupClientInterface.State](BackupClientInterface) which is the data structure that is +returned when any previously existing backup for that key exists. This is provided to +@apidoc[BackupClientInterface.backupToStorageSink](BackupClientInterface) indicating whether the backup being performed +is a new backup or resuming from a previous one with the retrieval of the current state being defined by +@apidoc[BackupClientInterface.getCurrentUploadState](BackupClientInterface). + +Note that when implementing @apidoc[BackupClientInterface] you do not need to handle the corner cases regarding the +contents of the byte string when resuming/suspending/terminating, this is automatically handled for you. Essentially you +just need to handle how to store/push `ByteString` into the storage of your choice. + +## RestoreClientInterface + +The @apidoc[RestoreClientInterface] implements restoration from an existing backup. Implementing this is quite simple, +you need to define @apidoc[RestoreClientInterface.retrieveBackupKeys](RestoreClientInterface) which returns all valid +keys to restore (i.e. don't include currently in progress backup keys) and +@apidoc[RestoreClientInterface.downloadFlow](RestoreClientInterface) which is an akka-stream `Flow` that takes +a `String` which is the key and outputs the content of that key. diff --git a/docs/src/main/paradox/persistence/index.md b/docs/src/main/paradox/persistence/index.md new file mode 100644 index 00000000..3d1f949e --- /dev/null +++ b/docs/src/main/paradox/persistence/index.md @@ -0,0 +1,12 @@ +# Persistence Modules + +Guardian for Apache Kafka has a modular architecture that provides support for different persistence backups. + +@@toc { depth=2 } + +@@@ index + +* [design](design.md) +* [S3](s3/index.md) + +@@@ \ No newline at end of file diff --git a/docs/src/main/paradox/persistence/s3/configuration.md b/docs/src/main/paradox/persistence/s3/configuration.md new file mode 100644 index 00000000..ba270dbd --- /dev/null +++ b/docs/src/main/paradox/persistence/s3/configuration.md @@ -0,0 +1,15 @@ +# S3 + +## Reference + +@@snip (/core-s3/src/main/resources/reference.conf) + +Scala API doc @apidoc[kafka.s3.configs.S3] + +## Explanation + +* `s3-headers`: See @extref:[documentation](alpakka:akka/stream/alpakka/s3/headers/index.html) +* `alpakka.s3`: See @extref:[documentation](alpakka-docs:s3.html#configuration) +* `s3-config`: Core S3 configuration about where to persist the data + * `data-bucket`: The main S3 bucket where data is backed up and where to restore data from + * `data-bucket-prefix`: S3 prefix configuration to be used when searching for the bucket diff --git a/docs/src/main/paradox/persistence/s3/index.md b/docs/src/main/paradox/persistence/s3/index.md new file mode 100644 index 00000000..0b97c2eb --- /dev/null +++ b/docs/src/main/paradox/persistence/s3/index.md @@ -0,0 +1,12 @@ +# S3 + +The S3 persistence module allows you to store kafka backups on [AWS S3 Cloud Storage](https://aws.amazon.com/s3/). + +@@toc { depth=2 } + +@@@ index + +* [configuration](configuration.md) + +@@@ + diff --git a/docs/src/main/paradox/restore/configuration.md b/docs/src/main/paradox/restore/configuration.md new file mode 100644 index 00000000..853a3328 --- /dev/null +++ b/docs/src/main/paradox/restore/configuration.md @@ -0,0 +1,13 @@ +# Configuration + +## Reference + +@@snip (/core-restore/src/main/resources/reference.conf) + +Scala API doc @apidoc[kafka.restore.configs.Restore] + +## Explanation + +* `from-when`: An `ISO-8601` time that specifies from when topics need to be restored. Note that the time used is based + on the original Kafka timestamp and **NOT** the current time. +* `override-topics`: A mapping of currently backed up topics to a new topic in the destination Kafka cluster diff --git a/docs/src/main/paradox/restore/index.md b/docs/src/main/paradox/restore/index.md new file mode 100644 index 00000000..931bb71d --- /dev/null +++ b/docs/src/main/paradox/restore/index.md @@ -0,0 +1,13 @@ +# Restore + +The restore module is responsible for streaming data from a backup storage location into a fresh new cluster in the +circumstance of a disaster recovery. The restore is able to work in any format of backed up files created by Guardian's +restore. + +@@toc { depth=2 } + +@@@ index + +* [configuration](configuration.md) + +@@@ diff --git a/docs/src/main/paradox/security.md b/docs/src/main/paradox/security.md new file mode 100644 index 00000000..8eb764c3 --- /dev/null +++ b/docs/src/main/paradox/security.md @@ -0,0 +1,29 @@ +# Security + +## OWASP Report + +Guardian uses [sbt-dependency-check](https://github.com/albuch/sbt-dependency-check) to generate +a [dependency-check-report][dependency-check-report-link] which checks direct and transitive dependencies for +vulnerabilities against [NVD](https://nvd.nist.gov/) in the form of a HTML file that can be viewed in a standard +browser. + +### Generating a report + +You can use the sbt shell to generate a report at any time using + +``` +dependencyCheckAggregate +``` + +This will overwrite the [current report file][dependency-check-report-link] + +### Suppressing false positives + +Sometimes it is possible that a false positive get generated in the report. To add a false positive, first you need to +open the [report file][dependency-check-report-link] in a supported browser. In the list of found vulnerabilities there +should be a suppress button which when clicked displays a popup containing an `XML` suppression entry. You then add +that `` tag entry to the +existing [suppression-file](https://github.com/aiven/guardian-for-apache-kafka/edit/main/dependency-check/suppression.xml) +. Finally, regenerate the report again using sbt's `dependencyCheckAggregate` + +[dependency-check-report-link]: https://github.com/aiven/guardian-for-apache-kafka/blob/main/dependency-check/dependency-check-report.html diff --git a/project/plugins.sbt b/project/plugins.sbt index 323c250b..5ee3a87a 100644 --- a/project/plugins.sbt +++ b/project/plugins.sbt @@ -1,10 +1,23 @@ -addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6") -addSbtPlugin("com.lightbend.paradox" % "sbt-paradox" % "0.9.2") -addSbtPlugin("com.github.sbt" % "sbt-native-packager" % "1.9.9") -addSbtPlugin("com.codecommit" % "sbt-github-actions" % "0.14.2") -addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") -addSbtPlugin("com.github.sbt" % "sbt-release" % "1.1.0") -addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.34") -addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3") -addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.3.2") -addSbtPlugin("net.vonbuchholtz" % "sbt-dependency-check" % "4.0.0") +addSbtPlugin("org.scalameta" % "sbt-scalafmt" % "2.4.6") +addSbtPlugin("com.lightbend.paradox" % "sbt-paradox" % "0.9.2") +addSbtPlugin("com.lightbend.paradox" % "sbt-paradox-apidoc" % "0.10+8-1685fc09") +addSbtPlugin("com.lightbend.paradox" % "sbt-paradox-project-info" % "1.1.3") +addSbtPlugin("com.github.sbt" % "sbt-unidoc" % "0.5.0") +addSbtPlugin("com.typesafe.sbt" % "sbt-ghpages" % "0.6.3") +addSbtPlugin("com.thoughtworks.sbt-api-mappings" % "sbt-api-mappings" % "3.0.0+82-b1fe858b") +addSbtPlugin("com.typesafe.sbt" % "sbt-site" % "1.4.1") +addSbtPlugin("com.github.sbt" % "sbt-native-packager" % "1.9.9") +addSbtPlugin("com.codecommit" % "sbt-github-actions" % "0.14.2") +addSbtPlugin("com.github.sbt" % "sbt-pgp" % "2.1.2") +addSbtPlugin("com.github.sbt" % "sbt-release" % "1.1.0") +addSbtPlugin("ch.epfl.scala" % "sbt-scalafix" % "0.9.34") +addSbtPlugin("org.scoverage" % "sbt-scoverage" % "1.9.3") +addSbtPlugin("org.scoverage" % "sbt-coveralls" % "1.3.2") +addSbtPlugin("net.vonbuchholtz" % "sbt-dependency-check" % "4.0.0") + +// This is here to bump dependencies for sbt-paradox/sbt-site, see +// https://github.com/sirthias/parboiled/issues/175 and https://github.com/sirthias/parboiled/issues/128 +libraryDependencies ++= Seq( + "org.parboiled" %% "parboiled-scala" % "1.4.0", + "org.parboiled" % "parboiled-java" % "1.4.0" +) diff --git a/project/project-info.conf b/project/project-info.conf new file mode 100644 index 00000000..234e62b3 --- /dev/null +++ b/project/project-info.conf @@ -0,0 +1,57 @@ +project-info { + version: "current" + labels: "https://github.com/aiven/guardian-for-apache-kafka/labels/p%3A" + scaladoc: "https://doc.akka.io/api/alpakka/"${project-info.version}"/akka/stream/alpakka/" + shared-info { + jdk-versions: ["Adopt OpenJDK 11", "Adopt OpenJDK 17"] + snapshots: { + url: "other-docs/snapshots.html" + text: "Snapshots are available" + new-tab: false + } + issues: { + url: "https://github.com/aiven/guardian-for-apache-kafka/issues" + text: "Github issues" + } + release-notes: { + url: "https://github.com/aiven/guardian-for-apache-kafka/releases" + text: "GitHub releases" + } + } + backupS3: ${project-info.shared-info} { + title: "Backup S3" + jpms-name: "io.aiven.guardian.kafka.backup.s3" + } + cliBackup: ${project-info.shared-info} { + title: "CLI Backup" + jpms-name: "io.aiven.guardian.kafka.backup" + } + cliRestore: ${project-info.shared-info} { + title: "CLI Restore" + jpms-name: "io.aiven.guardian.kafka.restore" + } + core: ${project-info.shared-info} { + title: "Core" + jpms-name: "io.aiven.guardian.kafka" + } + coreBackup: ${project-info.shared-info} { + title: "Core Backup" + jpms-name: "io.aiven.guardian.kafka.backup" + } + coreCli: ${project-info.shared-info} { + title: "Core CLI" + jpms-name: "io.aiven.guardian.cli" + } + coreRestore: ${project-info.shared-info} { + title: "Core Restore" + jpms-name: "io.aiven.guardian.kafka.restore" + } + coreS3: ${project-info.shared-info} { + title: "Core S3" + jpms-name: "io.aiven.guardian.kafka.restore" + } + restoreS3: ${project-info.shared-info} { + title: "Restore S3" + jpms-name: "io.aiven.guardian.kafka.restore.s3" + } +}