Aiven-Open · mdedetrich · Mar 10, 2022 · Feb 18, 2022 · reta · Mar 10, 2022
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -64,8 +64,67 @@ jobs:
       - name: Build project
         run: sbt ++${{ matrix.scala }} clean coverage test
 
+      - name: Compile docs
+        run: sbt ++${{ matrix.scala }} docs/makeSite
+
       - name: Upload coverage data to Coveralls
         env:
           COVERALLS_REPO_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           COVERALLS_FLAG_NAME: Scala ${{ matrix.scala }}
         run: sbt ++${{ matrix.scala }} coverageReport coverageAggregate coveralls
+
+      - name: Compress target directories
+        run: tar cf targets.tar target cli-compaction/target compaction-gcs/target backup-s3/target compaction-s3/target docs/target cli-backup/target core-restore/target restore-s3/target core-gcs/target core-compaction/target core-s3/target core-backup/target core-cli/target cli-restore/target core/target restore-gcs/target backup-gcs/target project/target
+
+      - name: Upload target directories
+        uses: actions/upload-artifact@v2
+        with:
+          name: target-${{ matrix.os }}-${{ matrix.scala }}-${{ matrix.java }}
+          path: targets.tar
+
+  publish:
+    name: Publish Artifacts
+    needs: [build]
+    if: github.event_name != 'pull_request' && (github.ref == 'refs/heads/main')
+    strategy:
+      matrix:
+        os: [ubuntu-latest]
+        scala: [2.13.8]
+        java: [temurin@11]
+    runs-on: ${{ matrix.os }}
+    steps:
+      - name: Checkout current branch (full)
+        uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Setup Java (temurin@11)
+        if: matrix.java == 'temurin@11'
+        uses: actions/setup-java@v2
+        with:
+          distribution: temurin
+          java-version: 11
+
+      - name: Cache sbt
+        uses: actions/cache@v2
+        with:
+          path: |
+            ~/.sbt
+            ~/.ivy2/cache
+            ~/.coursier/cache/v1
+            ~/.cache/coursier/v1
+            ~/AppData/Local/Coursier/Cache/v1
+            ~/Library/Caches/Coursier/v1
+          key: ${{ runner.os }}-sbt-cache-v2-${{ hashFiles('**/*.sbt') }}-${{ hashFiles('project/build.properties') }}
+
+      - name: Download target directories (2.13.8)
+        uses: actions/download-artifact@v2
+        with:
+          name: target-${{ matrix.os }}-2.13.8-${{ matrix.java }}
+
+      - name: Inflate target directories (2.13.8)
+        run: |
+          tar xf targets.tar
+          rm targets.tar
+
+      - run: sbt ++${{ matrix.scala }} docs/ghpagesPushSite
diff --git a/.jvmopts b/.jvmopts
@@ -0,0 +1,3 @@
+-XX:+IgnoreUnrecognizedVMOptions
+--add-opens java.base/java.lang=ALL-UNNAMED
+--illegal-access=permit
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -67,8 +67,8 @@ the [scalafmt installation guide][scalafmt-installation-link] for more details
 * There are native builds of Scalafmt that let you run a `scalafmt` as a CLI tool, see the CLI section in
 [scalafmt installation guide][scalafmt-installation-link]
 
-Note that a github action exists which will check that your PR is formatted when you create it. The check runs
-separately ad in parallel to the main build/tests
+Note that a github action exists which will check that your code is formatted whenever you create a PR. For more details
+read the [documentation](https://aiven.github.io/guardian-for-apache-kafka/ci.html#scalafmt)
 
 ## sbt - Compiling, Building and Testing
 
@@ -81,6 +81,19 @@ it will start a REPL session where you can type in commands, i.e.
 * `core/compile` will only compile the `core` project. See [build.sbt](build.sbt) to get a reference for how the projects
 are named
 * `publishLocal` will publish the project into the local `~/.m2` repository
+* `clean` will clean all builds targets (including documentation) from the project. Note that sbt stores build
+in sub-directories named `target`
+* `reload` will reload sbt which is used when the [sbt][sbt-link] build definition is changed
+
+## sbt - documentation
+
+Documentation is also built within SBT, i.e.
+
+* `docs/makeSite` will compile documentation
+* `docs/previewSite` will compile documentation (if needed) and open the result in your system's default browser
+
+For details about how the document generation works go 
+[here](https://aiven.github.io/guardian-for-apache-kafka/doc-generation.html)
 
 [adopt-openjdk-link]: https://adoptopenjdk.net/
 [metals-link]: https://scalameta.org/metals/

diff --git a/README.md b/README.md
@@ -4,8 +4,14 @@
 
 # Guardian for Apache Kafka®
 
-Guardian is a backup and restore tool for Apache Kafka clusters. It is designed to continuously stream kafka topics
-into persistent/object storages such as S3 and also provides tools for restoring said backups.
+Guardian is a backup and restore tool for Apache Kafka clusters. It is designed to continuously stream kafka topics into
+persistent/object storages such as S3 and also provides tools for restoring said backups.
+
+## Documentation
+
+* [Guardian reference](https://aiven.github.io/guardian-for-apache-kafka/) documentation.
 
 ## Trademarks
-Apache Kafka is either a registered trademark or trademark of the Apache Software Foundation in the United States and/or other countries.
+
+Apache Kafka is either a registered trademark or trademark of the Apache Software Foundation in the United States and/or
+other countries.
diff --git a/build.sbt b/build.sbt
@@ -1,4 +1,5 @@
 import com.jsuereth.sbtpgp.PgpKeys.publishSigned
+import com.lightbend.paradox.apidoc.ApidocPlugin.autoImport.apidocRootPackage
 
 ThisBuild / scalaVersion         := "2.13.8"
 ThisBuild / organization         := "aiven.io"
@@ -75,6 +76,33 @@ val cliSettings = Seq(
 
 val baseName = "guardian-for-apache-kafka"
 
+lazy val guardian = project
+  .in(file("."))
+  .enablePlugins(ScalaUnidocPlugin)
+  .disablePlugins(SitePlugin)
+  .aggregate(
+    core,
+    coreCli,
+    coreS3,
+    coreGCS,
+    coreBackup,
+    backupS3,
+    backupGCS,
+    cliBackup,
+    coreCompaction,
+    compactionS3,
+    compactionGCS,
+    cliCompaction,
+    coreRestore,
+    restoreS3,
+    restoreGCS,
+    cliRestore
+  )
+  .settings(
+    publish / skip     := true,
+    crossScalaVersions := List() // workaround for https://github.com/sbt/sbt/issues/3465
+  )
+
 lazy val core = project
   .in(file("core"))
   .settings(
@@ -268,12 +296,65 @@ lazy val cliRestore = project
   )
   .enablePlugins(JavaAppPackaging)
 
+def binaryVersion(key: String): String = key.substring(0, key.lastIndexOf('.'))
+
+lazy val docs = project
+  .enablePlugins(ParadoxPlugin, ParadoxSitePlugin, PreprocessPlugin, GhpagesPlugin)
+  .settings(
+    Compile / paradox / name     := "Guardian for Apache Kafka",
+    publish / skip               := true,
+    makeSite                     := makeSite.dependsOn(LocalRootProject / ScalaUnidoc / doc).value,
+    previewPath                  := (Paradox / siteSubdirName).value,
+    paradoxTheme                 := Some(builtinParadoxTheme("generic")),
+    apidocRootPackage            := "io.aiven.guardian",
+    Preprocess / siteSubdirName  := s"api/${projectInfoVersion.value}",
+    Preprocess / sourceDirectory := (LocalRootProject / ScalaUnidoc / unidoc / target).value,
+    git.remoteRepo               := scmInfo.value.get.connection.replace("scm:git:", ""),
+    paradoxGroups                := Map("Language" -> Seq("Scala")),
+    paradoxProperties ++= Map(
+      "akka.version"                        -> akkaVersion,
+      "akka-http.version"                   -> akkaHttpVersion,
+      "akka-streams-json.version"           -> akkaStreamsJson,
+      "pure-config.version"                 -> pureConfigVersion,
+      "decline.version"                     -> declineVersion,
+      "scala-logging.version"               -> scalaLoggingVersion,
+      "extref.akka.base_url"                -> s"https://doc.akka.io/docs/akka/${binaryVersion(akkaVersion)}/%s",
+      "extref.akka-stream-json.base_url"    -> s"https://github.com/mdedetrich/akka-streams-json",
+      "extref.alpakka.base_url"             -> s"https://doc.akka.io/api/alpakka/${binaryVersion(alpakkaVersion)}/%s",
+      "extref.alpakka-docs.base_url"        -> s"https://docs.akka.io/docs/alpakka/${binaryVersion(alpakkaVersion)}/%s",
+      "extref.pureconfig.base_url"          -> s"https://pureconfig.github.io/docs/",
+      "scaladoc.io.aiven.guardian.base_url" -> s"/guardian-for-apache-kafka/${(Preprocess / siteSubdirName).value}/"
+    )
+  )
+
+ThisBuild / homepage := Some(url("https://github.com/aiven/akka-streams-json"))
+
+ThisBuild / scmInfo := Some(
+  ScmInfo(url("https://github.com/aiven/guardian-for-apache-kafka"),
+          "scm:git:[email protected]:aiven/guardian-for-apache-kafka.git"
+  )
+)
+
+ThisBuild / developers := List(
+  Developer("jlprat", "Josep Prat", "[email protected]", url("https://github.com/jlprat")),
+  Developer("mdedetrich", "Matthew de Detrich", "[email protected]", url("https://github.com/mdedetrich")),
+  Developer("reta", "Andriy Redko", "[email protected]", url("https://github.com/reta"))
+)
+
+maintainer := "[email protected]"
+
+ThisBuild / licenses += ("Apache-2.0", url("https://opensource.org/licenses/Apache-2.0"))
+
 // This is currently causing problems, see https://github.com/djspiewak/sbt-github-actions/issues/74
 ThisBuild / githubWorkflowUseSbtThinClient := false
 
-ThisBuild / githubWorkflowTargetBranches := Seq("main") // Once we have branches per version, add the pattern here
+ThisBuild / githubWorkflowTargetBranches := Seq("main")
+
+// Once we have branches per version, add the pattern here, see
+// https://github.com/djspiewak/sbt-github-actions#integration-with-sbt-ci-release
+ThisBuild / githubWorkflowPublishTargetBranches := Seq(RefPredicate.Equals(Ref.Branch("main")))
 
-ThisBuild / githubWorkflowPublishTargetBranches := Seq()
+ThisBuild / githubWorkflowPublish := Seq(WorkflowStep.Sbt(List("docs/ghpagesPushSite")))
 
 ThisBuild / githubWorkflowBuildPreamble := Seq(
   WorkflowStep.Sbt(List("scalafixAll --check"), name = Some("Linter: Scalafix checks"))
@@ -300,7 +381,8 @@ ThisBuild / githubWorkflowEnv ++= Map(
 ThisBuild / githubWorkflowJavaVersions := List(JavaSpec.temurin("11"))
 
 ThisBuild / githubWorkflowBuild := Seq(
-  WorkflowStep.Sbt(List("clean", "coverage", "test"), name = Some("Build project"))
+  WorkflowStep.Sbt(List("clean", "coverage", "test"), name = Some("Build project")),
+  WorkflowStep.Sbt(List("docs/makeSite"), name = Some("Compile docs"))
 )
 
 ThisBuild / githubWorkflowBuildPostamble ++= Seq(

diff --git a/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/BackupClientInterface.scala b/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/BackupClientInterface.scala
@@ -69,38 +69,39 @@ trait BackupClientInterface[T <: KafkaClientInterface] extends StrictLogging {
     * @param key
     *   The object key or filename for what is currently being backed up
     * @return
-    *   A [[Future]] with a [[UploadStateResult]] data structure that optionally contains the state associated with
-    *   `key` along with the previous latest state before `key` (if it exists)
+    *   A [[scala.concurrent.Future]] with a [[UploadStateResult]] data structure that optionally contains the state
+    *   associated with `key` along with the previous latest state before `key` (if it exists)
     */
   def getCurrentUploadState(key: String): Future[UploadStateResult]
 
   /** A sink that is executed whenever a previously existing Backup needs to be terminated and closed. Generally
-    * speaking this [[Sink]] is similar to the [[backupToStorageSink]] except that
-    * [[kafkaClientInterface.CursorContext]] is not required since no Kafka messages are being written.
+    * speaking this [[akka.stream.scaladsl.Sink]] is similar to the `backupToStorageSink` except that
+    * `kafkaClientInterface.CursorContext` is not required since no Kafka messages are being written.
     *
-    * Note that the terminate refers to the fact that this Sink is executed with a `null]` [[Source]] which when written
-    * to an already existing unfinished backup terminates the containing JSON array so that it becomes valid parsable
-    * JSON.
+    * Note that the terminate refers to the fact that this Sink is executed with a `null]`
+    * [[akka.stream.scaladsl.Source]] which when written to an already existing unfinished backup terminates the
+    * containing JSON array so that it becomes valid parsable JSON.
     * @param previousState
     *   A data structure containing both the [[State]] along with the associated key which you can refer to in order to
-    *   define your [[Sink]]
+    *   define your [[akka.stream.scaladsl.Sink]]
     * @return
-    *   A [[Sink]] that points to an existing key defined by `previousState.previousKey`
+    *   A [[akka.stream.scaladsl.Sink]] that points to an existing key defined by `previousState.previousKey`
     */
   def backupToStorageTerminateSink(previousState: PreviousState): Sink[ByteString, Future[BackupResult]]
 
-  /** Override this method to define how to backup a `ByteString` combined with Kafka
+  /** Override this method to define how to backup a [[akka.util.ByteString]] combined with Kafka
     * `kafkaClientInterface.CursorContext` to a `DataSource`
     * @param key
     *   The object key or filename for what is being backed up
     * @param currentState
     *   The current state if it exists. If this is empty then a new backup is being created with the associated `key`
-    *   otherwise if this contains a [[State]] then the defined [[Sink]] needs to handle resuming a previously
-    *   unfinished backup with that `key` by directly appending the [[ByteString]] data.
+    *   otherwise if this contains a [[State]] then the defined [[akka.stream.scaladsl.Sink]] needs to handle resuming a
+    *   previously unfinished backup with that `key` by directly appending the [[akka.util.ByteString]] data.
     * @return
-    *   A [[Sink]] that given a [[ByteString]] (containing a single Kafka [[ReducedConsumerRecord]]) along with its
-    *   [[kafkaClientInterface.CursorContext]] backs up the data to your data storage. The [[Sink]] is also responsible
-    *   for executing [[kafkaClientInterface.commitCursor]] when the data is successfully backed up
+    *   A [[akka.stream.scaladsl.Sink]] that given a [[akka.util.ByteString]] (containing a single Kafka
+    *   [[io.aiven.guardian.kafka.models.ReducedConsumerRecord]]) along with its `kafkaClientInterface.CursorContext`
+    *   backs up the data to your data storage. The [[akka.stream.scaladsl.Sink]] is also responsible for executing
+    *   `kafkaClientInterface.commitCursor` when the data is successfully backed up
     */
   def backupToStorageSink(key: String,
                           currentState: Option[State]

diff --git a/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/configs/TimeConfiguration.scala b/core-backup/src/main/scala/io/aiven/guardian/kafka/backup/configs/TimeConfiguration.scala
@@ -13,11 +13,11 @@ sealed trait TimeConfiguration
   */
 final case class PeriodFromFirst(duration: FiniteDuration) extends TimeConfiguration
 
-/** Backs up objects/files by collecting received Kafka messages into a single time slice based on a [[ChronoUnit]].
-  * When suspending/resuming the backup client, this option will reuse existing objects/files if they fall into the
-  * currently configured `chronoUnit`.
+/** Backs up objects/files by collecting received Kafka messages into a single time slice based on a
+  * [[java.time.temporal.ChronoUnit]]. When suspending/resuming the backup client, this option will reuse existing
+  * objects/files if they fall into the currently configured `chronoUnit`.
   * @param chronoUnit
-  *   Timestamps for kafka messages that are contained within the configured [[ChronoUnit]] will be placed into the same
-  *   object/file.
+  *   Timestamps for kafka messages that are contained within the configured [[java.time.temporal.ChronoUnit]] will be
+  *   placed into the same object/file.
   */
 final case class ChronoUnitSlice(chronoUnit: ChronoUnit) extends TimeConfiguration
diff --git a/...store/src/main/resources/application.conf → ...restore/src/main/resources/reference.conf b/...store/src/main/resources/application.conf → ...restore/src/main/resources/reference.conf
diff --git a/core-s3/src/main/resources/reference.conf b/core-s3/src/main/resources/reference.conf
@@ -6,6 +6,7 @@ alpakka.s3 {
     scheme = ${?ALPAKKA_S3_FORWARD_PROXY_SCHEME}
     host = ${?ALPAKKA_S3_FORWARD_PROXY_HOST}
     port = ${?ALPAKKA_S3_FORWARD_PROXY_PORT}
+
     credentials {
       username = ${?ALPAKKA_S3_FORWARD_PROXY_CREDENTIALS_USERNAME}
       password = ${?ALPAKKA_S3_FORWARD_PROXY_CREDENTIALS_PASSWORD}

diff --git a/docs/src/main/paradox/application/index.md b/docs/src/main/paradox/application/index.md
@@ -0,0 +1,49 @@
+# Application
+
+Guardian also becomes packaged as various application/s that lets you run it using a CLI interface. Currently, the
+binaries provided are
+
+* restore: A continuously running binary that performs the restore operation.
+* backup: A binary which when executed allows you to restore an existing backup.
+
+The CLI follows POSIX guidelines which means you can use `--help` as an argument to provide information on all of the
+parameters.
+
+## Package formats
+
+Guardian is currently packaged using [sbt-native-packager](https://github.com/sbt/sbt-native-packager) to provide the
+following formats by using the sbt shell.
+
+* `rpm`
+    * restore: `cliRestore/rpm:packageBin`. Created `rpm` file will be contained
+      in `cli-restore/target/rpm/RPMS/noarch/`
+    * backup: `cliBackup/rpm:packageBin`. Created `rpm` file will be contained in `cli-backup/target/rpm/RPMS/noarch/`
+      NOTE: In order to build packages you need to have the [rpm-tools](https://rpm.org/) (specifically `rpmbuild`)
+      installed and available on `PATH`. Please consult your Linux distribution for more info
+* `zip`
+    * restore: `cliRestore/universal:packageBin`. Created `zip` file will be contained
+      in `cli-restore/target/universal/`
+    * backup: `cliBackup/universal:packageBin`. Created `zip` file will be contained in `cli-backup/target/universal/`
+* `tar`
+    * restore: `cliRestore/universal:packageZipTarball`. Created `tar` file will be contained
+      in `cli-restore/target/universal/`
+    * backup: `cliBackup/universal:packageZipTarball`. Created `tar` file will be contained
+      in `cli-backup/target/universal/`
+* `Xz`
+    * restore: `cliRestore/universal:packageXzTarball`. Created `xz` file will be contained
+      in `cli-restore/target/universal/`
+    * backup: `cliBackup/universal:packageXzTarball`. Created `xz` file will be contained
+      in `cli-backup/target/universal/`
+
+Note that for these packages formats you need to have JRE installed on your system to run the package. For more details
+about packaging read the [docs](https://sbt-native-packager.readthedocs.io/en/latest/)
+
+## Design
+
+Each application is contained within a corresponding sbt submodule, i.e. the application for `backup` is contained
+within the `cli-backup` sbt submodule. The `core-cli` sbt submodule contains common cli arguments (i.e. `kafka-topics`).
+
+Scala packaging has been disabled for these submodules which means that when publishing/packaging Guardian it won't push
+any built `.jar` files. This is because its unnecessary since you are meant to run these applications as a binary and
+not include it as a library. By the same token this also means that the cli modules are built with global inlining
+using `"-opt-inline-from:**"`, see [here](https://www.lightbend.com/blog/scala-inliner-optimizer) for more info.
diff --git a/docs/src/main/paradox/backup/configuration.md b/docs/src/main/paradox/backup/configuration.md
@@ -0,0 +1,24 @@
+# Configuration
+
+## Reference
+
+@@snip (/core-backup/src/main/resources/reference.conf)
+
+Scala API doc @apidoc[kafka.backup.configs.Backup]
+
+## Explanation
+
+* `kafka-group-id`: The group id for the Kafka consumer that's used in restore tool
+* `time-configuration`: How to slice the persisted keys/files based by time
+    * `type`: The type of time configuration. Either `period-from-first` or `chrono-unit-slice`
+        * `period-from-first`: Guardian will split up the backup keys/files determined by the `duration` specified. The
+          key/filename will be determined by the timestamp of the first message received from the Kafka consumer with
+          each further key/filename being incremented by the configured `duration`. If guardian is shut down then it
+          will terminate and complete stream with the final element in the JSON array being a `null`
+            * This is done so it's possible to determine if a backup has been terminated by shut down of Guardian and
+              also because it's not really possible to resume using arbitrary durations.
+        * `chrono-unit-slice`: Guardian will split up the backup keys/files determined by the `chrono-unit` which
+          represent intervals such as days and weeks. As such when using this setting its possible for Guardian to
+          resume from a previous uncompleted backup.
+    * `duration`: If configuration is `period-from-first` then this determines max period of time for each time slice.
+    * `chrono-unit`: if configuration is `chrono-unit-slice` the `chrono-unit` determines