Merge branch 'develop' into update/sentry-logback-7.16.0

broadinstitute · Nov 5, 2024 · 3c36a68 · 3c36a68
2 parents 06d88fa + f49bf78
commit 3c36a68
Show file tree

Hide file tree

Showing 28 changed files with 800 additions and 24 deletions.
diff --git a/automation/Dockerfile-tests b/automation/Dockerfile-tests
@@ -1,4 +1,4 @@
-FROM sbtscala/scala-sbt:eclipse-temurin-jammy-17.0.10_7_1.10.4_2.13.15
+FROM sbtscala/scala-sbt:eclipse-temurin-17.0.13_11_1.10.5_2.13.15
 
 COPY src /app/src
 COPY test.sh /app

diff --git a/automation/project/build.properties b/automation/project/build.properties
@@ -1 +1 @@
-sbt.version = 1.10.4
+sbt.version = 1.10.5
diff --git a/jenkins/ittests.sh b/jenkins/ittests.sh
@@ -6,7 +6,7 @@ set -eux
 ./docker/run-es.sh start
 
 # execute tests, overriding elasticsearch.urls to point at the linked container
-SBT_IMAGE=sbtscala/scala-sbt:eclipse-temurin-jammy-17.0.10_7_1.10.4_2.13.15
+SBT_IMAGE=sbtscala/scala-sbt:eclipse-temurin-17.0.13_11_1.10.5_2.13.15
 docker run --rm \
   --link elasticsearch-ittest:elasticsearch-ittest \
   -v sbt-cache:/root/.sbt \

diff --git a/local-dev/templates/docker-rsync-local-orch.sh b/local-dev/templates/docker-rsync-local-orch.sh
@@ -111,7 +111,7 @@ start_server () {
     -p 5051:5051 \
     --network=fc-orch \
     -e JAVA_OPTS="$DOCKER_JAVA_OPTS" \
-    sbtscala/scala-sbt:eclipse-temurin-jammy-17.0.10_7_1.10.4_2.13.15 \
+    sbtscala/scala-sbt:eclipse-temurin-17.0.13_11_1.10.5_2.13.15 \
     bash -c "git config --global --add safe.directory /app && sbt \~reStart"
 
     docker cp config/firecloud-account.pem orch-sbt:/etc/firecloud-account.pem

diff --git a/project/Dependencies.scala b/project/Dependencies.scala
@@ -56,7 +56,7 @@ object Dependencies {
     excludeGuava("org.broadinstitute.dsde.workbench" %% "workbench-util"  % s"0.10-$workbenchLibsHash"),
     "org.broadinstitute.dsde.workbench" %% "workbench-google2" % s"0.36-$workbenchLibsHash",
     "org.broadinstitute.dsde.workbench" %% "workbench-oauth2" % s"0.8-$workbenchLibsHash",
-    "org.broadinstitute.dsde.workbench" %% "sam-client"       % "v0.0.287",
+    "org.broadinstitute.dsde.workbench" %% "sam-client"       % "v0.0.296",
     "org.broadinstitute.dsde.workbench" %% "workbench-notifications" %s"0.8-$workbenchLibsHash",
     "org.databiosphere" % "workspacedataservice-client-okhttp-jakarta" % "0.2.167-SNAPSHOT",
     "bio.terra" % "externalcreds-client-resttemplate" % "1.44.0-20240725.201427-1" excludeAll(excludeSpring, excludeSpringBoot),
@@ -96,7 +96,7 @@ object Dependencies {
     "org.scalatest"                 %% "scalatest"           % "3.2.19"   % "test",
     "org.mock-server"                % "mockserver-netty-no-dependencies"    % "5.15.0"  % "test",
     // provides testing mocks
-    "com.google.cloud"               % "google-cloud-nio"    % "0.127.25" % "test",
+    "com.google.cloud"               % "google-cloud-nio"    % "0.127.26" % "test",
     "org.scalatestplus"             %% "mockito-4-5"         % "3.2.12.0" % "test"
   )
 }
diff --git a/project/build.properties b/project/build.properties
@@ -1 +1 @@
-sbt.version=1.10.4
+sbt.version=1.10.5
diff --git a/script/build.sh b/script/build.sh
@@ -96,7 +96,7 @@ function make_jar()
 
     docker run --rm -e GIT_MODEL_HASH=${GIT_MODEL_HASH} \
         -v $PWD:/working -w /working -v jar-cache:/root/.ivy -v jar-cache:/root/.ivy2 \
-        sbtscala/scala-sbt:eclipse-temurin-jammy-17.0.10_7_1.10.4_2.13.15 /working/src/docker/install.sh /working
+        sbtscala/scala-sbt:eclipse-temurin-17.0.13_11_1.10.5_2.13.15 /working/src/docker/install.sh /working
 }
 
 function docker_cmd()

diff --git a/script/build_jar.sh b/script/build_jar.sh
@@ -12,7 +12,7 @@ docker run --rm -e GIT_MODEL_HASH=${GIT_MODEL_HASH} \
   -v $PWD:/working \
   -v jar-cache:/root/.ivy -v jar-cache:/root/.ivy2 \
   -w /working \
-  sbtscala/scala-sbt:eclipse-temurin-jammy-17.0.10_7_1.10.4_2.13.15 /working/src/docker/clean_install.sh /working
+  sbtscala/scala-sbt:eclipse-temurin-17.0.13_11_1.10.5_2.13.15 /working/src/docker/clean_install.sh /working
 
 EXIT_CODE=$?
 

diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf
@@ -94,3 +94,7 @@ googlecloud {
                             "153601": 0.045
                            }
 }
+
+firecloud {
+  max-filematching-bucket-files = 25000
+}
diff --git a/src/main/resources/swagger/api-docs.yaml b/src/main/resources/swagger/api-docs.yaml
@@ -3688,6 +3688,55 @@ paths:
           description: Internal Server Error
           content: {}
       x-passthrough: false
+  /api/workspaces/{workspaceNamespace}/{workspaceName}/entities/{entityType}/paired-tsv:
+    post:
+      tags:
+        - Entities
+      summary: |
+        Download a TSV of files in the workspace's bucket, paired by naming convention
+      description: |
+        Lists the files in the workspace's bucket, filtered to a specified prefix. Then,
+        attempts to pair those files to each other based on well-known naming conventions.
+        Downloads a TSV containing the result of those pairings.
+      operationId: bucketPairedTSV
+      parameters:
+        - $ref: '#/components/parameters/workspaceNamespaceParam'
+        - $ref: '#/components/parameters/workspaceNameParam'
+        - $ref: '#/components/parameters/entityTypeParam'
+      requestBody:
+        content:
+          'application/json':
+            schema:
+              $ref: '#/components/schemas/FileMatchingOptions'
+            examples:
+              minimally-required:
+                value:
+                  prefix: my-bucket-prefix
+              disable-recursion:
+                value:
+                  prefix: my-bucket-prefix/
+                  recursive: false
+              rename-columns:
+                value:
+                  prefix: my-bucket-prefix
+                  read1Name: my-column-name-one
+                  read2Name: my-column-name-two
+        required: true
+      responses:
+        200:
+          description: URL to saved file
+          content:
+            text/plain:
+              schema:
+                type: string
+                format: binary
+        404:
+          description: Workspace or entity type does not exist
+          content: {}
+        500:
+          description: Internal Server Error
+          content: {}
+      x-passthrough: false
   /api/workspaces/{workspaceNamespace}/{workspaceName}/entityQuery/{entityType}:
     get:
       tags:
@@ -7473,6 +7522,28 @@ components:
     ExtendedEnabled:
       allOf:
         - $ref: '#/components/schemas/Enabled'
+    FileMatchingOptions:
+      type: object
+      required:
+        - prefix
+      properties:
+        prefix:
+          type: string
+          description: |
+            Bucket prefix in which to look. If `recursive` is false, this must include a trailing
+            slash when specifying a subdirectory.
+        read1Name:
+          type: string
+          description: column name to use for the primary "read 1" file
+          default: read1
+        read2Name:
+          type: string
+          description: column name to use for the matching "read 2" file
+          default: read2
+        recursive:
+          type: boolean
+          description: whether to list files in subdirectories of the prefix
+          default: true
     FireCloudPermission:
       required:
         - role
@@ -7749,40 +7820,45 @@ components:
           format: int32
           default: 0
     MethodQuery:
+      required:
+        - namespace
+        - name
+        - payload
+        - entityType
       type: object
       properties:
         namespace:
           type: string
           description: Namespace which contains AgoraEntity.
-          default: YOUR_NAMESPACE
+          example: YOUR_NAMESPACE
         name:
           type: string
           description: Name of the AgoraEntity.
-          default: BWA
+          example: BWA
         synopsis:
           type: string
           description: Synopsis which contains AgoraEntity.
-          default: Quickly aligns short nucleotide sequences.
+          example: Quickly aligns short nucleotide sequences.
         snapshotComment:
           type: string
           description: Snapshot comment of AgoraEntity
-          default: Improved spline reticulation
+          example: Improved spline reticulation
         documentation:
           type: string
           description: Documentation of the AgoraEntity.
-          default: |
+          example: |
             BWA is a software package for mapping low-divergent sequences
             against a large reference genome, such as the human genome.
             It consists of three algorithms: BWA-backtrack, BWA-SW and BWA-MEM.
         payload:
           type: string
           description: Payload of method -- must be in WDL format
-          default: |
+          example: |
             task wc {File in_file command { cat ${in_file} | wc -l } output { Int count = read_int(stdout()) }}
         entityType:
           type: string
           description: Type of the AgoraEntity -- Task or Workflow.
-          default: Task
+          example: Task
     MethodShort:
       required:
         - managers

diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/FireCloudConfig.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/FireCloudConfig.scala
@@ -153,6 +153,7 @@ object FireCloudConfig {
     lazy val supportDomain = firecloud.getString("supportDomain")
     lazy val supportPrefix = firecloud.getString("supportPrefix")
     lazy val userAdminAccount = firecloud.getString("userAdminAccount")
+    lazy val maxFileMatchingFileCount = firecloud.getInt("max-filematching-bucket-files")
   }
 
   object Shibboleth {

diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala
@@ -40,4 +40,6 @@ trait GoogleServicesDAO extends ReportsSubsystemStatus {
   def publishMessages(fullyQualifiedTopic: String, messages: Seq[String]): Future[Unit]
 
   def getBucket(bucketName: String, petKey: String): Option[Bucket]
+
+  def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName]
 }
diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala
@@ -400,4 +400,22 @@ class HttpGoogleServicesDAO(priceListUrl: String, defaultPriceList: GooglePriceL
     getScopedServiceAccountCredentials(firecloudAdminSACreds, authScopes)
       .refreshAccessToken()
       .getTokenValue
+
+  override def listBucket(bucketName: GcsBucketName,
+                          prefix: Option[String],
+                          recursive: Boolean = true
+  ): List[GcsObjectName] = {
+    // listObjectsWithPrefix handles paginating through results if there are more results than
+    // the `maxPageSize` setting.
+    val listAttempt = getStorageResource.use { storageService =>
+      storageService
+        .listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = recursive)
+        .compile
+        .toList
+    }
+
+    // execute the upload
+    listAttempt.unsafeRunSync()
+  }
+
 }
diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala
@@ -0,0 +1,124 @@
+package org.broadinstitute.dsde.firecloud.filematch
+
+import com.typesafe.scalalogging.LazyLogging
+import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult}
+import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy}
+
+import java.nio.file.Path
+
+/**
+  * Given a list of files, pair those files based on their naming conventions.
+  * At the time of writing, this involves recognizing Illumina single end and paired end read patterns
+  * such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm
+  *
+  * In the future, we may support additional naming conventions
+  */
+class FileMatcher extends LazyLogging {
+
+  // the list of recognition strategies to use
+  private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy())
+
+  /**
+    * Given a list of files, pair up those files according to our known recognition strategies.
+    * @param pathList the list of files to inspect
+    * @return pairing results
+    */
+  def pairPaths(pathList: List[Path]): List[FileMatchResult] =
+    performPairing(pathList)
+
+  /**
+    * Given a list of files, pair up those files according to our known recognition strategies.
+    * @param fileList the list of files to inspect, as Strings
+    * @return pairing results
+    */
+  def pairFiles(fileList: List[String]): List[FileMatchResult] = {
+    // convert fileList to pathList
+    val pathList = fileList.map(file => new java.io.File(file).toPath)
+    pairPaths(pathList)
+  }
+
+  /**
+    * Implementation for file pairing. This executes in three steps:
+    *   1. Use our known file recognition strategies to identify all "read 1" files in the file list
+    *   2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s
+    *   3. Handle the remaining files which are not recognized as either "read 1" or "read 2"
+    *
+    * @param pathList the list of files to inspect
+    * @return pairing results
+    */
+  private def performPairing(pathList: List[Path]): List[FileMatchResult] = {
+    // find every path in the incoming pathList that is recognized by one of our known patterns
+    val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList)
+
+    // remove the recognized firstFiles from the outstanding pathList
+    val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile)
+
+    // process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList.
+    // this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult
+    // when the desired pairing is not found
+    val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings)
+
+    // remove the recognized "read 2" files from the outstanding pathList
+    val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult =>
+      s.secondFile
+    }
+    // translate the unrecognized paths into a FileMatchResult
+    val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path))
+
+    // return results, sorted by firstFile
+    (pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile)
+  }
+
+  /**
+    * find every path in the incoming pathList that is recognized as a "read 1" by our known patterns
+    * @param pathList the list of files to inspect
+    * @return pairing results
+    */
+  private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] =
+    pathList.collect { path =>
+      tryPairingStrategies(path) match {
+        case success: SuccessfulMatchResult => success
+      }
+    }
+
+  /**
+    * find every path in the incoming pathList that is recognized as a "read 2" by our known patterns
+    *
+    * @param pathList the list of files to inspect
+    * @param desiredPairings the "read 2" files to look for in the pathList
+    * @return pairing results
+    */
+  private def findSecondFiles(pathList: List[Path],
+                              desiredPairings: List[SuccessfulMatchResult]
+  ): List[FileMatchResult] =
+    desiredPairings.map { desiredPairing =>
+      // search for the desired pairing's secondFile in the list of actual files
+      pathList.find(p => p.equals(desiredPairing.secondFile)) match {
+        case Some(_) => desiredPairing
+        case None    => desiredPairing.toPartial
+      }
+    }
+
+  /**
+    * Attempt all the configured file recognition strategies against the supplied file.
+    *
+    * @param file the file to try to recognize
+    * @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not
+    */
+  private def tryPairingStrategies(file: Path): FileMatchResult = {
+    // does the current file hit on any of our file-matching patterns?
+    // Iterate over the matching strategies and return the first successful match result.
+    val strategyHit = matchingStrategies.collectFirst(strategy =>
+      strategy.matchFirstFile(file) match {
+        case success: SuccessfulMatchResult => success
+      }
+    )
+    strategyHit match {
+      // The current file is recognized by one of our recognition strategies
+      case Some(desiredResult: SuccessfulMatchResult) => desiredResult
+      // the current file is not recognized
+      case _ => FailedMatchResult(file)
+    }
+  }
+
+}
diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala
@@ -0,0 +1,22 @@
+package org.broadinstitute.dsde.firecloud.filematch
+
+import spray.json.DefaultJsonProtocol.jsonFormat4
+import spray.json.RootJsonFormat
+import spray.json.DefaultJsonProtocol._
+
+/**
+  * Request payload, specified by end users, to control file-matching functionality
+  * @param prefix bucket prefix in which to list files
+  * @param read1Name name for the "read1" column
+  * @param read2Name name for the "read2" column
+  * @param recursive should bucket-listing be recursive?
+  */
+case class FileMatchingOptions(prefix: String,
+                               read1Name: Option[String] = None,
+                               read2Name: Option[String] = None,
+                               recursive: Option[Boolean] = None
+)
+
+object FileMatchingOptionsFormat {
+  implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat4(FileMatchingOptions)
+}