From 611acf8f9ef43554d868d269024a33fc0e912d03 Mon Sep 17 00:00:00 2001 From: David An Date: Tue, 5 Nov 2024 10:11:28 -0500 Subject: [PATCH] CORE-123: new file-pairing API for data uploader (#1452) --- src/main/resources/reference.conf | 4 + src/main/resources/swagger/api-docs.yaml | 71 ++++++++++ .../dsde/firecloud/FireCloudConfig.scala | 1 + .../dataaccess/GoogleServicesDAO.scala | 2 + .../dataaccess/HttpGoogleServicesDAO.scala | 18 +++ .../firecloud/filematch/FileMatcher.scala | 124 +++++++++++++++++ .../filematch/FileMatchingOptions.scala | 22 +++ .../filematch/result/FailedMatchResult.scala | 16 +++ .../filematch/result/FileMatchResult.scala | 10 ++ .../filematch/result/PartialMatchResult.scala | 16 +++ .../result/SuccessfulMatchResult.scala | 19 +++ .../strategy/FileRecognitionStrategy.scala | 14 ++ .../strategy/IlluminaPairedEndStrategy.scala | 48 +++++++ .../service/ExportEntitiesByTypeActor.scala | 113 ++++++++++++++- .../webservice/ExportEntitiesApiService.scala | 36 ++++- src/test/resources/reference.conf | 4 + .../firecloud/filematch/FileMatcherSpec.scala | 68 +++++++++ .../IlluminaPairedEndStrategySpec.scala | 68 +++++++++ .../mock/MockGoogleServicesDAO.scala | 3 + .../ExportEntitiesByTypeServiceSpec.scala | 130 +++++++++++++++++- 20 files changed, 779 insertions(+), 8 deletions(-) create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FailedMatchResult.scala create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/PartialMatchResult.scala create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala create mode 100644 src/test/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcherSpec.scala create mode 100644 src/test/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategySpec.scala diff --git a/src/main/resources/reference.conf b/src/main/resources/reference.conf index 77aa81b92..932731d32 100644 --- a/src/main/resources/reference.conf +++ b/src/main/resources/reference.conf @@ -94,3 +94,7 @@ googlecloud { "153601": 0.045 } } + +firecloud { + max-filematching-bucket-files = 25000 +} diff --git a/src/main/resources/swagger/api-docs.yaml b/src/main/resources/swagger/api-docs.yaml index f01dd1b72..c101a6ce3 100755 --- a/src/main/resources/swagger/api-docs.yaml +++ b/src/main/resources/swagger/api-docs.yaml @@ -3688,6 +3688,55 @@ paths: description: Internal Server Error content: {} x-passthrough: false + /api/workspaces/{workspaceNamespace}/{workspaceName}/entities/{entityType}/paired-tsv: + post: + tags: + - Entities + summary: | + Download a TSV of files in the workspace's bucket, paired by naming convention + description: | + Lists the files in the workspace's bucket, filtered to a specified prefix. Then, + attempts to pair those files to each other based on well-known naming conventions. + Downloads a TSV containing the result of those pairings. + operationId: bucketPairedTSV + parameters: + - $ref: '#/components/parameters/workspaceNamespaceParam' + - $ref: '#/components/parameters/workspaceNameParam' + - $ref: '#/components/parameters/entityTypeParam' + requestBody: + content: + 'application/json': + schema: + $ref: '#/components/schemas/FileMatchingOptions' + examples: + minimally-required: + value: + prefix: my-bucket-prefix + disable-recursion: + value: + prefix: my-bucket-prefix/ + recursive: false + rename-columns: + value: + prefix: my-bucket-prefix + read1Name: my-column-name-one + read2Name: my-column-name-two + required: true + responses: + 200: + description: URL to saved file + content: + text/plain: + schema: + type: string + format: binary + 404: + description: Workspace or entity type does not exist + content: {} + 500: + description: Internal Server Error + content: {} + x-passthrough: false /api/workspaces/{workspaceNamespace}/{workspaceName}/entityQuery/{entityType}: get: tags: @@ -7473,6 +7522,28 @@ components: ExtendedEnabled: allOf: - $ref: '#/components/schemas/Enabled' + FileMatchingOptions: + type: object + required: + - prefix + properties: + prefix: + type: string + description: | + Bucket prefix in which to look. If `recursive` is false, this must include a trailing + slash when specifying a subdirectory. + read1Name: + type: string + description: column name to use for the primary "read 1" file + default: read1 + read2Name: + type: string + description: column name to use for the matching "read 2" file + default: read2 + recursive: + type: boolean + description: whether to list files in subdirectories of the prefix + default: true FireCloudPermission: required: - role diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/FireCloudConfig.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/FireCloudConfig.scala index 756546b03..850efc8e6 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/FireCloudConfig.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/FireCloudConfig.scala @@ -153,6 +153,7 @@ object FireCloudConfig { lazy val supportDomain = firecloud.getString("supportDomain") lazy val supportPrefix = firecloud.getString("supportPrefix") lazy val userAdminAccount = firecloud.getString("userAdminAccount") + lazy val maxFileMatchingFileCount = firecloud.getInt("max-filematching-bucket-files") } object Shibboleth { diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala index ac8299417..0202023f5 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala @@ -40,4 +40,6 @@ trait GoogleServicesDAO extends ReportsSubsystemStatus { def publishMessages(fullyQualifiedTopic: String, messages: Seq[String]): Future[Unit] def getBucket(bucketName: String, petKey: String): Option[Bucket] + + def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName] } diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala index 630492c5b..022b43485 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala @@ -400,4 +400,22 @@ class HttpGoogleServicesDAO(priceListUrl: String, defaultPriceList: GooglePriceL getScopedServiceAccountCredentials(firecloudAdminSACreds, authScopes) .refreshAccessToken() .getTokenValue + + override def listBucket(bucketName: GcsBucketName, + prefix: Option[String], + recursive: Boolean = true + ): List[GcsObjectName] = { + // listObjectsWithPrefix handles paginating through results if there are more results than + // the `maxPageSize` setting. + val listAttempt = getStorageResource.use { storageService => + storageService + .listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = recursive) + .compile + .toList + } + + // execute the upload + listAttempt.unsafeRunSync() + } + } diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala new file mode 100644 index 000000000..a7f770c74 --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala @@ -0,0 +1,124 @@ +package org.broadinstitute.dsde.firecloud.filematch + +import com.typesafe.scalalogging.LazyLogging +import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult} +import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy} + +import java.nio.file.Path + +/** + * Given a list of files, pair those files based on their naming conventions. + * At the time of writing, this involves recognizing Illumina single end and paired end read patterns + * such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm + * + * In the future, we may support additional naming conventions + */ +class FileMatcher extends LazyLogging { + + // the list of recognition strategies to use + private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy()) + + /** + * Given a list of files, pair up those files according to our known recognition strategies. + * @param pathList the list of files to inspect + * @return pairing results + */ + def pairPaths(pathList: List[Path]): List[FileMatchResult] = + performPairing(pathList) + + /** + * Given a list of files, pair up those files according to our known recognition strategies. + * @param fileList the list of files to inspect, as Strings + * @return pairing results + */ + def pairFiles(fileList: List[String]): List[FileMatchResult] = { + // convert fileList to pathList + val pathList = fileList.map(file => new java.io.File(file).toPath) + pairPaths(pathList) + } + + /** + * Implementation for file pairing. This executes in three steps: + * 1. Use our known file recognition strategies to identify all "read 1" files in the file list + * 2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s + * 3. Handle the remaining files which are not recognized as either "read 1" or "read 2" + * + * @param pathList the list of files to inspect + * @return pairing results + */ + private def performPairing(pathList: List[Path]): List[FileMatchResult] = { + // find every path in the incoming pathList that is recognized by one of our known patterns + val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList) + + // remove the recognized firstFiles from the outstanding pathList + val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile) + + // process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList. + // this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult + // when the desired pairing is not found + val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings) + + // remove the recognized "read 2" files from the outstanding pathList + val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult => + s.secondFile + } + // translate the unrecognized paths into a FileMatchResult + val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path)) + + // return results, sorted by firstFile + (pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile) + } + + /** + * find every path in the incoming pathList that is recognized as a "read 1" by our known patterns + * @param pathList the list of files to inspect + * @return pairing results + */ + private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] = + pathList.collect { path => + tryPairingStrategies(path) match { + case success: SuccessfulMatchResult => success + } + } + + /** + * find every path in the incoming pathList that is recognized as a "read 2" by our known patterns + * + * @param pathList the list of files to inspect + * @param desiredPairings the "read 2" files to look for in the pathList + * @return pairing results + */ + private def findSecondFiles(pathList: List[Path], + desiredPairings: List[SuccessfulMatchResult] + ): List[FileMatchResult] = + desiredPairings.map { desiredPairing => + // search for the desired pairing's secondFile in the list of actual files + pathList.find(p => p.equals(desiredPairing.secondFile)) match { + case Some(_) => desiredPairing + case None => desiredPairing.toPartial + } + } + + /** + * Attempt all the configured file recognition strategies against the supplied file. + * + * @param file the file to try to recognize + * @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not + */ + private def tryPairingStrategies(file: Path): FileMatchResult = { + // does the current file hit on any of our file-matching patterns? + // Iterate over the matching strategies and return the first successful match result. + val strategyHit = matchingStrategies.collectFirst(strategy => + strategy.matchFirstFile(file) match { + case success: SuccessfulMatchResult => success + } + ) + strategyHit match { + // The current file is recognized by one of our recognition strategies + case Some(desiredResult: SuccessfulMatchResult) => desiredResult + // the current file is not recognized + case _ => FailedMatchResult(file) + } + } + +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala new file mode 100644 index 000000000..f3511ef7a --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala @@ -0,0 +1,22 @@ +package org.broadinstitute.dsde.firecloud.filematch + +import spray.json.DefaultJsonProtocol.jsonFormat4 +import spray.json.RootJsonFormat +import spray.json.DefaultJsonProtocol._ + +/** + * Request payload, specified by end users, to control file-matching functionality + * @param prefix bucket prefix in which to list files + * @param read1Name name for the "read1" column + * @param read2Name name for the "read2" column + * @param recursive should bucket-listing be recursive? + */ +case class FileMatchingOptions(prefix: String, + read1Name: Option[String] = None, + read2Name: Option[String] = None, + recursive: Option[Boolean] = None +) + +object FileMatchingOptionsFormat { + implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat4(FileMatchingOptions) +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FailedMatchResult.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FailedMatchResult.scala new file mode 100644 index 000000000..a4bb9a133 --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FailedMatchResult.scala @@ -0,0 +1,16 @@ +package org.broadinstitute.dsde.firecloud.filematch.result + +import com.google.common.annotations.VisibleForTesting + +import java.nio.file.Path + +/** + * FileMatchResult indicating that the file did not hit on any known pattern. + */ +case class FailedMatchResult(firstFile: Path) extends FileMatchResult {} + +@VisibleForTesting +object FailedMatchResult { + def fromString(firstFile: String): FailedMatchResult = + FailedMatchResult(new java.io.File(firstFile).toPath) +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala new file mode 100644 index 000000000..e50af036b --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala @@ -0,0 +1,10 @@ +package org.broadinstitute.dsde.firecloud.filematch.result + +import java.nio.file.Path + +/** + * Marker trait for failed/partial/successful file-matching results + */ +trait FileMatchResult { + def firstFile: Path +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/PartialMatchResult.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/PartialMatchResult.scala new file mode 100644 index 000000000..76a372919 --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/PartialMatchResult.scala @@ -0,0 +1,16 @@ +package org.broadinstitute.dsde.firecloud.filematch.result + +import com.google.common.annotations.VisibleForTesting + +import java.nio.file.Path + +/** + * FileMatchResult indicating that the file successfully hit a known pattern, but no paired file could be found. + */ +case class PartialMatchResult(firstFile: Path, id: String) extends FileMatchResult {} + +@VisibleForTesting +object PartialMatchResult { + def fromStrings(firstFile: String, id: String): PartialMatchResult = + PartialMatchResult(new java.io.File(firstFile).toPath, id) +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala new file mode 100644 index 000000000..017100fd8 --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala @@ -0,0 +1,19 @@ +package org.broadinstitute.dsde.firecloud.filematch.result + +import com.google.common.annotations.VisibleForTesting + +import java.nio.file.Path + +/** + * FileMatchResult indicating that the file successfully hit a known pattern. + */ +case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult { + // convert this SuccessfulMatchResult to a PartialMatchResult + def toPartial: PartialMatchResult = PartialMatchResult(firstFile, id) +} + +@VisibleForTesting +object SuccessfulMatchResult { + def fromStrings(firstFile: String, secondFile: String, id: String): SuccessfulMatchResult = + SuccessfulMatchResult(new java.io.File(firstFile).toPath, new java.io.File(secondFile).toPath, id) +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala new file mode 100644 index 000000000..d81fd6bca --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala @@ -0,0 +1,14 @@ +package org.broadinstitute.dsde.firecloud.filematch.strategy + +import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult + +import java.nio.file.Path + +/** + * Marker trait representing file-naming conventions used for pairing matched reads. + */ +trait FileRecognitionStrategy { + + def matchFirstFile(path: Path): FileMatchResult + +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala new file mode 100644 index 000000000..124585c4f --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala @@ -0,0 +1,48 @@ +package org.broadinstitute.dsde.firecloud.filematch.strategy + +import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult} +import org.broadinstitute.dsde.firecloud.filematch.strategy.IlluminaPairedEndStrategy.FILE_ENDINGS + +import java.nio.file.Path + +/** + * Naming conventions for Illumina single end and paired end read patterns. Examples of files recognized: + * + * Sample1_01.fastq.gz -> Sample1_02.fastq.gz + * sample01_1.fastq.gz -> sample01_2.fastq.gz + * sample01_R1.fastq.gz -> sample01_R2.fastq.gz + * sample01_F.fastq.gz -> sample01_R.fastq.gz + * sample01_R1.fastq -> sample01_R2.fastq + * SampleName_S1_L001_R1_001.fastq.gz -> SampleName_S1_L001_R2_001.fastq.gz + */ +class IlluminaPairedEndStrategy extends FileRecognitionStrategy { + override def matchFirstFile(path: Path): FileMatchResult = { + // search known patterns for a "read1" file + val foundMatch = FILE_ENDINGS.find { case (key, _) => path.toString.endsWith(key) } + + foundMatch match { + // we found a "read1" + case Some((key, value)) => + // generate the id: strip the suffix from the filename. + val id = path.getFileName.toString.replace(key, "") + // generate the second filename: replace the first suffix with the second suffix + val secondFile = new java.io.File(path.toString.replace(key, value)) + SuccessfulMatchResult(path, secondFile.toPath, id) + + // the file is not recognized + case None => FailedMatchResult(path) + } + } +} + +object IlluminaPairedEndStrategy { + // if the first file ends with ${key}, then the second file should end with ${value} + val FILE_ENDINGS: Map[String, String] = Map( + "_01.fastq.gz" -> "_02.fastq.gz", + "_1.fastq.gz" -> "_2.fastq.gz", + "_R1.fastq.gz" -> "_R2.fastq.gz", + "_F.fastq.gz" -> "_R.fastq.gz", + "_R1.fastq" -> "_R2.fastq", + "_R1_001.fastq.gz" -> "_R2_001.fastq.gz" + ) +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala index e26c73f2f..79b8a04b4 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala @@ -1,23 +1,31 @@ package org.broadinstitute.dsde.firecloud.service import akka.actor.ActorSystem -import akka.http.scaladsl.model.headers.{`Content-Disposition`, Connection, ContentDispositionTypes} import akka.http.scaladsl.model._ +import akka.http.scaladsl.model.headers.{`Content-Disposition`, Connection, ContentDispositionTypes} import akka.stream._ import akka.stream.scaladsl.{Source => AkkaSource, _} import akka.util.{ByteString, Timeout} import better.files.File import com.typesafe.scalalogging.LazyLogging import org.broadinstitute.dsde.firecloud.dataaccess.{GoogleServicesDAO, RawlsDAO} +import org.broadinstitute.dsde.firecloud.filematch.result.{ + FailedMatchResult, + FileMatchResult, + PartialMatchResult, + SuccessfulMatchResult +} +import org.broadinstitute.dsde.firecloud.filematch.{FileMatcher, FileMatchingOptions} import org.broadinstitute.dsde.firecloud.model.ModelJsonProtocol._ -import org.broadinstitute.dsde.firecloud.model.{UserInfo, _} +import org.broadinstitute.dsde.firecloud.model._ import org.broadinstitute.dsde.firecloud.utils.TSVFormatter import org.broadinstitute.dsde.firecloud.{Application, FireCloudConfig, FireCloudExceptionWithErrorReport} -import org.broadinstitute.dsde.rawls.model.WorkspaceAccessLevels.WorkspaceAccessLevel +import org.broadinstitute.dsde.rawls.StringValidationUtils import org.broadinstitute.dsde.rawls.model._ import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GcsObjectName, GcsPath} import spray.json._ +import java.nio.file.Path import java.time.Instant import scala.concurrent.duration._ import scala.concurrent.{ExecutionContext, Future} @@ -32,14 +40,17 @@ case class ExportEntitiesByTypeArguments( model: Option[String] ) -object ExportEntitiesByTypeActor { +object ExportEntitiesByTypeActor extends StringValidationUtils { sealed trait ExportEntitiesByTypeMessage case object ExportEntities extends ExportEntitiesByTypeMessage + implicit val errorReportSource: ErrorReportSource = ErrorReportSource(ExportEntitiesByTypeActor.getClass.getName) + def constructor(app: Application, system: ActorSystem)(exportArgs: ExportEntitiesByTypeArguments)(implicit executionContext: ExecutionContext - ) = + ) = { + validateUserDefinedString(exportArgs.entityType) new ExportEntitiesByTypeActor( app.rawlsDAO, app.googleServicesDAO, @@ -51,6 +62,7 @@ object ExportEntitiesByTypeActor { exportArgs.model, system ) + } } /** @@ -72,7 +84,10 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO, model: Option[String], argSystem: ActorSystem )(implicit protected val executionContext: ExecutionContext) - extends LazyLogging { + extends LazyLogging + with StringValidationUtils { + + implicit val errorReportSource: ErrorReportSource = ErrorReportSource(ExportEntitiesByTypeActor.getClass.getName) implicit val timeout: Timeout = Timeout(1 minute) implicit val userInfo: UserInfo = argUserInfo @@ -84,6 +99,9 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO, case None => ModelSchemaRegistry.getModelForSchemaType(SchemaTypes.FIRECLOUD) } + // maximum allowed count of files in a bucket for the file-matching API + private val maxFileMatchingFileCount = FireCloudConfig.FireCloud.maxFileMatchingFileCount + def ExportEntities = streamEntities() /** @@ -369,4 +387,87 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO, response.results } + /** + * Perform file-matching for this workspace's bucket. Lists the files in the bucket, filtered by a bucket prefix, + * then executes `FileMatcher.pairPaths` on those files. Finally, creates a TSV out of the paired results. + * + * @see [[FileMatcher]] + * @param matchingOptions configuration options for file matching + * @return contents of the resultant TSV + */ + def matchBucketFiles(matchingOptions: FileMatchingOptions): Future[String] = { + // generate defaults for options + val read1Name = matchingOptions.read1Name.getOrElse("read1") + val read2Name = matchingOptions.read2Name.getOrElse("read2") + val recursive = matchingOptions.recursive.getOrElse(true) + + validateUserDefinedString(read1Name) + validateUserDefinedString(read2Name) + validateAttributeName(AttributeName.fromDelimitedName(read1Name), entityType) + validateAttributeName(AttributeName.fromDelimitedName(read2Name), entityType) + + // retrieve workspace so we can get its bucket + rawlsDAO.getWorkspace(workspaceNamespace, workspaceName)(userInfo) map { workspaceResponse => + val workspaceBucket = GcsBucketName(workspaceResponse.workspace.bucketName) + + // list all files in bucket which match matchingOptions.prefix + logger.info("listing bucket files ...") + val fileList: List[GcsObjectName] = + googleServicesDao.listBucket(workspaceBucket, Option(matchingOptions.prefix), recursive) + + // sanity check + if (fileList.length > maxFileMatchingFileCount) { + throw new FireCloudExceptionWithErrorReport(errorReport = + ErrorReport(StatusCodes.BadRequest, s"Too many files in bucket (${fileList.length}); cannot continue.") + ) + } + logger.info(s"found ${fileList.length} files") + + // transform the list of GcsObjectName to a list of java.nio.Path + val pathList: List[Path] = fileList.map(gcsObject => new java.io.File(gcsObject.value).toPath) + + // perform the pairing + logger.info("starting pairing analysis ...") + val pairs: List[FileMatchResult] = new FileMatcher().pairPaths(pathList) + logger.info(s"completed pairing; result is ${pairs.length} rows") + + // TSV headers + val entityHeaders: IndexedSeq[String] = IndexedSeq(s"entity:${entityType}_id", read1Name, read2Name) + + // transform the matched pairs into entities + val entities: List[Entity] = pairs.map { + case SuccessfulMatchResult(firstFile, secondFile, id) => + val attributes = Map( + AttributeName.withDefaultNS(read1Name) -> AttributeString(qualifyBucketFile(firstFile, workspaceBucket)), + AttributeName.withDefaultNS(read2Name) -> AttributeString(qualifyBucketFile(secondFile, workspaceBucket)) + ) + Entity(id, entityType, attributes) + case PartialMatchResult(firstFile, id) => + val attributes = Map( + AttributeName.withDefaultNS(read1Name) -> AttributeString(qualifyBucketFile(firstFile, workspaceBucket)) + ) + Entity(id, entityType, attributes) + case FailedMatchResult(firstFile) => + val attributes = Map( + AttributeName.withDefaultNS(read1Name) -> AttributeString(qualifyBucketFile(firstFile, workspaceBucket)) + ) + // can't use the file path directly as an entity id; it can contain slashes or other illegal chars + val id = firstFile.toString.replaceAll("[^A-z0-9_-]", "_") + Entity(id, entityType, attributes) + } + + val headerString = entityHeaders.mkString("\t") + "\n" + + // transform the entities into a TSV + val rows = TSVFormatter.makeEntityRows(entityType, entities, entityHeaders) + val rowString = rows.map(_.mkString("\t")).mkString("\n") + "\n" + + headerString + rowString + } + } + + // helper to turn a file path into a fully-qualified gs:// url + private def qualifyBucketFile(file: Path, workspaceBucket: GcsBucketName): String = + s"gs://${workspaceBucket.value}/$file" + } diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala index 8b6722fbd..f934ee89a 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala @@ -1,10 +1,15 @@ package org.broadinstitute.dsde.firecloud.webservice import akka.http.scaladsl.client.RequestBuilding +import akka.http.scaladsl.marshallers.sprayjson.SprayJsonSupport import akka.http.scaladsl.model.StatusCodes.OK +import akka.http.scaladsl.model.headers.{`Content-Disposition`, Connection, ContentDispositionTypes} +import akka.http.scaladsl.model._ import akka.http.scaladsl.server.{Directives, Route} import com.typesafe.scalalogging.LazyLogging import org.apache.commons.lang3.StringUtils +import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptions +import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptionsFormat.fileMatchingOptionsFormat import org.broadinstitute.dsde.firecloud.service.PerRequest.RequestComplete import org.broadinstitute.dsde.firecloud.service.{ExportEntitiesByTypeActor, ExportEntitiesByTypeArguments} import org.broadinstitute.dsde.firecloud.utils.StandardUserInfoDirectives @@ -16,7 +21,8 @@ trait ExportEntitiesApiService extends Directives with RequestBuilding with StandardUserInfoDirectives - with LazyLogging { + with LazyLogging + with SprayJsonSupport { val exportEntitiesByTypeConstructor: ExportEntitiesByTypeArguments => ExportEntitiesByTypeActor @@ -61,5 +67,33 @@ trait ExportEntitiesApiService } } } + } ~ path("api" / "workspaces" / Segment / Segment / "entities" / Segment / "paired-tsv") { + (workspaceNamespace, workspaceName, entityType) => + requireUserInfo() { userInfo => + post { + entity(as[FileMatchingOptions]) { matchingOptions => + val exportArgs = + ExportEntitiesByTypeArguments(userInfo, workspaceNamespace, workspaceName, entityType, None, None) + + complete { + exportEntitiesByTypeConstructor(exportArgs).matchBucketFiles(matchingOptions) map { pairs => + // download the TSV as an attachment: + asDownloadableTsv(pairs, s"$entityType.tsv") + } + } + } + } + } } + + // given the contents of a TSV, generate a HttpResponse with the appropriate headers to download that TSV file. + private def asDownloadableTsv(contents: String, filename: String) = HttpResponse( + entity = + HttpEntity.apply(ContentType.apply(MediaTypes.`text/tab-separated-values`, HttpCharsets.`UTF-8`), contents), + headers = List( + Connection("Keep-Alive"), + `Content-Disposition`.apply(ContentDispositionTypes.attachment, Map("filename" -> filename)) + ) + ) + } diff --git a/src/test/resources/reference.conf b/src/test/resources/reference.conf index f3d0d4267..b7f262515 100644 --- a/src/test/resources/reference.conf +++ b/src/test/resources/reference.conf @@ -101,3 +101,7 @@ spray.can.host-connector { notification { fullyQualifiedNotificationTopic = "dummy" } + +firecloud { + max-filematching-bucket-files = 25000 +} diff --git a/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcherSpec.scala b/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcherSpec.scala new file mode 100644 index 000000000..894901a6d --- /dev/null +++ b/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcherSpec.scala @@ -0,0 +1,68 @@ +package org.broadinstitute.dsde.firecloud.filematch + +import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, PartialMatchResult, SuccessfulMatchResult} +import org.scalatest.freespec.AnyFreeSpec +import org.scalatest.matchers.should.Matchers + +class FileMatcherSpec extends AnyFreeSpec with Matchers { + + "FileMatcher" - { + "pairFiles" - { + "should match when input is ideal" in { + val input = List("Sample1_01.fastq.gz", "Sample1_02.fastq.gz", "Sample2_01.fastq.gz", "Sample2_02.fastq.gz") + + val expected = List( + SuccessfulMatchResult.fromStrings("Sample1_01.fastq.gz", "Sample1_02.fastq.gz", "Sample1"), + SuccessfulMatchResult.fromStrings("Sample2_01.fastq.gz", "Sample2_02.fastq.gz", "Sample2") + ) + val actual = new FileMatcher().pairFiles(input) + + actual shouldBe expected + } + "should still return results when no matches exist" in { + val input = List("Sample1_01.fastq.gz", "Sample2_01.fastq.gz", "Sample3_01.fastq.gz", "Sample4_01.fastq.gz") + + val expected = List( + PartialMatchResult.fromStrings("Sample1_01.fastq.gz", "Sample1"), + PartialMatchResult.fromStrings("Sample2_01.fastq.gz", "Sample2"), + PartialMatchResult.fromStrings("Sample3_01.fastq.gz", "Sample3"), + PartialMatchResult.fromStrings("Sample4_01.fastq.gz", "Sample4") + ) + val actual = new FileMatcher().pairFiles(input) + + actual shouldBe expected + } + "should return results when some but not all matches exist" in { + val input = List("Sample1_01.fastq.gz", "Sample2_01.fastq.gz", "Sample1_02.fastq.gz", "Sample4_01.fastq.gz") + + val expected = List( + SuccessfulMatchResult.fromStrings("Sample1_01.fastq.gz", "Sample1_02.fastq.gz", "Sample1"), + PartialMatchResult.fromStrings("Sample2_01.fastq.gz", "Sample2"), + PartialMatchResult.fromStrings("Sample4_01.fastq.gz", "Sample4") + ) + val actual = new FileMatcher().pairFiles(input) + + actual shouldBe expected + } + "should return results when some inputs dont hit the regex at all" in { + val input = List("Sample1_01.fastq.gz", + "Sample2_01.fastq.gz", + "Sample1_02.fastq.gz", + "anotherfile.txt", + "my-cat-picture.jpg" + ) + + val expected = List( + SuccessfulMatchResult.fromStrings("Sample1_01.fastq.gz", "Sample1_02.fastq.gz", "Sample1"), + PartialMatchResult.fromStrings("Sample2_01.fastq.gz", "Sample2"), + FailedMatchResult.fromString("anotherfile.txt"), + FailedMatchResult.fromString("my-cat-picture.jpg") + ) + val actual = new FileMatcher().pairFiles(input) + + actual shouldBe expected + } + } + } + +} diff --git a/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategySpec.scala b/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategySpec.scala new file mode 100644 index 000000000..01a97fa19 --- /dev/null +++ b/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategySpec.scala @@ -0,0 +1,68 @@ +package org.broadinstitute.dsde.firecloud.filematch.strategy + +import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult} +import org.scalatest.freespec.AnyFreeSpec +import org.scalatest.matchers.should.Matchers + +class IlluminaPairedEndStrategySpec extends AnyFreeSpec with Matchers { + + val strategy = new IlluminaPairedEndStrategy + + /** + * Naming conventions for Illumina single end and paired end read patterns. Examples of files recognized: + * + * SampleName_S1_L001_R1_001.fastq.gz -> SampleName_S1_L001_R2_001.fastq.gz + */ + + // set of input, expected test cases + val recognizedTestCases: Map[String, FileMatchResult] = Map( + "Sample1_01.fastq.gz" -> SuccessfulMatchResult(toPath("Sample1_01.fastq.gz"), + toPath("Sample1_02.fastq.gz"), + "Sample1" + ), + "someSubdirectory/Sample1_01.fastq.gz" -> SuccessfulMatchResult(toPath("someSubdirectory/Sample1_01.fastq.gz"), + toPath("someSubdirectory/Sample1_02.fastq.gz"), + "Sample1" + ), + "sample42_1.fastq.gz" -> SuccessfulMatchResult(toPath("sample42_1.fastq.gz"), + toPath("sample42_2.fastq.gz"), + "sample42" + ), + "sample01_R1.fastq.gz" -> SuccessfulMatchResult(toPath("sample01_R1.fastq.gz"), + toPath("sample01_R2.fastq.gz"), + "sample01" + ), + "/foo/bar/789a_F.fastq.gz" -> SuccessfulMatchResult(toPath("/foo/bar/789a_F.fastq.gz"), + toPath("/foo/bar/789a_R.fastq.gz"), + "789a" + ), + "sample01_R1.fastq" -> SuccessfulMatchResult(toPath("sample01_R1.fastq"), toPath("sample01_R2.fastq"), "sample01"), + "SampleName_S1_L001_R1_001.fastq.gz" -> SuccessfulMatchResult(toPath("SampleName_S1_L001_R1_001.fastq.gz"), + toPath("SampleName_S1_L001_R2_001.fastq.gz"), + "SampleName_S1_L001" + ) + ) + + val unrecognizedInputs: List[String] = + List("my-cat-picture.png", "Sample1_01.fastq.gz/is/a/bad/directory/name", "Sample1_01.fasta.gz", "Sample1_01.bam") + + "IlluminaPairedEndStrategy" - { + recognizedTestCases foreach { case (inputString, expectedMatchResult) => + s"should hit on recognized input file $inputString" in { + val matchResult = strategy.matchFirstFile(toPath(inputString)) + matchResult shouldBe expectedMatchResult + } + } + + unrecognizedInputs foreach { inputString => + s"should miss on unrecognized input file $inputString" in { + val matchResult = strategy.matchFirstFile(toPath(inputString)) + matchResult shouldBe FailedMatchResult(toPath(inputString)) + } + } + + } + + private def toPath(input: String) = new java.io.File(input).toPath + +} diff --git a/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala b/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala index 4c1cd1ee2..0bb373c01 100644 --- a/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala +++ b/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala @@ -118,4 +118,7 @@ class MockGoogleServicesDAO extends GoogleServicesDAO { pubsubMessages.addAll(messages.asJava) Future.successful(()) } + + override def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName] = + List() } diff --git a/src/test/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeServiceSpec.scala b/src/test/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeServiceSpec.scala index cdcff12ea..83e7f9562 100644 --- a/src/test/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeServiceSpec.scala +++ b/src/test/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeServiceSpec.scala @@ -10,13 +10,20 @@ import akka.http.scaladsl.server.Route.{seal => sealRoute} import akka.http.scaladsl.testkit.RouteTestTimeout import akka.http.scaladsl.unmarshalling.Unmarshal import better.files.File +import org.apache.commons.lang3.StringUtils +import org.broadinstitute.dsde.firecloud.FireCloudConfig import org.broadinstitute.dsde.firecloud.dataaccess.MockRawlsDAO +import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptions +import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptionsFormat.fileMatchingOptionsFormat import org.broadinstitute.dsde.firecloud.mock.MockGoogleServicesDAO import org.broadinstitute.dsde.firecloud.model._ import org.broadinstitute.dsde.firecloud.webservice.{CookieAuthedApiService, ExportEntitiesApiService} +import org.broadinstitute.dsde.rawls.model.ErrorReport +import org.broadinstitute.dsde.rawls.model.WorkspaceJsonSupport.ErrorReportFormat import org.broadinstitute.dsde.workbench.model.google.{GcsBucketName, GcsObjectName} +import org.mockito.ArgumentMatchers import org.mockito.ArgumentMatchers.any -import org.mockito.Mockito.{reset, spy, times, verify} +import org.mockito.Mockito.{reset, spy, times, verify, when} import org.scalatest.BeforeAndAfterEach import scala.concurrent.duration._ @@ -301,6 +308,127 @@ class ExportEntitiesByTypeServiceSpec } + "ExportEntitiesApiService-filematching" - { + List(true, false) foreach { recursive => + s"should pass on recursive flag '$recursive' to bucket listing" in { + val fileMatchingOptions = FileMatchingOptions("prefix", recursive = Option(recursive)) + Post("/api/workspaces/broad-dsde-dev/valid/entities/sample_set/paired-tsv", + fileMatchingOptions + ) ~> dummyUserIdHeaders("1234") ~> sealRoute( + exportEntitiesRoutes + ) ~> check { + handled should be(true) + status should be(OK) + verify(mockitoGoogleServicesDao, times(1)).listBucket(any[GcsBucketName], + any[Option[String]], + ArgumentMatchers.eq(recursive) + ) + } + } + } + + s"should default recursive flag to true if omitted" in { + val fileMatchingOptions = FileMatchingOptions("prefix", recursive = None) + Post("/api/workspaces/broad-dsde-dev/valid/entities/sample_set/paired-tsv", + fileMatchingOptions + ) ~> dummyUserIdHeaders("1234") ~> sealRoute( + exportEntitiesRoutes + ) ~> check { + handled should be(true) + status should be(OK) + verify(mockitoGoogleServicesDao, times(1)).listBucket(any[GcsBucketName], + any[Option[String]], + ArgumentMatchers.eq(true) + ) + } + } + + "should use read1/read2 column names if specified" in { + val fileMatchingOptions = + FileMatchingOptions("prefix", read1Name = Option("my-col-1"), read2Name = Option("column-two")) + Post("/api/workspaces/broad-dsde-dev/valid/entities/my-entity-type/paired-tsv", + fileMatchingOptions + ) ~> dummyUserIdHeaders("1234") ~> sealRoute( + exportEntitiesRoutes + ) ~> check { + handled should be(true) + status should be(OK) + responseAs[String] should startWith("entity:my-entity-type_id\tmy-col-1\tcolumn-two") + } + } + + "should default read1/read2 column names to 'read1' and 'read2' if omitted" in { + val fileMatchingOptions = FileMatchingOptions("prefix", read1Name = None, read2Name = None) + Post("/api/workspaces/broad-dsde-dev/valid/entities/my-entity-type/paired-tsv", + fileMatchingOptions + ) ~> dummyUserIdHeaders("1234") ~> sealRoute( + exportEntitiesRoutes + ) ~> check { + handled should be(true) + status should be(OK) + responseAs[String] should startWith("entity:my-entity-type_id\tread1\tread2") + } + } + + "should set a content-type of tab-separated-values on the response" in { + val fileMatchingOptions = FileMatchingOptions("prefix") + Post("/api/workspaces/broad-dsde-dev/valid/entities/my-entity-type/paired-tsv", + fileMatchingOptions + ) ~> dummyUserIdHeaders("1234") ~> sealRoute( + exportEntitiesRoutes + ) ~> check { + handled should be(true) + status should be(OK) + contentType shouldEqual ContentType(MediaTypes.`text/tab-separated-values`, HttpCharsets.`UTF-8`) + } + } + + s"should fully-qualify file paths and sanitize ids in the response" in { + val bucketListResponse = List( + GcsObjectName("unit-test/ExportEntitiesByTypeServiceSpec/file1"), + GcsObjectName("unit-test/ExportEntitiesByTypeServiceSpec/file2") + ) + when(mockitoGoogleServicesDao.listBucket(any(), any(), any())).thenReturn(bucketListResponse) + val fileMatchingOptions = FileMatchingOptions("prefix") + Post("/api/workspaces/broad-dsde-dev/valid/entities/my-entity-type/paired-tsv", + fileMatchingOptions + ) ~> dummyUserIdHeaders("1234") ~> sealRoute( + exportEntitiesRoutes + ) ~> check { + handled should be(true) + status should be(OK) + + val expected = + "entity:my-entity-type_id\tread1\tread2\n" + + "unit-test_ExportEntitiesByTypeServiceSpec_file1\tgs://bucketName/unit-test/ExportEntitiesByTypeServiceSpec/file1\t\n" + + "unit-test_ExportEntitiesByTypeServiceSpec_file2\tgs://bucketName/unit-test/ExportEntitiesByTypeServiceSpec/file2\t\n" + + responseAs[String] shouldBe expected + } + } + + s"should throw an error if the bucket contains too many files" in { + val configuredMax = FireCloudConfig.FireCloud.maxFileMatchingFileCount + val bucketListResponse: List[GcsObjectName] = + Range.apply(0, configuredMax + 1).map(idx => GcsObjectName(s"file$idx")).toList + + when(mockitoGoogleServicesDao.listBucket(any(), any(), any())).thenReturn(bucketListResponse) + val fileMatchingOptions = FileMatchingOptions("prefix") + Post("/api/workspaces/broad-dsde-dev/valid/entities/my-entity-type/paired-tsv", + fileMatchingOptions + ) ~> dummyUserIdHeaders("1234") ~> sealRoute( + exportEntitiesRoutes + ) ~> check { + handled should be(true) + status should be(BadRequest) + contentType shouldEqual ContentTypes.`application/json` + val actualError = responseAs[ErrorReport] + actualError.message should startWith("Too many files in bucket") + } + } + + } + val validCookieFireCloudEntitiesLargeSampleTSVPath = "/cookie-authed/workspaces/broad-dsde-dev/large/entities/sample/tsv" val validCookieFireCloudEntitiesSampleSetTSVPath =