From 485ee0af256a78feba0762c46c968f9c9b9c764a Mon Sep 17 00:00:00 2001 From: David An Date: Fri, 1 Nov 2024 12:58:42 -0400 Subject: [PATCH] updates --- src/main/resources/swagger/api-docs.yaml | 30 ++++++--- .../dataaccess/GoogleServicesDAO.scala | 2 +- .../dataaccess/HttpGoogleServicesDAO.scala | 7 ++- .../{utils => filematch}/FileMatcher.scala | 3 +- .../filematch/FileMatchingOptions.scala | 15 +++++ .../strategy/FileRecognitionStrategy.scala | 3 + .../strategy/IlluminaPairedEndStrategy.scala | 46 +++++++------- .../service/ExportEntitiesByTypeActor.scala | 40 ++++++------ .../webservice/ExportEntitiesApiService.scala | 61 ++++++++----------- .../FileMatcherSpec.scala | 2 +- .../mock/MockGoogleServicesDAO.scala | 3 +- 11 files changed, 119 insertions(+), 93 deletions(-) rename src/main/scala/org/broadinstitute/dsde/firecloud/{utils => filematch}/FileMatcher.scala (98%) create mode 100644 src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala rename src/test/scala/org/broadinstitute/dsde/firecloud/{utils => filematch}/FileMatcherSpec.scala (98%) diff --git a/src/main/resources/swagger/api-docs.yaml b/src/main/resources/swagger/api-docs.yaml index 7573b78bf..dd859d030 100755 --- a/src/main/resources/swagger/api-docs.yaml +++ b/src/main/resources/swagger/api-docs.yaml @@ -3703,13 +3703,7 @@ paths: content: 'application/json': schema: - type: object - required: - - prefix - properties: - prefix: - type: string - description: bucket prefix in which to look + $ref: '#/components/schemas/FileMatchingOptions' required: true responses: 200: @@ -7511,6 +7505,28 @@ components: ExtendedEnabled: allOf: - $ref: '#/components/schemas/Enabled' + FileMatchingOptions: + type: object + required: + - prefix + properties: + prefix: + type: string + description: | + Bucket prefix in which to look. If `recursive` is false, this must include a trailing + slash when specifying a subdirectory. + read1Name: + type: string + description: column name to use for the primary "read 1" file + default: read1 + read2Name: + type: string + description: column name to use for the matching "read 2" file + default: read2 + recursive: + type: boolean + description: whether to list files in subdirectories of the prefix + default: true FireCloudPermission: required: - role diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala index 8c06a4b83..0202023f5 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/GoogleServicesDAO.scala @@ -41,5 +41,5 @@ trait GoogleServicesDAO extends ReportsSubsystemStatus { def getBucket(bucketName: String, petKey: String): Option[Bucket] - def listBucket(bucketName: GcsBucketName, prefix: Option[String]): List[GcsObjectName] + def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName] } diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala index 51f13ed48..022b43485 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/dataaccess/HttpGoogleServicesDAO.scala @@ -401,12 +401,15 @@ class HttpGoogleServicesDAO(priceListUrl: String, defaultPriceList: GooglePriceL .refreshAccessToken() .getTokenValue - override def listBucket(bucketName: GcsBucketName, prefix: Option[String]): List[GcsObjectName] = { + override def listBucket(bucketName: GcsBucketName, + prefix: Option[String], + recursive: Boolean = true + ): List[GcsObjectName] = { // listObjectsWithPrefix handles paginating through results if there are more results than // the `maxPageSize` setting. val listAttempt = getStorageResource.use { storageService => storageService - .listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = true) + .listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = recursive) .compile .toList } diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala similarity index 98% rename from src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala rename to src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala index be4d151aa..a3470db32 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcher.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala @@ -1,7 +1,6 @@ -package org.broadinstitute.dsde.firecloud.utils +package org.broadinstitute.dsde.firecloud.filematch import com.typesafe.scalalogging.LazyLogging -import org.broadinstitute.dsde.firecloud.filematch.result import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult} import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala new file mode 100644 index 000000000..4dbf7c54c --- /dev/null +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala @@ -0,0 +1,15 @@ +package org.broadinstitute.dsde.firecloud.filematch + +import spray.json.DefaultJsonProtocol.jsonFormat4 +import spray.json.RootJsonFormat +import spray.json.DefaultJsonProtocol._ + +case class FileMatchingOptions(prefix: String, + read1Name: Option[String] = None, + read2Name: Option[String] = None, + recursive: Option[Boolean] = None +) + +object FileMatchingOptionsFormat { + implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat4(FileMatchingOptions) +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala index 1c3cf2783..d81fd6bca 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala @@ -4,6 +4,9 @@ import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult import java.nio.file.Path +/** + * Marker trait representing file-naming conventions used for pairing matched reads. + */ trait FileRecognitionStrategy { def matchFirstFile(path: Path): FileMatchResult diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala index 17de75b48..124585c4f 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/filematch/strategy/IlluminaPairedEndStrategy.scala @@ -5,32 +5,23 @@ import org.broadinstitute.dsde.firecloud.filematch.strategy.IlluminaPairedEndStr import java.nio.file.Path -/* -Sample1_01.fastq.gz -> Sample1_02.fastq.gz -sample01_1.fastq.gz -> sample01_2.fastq.gz -sample01_R1.fastq.gz -> sample01_R2.fastq.gz -sample01_F.fastq.gz -> sample01_R.fastq.gz -sample01_R1.fastq -> sample01_R2.fastq -SampleName_S1_L001_R1_001.fastq.gz -> SampleName_S1_L001_R2_001.fastq.gz - */ - -object IlluminaPairedEndStrategy { - // if the first file ends with ${key}, then the second file should end with ${value} - val FILE_ENDINGS: Map[String, String] = Map( - "_01.fastq.gz" -> "_02.fastq.gz", - "_1.fastq.gz" -> "_2.fastq.gz", - "_R1.fastq.gz" -> "_R2.fastq.gz", - "_F.fastq.gz" -> "_R.fastq.gz", - "_R1.fastq" -> "_R2.fastq", - "_R1_001.fastq.gz" -> "_R2_001.fastq.gz" - ) -} - +/** + * Naming conventions for Illumina single end and paired end read patterns. Examples of files recognized: + * + * Sample1_01.fastq.gz -> Sample1_02.fastq.gz + * sample01_1.fastq.gz -> sample01_2.fastq.gz + * sample01_R1.fastq.gz -> sample01_R2.fastq.gz + * sample01_F.fastq.gz -> sample01_R.fastq.gz + * sample01_R1.fastq -> sample01_R2.fastq + * SampleName_S1_L001_R1_001.fastq.gz -> SampleName_S1_L001_R2_001.fastq.gz + */ class IlluminaPairedEndStrategy extends FileRecognitionStrategy { override def matchFirstFile(path: Path): FileMatchResult = { + // search known patterns for a "read1" file val foundMatch = FILE_ENDINGS.find { case (key, _) => path.toString.endsWith(key) } foundMatch match { + // we found a "read1" case Some((key, value)) => // generate the id: strip the suffix from the filename. val id = path.getFileName.toString.replace(key, "") @@ -38,7 +29,20 @@ class IlluminaPairedEndStrategy extends FileRecognitionStrategy { val secondFile = new java.io.File(path.toString.replace(key, value)) SuccessfulMatchResult(path, secondFile.toPath, id) + // the file is not recognized case None => FailedMatchResult(path) } } } + +object IlluminaPairedEndStrategy { + // if the first file ends with ${key}, then the second file should end with ${value} + val FILE_ENDINGS: Map[String, String] = Map( + "_01.fastq.gz" -> "_02.fastq.gz", + "_1.fastq.gz" -> "_2.fastq.gz", + "_R1.fastq.gz" -> "_R2.fastq.gz", + "_F.fastq.gz" -> "_R.fastq.gz", + "_R1.fastq" -> "_R2.fastq", + "_R1_001.fastq.gz" -> "_R2_001.fastq.gz" + ) +} diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala index b424cd77e..15a6a0acd 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/service/ExportEntitiesByTypeActor.scala @@ -9,6 +9,8 @@ import akka.util.{ByteString, Timeout} import better.files.File import com.typesafe.scalalogging.LazyLogging import org.broadinstitute.dsde.firecloud.dataaccess.{GoogleServicesDAO, RawlsDAO} +import org.broadinstitute.dsde.firecloud.filematch.{FileMatcher, FileMatchingOptions} +import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptionsFormat.fileMatchingOptionsFormat import org.broadinstitute.dsde.firecloud.filematch.result.{ FailedMatchResult, FileMatchResult, @@ -16,9 +18,8 @@ import org.broadinstitute.dsde.firecloud.filematch.result.{ SuccessfulMatchResult } import org.broadinstitute.dsde.firecloud.model.ModelJsonProtocol._ -import org.broadinstitute.dsde.firecloud.model.{UserInfo, _} -import org.broadinstitute.dsde.firecloud.service.ExportEntitiesByTypeActor.FileMatchingOptions -import org.broadinstitute.dsde.firecloud.utils.{FileMatcher, TSVFormatter} +import org.broadinstitute.dsde.firecloud.model._ +import org.broadinstitute.dsde.firecloud.utils.TSVFormatter import org.broadinstitute.dsde.firecloud.{Application, FireCloudConfig, FireCloudExceptionWithErrorReport} import org.broadinstitute.dsde.rawls.model.WorkspaceAccessLevels.WorkspaceAccessLevel import org.broadinstitute.dsde.rawls.model._ @@ -59,18 +60,6 @@ object ExportEntitiesByTypeActor { exportArgs.model, system ) - - // ******************************************************************************************************************* - // POC of file-matching for AJ-2025 - // ******************************************************************************************************************* - import spray.json.DefaultJsonProtocol._ - - case class FileMatchingOptions(prefix: String) - - implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat1(FileMatchingOptions) - // ******************************************************************************************************************* - // POC of file-matching for AJ-2025: - // ******************************************************************************************************************* } /** @@ -395,12 +384,18 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO, // those files based on Illumina single end and paired end read patterns // ******************************************************************************************************************* - def matchBucketFiles(matchingOptions: FileMatchingOptions): Future[String] = + def matchBucketFiles(matchingOptions: FileMatchingOptions): Future[String] = { + // generate defaults for options + val read1Name = matchingOptions.read1Name.getOrElse("read1") + val read2Name = matchingOptions.read2Name.getOrElse("read2") + val recursive = matchingOptions.recursive.getOrElse(true) + // retrieve workspace so we can get its bucket rawlsDAO.getWorkspace(workspaceNamespace, workspaceName)(userInfo) map { workspaceResponse => val workspaceBucket = GcsBucketName(workspaceResponse.workspace.bucketName) // list all files in bucket which match matchingOptions.prefix - val fileList: List[GcsObjectName] = googleServicesDao.listBucket(workspaceBucket, Option(matchingOptions.prefix)) + val fileList: List[GcsObjectName] = + googleServicesDao.listBucket(workspaceBucket, Option(matchingOptions.prefix), recursive) // transform the list of GcsObjectName to a list of java.nio.Path val pathList: List[Path] = fileList.map(gcsObject => new java.io.File(gcsObject.value).toPath) @@ -409,24 +404,24 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO, val pairs: List[FileMatchResult] = new FileMatcher().pairPaths(pathList) // TSV headers - val entityHeaders: IndexedSeq[String] = IndexedSeq(s"entity:${entityType}_id", "read1", "read2") + val entityHeaders: IndexedSeq[String] = IndexedSeq(s"entity:${entityType}_id", read1Name, read2Name) // transform the matched pairs into entities val entities: List[Entity] = pairs.map { case SuccessfulMatchResult(firstFile, secondFile, id) => val attributes = Map( - AttributeName.withDefaultNS("read1") -> AttributeString(firstFile.toString), - AttributeName.withDefaultNS("read2") -> AttributeString(secondFile.toString) + AttributeName.withDefaultNS(read1Name) -> AttributeString(firstFile.toString), + AttributeName.withDefaultNS(read2Name) -> AttributeString(secondFile.toString) ) Entity(id, entityType, attributes) case PartialMatchResult(firstFile, id) => val attributes = Map( - AttributeName.withDefaultNS("read1") -> AttributeString(firstFile.toString) + AttributeName.withDefaultNS(read1Name) -> AttributeString(firstFile.toString) ) Entity(id, entityType, attributes) case FailedMatchResult(firstFile) => val attributes = Map( - AttributeName.withDefaultNS("read1") -> AttributeString(firstFile.toString) + AttributeName.withDefaultNS(read1Name) -> AttributeString(firstFile.toString) ) Entity(firstFile.toString, entityType, attributes) @@ -440,5 +435,6 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO, headerString + rowString } + } } diff --git a/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala b/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala index 21cb1e9a3..6dfb4508c 100644 --- a/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala +++ b/src/main/scala/org/broadinstitute/dsde/firecloud/webservice/ExportEntitiesApiService.scala @@ -8,6 +8,8 @@ import akka.http.scaladsl.model.headers.{`Content-Disposition`, `Content-Type`, import akka.http.scaladsl.server.{Directives, Route} import com.typesafe.scalalogging.LazyLogging import org.apache.commons.lang3.StringUtils +import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptions +import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptionsFormat.fileMatchingOptionsFormat import org.broadinstitute.dsde.firecloud.service.PerRequest.{RequestComplete, RequestCompleteWithHeaders} import org.broadinstitute.dsde.firecloud.service.{ExportEntitiesByTypeActor, ExportEntitiesByTypeArguments} import org.broadinstitute.dsde.firecloud.utils.StandardUserInfoDirectives @@ -65,48 +67,35 @@ trait ExportEntitiesApiService } } } - } ~ - // ******************************************************************************************************************* - // POC of file-matching for AJ-2025 - // ******************************************************************************************************************* - // TODO: add swagger definition - path("api" / "workspaces" / Segment / Segment / "entities" / Segment / "tsv" / "frombucket") { - (workspaceNamespace, workspaceName, entityType) => - requireUserInfo() { userInfo => - post { - import ExportEntitiesByTypeActor._ - entity(as[FileMatchingOptions]) { matchingOptions => - val attributeNames = None - val model = None - val exportArgs = ExportEntitiesByTypeArguments(userInfo, - workspaceNamespace, - workspaceName, - entityType, - attributeNames, - model - ) + } ~ path("api" / "workspaces" / Segment / Segment / "entities" / Segment / "tsv" / "frombucket") { + (workspaceNamespace, workspaceName, entityType) => + requireUserInfo() { userInfo => + post { + entity(as[FileMatchingOptions]) { matchingOptions => + val exportArgs = + ExportEntitiesByTypeArguments(userInfo, workspaceNamespace, workspaceName, entityType, None, None) - complete { - exportEntitiesByTypeConstructor(exportArgs).matchBucketFiles(matchingOptions) map { pairs => - // download the TSV as an attachment: - RequestCompleteWithHeaders( - (OK, pairs), - `Content-Type`.apply( - ContentType.apply(MediaTypes.`text/tab-separated-values`, HttpCharsets.`UTF-8`) - ), - `Content-Disposition`.apply(ContentDispositionTypes.attachment, - Map("filename" -> "filematching.tsv") - ) - ) + complete { + exportEntitiesByTypeConstructor(exportArgs).matchBucketFiles(matchingOptions) map { pairs => + // download the TSV as an attachment: +// RequestCompleteWithHeaders( +// (OK, pairs), +// `Content-Type`.apply( +// ContentType.apply(MediaTypes.`text/tab-separated-values`, HttpCharsets.`UTF-8`) +// ), +// `Content-Disposition`.apply(ContentDispositionTypes.attachment, +// Map("filename" -> "filematching.tsv") +// ) +// ) - // for easy debugging: output the TSV as text - // RequestComplete(OK, pairs) - } + // for easy debugging: output the TSV as text + RequestComplete(OK, pairs) } } } } - } + } + } // ******************************************************************************************************************* // POC of file-matching for AJ-2025 // ******************************************************************************************************************* diff --git a/src/test/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcherSpec.scala b/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcherSpec.scala similarity index 98% rename from src/test/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcherSpec.scala rename to src/test/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcherSpec.scala index f305649b7..894901a6d 100644 --- a/src/test/scala/org/broadinstitute/dsde/firecloud/utils/FileMatcherSpec.scala +++ b/src/test/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcherSpec.scala @@ -1,4 +1,4 @@ -package org.broadinstitute.dsde.firecloud.utils +package org.broadinstitute.dsde.firecloud.filematch import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, PartialMatchResult, SuccessfulMatchResult} import org.scalatest.freespec.AnyFreeSpec diff --git a/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala b/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala index 9124b55ee..a00d24fad 100644 --- a/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala +++ b/src/test/scala/org/broadinstitute/dsde/firecloud/mock/MockGoogleServicesDAO.scala @@ -119,5 +119,6 @@ class MockGoogleServicesDAO extends GoogleServicesDAO { Future.successful(()) } - override def listBucket(bucketName: GcsBucketName, prefix: Option[String]): List[GcsObjectName] = ??? + override def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName] = + ??? }