Skip to content

Commit

Permalink
updates
Browse files Browse the repository at this point in the history
  • Loading branch information
davidangb committed Nov 1, 2024
1 parent 7857f88 commit 485ee0a
Show file tree
Hide file tree
Showing 11 changed files with 119 additions and 93 deletions.
30 changes: 23 additions & 7 deletions src/main/resources/swagger/api-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3703,13 +3703,7 @@ paths:
content:
'application/json':
schema:
type: object
required:
- prefix
properties:
prefix:
type: string
description: bucket prefix in which to look
$ref: '#/components/schemas/FileMatchingOptions'
required: true
responses:
200:
Expand Down Expand Up @@ -7511,6 +7505,28 @@ components:
ExtendedEnabled:
allOf:
- $ref: '#/components/schemas/Enabled'
FileMatchingOptions:
type: object
required:
- prefix
properties:
prefix:
type: string
description: |
Bucket prefix in which to look. If `recursive` is false, this must include a trailing
slash when specifying a subdirectory.
read1Name:
type: string
description: column name to use for the primary "read 1" file
default: read1
read2Name:
type: string
description: column name to use for the matching "read 2" file
default: read2
recursive:
type: boolean
description: whether to list files in subdirectories of the prefix
default: true
FireCloudPermission:
required:
- role
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,5 +41,5 @@ trait GoogleServicesDAO extends ReportsSubsystemStatus {

def getBucket(bucketName: String, petKey: String): Option[Bucket]

def listBucket(bucketName: GcsBucketName, prefix: Option[String]): List[GcsObjectName]
def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName]
}
Original file line number Diff line number Diff line change
Expand Up @@ -401,12 +401,15 @@ class HttpGoogleServicesDAO(priceListUrl: String, defaultPriceList: GooglePriceL
.refreshAccessToken()
.getTokenValue

override def listBucket(bucketName: GcsBucketName, prefix: Option[String]): List[GcsObjectName] = {
override def listBucket(bucketName: GcsBucketName,
prefix: Option[String],
recursive: Boolean = true
): List[GcsObjectName] = {
// listObjectsWithPrefix handles paginating through results if there are more results than
// the `maxPageSize` setting.
val listAttempt = getStorageResource.use { storageService =>
storageService
.listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = true)
.listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = recursive)
.compile
.toList
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
package org.broadinstitute.dsde.firecloud.utils
package org.broadinstitute.dsde.firecloud.filematch

import com.typesafe.scalalogging.LazyLogging
import org.broadinstitute.dsde.firecloud.filematch.result
import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult}
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy}

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
package org.broadinstitute.dsde.firecloud.filematch

import spray.json.DefaultJsonProtocol.jsonFormat4
import spray.json.RootJsonFormat
import spray.json.DefaultJsonProtocol._

case class FileMatchingOptions(prefix: String,
read1Name: Option[String] = None,
read2Name: Option[String] = None,
recursive: Option[Boolean] = None
)

object FileMatchingOptionsFormat {
implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat4(FileMatchingOptions)
}
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,9 @@ import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult

import java.nio.file.Path

/**
* Marker trait representing file-naming conventions used for pairing matched reads.
*/
trait FileRecognitionStrategy {

def matchFirstFile(path: Path): FileMatchResult
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,40 +5,44 @@ import org.broadinstitute.dsde.firecloud.filematch.strategy.IlluminaPairedEndStr

import java.nio.file.Path

/*
Sample1_01.fastq.gz -> Sample1_02.fastq.gz
sample01_1.fastq.gz -> sample01_2.fastq.gz
sample01_R1.fastq.gz -> sample01_R2.fastq.gz
sample01_F.fastq.gz -> sample01_R.fastq.gz
sample01_R1.fastq -> sample01_R2.fastq
SampleName_S1_L001_R1_001.fastq.gz -> SampleName_S1_L001_R2_001.fastq.gz
*/

object IlluminaPairedEndStrategy {
// if the first file ends with ${key}, then the second file should end with ${value}
val FILE_ENDINGS: Map[String, String] = Map(
"_01.fastq.gz" -> "_02.fastq.gz",
"_1.fastq.gz" -> "_2.fastq.gz",
"_R1.fastq.gz" -> "_R2.fastq.gz",
"_F.fastq.gz" -> "_R.fastq.gz",
"_R1.fastq" -> "_R2.fastq",
"_R1_001.fastq.gz" -> "_R2_001.fastq.gz"
)
}

/**
* Naming conventions for Illumina single end and paired end read patterns. Examples of files recognized:
*
* Sample1_01.fastq.gz -> Sample1_02.fastq.gz
* sample01_1.fastq.gz -> sample01_2.fastq.gz
* sample01_R1.fastq.gz -> sample01_R2.fastq.gz
* sample01_F.fastq.gz -> sample01_R.fastq.gz
* sample01_R1.fastq -> sample01_R2.fastq
* SampleName_S1_L001_R1_001.fastq.gz -> SampleName_S1_L001_R2_001.fastq.gz
*/
class IlluminaPairedEndStrategy extends FileRecognitionStrategy {
override def matchFirstFile(path: Path): FileMatchResult = {
// search known patterns for a "read1" file
val foundMatch = FILE_ENDINGS.find { case (key, _) => path.toString.endsWith(key) }

foundMatch match {
// we found a "read1"
case Some((key, value)) =>
// generate the id: strip the suffix from the filename.
val id = path.getFileName.toString.replace(key, "")
// generate the second filename: replace the first suffix with the second suffix
val secondFile = new java.io.File(path.toString.replace(key, value))
SuccessfulMatchResult(path, secondFile.toPath, id)

// the file is not recognized
case None => FailedMatchResult(path)
}
}
}

object IlluminaPairedEndStrategy {
// if the first file ends with ${key}, then the second file should end with ${value}
val FILE_ENDINGS: Map[String, String] = Map(
"_01.fastq.gz" -> "_02.fastq.gz",
"_1.fastq.gz" -> "_2.fastq.gz",
"_R1.fastq.gz" -> "_R2.fastq.gz",
"_F.fastq.gz" -> "_R.fastq.gz",
"_R1.fastq" -> "_R2.fastq",
"_R1_001.fastq.gz" -> "_R2_001.fastq.gz"
)
}
Original file line number Diff line number Diff line change
Expand Up @@ -9,16 +9,17 @@ import akka.util.{ByteString, Timeout}
import better.files.File
import com.typesafe.scalalogging.LazyLogging
import org.broadinstitute.dsde.firecloud.dataaccess.{GoogleServicesDAO, RawlsDAO}
import org.broadinstitute.dsde.firecloud.filematch.{FileMatcher, FileMatchingOptions}
import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptionsFormat.fileMatchingOptionsFormat
import org.broadinstitute.dsde.firecloud.filematch.result.{
FailedMatchResult,
FileMatchResult,
PartialMatchResult,
SuccessfulMatchResult
}
import org.broadinstitute.dsde.firecloud.model.ModelJsonProtocol._
import org.broadinstitute.dsde.firecloud.model.{UserInfo, _}
import org.broadinstitute.dsde.firecloud.service.ExportEntitiesByTypeActor.FileMatchingOptions
import org.broadinstitute.dsde.firecloud.utils.{FileMatcher, TSVFormatter}
import org.broadinstitute.dsde.firecloud.model._
import org.broadinstitute.dsde.firecloud.utils.TSVFormatter
import org.broadinstitute.dsde.firecloud.{Application, FireCloudConfig, FireCloudExceptionWithErrorReport}
import org.broadinstitute.dsde.rawls.model.WorkspaceAccessLevels.WorkspaceAccessLevel
import org.broadinstitute.dsde.rawls.model._
Expand Down Expand Up @@ -59,18 +60,6 @@ object ExportEntitiesByTypeActor {
exportArgs.model,
system
)

// *******************************************************************************************************************
// POC of file-matching for AJ-2025
// *******************************************************************************************************************
import spray.json.DefaultJsonProtocol._

case class FileMatchingOptions(prefix: String)

implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat1(FileMatchingOptions)
// *******************************************************************************************************************
// POC of file-matching for AJ-2025:
// *******************************************************************************************************************
}

/**
Expand Down Expand Up @@ -395,12 +384,18 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO,
// those files based on Illumina single end and paired end read patterns
// *******************************************************************************************************************

def matchBucketFiles(matchingOptions: FileMatchingOptions): Future[String] =
def matchBucketFiles(matchingOptions: FileMatchingOptions): Future[String] = {
// generate defaults for options
val read1Name = matchingOptions.read1Name.getOrElse("read1")
val read2Name = matchingOptions.read2Name.getOrElse("read2")
val recursive = matchingOptions.recursive.getOrElse(true)

// retrieve workspace so we can get its bucket
rawlsDAO.getWorkspace(workspaceNamespace, workspaceName)(userInfo) map { workspaceResponse =>
val workspaceBucket = GcsBucketName(workspaceResponse.workspace.bucketName)
// list all files in bucket which match matchingOptions.prefix
val fileList: List[GcsObjectName] = googleServicesDao.listBucket(workspaceBucket, Option(matchingOptions.prefix))
val fileList: List[GcsObjectName] =
googleServicesDao.listBucket(workspaceBucket, Option(matchingOptions.prefix), recursive)

// transform the list of GcsObjectName to a list of java.nio.Path
val pathList: List[Path] = fileList.map(gcsObject => new java.io.File(gcsObject.value).toPath)
Expand All @@ -409,24 +404,24 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO,
val pairs: List[FileMatchResult] = new FileMatcher().pairPaths(pathList)

// TSV headers
val entityHeaders: IndexedSeq[String] = IndexedSeq(s"entity:${entityType}_id", "read1", "read2")
val entityHeaders: IndexedSeq[String] = IndexedSeq(s"entity:${entityType}_id", read1Name, read2Name)

// transform the matched pairs into entities
val entities: List[Entity] = pairs.map {
case SuccessfulMatchResult(firstFile, secondFile, id) =>
val attributes = Map(
AttributeName.withDefaultNS("read1") -> AttributeString(firstFile.toString),
AttributeName.withDefaultNS("read2") -> AttributeString(secondFile.toString)
AttributeName.withDefaultNS(read1Name) -> AttributeString(firstFile.toString),
AttributeName.withDefaultNS(read2Name) -> AttributeString(secondFile.toString)
)
Entity(id, entityType, attributes)
case PartialMatchResult(firstFile, id) =>
val attributes = Map(
AttributeName.withDefaultNS("read1") -> AttributeString(firstFile.toString)
AttributeName.withDefaultNS(read1Name) -> AttributeString(firstFile.toString)
)
Entity(id, entityType, attributes)
case FailedMatchResult(firstFile) =>
val attributes = Map(
AttributeName.withDefaultNS("read1") -> AttributeString(firstFile.toString)
AttributeName.withDefaultNS(read1Name) -> AttributeString(firstFile.toString)
)
Entity(firstFile.toString, entityType, attributes)

Expand All @@ -440,5 +435,6 @@ class ExportEntitiesByTypeActor(rawlsDAO: RawlsDAO,

headerString + rowString
}
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import akka.http.scaladsl.model.headers.{`Content-Disposition`, `Content-Type`,
import akka.http.scaladsl.server.{Directives, Route}
import com.typesafe.scalalogging.LazyLogging
import org.apache.commons.lang3.StringUtils
import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptions
import org.broadinstitute.dsde.firecloud.filematch.FileMatchingOptionsFormat.fileMatchingOptionsFormat
import org.broadinstitute.dsde.firecloud.service.PerRequest.{RequestComplete, RequestCompleteWithHeaders}
import org.broadinstitute.dsde.firecloud.service.{ExportEntitiesByTypeActor, ExportEntitiesByTypeArguments}
import org.broadinstitute.dsde.firecloud.utils.StandardUserInfoDirectives
Expand Down Expand Up @@ -65,48 +67,35 @@ trait ExportEntitiesApiService
}
}
}
} ~
// *******************************************************************************************************************
// POC of file-matching for AJ-2025
// *******************************************************************************************************************
// TODO: add swagger definition
path("api" / "workspaces" / Segment / Segment / "entities" / Segment / "tsv" / "frombucket") {
(workspaceNamespace, workspaceName, entityType) =>
requireUserInfo() { userInfo =>
post {
import ExportEntitiesByTypeActor._
entity(as[FileMatchingOptions]) { matchingOptions =>
val attributeNames = None
val model = None
val exportArgs = ExportEntitiesByTypeArguments(userInfo,
workspaceNamespace,
workspaceName,
entityType,
attributeNames,
model
)
} ~ path("api" / "workspaces" / Segment / Segment / "entities" / Segment / "tsv" / "frombucket") {
(workspaceNamespace, workspaceName, entityType) =>
requireUserInfo() { userInfo =>
post {
entity(as[FileMatchingOptions]) { matchingOptions =>
val exportArgs =
ExportEntitiesByTypeArguments(userInfo, workspaceNamespace, workspaceName, entityType, None, None)

complete {
exportEntitiesByTypeConstructor(exportArgs).matchBucketFiles(matchingOptions) map { pairs =>
// download the TSV as an attachment:
RequestCompleteWithHeaders(
(OK, pairs),
`Content-Type`.apply(
ContentType.apply(MediaTypes.`text/tab-separated-values`, HttpCharsets.`UTF-8`)
),
`Content-Disposition`.apply(ContentDispositionTypes.attachment,
Map("filename" -> "filematching.tsv")
)
)
complete {
exportEntitiesByTypeConstructor(exportArgs).matchBucketFiles(matchingOptions) map { pairs =>
// download the TSV as an attachment:
// RequestCompleteWithHeaders(
// (OK, pairs),
// `Content-Type`.apply(
// ContentType.apply(MediaTypes.`text/tab-separated-values`, HttpCharsets.`UTF-8`)
// ),
// `Content-Disposition`.apply(ContentDispositionTypes.attachment,
// Map("filename" -> "filematching.tsv")
// )
// )

// for easy debugging: output the TSV as text
// RequestComplete(OK, pairs)
}
// for easy debugging: output the TSV as text
RequestComplete(OK, pairs)
}
}
}
}
}
}
}
// *******************************************************************************************************************
// POC of file-matching for AJ-2025
// *******************************************************************************************************************
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.broadinstitute.dsde.firecloud.utils
package org.broadinstitute.dsde.firecloud.filematch

import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, PartialMatchResult, SuccessfulMatchResult}
import org.scalatest.freespec.AnyFreeSpec
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,5 +119,6 @@ class MockGoogleServicesDAO extends GoogleServicesDAO {
Future.successful(())
}

override def listBucket(bucketName: GcsBucketName, prefix: Option[String]): List[GcsObjectName] = ???
override def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName] =
???
}

0 comments on commit 485ee0a

Please sign in to comment.