Skip to content

Commit

Permalink
CORE-123: new file-pairing API for data uploader (#1452)
Browse files Browse the repository at this point in the history
  • Loading branch information
davidangb authored Nov 5, 2024
1 parent 775cb6d commit 611acf8
Show file tree
Hide file tree
Showing 20 changed files with 779 additions and 8 deletions.
4 changes: 4 additions & 0 deletions src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,7 @@ googlecloud {
"153601": 0.045
}
}

firecloud {
max-filematching-bucket-files = 25000
}
71 changes: 71 additions & 0 deletions src/main/resources/swagger/api-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3688,6 +3688,55 @@ paths:
description: Internal Server Error
content: {}
x-passthrough: false
/api/workspaces/{workspaceNamespace}/{workspaceName}/entities/{entityType}/paired-tsv:
post:
tags:
- Entities
summary: |
Download a TSV of files in the workspace's bucket, paired by naming convention
description: |
Lists the files in the workspace's bucket, filtered to a specified prefix. Then,
attempts to pair those files to each other based on well-known naming conventions.
Downloads a TSV containing the result of those pairings.
operationId: bucketPairedTSV
parameters:
- $ref: '#/components/parameters/workspaceNamespaceParam'
- $ref: '#/components/parameters/workspaceNameParam'
- $ref: '#/components/parameters/entityTypeParam'
requestBody:
content:
'application/json':
schema:
$ref: '#/components/schemas/FileMatchingOptions'
examples:
minimally-required:
value:
prefix: my-bucket-prefix
disable-recursion:
value:
prefix: my-bucket-prefix/
recursive: false
rename-columns:
value:
prefix: my-bucket-prefix
read1Name: my-column-name-one
read2Name: my-column-name-two
required: true
responses:
200:
description: URL to saved file
content:
text/plain:
schema:
type: string
format: binary
404:
description: Workspace or entity type does not exist
content: {}
500:
description: Internal Server Error
content: {}
x-passthrough: false
/api/workspaces/{workspaceNamespace}/{workspaceName}/entityQuery/{entityType}:
get:
tags:
Expand Down Expand Up @@ -7473,6 +7522,28 @@ components:
ExtendedEnabled:
allOf:
- $ref: '#/components/schemas/Enabled'
FileMatchingOptions:
type: object
required:
- prefix
properties:
prefix:
type: string
description: |
Bucket prefix in which to look. If `recursive` is false, this must include a trailing
slash when specifying a subdirectory.
read1Name:
type: string
description: column name to use for the primary "read 1" file
default: read1
read2Name:
type: string
description: column name to use for the matching "read 2" file
default: read2
recursive:
type: boolean
description: whether to list files in subdirectories of the prefix
default: true
FireCloudPermission:
required:
- role
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ object FireCloudConfig {
lazy val supportDomain = firecloud.getString("supportDomain")
lazy val supportPrefix = firecloud.getString("supportPrefix")
lazy val userAdminAccount = firecloud.getString("userAdminAccount")
lazy val maxFileMatchingFileCount = firecloud.getInt("max-filematching-bucket-files")
}

object Shibboleth {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ trait GoogleServicesDAO extends ReportsSubsystemStatus {
def publishMessages(fullyQualifiedTopic: String, messages: Seq[String]): Future[Unit]

def getBucket(bucketName: String, petKey: String): Option[Bucket]

def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName]
}
Original file line number Diff line number Diff line change
Expand Up @@ -400,4 +400,22 @@ class HttpGoogleServicesDAO(priceListUrl: String, defaultPriceList: GooglePriceL
getScopedServiceAccountCredentials(firecloudAdminSACreds, authScopes)
.refreshAccessToken()
.getTokenValue

override def listBucket(bucketName: GcsBucketName,
prefix: Option[String],
recursive: Boolean = true
): List[GcsObjectName] = {
// listObjectsWithPrefix handles paginating through results if there are more results than
// the `maxPageSize` setting.
val listAttempt = getStorageResource.use { storageService =>
storageService
.listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = recursive)
.compile
.toList
}

// execute the upload
listAttempt.unsafeRunSync()
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package org.broadinstitute.dsde.firecloud.filematch

import com.typesafe.scalalogging.LazyLogging
import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult}
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy}

import java.nio.file.Path

/**
* Given a list of files, pair those files based on their naming conventions.
* At the time of writing, this involves recognizing Illumina single end and paired end read patterns
* such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm
*
* In the future, we may support additional naming conventions
*/
class FileMatcher extends LazyLogging {

// the list of recognition strategies to use
private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy())

/**
* Given a list of files, pair up those files according to our known recognition strategies.
* @param pathList the list of files to inspect
* @return pairing results
*/
def pairPaths(pathList: List[Path]): List[FileMatchResult] =
performPairing(pathList)

/**
* Given a list of files, pair up those files according to our known recognition strategies.
* @param fileList the list of files to inspect, as Strings
* @return pairing results
*/
def pairFiles(fileList: List[String]): List[FileMatchResult] = {
// convert fileList to pathList
val pathList = fileList.map(file => new java.io.File(file).toPath)
pairPaths(pathList)
}

/**
* Implementation for file pairing. This executes in three steps:
* 1. Use our known file recognition strategies to identify all "read 1" files in the file list
* 2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s
* 3. Handle the remaining files which are not recognized as either "read 1" or "read 2"
*
* @param pathList the list of files to inspect
* @return pairing results
*/
private def performPairing(pathList: List[Path]): List[FileMatchResult] = {
// find every path in the incoming pathList that is recognized by one of our known patterns
val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList)

// remove the recognized firstFiles from the outstanding pathList
val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile)

// process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList.
// this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult
// when the desired pairing is not found
val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings)

// remove the recognized "read 2" files from the outstanding pathList
val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult =>
s.secondFile
}
// translate the unrecognized paths into a FileMatchResult
val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path))

// return results, sorted by firstFile
(pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile)
}

/**
* find every path in the incoming pathList that is recognized as a "read 1" by our known patterns
* @param pathList the list of files to inspect
* @return pairing results
*/
private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] =
pathList.collect { path =>
tryPairingStrategies(path) match {
case success: SuccessfulMatchResult => success
}
}

/**
* find every path in the incoming pathList that is recognized as a "read 2" by our known patterns
*
* @param pathList the list of files to inspect
* @param desiredPairings the "read 2" files to look for in the pathList
* @return pairing results
*/
private def findSecondFiles(pathList: List[Path],
desiredPairings: List[SuccessfulMatchResult]
): List[FileMatchResult] =
desiredPairings.map { desiredPairing =>
// search for the desired pairing's secondFile in the list of actual files
pathList.find(p => p.equals(desiredPairing.secondFile)) match {
case Some(_) => desiredPairing
case None => desiredPairing.toPartial
}
}

/**
* Attempt all the configured file recognition strategies against the supplied file.
*
* @param file the file to try to recognize
* @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not
*/
private def tryPairingStrategies(file: Path): FileMatchResult = {
// does the current file hit on any of our file-matching patterns?
// Iterate over the matching strategies and return the first successful match result.
val strategyHit = matchingStrategies.collectFirst(strategy =>
strategy.matchFirstFile(file) match {
case success: SuccessfulMatchResult => success
}
)
strategyHit match {
// The current file is recognized by one of our recognition strategies
case Some(desiredResult: SuccessfulMatchResult) => desiredResult
// the current file is not recognized
case _ => FailedMatchResult(file)
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package org.broadinstitute.dsde.firecloud.filematch

import spray.json.DefaultJsonProtocol.jsonFormat4
import spray.json.RootJsonFormat
import spray.json.DefaultJsonProtocol._

/**
* Request payload, specified by end users, to control file-matching functionality
* @param prefix bucket prefix in which to list files
* @param read1Name name for the "read1" column
* @param read2Name name for the "read2" column
* @param recursive should bucket-listing be recursive?
*/
case class FileMatchingOptions(prefix: String,
read1Name: Option[String] = None,
read2Name: Option[String] = None,
recursive: Option[Boolean] = None
)

object FileMatchingOptionsFormat {
implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat4(FileMatchingOptions)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import com.google.common.annotations.VisibleForTesting

import java.nio.file.Path

/**
* FileMatchResult indicating that the file did not hit on any known pattern.
*/
case class FailedMatchResult(firstFile: Path) extends FileMatchResult {}

@VisibleForTesting
object FailedMatchResult {
def fromString(firstFile: String): FailedMatchResult =
FailedMatchResult(new java.io.File(firstFile).toPath)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import java.nio.file.Path

/**
* Marker trait for failed/partial/successful file-matching results
*/
trait FileMatchResult {
def firstFile: Path
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import com.google.common.annotations.VisibleForTesting

import java.nio.file.Path

/**
* FileMatchResult indicating that the file successfully hit a known pattern, but no paired file could be found.
*/
case class PartialMatchResult(firstFile: Path, id: String) extends FileMatchResult {}

@VisibleForTesting
object PartialMatchResult {
def fromStrings(firstFile: String, id: String): PartialMatchResult =
PartialMatchResult(new java.io.File(firstFile).toPath, id)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import com.google.common.annotations.VisibleForTesting

import java.nio.file.Path

/**
* FileMatchResult indicating that the file successfully hit a known pattern.
*/
case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult {
// convert this SuccessfulMatchResult to a PartialMatchResult
def toPartial: PartialMatchResult = PartialMatchResult(firstFile, id)
}

@VisibleForTesting
object SuccessfulMatchResult {
def fromStrings(firstFile: String, secondFile: String, id: String): SuccessfulMatchResult =
SuccessfulMatchResult(new java.io.File(firstFile).toPath, new java.io.File(secondFile).toPath, id)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package org.broadinstitute.dsde.firecloud.filematch.strategy

import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult

import java.nio.file.Path

/**
* Marker trait representing file-naming conventions used for pairing matched reads.
*/
trait FileRecognitionStrategy {

def matchFirstFile(path: Path): FileMatchResult

}
Loading

0 comments on commit 611acf8

Please sign in to comment.