Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CORE-123: new file-pairing API for data uploader #1452

Merged
merged 23 commits into from
Nov 5, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions src/main/resources/reference.conf
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,7 @@ googlecloud {
"153601": 0.045
}
}

firecloud {
max-filematching-bucket-files = 25000
}
71 changes: 71 additions & 0 deletions src/main/resources/swagger/api-docs.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3688,6 +3688,55 @@ paths:
description: Internal Server Error
content: {}
x-passthrough: false
/api/workspaces/{workspaceNamespace}/{workspaceName}/entities/{entityType}/paired-tsv:
post:
tags:
- Entities
summary: |
Download a TSV of files in the workspace's bucket, paired by naming convention
description: |
Lists the files in the workspace's bucket, filtered to a specified prefix. Then,
attempts to pair those files to each other based on well-known naming conventions.
Downloads a TSV containing the result of those pairings.
operationId: bucketPairedTSV
parameters:
- $ref: '#/components/parameters/workspaceNamespaceParam'
- $ref: '#/components/parameters/workspaceNameParam'
- $ref: '#/components/parameters/entityTypeParam'
requestBody:
content:
'application/json':
schema:
$ref: '#/components/schemas/FileMatchingOptions'
examples:
minimally-required:
value:
prefix: my-bucket-prefix
disable-recursion:
value:
prefix: my-bucket-prefix/
recursive: false
rename-columns:
value:
prefix: my-bucket-prefix
read1Name: my-column-name-one
read2Name: my-column-name-two
required: true
responses:
200:
description: URL to saved file
content:
text/plain:
schema:
type: string
format: binary
404:
description: Workspace or entity type does not exist
content: {}
500:
description: Internal Server Error
content: {}
x-passthrough: false
/api/workspaces/{workspaceNamespace}/{workspaceName}/entityQuery/{entityType}:
get:
tags:
Expand Down Expand Up @@ -7473,6 +7522,28 @@ components:
ExtendedEnabled:
allOf:
- $ref: '#/components/schemas/Enabled'
FileMatchingOptions:
type: object
required:
- prefix
properties:
prefix:
type: string
description: |
Bucket prefix in which to look. If `recursive` is false, this must include a trailing
slash when specifying a subdirectory.
read1Name:
type: string
description: column name to use for the primary "read 1" file
default: read1
read2Name:
type: string
description: column name to use for the matching "read 2" file
default: read2
recursive:
type: boolean
description: whether to list files in subdirectories of the prefix
default: true
FireCloudPermission:
required:
- role
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,7 @@ object FireCloudConfig {
lazy val supportDomain = firecloud.getString("supportDomain")
lazy val supportPrefix = firecloud.getString("supportPrefix")
lazy val userAdminAccount = firecloud.getString("userAdminAccount")
lazy val maxFileMatchingFileCount = firecloud.getInt("max-filematching-bucket-files")
}

object Shibboleth {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,4 +40,6 @@ trait GoogleServicesDAO extends ReportsSubsystemStatus {
def publishMessages(fullyQualifiedTopic: String, messages: Seq[String]): Future[Unit]

def getBucket(bucketName: String, petKey: String): Option[Bucket]

def listBucket(bucketName: GcsBucketName, prefix: Option[String], recursive: Boolean): List[GcsObjectName]
}
Original file line number Diff line number Diff line change
Expand Up @@ -400,4 +400,22 @@ class HttpGoogleServicesDAO(priceListUrl: String, defaultPriceList: GooglePriceL
getScopedServiceAccountCredentials(firecloudAdminSACreds, authScopes)
.refreshAccessToken()
.getTokenValue

override def listBucket(bucketName: GcsBucketName,
prefix: Option[String],
recursive: Boolean = true
): List[GcsObjectName] = {
// listObjectsWithPrefix handles paginating through results if there are more results than
// the `maxPageSize` setting.
val listAttempt = getStorageResource.use { storageService =>
storageService
.listObjectsWithPrefix(bucketName, prefix.getOrElse(""), maxPageSize = 2000, isRecursive = recursive)
kevinmarete marked this conversation as resolved.
Show resolved Hide resolved
.compile
.toList
}

// execute the upload
listAttempt.unsafeRunSync()
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package org.broadinstitute.dsde.firecloud.filematch

import com.typesafe.scalalogging.LazyLogging
import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult}
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy}

import java.nio.file.Path

/**
* Given a list of files, pair those files based on their naming conventions.
* At the time of writing, this involves recognizing Illumina single end and paired end read patterns
* such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm
*
* In the future, we may support additional naming conventions
*/
class FileMatcher extends LazyLogging {

// the list of recognition strategies to use
private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy())

/**
* Given a list of files, pair up those files according to our known recognition strategies.
* @param pathList the list of files to inspect
* @return pairing results
*/
def pairPaths(pathList: List[Path]): List[FileMatchResult] =
performPairing(pathList)

/**
* Given a list of files, pair up those files according to our known recognition strategies.
* @param fileList the list of files to inspect, as Strings
* @return pairing results
*/
def pairFiles(fileList: List[String]): List[FileMatchResult] = {
// convert fileList to pathList
val pathList = fileList.map(file => new java.io.File(file).toPath)
pairPaths(pathList)
}

/**
* Implementation for file pairing. This executes in three steps:
* 1. Use our known file recognition strategies to identify all "read 1" files in the file list
* 2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s
* 3. Handle the remaining files which are not recognized as either "read 1" or "read 2"
*
* @param pathList the list of files to inspect
* @return pairing results
*/
private def performPairing(pathList: List[Path]): List[FileMatchResult] = {
// find every path in the incoming pathList that is recognized by one of our known patterns
val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList)

// remove the recognized firstFiles from the outstanding pathList
val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile)

// process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList.
// this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult
// when the desired pairing is not found
val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings)

// remove the recognized "read 2" files from the outstanding pathList
val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult =>
s.secondFile
}
// translate the unrecognized paths into a FileMatchResult
val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path))

// return results, sorted by firstFile
(pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile)
}

/**
* find every path in the incoming pathList that is recognized as a "read 1" by our known patterns
* @param pathList the list of files to inspect
* @return pairing results
*/
private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] =
pathList.collect { path =>
tryPairingStrategies(path) match {
case success: SuccessfulMatchResult => success
}
}

/**
* find every path in the incoming pathList that is recognized as a "read 2" by our known patterns
*
* @param pathList the list of files to inspect
* @param desiredPairings the "read 2" files to look for in the pathList
* @return pairing results
*/
private def findSecondFiles(pathList: List[Path],
desiredPairings: List[SuccessfulMatchResult]
): List[FileMatchResult] =
desiredPairings.map { desiredPairing =>
// search for the desired pairing's secondFile in the list of actual files
pathList.find(p => p.equals(desiredPairing.secondFile)) match {
case Some(_) => desiredPairing
case None => desiredPairing.toPartial
}
}

/**
* Attempt all the configured file recognition strategies against the supplied file.
*
* @param file the file to try to recognize
* @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not
*/
private def tryPairingStrategies(file: Path): FileMatchResult = {
// does the current file hit on any of our file-matching patterns?
// Iterate over the matching strategies and return the first successful match result.
val strategyHit = matchingStrategies.collectFirst(strategy =>
strategy.matchFirstFile(file) match {
case success: SuccessfulMatchResult => success
}
)
strategyHit match {
// The current file is recognized by one of our recognition strategies
case Some(desiredResult: SuccessfulMatchResult) => desiredResult
// the current file is not recognized
case _ => FailedMatchResult(file)
}
}

}
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
package org.broadinstitute.dsde.firecloud.filematch

import spray.json.DefaultJsonProtocol.jsonFormat4
import spray.json.RootJsonFormat
import spray.json.DefaultJsonProtocol._

/**
* Request payload, specified by end users, to control file-matching functionality
* @param prefix bucket prefix in which to list files
* @param read1Name name for the "read1" column
* @param read2Name name for the "read2" column
* @param recursive should bucket-listing be recursive?
*/
case class FileMatchingOptions(prefix: String,
read1Name: Option[String] = None,
read2Name: Option[String] = None,
recursive: Option[Boolean] = None
)

object FileMatchingOptionsFormat {
implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat4(FileMatchingOptions)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import com.google.common.annotations.VisibleForTesting

import java.nio.file.Path

/**
* FileMatchResult indicating that the file did not hit on any known pattern.
*/
case class FailedMatchResult(firstFile: Path) extends FileMatchResult {}

@VisibleForTesting
object FailedMatchResult {
def fromString(firstFile: String): FailedMatchResult =
FailedMatchResult(new java.io.File(firstFile).toPath)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import java.nio.file.Path

/**
* Marker trait for failed/partial/successful file-matching results
*/
trait FileMatchResult {
def firstFile: Path
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import com.google.common.annotations.VisibleForTesting

import java.nio.file.Path

/**
* FileMatchResult indicating that the file successfully hit a known pattern, but no paired file could be found.
*/
case class PartialMatchResult(firstFile: Path, id: String) extends FileMatchResult {}

@VisibleForTesting
object PartialMatchResult {
def fromStrings(firstFile: String, id: String): PartialMatchResult =
PartialMatchResult(new java.io.File(firstFile).toPath, id)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
package org.broadinstitute.dsde.firecloud.filematch.result

import com.google.common.annotations.VisibleForTesting

import java.nio.file.Path

/**
* FileMatchResult indicating that the file successfully hit a known pattern.
*/
case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult {
// convert this SuccessfulMatchResult to a PartialMatchResult
def toPartial: PartialMatchResult = PartialMatchResult(firstFile, id)
}

@VisibleForTesting
object SuccessfulMatchResult {
def fromStrings(firstFile: String, secondFile: String, id: String): SuccessfulMatchResult =
SuccessfulMatchResult(new java.io.File(firstFile).toPath, new java.io.File(secondFile).toPath, id)
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
package org.broadinstitute.dsde.firecloud.filematch.strategy

import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult

import java.nio.file.Path

/**
* Marker trait representing file-naming conventions used for pairing matched reads.
*/
trait FileRecognitionStrategy {

def matchFirstFile(path: Path): FileMatchResult

}
Loading
Loading