-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
CORE-123: new file-pairing API for data uploader (#1452)
- Loading branch information
Showing
20 changed files
with
779 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -94,3 +94,7 @@ googlecloud { | |
"153601": 0.045 | ||
} | ||
} | ||
|
||
firecloud { | ||
max-filematching-bucket-files = 25000 | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
124 changes: 124 additions & 0 deletions
124
src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatcher.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch | ||
|
||
import com.typesafe.scalalogging.LazyLogging | ||
import org.broadinstitute.dsde.firecloud.filematch.result.{FailedMatchResult, FileMatchResult, SuccessfulMatchResult} | ||
import org.broadinstitute.dsde.firecloud.filematch.strategy.{FileRecognitionStrategy, IlluminaPairedEndStrategy} | ||
|
||
import java.nio.file.Path | ||
|
||
/** | ||
* Given a list of files, pair those files based on their naming conventions. | ||
* At the time of writing, this involves recognizing Illumina single end and paired end read patterns | ||
* such as those defined at https://support.illumina.com/help/BaseSpace_Sequence_Hub_OLH_009008_2/Source/Informatics/BS/NamingConvention_FASTQ-files-swBS.htm | ||
* | ||
* In the future, we may support additional naming conventions | ||
*/ | ||
class FileMatcher extends LazyLogging { | ||
|
||
// the list of recognition strategies to use | ||
private val matchingStrategies: List[FileRecognitionStrategy] = List(new IlluminaPairedEndStrategy()) | ||
|
||
/** | ||
* Given a list of files, pair up those files according to our known recognition strategies. | ||
* @param pathList the list of files to inspect | ||
* @return pairing results | ||
*/ | ||
def pairPaths(pathList: List[Path]): List[FileMatchResult] = | ||
performPairing(pathList) | ||
|
||
/** | ||
* Given a list of files, pair up those files according to our known recognition strategies. | ||
* @param fileList the list of files to inspect, as Strings | ||
* @return pairing results | ||
*/ | ||
def pairFiles(fileList: List[String]): List[FileMatchResult] = { | ||
// convert fileList to pathList | ||
val pathList = fileList.map(file => new java.io.File(file).toPath) | ||
pairPaths(pathList) | ||
} | ||
|
||
/** | ||
* Implementation for file pairing. This executes in three steps: | ||
* 1. Use our known file recognition strategies to identify all "read 1" files in the file list | ||
* 2. Search for all "read 2" files in the file list which match the previously-identified "read 1"s | ||
* 3. Handle the remaining files which are not recognized as either "read 1" or "read 2" | ||
* | ||
* @param pathList the list of files to inspect | ||
* @return pairing results | ||
*/ | ||
private def performPairing(pathList: List[Path]): List[FileMatchResult] = { | ||
// find every path in the incoming pathList that is recognized by one of our known patterns | ||
val desiredPairings: List[SuccessfulMatchResult] = findFirstFiles(pathList) | ||
|
||
// remove the recognized firstFiles from the outstanding pathList | ||
val remainingPaths: List[Path] = pathList diff desiredPairings.map(_.firstFile) | ||
|
||
// process the recognized "read 1" files, and look for their desired pairings in the outstanding pathList. | ||
// this will result in either SuccessfulMatchResult when the desired pairing is found, or PartialMatchResult | ||
// when the desired pairing is not found | ||
val pairingResults: List[FileMatchResult] = findSecondFiles(remainingPaths, desiredPairings) | ||
|
||
// remove the recognized "read 2" files from the outstanding pathList | ||
val unrecognizedPaths: List[Path] = remainingPaths diff pairingResults.collect { case s: SuccessfulMatchResult => | ||
s.secondFile | ||
} | ||
// translate the unrecognized paths into a FileMatchResult | ||
val unrecognizedResults: List[FailedMatchResult] = unrecognizedPaths.map(path => FailedMatchResult(path)) | ||
|
||
// return results, sorted by firstFile | ||
(pairingResults ++ unrecognizedResults).sortBy(r => r.firstFile) | ||
} | ||
|
||
/** | ||
* find every path in the incoming pathList that is recognized as a "read 1" by our known patterns | ||
* @param pathList the list of files to inspect | ||
* @return pairing results | ||
*/ | ||
private def findFirstFiles(pathList: List[Path]): List[SuccessfulMatchResult] = | ||
pathList.collect { path => | ||
tryPairingStrategies(path) match { | ||
case success: SuccessfulMatchResult => success | ||
} | ||
} | ||
|
||
/** | ||
* find every path in the incoming pathList that is recognized as a "read 2" by our known patterns | ||
* | ||
* @param pathList the list of files to inspect | ||
* @param desiredPairings the "read 2" files to look for in the pathList | ||
* @return pairing results | ||
*/ | ||
private def findSecondFiles(pathList: List[Path], | ||
desiredPairings: List[SuccessfulMatchResult] | ||
): List[FileMatchResult] = | ||
desiredPairings.map { desiredPairing => | ||
// search for the desired pairing's secondFile in the list of actual files | ||
pathList.find(p => p.equals(desiredPairing.secondFile)) match { | ||
case Some(_) => desiredPairing | ||
case None => desiredPairing.toPartial | ||
} | ||
} | ||
|
||
/** | ||
* Attempt all the configured file recognition strategies against the supplied file. | ||
* | ||
* @param file the file to try to recognize | ||
* @return SuccessfulMatchResult if the file is recognized; FailedMatchResult if not | ||
*/ | ||
private def tryPairingStrategies(file: Path): FileMatchResult = { | ||
// does the current file hit on any of our file-matching patterns? | ||
// Iterate over the matching strategies and return the first successful match result. | ||
val strategyHit = matchingStrategies.collectFirst(strategy => | ||
strategy.matchFirstFile(file) match { | ||
case success: SuccessfulMatchResult => success | ||
} | ||
) | ||
strategyHit match { | ||
// The current file is recognized by one of our recognition strategies | ||
case Some(desiredResult: SuccessfulMatchResult) => desiredResult | ||
// the current file is not recognized | ||
case _ => FailedMatchResult(file) | ||
} | ||
} | ||
|
||
} |
22 changes: 22 additions & 0 deletions
22
src/main/scala/org/broadinstitute/dsde/firecloud/filematch/FileMatchingOptions.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,22 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch | ||
|
||
import spray.json.DefaultJsonProtocol.jsonFormat4 | ||
import spray.json.RootJsonFormat | ||
import spray.json.DefaultJsonProtocol._ | ||
|
||
/** | ||
* Request payload, specified by end users, to control file-matching functionality | ||
* @param prefix bucket prefix in which to list files | ||
* @param read1Name name for the "read1" column | ||
* @param read2Name name for the "read2" column | ||
* @param recursive should bucket-listing be recursive? | ||
*/ | ||
case class FileMatchingOptions(prefix: String, | ||
read1Name: Option[String] = None, | ||
read2Name: Option[String] = None, | ||
recursive: Option[Boolean] = None | ||
) | ||
|
||
object FileMatchingOptionsFormat { | ||
implicit val fileMatchingOptionsFormat: RootJsonFormat[FileMatchingOptions] = jsonFormat4(FileMatchingOptions) | ||
} |
16 changes: 16 additions & 0 deletions
16
src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FailedMatchResult.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch.result | ||
|
||
import com.google.common.annotations.VisibleForTesting | ||
|
||
import java.nio.file.Path | ||
|
||
/** | ||
* FileMatchResult indicating that the file did not hit on any known pattern. | ||
*/ | ||
case class FailedMatchResult(firstFile: Path) extends FileMatchResult {} | ||
|
||
@VisibleForTesting | ||
object FailedMatchResult { | ||
def fromString(firstFile: String): FailedMatchResult = | ||
FailedMatchResult(new java.io.File(firstFile).toPath) | ||
} |
10 changes: 10 additions & 0 deletions
10
src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/FileMatchResult.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch.result | ||
|
||
import java.nio.file.Path | ||
|
||
/** | ||
* Marker trait for failed/partial/successful file-matching results | ||
*/ | ||
trait FileMatchResult { | ||
def firstFile: Path | ||
} |
16 changes: 16 additions & 0 deletions
16
src/main/scala/org/broadinstitute/dsde/firecloud/filematch/result/PartialMatchResult.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch.result | ||
|
||
import com.google.common.annotations.VisibleForTesting | ||
|
||
import java.nio.file.Path | ||
|
||
/** | ||
* FileMatchResult indicating that the file successfully hit a known pattern, but no paired file could be found. | ||
*/ | ||
case class PartialMatchResult(firstFile: Path, id: String) extends FileMatchResult {} | ||
|
||
@VisibleForTesting | ||
object PartialMatchResult { | ||
def fromStrings(firstFile: String, id: String): PartialMatchResult = | ||
PartialMatchResult(new java.io.File(firstFile).toPath, id) | ||
} |
19 changes: 19 additions & 0 deletions
19
...main/scala/org/broadinstitute/dsde/firecloud/filematch/result/SuccessfulMatchResult.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch.result | ||
|
||
import com.google.common.annotations.VisibleForTesting | ||
|
||
import java.nio.file.Path | ||
|
||
/** | ||
* FileMatchResult indicating that the file successfully hit a known pattern. | ||
*/ | ||
case class SuccessfulMatchResult(firstFile: Path, secondFile: Path, id: String) extends FileMatchResult { | ||
// convert this SuccessfulMatchResult to a PartialMatchResult | ||
def toPartial: PartialMatchResult = PartialMatchResult(firstFile, id) | ||
} | ||
|
||
@VisibleForTesting | ||
object SuccessfulMatchResult { | ||
def fromStrings(firstFile: String, secondFile: String, id: String): SuccessfulMatchResult = | ||
SuccessfulMatchResult(new java.io.File(firstFile).toPath, new java.io.File(secondFile).toPath, id) | ||
} |
14 changes: 14 additions & 0 deletions
14
.../scala/org/broadinstitute/dsde/firecloud/filematch/strategy/FileRecognitionStrategy.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
package org.broadinstitute.dsde.firecloud.filematch.strategy | ||
|
||
import org.broadinstitute.dsde.firecloud.filematch.result.FileMatchResult | ||
|
||
import java.nio.file.Path | ||
|
||
/** | ||
* Marker trait representing file-naming conventions used for pairing matched reads. | ||
*/ | ||
trait FileRecognitionStrategy { | ||
|
||
def matchFirstFile(path: Path): FileMatchResult | ||
|
||
} |
Oops, something went wrong.