Skip to content

Commit

Permalink
Merge pull request #302 from liulab-dfci/dev
Browse files Browse the repository at this point in the history
Fix an issue of regarding IGLLx, like IGLL1, as a constant gene from recent IMGT reference sequences
  • Loading branch information
mourisl authored Aug 12, 2024
2 parents 78314de + 37429ff commit 81c0811
Show file tree
Hide file tree
Showing 4 changed files with 72 additions and 27 deletions.
7 changes: 4 additions & 3 deletions BarcodeTranslator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,10 @@ class BarcodeTranslator {
khiter_t barcode_translate_table_iter =
kh_get(barcodeHash, barcode_translate_table_, bc_from.c_str());
if (barcode_translate_table_iter == kh_end(barcode_translate_table_)) {
std::cerr << "Barcode " << bc_from << " does not exist in the translation table."
<< std::endl;
exit(-1);
//std::cerr << "Barcode " << bc_from << " does not exist in the translation table."
// << std::endl;
//exit(-1);
return ret; // ret should be empty
}
std::string bc_to(
kh_value(barcode_translate_table_, barcode_translate_table_iter));
Expand Down
76 changes: 57 additions & 19 deletions FastqExtractor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ char usage[] = "./fastq-extractor [OPTIONS]:\n"
//"\t--umiEnd INT: the end position of barcode in the barcode sequence (default: length-1)\n"
//"\t--umiRevComp: whether the barcode need to be reverse complemented (default: not used)\n"
"\t--readFormat STRING: format for read, barcode and UMI files (example: r1:0:-1,r2:0:-1,bc:0:15,um:16:-1 for paired-end files with barcode and UMI)\n"
"\t--skipBarcodeErrorRead: do not output reads that failed barcode correction or barcode translation. (default: not set)\n"
;

static const char *short_options = "f:u:1:2:o:t:" ;
Expand All @@ -54,6 +55,7 @@ static struct option long_options[] = {
{ "umiRevComp", no_argument, 0, 10012},
{ "readFormat", required_argument, 0, 10013},
{ "barcodeTranslate", required_argument, 0, 10014},
{ "skipBarcodeErrorRead", no_argument, 0, 10015},
{ (char *)0, 0, 0, 0}
} ;

Expand Down Expand Up @@ -141,9 +143,10 @@ void OutputSeq( FILE *fp, const char *name, char *seq, char *qual, ReadFormatter
}

// Maybe barcode read quality could be useful in future.
void OutputBarcode( FILE *fp, const char *name, char *barcode, char *qual,
// return: 1. output. 0: should skip
int OutputBarcode( FILE *fp, const char *name, char *barcode, char *qual,
ReadFormatter &readFormatter, int readCategory, BarcodeCorrector *barcodeCorrector,
BarcodeTranslator *barcodeTranslator)
BarcodeTranslator *barcodeTranslator, bool skipError)
{
if (barcode && barcode[0] != '\0')
{
Expand All @@ -156,17 +159,37 @@ void OutputBarcode( FILE *fp, const char *name, char *barcode, char *qual,
if (barcodeTranslator)
{
std::string newbc = barcodeTranslator->Translate(bcBuffer, strlen(bcBuffer)) ;
if (newbc.empty())
{
if (skipError)
return 0 ;
else
{
fprintf(stderr, "Barcode %s does not exist in the translation table.\n", bcBuffer) ;
exit(-1) ;
}
}
fprintf( fp, ">%s\n%s\n", name, newbc.c_str()) ;

}
else
fprintf( fp, ">%s\n%s\n", name, bcBuffer ) ;
}
else
fprintf(fp, ">%s\nmissing_barcode\n", name) ;
{
if (skipError)
return 0 ;
else
fprintf(fp, ">%s\nmissing_barcode\n", name) ;
}
}
else
fprintf( fp, ">%s\nmissing_barcode\n", name ) ;
{
if (skipError)
return 0 ;
else
fprintf( fp, ">%s\nmissing_barcode\n", name ) ;
}
return 1 ;
}

void *ProcessReads_Thread( void *pArg )
Expand Down Expand Up @@ -266,8 +289,9 @@ int main( int argc, char *argv[] )
int umiStart = 0 ;
int umiEnd = -1 ;
bool umiRevComp = false ;
bool skipBarcodeErrorRead = false ;
ReadFormatter readFormatter ;

while ( 1 )
{
c = getopt_long( argc, argv, short_options, long_options, &option_index ) ;
Expand Down Expand Up @@ -365,6 +389,10 @@ int main( int argc, char *argv[] )
{
barcodeTranslator.SetTranslateTable(optarg) ;
}
else if (c == 10015) //skipBarcodeErrorRead
{
skipBarcodeErrorRead = true ;
}
else
{
fprintf( stderr, "Unknown parameter %s\n", optarg ) ;
Expand Down Expand Up @@ -473,17 +501,21 @@ int main( int argc, char *argv[] )
++goodCandidate ;
if ( goodCandidate )
{
OutputSeq( fp1, reads.id, reads.seq, reads.qual, readFormatter, FORMAT_READ1 ) ;
if ( hasMate )
OutputSeq( fp2, reads.id, mateReads.seq, mateReads.qual, readFormatter, FORMAT_READ2 ) ;
int barcodeNoError = 1 ;
if ( hasBarcode )
OutputBarcode( fpBc, reads.id, barcodeFile.seq, barcodeFile.qual,
barcodeNoError = OutputBarcode( fpBc, reads.id, barcodeFile.seq, barcodeFile.qual,
readFormatter, FORMAT_BARCODE,
hasBarcodeWhitelist ? &barcodeCorrector : NULL,
barcodeTranslator.isSet() ? &barcodeTranslator : NULL) ;
barcodeTranslator.isSet() ? &barcodeTranslator : NULL, skipBarcodeErrorRead) ;
if (barcodeNoError == 0)
continue ;

OutputSeq( fp1, reads.id, reads.seq, reads.qual, readFormatter, FORMAT_READ1 ) ;
if ( hasMate )
OutputSeq( fp2, reads.id, mateReads.seq, mateReads.qual, readFormatter, FORMAT_READ2 ) ;
if ( hasUmi )
OutputBarcode( fpUmi, reads.id, umiFile.seq, umiFile.qual,
readFormatter, FORMAT_UMI, NULL, NULL) ;
readFormatter, FORMAT_UMI, NULL, NULL, false) ;
}
}
}
Expand Down Expand Up @@ -578,18 +610,24 @@ int main( int argc, char *argv[] )
{
if ( readBatch[i].id[0] == '\0' )
continue ;
OutputSeq( fp1, readBatch[i].id, readBatch[i].seq, readBatch[i].qual, readFormatter, FORMAT_READ1 ) ;
if ( readBatch2 != NULL )
OutputSeq( fp2, readBatch[i].id, readBatch2[i].seq, readBatch2[i].qual, readFormatter, FORMAT_READ2) ;

int barcodeNoError = 1 ;
if ( hasBarcode )
OutputBarcode( fpBc, readBatch[i].id, barcodeBatch[i].seq, barcodeBatch[i].qual,
barcodeNoError = OutputBarcode(
fpBc, readBatch[i].id, barcodeBatch[i].seq, barcodeBatch[i].qual,
readFormatter, FORMAT_BARCODE,
hasBarcodeWhitelist ? &barcodeCorrector : NULL,
barcodeTranslator.isSet() ? &barcodeTranslator : NULL) ;

barcodeTranslator.isSet() ? &barcodeTranslator : NULL,
skipBarcodeErrorRead) ;
if (barcodeNoError == 0)
continue ;

OutputSeq( fp1, readBatch[i].id, readBatch[i].seq, readBatch[i].qual, readFormatter, FORMAT_READ1 ) ;
if ( readBatch2 != NULL )
OutputSeq( fp2, readBatch[i].id, readBatch2[i].seq, readBatch2[i].qual, readFormatter, FORMAT_READ2) ;
if ( hasUmi )
OutputBarcode( fpUmi, readBatch[i].id, umiBatch[i].seq, umiBatch[i].qual,
readFormatter, FORMAT_UMI, NULL, NULL) ;
readFormatter, FORMAT_UMI, NULL, NULL, false) ;
}
}

Expand Down
14 changes: 10 additions & 4 deletions SeqSet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -4622,13 +4622,19 @@ class SeqSet
switch ( name[3] )
{
case 'V': geneType = 0 ; break ;
case 'D':
case 'D':
if ( name[4] >= '0' && name[4] <= '9' )
geneType = 1 ;
else
geneType = 3 ;
break ;
case 'J': geneType = 2 ; break ;
case 'L':
if (GetChainType(name) == 2)
{
geneType = -1 ; // IGLL genes
break ;
}
default: geneType = 3 ; break ;
}
return geneType ;
Expand Down Expand Up @@ -5687,7 +5693,7 @@ class SeqSet
for (i = 0 ; i < overlapCnt ; ++i)
{
int geneType = GetGeneType( seqs[ overlaps[i].seqIdx ].name ) ;
if (geneCompared[geneType] == 1)
if (geneType < 0 || geneCompared[geneType] == 1)
continue ;

if (geneUsed[geneType] == -1)
Expand All @@ -5708,9 +5714,9 @@ class SeqSet

for ( i = 0 ; i < overlapCnt ; ++i )
{
// Remove the hits on the D gene
// Remove the hits on the D gene or other random genes (genetype==-1)
int geneType = GetGeneType( seqs[ overlaps[i].seqIdx ].name ) ;
if ( geneType == 1 )
if ( geneType < 0 || geneType == 1 )
continue ;

// Remove those overlaps that was secondary as well.
Expand Down
2 changes: 1 addition & 1 deletion run-trust4
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use Cwd qw(cwd abs_path) ;
use File::Basename ;
use File::Path qw(make_path) ;

my $version = "v1.1.3-r526" ;
my $version = "v1.1.4-r530" ;

die "TRUST4 $version usage: ./run-trust4 [OPTIONS]:\n".
"Required:\n".
Expand Down

0 comments on commit 81c0811

Please sign in to comment.