Skip to content

Commit

Permalink
Biostar398854
Browse files Browse the repository at this point in the history
  • Loading branch information
lindenb committed Apr 18, 2024
1 parent 6fe7338 commit c789c6a
Show file tree
Hide file tree
Showing 3 changed files with 102 additions and 101 deletions.
4 changes: 2 additions & 2 deletions docs/JvarkitCentral.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ JVARKIT
=======

Author : Pierre Lindenbaum Phd. Institut du Thorax. Nantes. France.
Version : 4f4bf4d7d
Compilation : 20240418092015
Version : 6fe7338c2
Compilation : 20240418142746
Github : https://github.com/lindenb/jvarkit
Issues : https://github.com/lindenb/jvarkit/issues

Expand Down
6 changes: 3 additions & 3 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@ JVARKIT
=======

Author : Pierre Lindenbaum Phd. Institut du Thorax. Nantes. France.
Version : 4f4bf4d7d
Compilation : 20240418092015
Version : 6fe7338c2
Compilation : 20240418142746
Github : https://github.com/lindenb/jvarkit
Issues : https://github.com/lindenb/jvarkit/issues

Expand Down Expand Up @@ -94,7 +94,7 @@ Please, read [how to run and install jvarkit](JvarkitCentral.md)
| [biostar332826](Biostar332826.md) | Fast Extraction of Variants from a list of IDs | 20180817 | 20210412 |
| [biostar336589](Biostar336589.md) | displays circular map as SVG from BED and REF file | 20180907 | 20210818 |
| [biostar352930](Biostar352930.md) | Fills the empty SEQ(*) and QUAL(*) in a bam file using the the reads with the same name carrying this information. | | |
| [biostar398854](Biostar398854.md) | Extract every CDS sequences from a VCF file | 20190916 | 20190916 |
| [biostar398854](Biostar398854.md) | Extract every CDS sequences from a VCF file | 20190916 | 20240418 |
| [biostar404363](Biostar404363.md) | introduce artificial mutation SNV in bam | 20191023 | 20191024 |
| [biostar480685](Biostar480685.md) | paired-end bam clip bases outside insert range | 20201223 | 20200220 |
| [biostar489074](Biostar489074.md) | call variants for every paired overlaping read | 20200205 | 20210412 |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,6 @@ of this software and associated documentation files (the "Software"), to deal
import htsjdk.samtools.reference.ReferenceSequence;
import htsjdk.samtools.reference.ReferenceSequenceFile;
import htsjdk.samtools.reference.ReferenceSequenceFileFactory;
import htsjdk.samtools.util.CloserUtil;
import htsjdk.samtools.util.SequenceUtil;
import htsjdk.variant.variantcontext.Allele;
import htsjdk.variant.variantcontext.Genotype;
Expand All @@ -71,7 +70,7 @@ of this software and associated documentation files (the "Software"), to deal
keywords= {"vcf","gtf"},
biostars=398854,
creationDate="20190916",
modificationDate="20190916",
modificationDate="20240418",
jvarkit_amalgamion = true,
menu="Biostars"
)
Expand All @@ -84,118 +83,120 @@ public class Biostar398854 extends Launcher {
@Parameter(names={"-R","--reference"},description=INDEXED_FASTA_REFERENCE_DESCRIPTION,required=true)
private Path faidx = null;

private ReferenceSequenceFile referenceSequenceFile = null;;

@Override
public int doWork(final List<String> args) {
PrintWriter out = null;
try {
this.referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx);
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(this.referenceSequenceFile);
try(ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(this.faidx)) {
final SAMSequenceDictionary dict = SequenceDictionaryUtils.extractRequired(referenceSequenceFile);


try(VCFReader in = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)),true)) {
out = super.openPathOrStdoutAsPrintWriter(this.outputFile);
final PrintWriter final_out= out;
final List<String> samples = in.getHeader().getSampleNamesInOrder();


try(GtfReader gtfReader= new GtfReader(this.gtfIn)) {
final SAMSequenceDictionary dict2 = in.getHeader().getSequenceDictionary();
if(dict2!=null) SequenceUtil.assertSequenceDictionariesEqual(dict, dict2);
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().
stream().
flatMap(G->G.getTranscripts().stream()).
filter(T->T.hasCDS()).
forEach(transcript->{
final List<VariantContext> variants1 = in.query(transcript).
stream().
filter(V->V.isVariant() && AcidNucleics.isATGCN(V.getReference()) && V.getAlternateAlleles().stream().anyMatch(A->AcidNucleics.isATGCN(A))).
collect(Collectors.toCollection(ArrayList::new));

if(variants1.isEmpty()) return;

final int positions1[]=transcript.getAllCds().
stream().
flatMapToInt(CDS->IntStream.rangeClosed(CDS.getStart(), CDS.getEnd())).
toArray();


final List<VariantContext> variants = variants1.
stream().
filter(V->{
int insert = Arrays.binarySearch(positions1, V.getStart());
return insert>=0 && insert < positions1.length;
}).
collect(Collectors.toCollection(ArrayList::new));
if(variants.isEmpty()) return;

final ReferenceSequence refSeq = this.referenceSequenceFile.getSubsequenceAt(transcript.getContig(),transcript.getStart(), transcript.getEnd());



for(int nSample=0;nSample<=/* yes <= */ samples.size();nSample++)
{
final String fastaName= transcript.getId()+" "+transcript.getGene().getId()+" "+transcript.getGene().getGeneName()+" "+(nSample< samples.size()?samples.get(nSample):"ALL")+
" " + transcript.getContig()+":"+transcript.getStart()+"-"+transcript.getEnd()+"("+transcript.getStrand()+")";
final StringBuilder sb = new StringBuilder();
int array_index=0;

while(array_index< positions1.length) {
final int x1 = positions1[array_index];
try(VCFReader in = VCFReaderFactory.makeDefault().open(Paths.get(oneAndOnlyOneFile(args)),true)) {
try(final PrintWriter out = super.openPathOrStdoutAsPrintWriter(this.outputFile)) {
final List<String> samples = in.getHeader().getSampleNamesInOrder();


try(GtfReader gtfReader= new GtfReader(this.gtfIn)) {
final SAMSequenceDictionary dict2 = in.getHeader().getSequenceDictionary();
if(dict2!=null) SequenceUtil.assertSequenceDictionariesEqual(dict, dict2);
gtfReader.setContigNameConverter(ContigNameConverter.fromOneDictionary(dict));
gtfReader.getAllGenes().
stream().
flatMap(G->G.getTranscripts().stream()).
filter(T->T.hasCDS()).
forEach(transcript->{
final List<VariantContext> variants1 = in.query(transcript).
stream().
filter(V->V.isVariant() && AcidNucleics.isATGCN(V.getReference()) && V.getAlternateAlleles().stream().anyMatch(A->AcidNucleics.isATGCN(A))).
collect(Collectors.toCollection(ArrayList::new));

final int refseqidx0 = x1-transcript.getStart();
if(variants1.isEmpty()) return;

char refbase = (refseqidx0<0 || refseqidx0>=refSeq.length()?'N':(char)refSeq.getBases()[refseqidx0]);
final int x1_final = x1;
String base= String.valueOf(refbase);
final VariantContext ctx=variants.stream().filter(V->V.getStart()==x1_final).findFirst().orElse(null);
Allele alt = ctx==null?null:ctx.getAlternateAlleles().stream().filter(A->AcidNucleics.isATGCN(A)).findFirst().orElse(null);
if(ctx!=null && nSample< samples.size()) {
final Genotype gt = ctx.getGenotype(nSample);
alt = gt.getAlleles().stream().filter(A->!A.isReference() &&! A.isNoCall() && AcidNucleics.isATGCN(A)).findFirst().orElse(null);
}
final int positions1[]=transcript.getAllCds().
stream().
flatMapToInt(CDS->IntStream.rangeClosed(CDS.getStart(), CDS.getEnd())).
toArray();


if(alt!=null)
final List<VariantContext> variants = variants1.
stream().
filter(V->{
int insert = Arrays.binarySearch(positions1, V.getStart());
return insert>=0 && insert < positions1.length;
}).
collect(Collectors.toCollection(ArrayList::new));
if(variants.isEmpty()) return;

final ReferenceSequence refSeq = referenceSequenceFile.getSubsequenceAt(transcript.getContig(),transcript.getStart(), transcript.getEnd());



for(int nSample=0;nSample<=/* yes <= last if for ALL samples */ samples.size();nSample++)
{
base = alt.getBaseString().toUpperCase();
int i=0;
while(i< ctx.getReference().length() && array_index<positions1.length)
{
array_index++;
i++;
final String fastaName= transcript.getId()+" "+transcript.getGene().getId()+" "+transcript.getGene().getGeneName()+" "+(nSample< samples.size()?samples.get(nSample):"ALL")+
" " + transcript.getContig()+":"+transcript.getStart()+"-"+transcript.getEnd()+"("+transcript.getStrand()+")";
final StringBuilder sb = new StringBuilder();
int array_index=0;

while(array_index< positions1.length) {
final int x1 = positions1[array_index];

final int refseqidx0 = x1-transcript.getStart();

final char refbase = (refseqidx0<0 || refseqidx0>=refSeq.length()?'N':(char)refSeq.getBases()[refseqidx0]);
final int x1_final = x1;
String base= String.valueOf(refbase);
final VariantContext ctx=variants.stream().filter(V->V.getStart()==x1_final).findFirst().orElse(null);
Allele alt = ctx==null?null:ctx.getAlternateAlleles().stream().
filter(A->AcidNucleics.isATGCN(A)).
findFirst().
orElse(null);
if(ctx!=null && nSample< samples.size()) {
final Genotype gt = ctx.getGenotype(nSample);
if(gt.hasAltAllele() && (!gt.hasDP() || gt.getDP()>0)) {
alt = gt.getAlleles().stream().
filter(A->!A.isReference() &&! A.isNoCall() && AcidNucleics.isATGCN(A)).
findFirst().
orElse(null);
}
}

if(alt!=null)
{
base = alt.getBaseString().toUpperCase();
int i=0;
while(i< ctx.getReference().length() && array_index<positions1.length)
{
array_index++;
i++;
}
}
else
{
base = base.toLowerCase();
array_index++;
}
sb.append(base);
}

final String fastaSeq = transcript.isNegativeStrand()?AcidNucleics.reverseComplement(sb.toString()):sb.toString();
out.print(">");
out.println(fastaName);
out.println(fastaSeq);
}
else
{
base = base.toLowerCase();
array_index++;
}
sb.append(base);
}

String fastaSeq = transcript.isNegativeStrand()?AcidNucleics.reverseComplement(sb.toString()):sb.toString();
final_out.print(">");
final_out.println(fastaName);
final_out.println(fastaSeq);
}
}
);
}
out.flush();
out.close();
out=null;
}
} // end for each transcripts
);
} // end gtf
out.flush();
} // end out
} // end vcf reader
} // end ref
return 0;
} catch (final Throwable e) {
LOG.error(e);
return -1;
}
finally
{
CloserUtil.close(out);
CloserUtil.close(this.referenceSequenceFile);
}
}

Expand Down

0 comments on commit c789c6a

Please sign in to comment.