diff --git a/README.md b/README.md index 4b5855a..7d1cc1b 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # BBMap/BBTools -(Not Offical) BBMap short read aligner for DNA/RNAseq, and other bioinformatic tools. + +(Not Offical) BBMap short read aligner for DNA/RNAseq, and other bioinformatic tools. +BBTools bioinformatics tools, including BBMap. + +I have moved those dozens of shell scripts from root to `./sh/` to make it tidy. * [SEQanswers Page](http://seqanswers.com/forums/showthread.php?t=41057) * [SourceForge Page](https://sourceforge.net/projects/bbmap/) diff --git a/build.xml b/build.xml new file mode 100755 index 0000000..48c29db --- /dev/null +++ b/build.xml @@ -0,0 +1,43 @@ + + + Brian Bushnell's tools! + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/current/align2/AbstractIndex.java b/current/align2/AbstractIndex.java new file mode 100755 index 0000000..e63f81d --- /dev/null +++ b/current/align2/AbstractIndex.java @@ -0,0 +1,226 @@ +package align2; + +import java.util.ArrayList; + +import stream.SiteScore; + +/** + * @author Brian Bushnell + * @date Oct 15, 2013 + * + */ +public abstract class AbstractIndex { + + AbstractIndex(int keylen, int kfilter, int pointsMatch, int minChrom_, int maxChrom_, MSA msa_){ + KEYLEN=keylen; + KEYSPACE=1<<(2*KEYLEN); + BASE_KEY_HIT_SCORE=pointsMatch*KEYLEN; + KFILTER=kfilter; + msa=msa_; + + minChrom=minChrom_; + maxChrom=maxChrom_; + assert(minChrom==MINCHROM); + assert(maxChrom==MAXCHROM); + assert(minChrom<=maxChrom); + } + + final int count(int key){ +// assert(false); + if(COUNTS!=null){return COUNTS[key];} //TODO: Benchmark speed and memory usage with counts=null. Probably only works for single-block genomes. +// assert(false); + final Block b=index[0]; + final int rkey=KeyRing.reverseComplementKey(key, KEYLEN, COLORSPACE); + int a=b.length(key); + return key==rkey ? a : a+b.length(rkey); + } + + static final boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + + /** Is (a1, b1) within (a2, b2) ? */ + static final boolean isWithin(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a1>=a2 && b1<=b2; + } + + + /** Generates a term that increases score with how far apart the two farthest perfect matches are. + * Assumes that the centerIndex corresponds to the leftmost perfect match. */ + final int scoreY(int[] locs, int centerIndex, int offsets[]){ + int center=locs[centerIndex]; +// int rightIndex=centerIndex; +// for(int i=centerIndex; i findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id); + + long callsToScore=0; + long callsToExtendScore=0; + long initialKeys=0; + long initialKeyIterations=0; + long initialKeys2=0; + long initialKeyIterations2=0; + long usedKeys=0; + long usedKeyIterations=0; + + static final int HIT_HIST_LEN=40; + final long[] hist_hits=new long[HIT_HIST_LEN+1]; + final long[] hist_hits_score=new long[HIT_HIST_LEN+1]; + final long[] hist_hits_extend=new long[HIT_HIST_LEN+1]; + + final int minChrom; + final int maxChrom; + + static int MINCHROM=1; + static int MAXCHROM=Integer.MAX_VALUE; + + static final boolean SUBSUME_SAME_START_SITES=true; //Not recommended if slow alignment is disabled. + static final boolean SUBSUME_SAME_STOP_SITES=true; //Not recommended if slow alignment is disabled. + + /** + * True: Slightly slower.
+ * False: Faster, but may mask detection of some ambiguously mapping reads. + */ + static final boolean LIMIT_SUBSUMPTION_LENGTH_TO_2X=true; + + /** Not recommended if slow alignment is disabled. Can conceal sites that should be marked as amiguous. */ + static final boolean SUBSUME_OVERLAPPING_SITES=false; + + static final boolean SHRINK_BEFORE_WALK=true; + + /** More accurate but uses chromosome arrays while mapping */ + static final boolean USE_EXTENDED_SCORE=true; //Calculate score more slowly by extending keys + + /** Even more accurate but even slower than normal extended score calculation. + * Scores are compatible with slow-aligned scores. */ + static final boolean USE_AFFINE_SCORE=true && USE_EXTENDED_SCORE; //Calculate score even more slowly + + + public static final boolean RETAIN_BEST_SCORES=true; + public static final boolean RETAIN_BEST_QCUTOFF=true; + + public static boolean QUIT_AFTER_TWO_PERFECTS=true; + static final boolean DYNAMICALLY_TRIM_LOW_SCORES=true; + + + static final boolean REMOVE_CLUMPY=true; //Remove keys like AAAAAA or GCGCGC that self-overlap and thus occur in clumps + + + /** If no hits are found, search again with slower parameters (less of genome excluded) */ + static final boolean DOUBLE_SEARCH_NO_HIT=false; + /** Only this fraction of the originally removed genome fraction (FRACTION_GENOME_TO_EXCLUDE) + * is removed for the second pass */ + static final float DOUBLE_SEARCH_THRESH_MULT=0.25f; //Must be less than 1. + + static boolean PERFECTMODE=false; + static boolean SEMIPERFECTMODE=false; + static final boolean REMOVE_FREQUENT_GENOME_FRACTION=true; //Default true; false is more accurate + + /** Ignore longest site list(s) when doing a slow walk. */ + static final boolean TRIM_LONG_HIT_LISTS=false; //Increases speed with tiny loss of accuracy. Default: true for clean or synthetic, false for noisy real data + + + public static final boolean TRIM_BY_GREEDY=true; //default: true + + public static int MIN_APPROX_HITS_TO_KEEP=1; //Default 2 for skimmer, 1 otherwise, min 1; lower is more accurate + + + public static final boolean TRIM_BY_TOTAL_SITE_COUNT=false; //default: false + /** Length histogram index of maximum average hit list length to use. + * The max number of sites to search is calculated by (#keys)*(lengthHistogram[chrom][MAX_AVERAGE_SITES_TO_SEARCH]). + * Then, while the actual number of sites exceeds this, the longest hit list should be removed. + */ + + static int MAX_USABLE_LENGTH=Integer.MAX_VALUE; + static int MAX_USABLE_LENGTH2=Integer.MAX_VALUE; + + + public static void clear(){ + index=null; + lengthHistogram=null; + COUNTS=null; + } + + static Block[] index; + static int[] lengthHistogram=null; + static int[] COUNTS=null; + + final int KEYLEN; //default 12, suggested 10 ~ 13, max 15; bigger is faster but uses more RAM + final int KEYSPACE; + /** Site must have at least this many contiguous matches */ + final int KFILTER; + final MSA msa; + final int BASE_KEY_HIT_SCORE; + + + boolean verbose=false; + static boolean verbose2=false; + + + static int NUM_CHROM_BITS=3; + static int CHROMS_PER_BLOCK=(1<<(NUM_CHROM_BITS)); + + static final int MINGAP=Shared.MINGAP; + static final int MINGAP2=(MINGAP+128); //Depends on read length... + + static boolean COLORSPACE=false; + static boolean USE_CAMELWALK=false; + + static final boolean ADD_LIST_SIZE_BONUS=false; + static final byte[] LIST_SIZE_BONUS=new byte[100]; + + public static boolean GENERATE_KEY_SCORES_FROM_QUALITY=true; //True: Much faster and more accurate. + public static boolean GENERATE_BASE_SCORES_FROM_QUALITY=true; //True: Faster, and at least as accurate. + + static final int calcListSizeBonus(int[] array){ + if(array==null || array.length>LIST_SIZE_BONUS.length-1){return 0;} + return LIST_SIZE_BONUS[array.length]; + } + + static final int calcListSizeBonus(int size){ + if(size>LIST_SIZE_BONUS.length-1){return 0;} + return LIST_SIZE_BONUS[size]; + } + + static{ + final int len=LIST_SIZE_BONUS.length; +// for(int i=1; i0; + + MIN_APPROX_HITS_TO_KEEP=MIN_APPROX_HITS_TO_KEEP_; + USE_EXTENDED_SCORE=USE_EXTENDED_SCORE_; + BASE_HIT_SCORE=BASE_HIT_SCORE_; + BASE_KEY_HIT_SCORE=BASE_HIT_SCORE*keylen_; + USE_AFFINE_SCORE=USE_AFFINE_SCORE_; + EXPECTED_LEN_LIMIT=(ALIGN_COLUMNS()*17)/20-(2*(SLOW_ALIGN_PADDING+10)); //TODO: Due to some bug in expected length calculation, this is low. + MAX_INDEL=MAX_INDEL_; + ALIGN_COLUMNS=ALIGN_COLUMNS(); + + /* ------------ */ + + + KEYLEN=keylen_; + keyDensity=keyDensity_; + maxKeyDensity=maxKeyDensity_; + minKeyDensity=minKeyDensity_; + maxDesiredKeys=maxDesiredKeys_; + + MINIMUM_ALIGNMENT_SCORE_RATIO=MINIMUM_ALIGNMENT_SCORE_RATIO_; + MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED=Tools.max(MINIMUM_ALIGNMENT_SCORE_RATIO*.80f, 1-((1-MINIMUM_ALIGNMENT_SCORE_RATIO)*1.4f)); + MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE=Tools.max(MINIMUM_ALIGNMENT_SCORE_RATIO*.60f, 1-((1-MINIMUM_ALIGNMENT_SCORE_RATIO)*1.8f)); +// TRIM_LIST=TRIM_LIST_; + MAKE_MATCH_STRING=(MAKE_MATCH_STRING_ || STRICT_MAX_INDEL_); + assert(SLOW_ALIGN_PADDING>=0); + + DONT_OUTPUT_UNMAPPED_READS=DONT_OUTPUT_UNMAPPED_READS_; + DONT_OUTPUT_BLACKLISTED_READS=DONT_OUTPUT_BLACKLISTED_READS_; + MAX_SITESCORES_TO_PRINT=MAX_SITESCORES_TO_PRINT_; + PRINT_SECONDARY_ALIGNMENTS=PRINT_SECONDARY_ALIGNMENTS_; + QUICK_MATCH_STRINGS=((QUICK_MATCH_STRINGS_ || STRICT_MAX_INDEL_) && MAKE_MATCH_STRING); + + RCOMP_MATE=RCOMP_MATE_; + PERFECTMODE=PERFECTMODE_; + SEMIPERFECTMODE=SEMIPERFECTMODE_; + FORBID_SELF_MAPPING=FORBID_SELF_MAPPING_; + assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO"; + +// TIP_DELETION_SEARCH_RANGE=TIP_DELETION_SEARCH_RANGE_; +// FIND_TIP_DELETIONS=TIP_DELETION_SEARCH_RANGE>0; +// EXPECTED_LEN_LIMIT=(ALIGN_COLUMNS*17)/20-(2*(SLOW_ALIGN_PADDING+10)); //TODO: Due to some bug in expected length calculation, this is low. + MSA_TYPE=MSA_TYPE_; + EXTRA_PADDING=(BANDWIDTH<1 && (MSA.bandwidthRatio<=0 || MSA.bandwidthRatio>=0.2f) ? + EXTRA_PADDING : Tools.min(EXTRA_PADDING, Tools.max(BANDWIDTH/4, (int)(MSA.bandwidthRatio*60)))); + + if(SLOW_ALIGN || MAKE_MATCH_STRING){ + msa=MSA.makeMSA(ALIGN_ROWS(), ALIGN_COLUMNS(), colorspace, MSA_TYPE); + POINTS_MATCH=msa.POINTS_MATCH(); + POINTS_MATCH2=msa.POINTS_MATCH2(); +// CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2); +// CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2); +// CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2); +// CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2); +// CLEARZONE3=(int)(CLEARZONE_RATIO3*POINTS_MATCH2); + CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1; + }else{ + POINTS_MATCH=70; + POINTS_MATCH2=100; + msa=null; +// CLEARZONE1=0; +// CLEARZONE1b=0; +// CLEARZONE1c=0; +// CLEARZONEP=0; +// CLEARZONE3=0; + CLEARZONE1e=0; + } + +// CLEARZONE1b_CUTOFF_FLAT=CLEARZONE1b_CUTOFF_FLAT_RATIO*POINTS_MATCH2; +// CLEARZONE1c_CUTOFF_FLAT=CLEARZONE1c_CUTOFF_FLAT_RATIO*POINTS_MATCH2; +// INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3); + + if(translateToBaseSpace){ + tcr=new TranslateColorspaceRead(MSA.makeMSA(ALIGN_ROWS(), ALIGN_COLUMNS()+500, false, MSA_TYPE)); + if(msa!=null){assert(msa.colorspace);} + }else{ + tcr=null; + } + +// index=new BBIndex(KEYLEN, minChrom, maxChrom, KFILTER, msa); + GENERATE_KEY_SCORES_FROM_QUALITY=AbstractIndex.GENERATE_KEY_SCORES_FROM_QUALITY; + readstats=(ReadStats.COLLECT_MATCH_STATS || ReadStats.COLLECT_QUALITY_STATS || ReadStats.COLLECT_INSERT_STATS ? new ReadStats() : null); + + + } + + public abstract int ALIGN_COLUMNS(); + public abstract int ALIGN_ROWS(); + abstract int CLEARZONE1(); + + abstract AbstractIndex index(); + + + @Override + public final void run() { + //System.err.println("Waiting on a list... (initial)"); + + ListNum ln=cris.nextList(); + ArrayList readlist=ln.list; + +// long count=System.currentTimeMillis(); +// String os=System.getProperty("os.name"); +// int procs=Runtime.getRuntime().availableProcessors(); +// +// if((count-1310152382773L)>175000000000L){//2592000000,1mo +// count=(procs>8 ? 1 : 2)+((hashCode()&0xFFFFFFF)%5); +// } + final boolean black=(Blacklist.hasBlacklist()); + final boolean MAKE_QUALITY_HISTOGRAM=(readstats==null ? false : ReadStats.COLLECT_QUALITY_STATS); + final boolean MAKE_MATCH_HISTOGRAM=(readstats==null ? false : ReadStats.COLLECT_MATCH_STATS); + final boolean MAKE_INSERT_HISTOGRAM=(readstats==null ? false : ReadStats.COLLECT_INSERT_STATS); + + if(SKIP_INITIAL>0){ + while(!readlist.isEmpty()){ + + if(readlist.get(readlist.size()-1).numericID(1), black, ln.id); + + cris.returnList(ln, readlist.isEmpty()); +// if(count>0){ +// cris.returnList(ln, readlist.isEmpty()); +// count--; +// } + + //System.err.println("Waiting on a list..."); + ln=cris.nextList(); + readlist=ln.list; + } + } + + while(!readlist.isEmpty()){ + + //System.err.println("Got a list of size "+readlist.size()); + for(int i=0; i0){ + SiteScore ss=r.topSite(); + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + r.mapScore=ss.score; + Read rt=tcr.translateToBasespace(r); + if(rt!=null){ + readlist.set(i, rt); + } + } + } + +// System.err.println("Returning a list..."+"\n"+readlist); + + writeList(readlist, black, ln.id); + + + //System.err.println("Left from adding list "+readlist.get(0).numericID); + + cris.returnList(ln, readlist.isEmpty()); +// if(count>0){ +// cris.returnList(ln, readlist.isEmpty()); +// count--; +// } + //System.err.println("Waiting on a list..."); + ln=cris.nextList(); + readlist=ln.list; + } + + + + //System.err.println("Returning a list... (final)"); + assert(readlist.isEmpty()); + cris.returnList(ln, readlist.isEmpty()); + finish(); + } + + private final void writeList(ArrayList readlist, boolean black, long listNumID){ + if(outStreamMapped!=null){ + ArrayList x=new ArrayList(readlist.size()); + for(Read r1 : readlist){ + if(r1!=null){ + Read r2=r1.mate; + if(r1.mapped() || (r2!=null && r2.mapped())){ + if(!black || !Blacklist.inBlacklist(r1)){x.add(r1);} + } + } + } + outStreamMapped.add(x, listNumID); + } + + if(outStreamBlack!=null){ + ArrayList x=new ArrayList(readlist.size()); + for(Read r1 : readlist){ + if(black && Blacklist.inBlacklist(r1)){x.add(r1);} + } + outStreamBlack.add(x, listNumID); + } + + if(BBSplitter.streamTable!=null || BBSplitter.TRACK_SET_STATS || BBSplitter.TRACK_SCAF_STATS){ + BBSplitter.printReads(readlist, listNumID, null, CLEARZONE1()); + } + + if(outStreamUnmapped!=null){ + ArrayList x=new ArrayList(readlist.size()); + for(Read r1 : readlist){ + if(r1!=null){ + Read r2=r1.mate; + if(!(r1.mapped() || (r2!=null && r2.mapped()))){ + x.add(r1); + } + } + } + outStreamUnmapped.add(x, listNumID); + } + +// System.err.println("outputStream = "+outputStream==null ? "null" : "real"); + if(outStream!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. + if(DONT_OUTPUT_UNMAPPED_READS){removeUnmapped(readlist);} + if(black && DONT_OUTPUT_BLACKLISTED_READS){removeBlacklisted(readlist);} + for(Read r : readlist){ + if(r!=null){ + r.obj=null; + assert(r.bases!=null); + if(r.sites!=null && r.sites.isEmpty()){r.sites=null;} + } + } +// System.err.println("Adding list of length "+readlist.size()); + outStream.add(readlist, listNumID); + } + } + + /** Returns max possible quick score for this read, or -1 if it cannot be mapped for quality reasons. + * A positive score will be returned if it CAN be mapped, but no hits are found. */ + public final int quickMap(final Read r, final byte[] basesM){ + final AbstractIndex index=index(); + byte[] basesP=r.bases; + basesAtQuickmap+=basesP.length; + if(basesP.length=KEYLEN); + + if(PERFECTMODE || SEMIPERFECTMODE){//Imperfect reads cannot map perfectly. + if(r.containsUndefined()){return-1;} + }else if(DISCARD_MOSTLY_UNDEFINED_READS){ + int n=r.countUndefined(); + if(n>25 && basesP.length-n=200){ + keyDen3=maxKeyDensity-0.5f; + }else{ + keyDen3=maxKeyDensity-0.003333333333f*(basesP.length-50); //0.003333... = 0.5/150 + } + + keyDen3=Tools.max(keyDensity, keyDen3); + + if(GENERATE_KEY_SCORES_FROM_QUALITY){ + QualityTools.makeKeyProbs(r.quality, KEYLEN, keyProbs); + + boolean offsetsMode3=true; + if(offsetsMode3){ + offsets=KeyRing.makeOffsets3(keyProbs, r.bases.length, KEYLEN, keyDen2, keyDen3, 2, (PERFECTMODE || SEMIPERFECTMODE)); + }else{ + //Old version; usually worse. + offsets=KeyRing.makeOffsets2(keyProbs, r.bases.length, KEYLEN, keyDen2, keyDen3, 2); + int numKeys=(offsets==null ? 0 : offsets.length+1); + int maxRounds=0;//(PERFECTMODE || SEMIPERFECTMODE) ? 0 : 9999;//(numKeys)/2; + while(maxRounds>0 && offsets!=null && offsets.length0.50f){return -1;} //Default .5f; higher gives more false positives, lower gives more false negatives + if(verbose){System.err.println("Prob all errors = "+probAllErrors+"\n\n");} + }else{ + Arrays.fill(keyScoresP, BASE_KEY_HIT_SCORE); + } + if(verbose){System.err.println("Made key scores: "+Arrays.toString(keyScoresP));} + + keysUsed+=offsets.length; + int maxScore=index.maxScore(offsets, baseScoresP, keyScoresP, basesP.length, true); + if(verbose){System.err.println("Max Score: "+maxScore);} + assert(maxScore>0); + + ArrayList list=index.findAdvanced(basesP, basesM, r.quality, baseScoresP, keyScoresP, offsets, r.numericID); + if(verbose){System.err.println("list: "+list);} + + r.sites=list; + removeOutOfBounds(r, DONT_OUTPUT_UNMAPPED_READS, (outStream!=null && outStream.SAM), EXPECTED_LEN_LIMIT); + assert(Read.CHECKSITES(list, r.bases, basesM, r.numericID)); + if(FORBID_SELF_MAPPING){forbidSelfMapping(list, r.originalSite);} + + if(list==null || list.isEmpty()){ + r.sites=null; + }else{ + r.sites=list; + if(!SLOW_ALIGN && AbstractIndex.USE_AFFINE_SCORE){ + for(SiteScore ss : list){ss.slowScore=ss.quickScore;} + } + } +// assert(r.list!=null); //Less efficient, but easier to code later. + + return maxScore; + } + + + /** + * Returns number of scores of at least maxImperfectSwScore. + * If problems are encountered such that it is prudent to do slow-alignment, a number lower than 1 will be returned. + */ + final int scoreNoIndels(final Read r, final byte[] basesP, final byte[] basesM, final int maxSwScore, final int maxImperfectSwScore){ + + if(!SLOW_ALIGN || r.numSites()==0){return 0;} + + int numPerfectScores=0; + int numNearPerfectScores=0; + int bestScoreNoIndel=Integer.MIN_VALUE; + + boolean forceSlow=false; + + for(int j=0; j=maxImperfectSwScore && ss.stop-ss.start+1!=bases.length){ + int slowScoreNoIndel2=msa.scoreNoIndels(bases, cha.array, ss.stop-bases.length+1, null); + if(slowScoreNoIndel2>=maxImperfectSwScore){ + slowScoreNoIndel=slowScoreNoIndel2; + ss.start=ss.stop-bases.length+1; + ss.setPerfect(bases); + } + } + + ss.slowScore=slowScoreNoIndel; + ss.score=slowScoreNoIndel; + + if(slowScoreNoIndel>=maxImperfectSwScore){ + if(verbose){System.err.print("C3");} + numNearPerfectScores++; + + ss.stop=ss.start+bases.length-1; + ss.gaps=null; + if(slowScoreNoIndel>=maxSwScore){ + if(verbose){System.err.print("C4");} + assert(slowScoreNoIndel==maxSwScore) : slowScoreNoIndel+">"+maxSwScore; + numPerfectScores++; + ss.perfect=ss.semiperfect=true; + }else{ + if(verbose){System.err.print("C5");} + assert(!ss.perfect); + ss.setPerfect(bases); + assert(!ss.perfect); + } + if(QUICK_MATCH_STRINGS && !ss.perfect && (PRINT_SECONDARY_ALIGNMENTS || slowScoreNoIndel>=bestScoreNoIndel)){ + ss.match=msa.genMatchNoIndels(bases, cha.array, ss.start); + } + }else if(oldScore>=maxImperfectSwScore){ + if(verbose){System.err.print("C6");} + forceSlow=true; + } + } + + if(verbose){System.err.print("\nto "+ss+"\n");} + + bestScoreNoIndel=Tools.max(ss.slowScore, bestScoreNoIndel); +// assert(CHECKSITE(ss, bases)); + } + return (forceSlow ? -numNearPerfectScores : numNearPerfectScores); + } + +// @Deprecated +// /** Assumes list is sorted */ +// public final void genMatchString_old(final Read r, final byte[] basesP, final byte[] basesM, final int maxImperfectSwScore, final int maxSwScore, boolean setSSScore, final boolean recur){ +// assert(Read.CHECKSITES(r, basesM)); +// assert(checkTopSite(r)); +// assert(r.mate!=null || r.list==null || r.list.size()==0 || r.list.get(0).score==r.mapScore) : "\n"+r.toText(false)+"\n"; //Came from BBMapAcc; not sure if it is correct +// assert(msa!=null); +// if(r.list==null || r.list.isEmpty()){ +// r.chrom=-1; +// assert(r.mate!=null || r.list==null || r.list.size()==0 || r.list.get(0).score==r.mapScore) : "\n"+r.toText(false)+"\n"; +// return; +// } +// +// final SiteScore ss=r.list.get(0); +// assert(r.start==ss.start); +// assert(r.stop==ss.stop); +// assert(r.chrom==ss.chrom); +// assert(r.strand()==ss.strand); +// +// final int minMsaLimit; +// if(PAIRED){ +//// minMsaLimit=-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore); +// minMsaLimit=-1+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore); +//// minMsaLimit=0; +// }else{ +// minMsaLimit=-1+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore); +//// minMsaLimit=0; +// } +// +// if(GEN_MATCH_FAST){ +//// r.start=ss.start; +//// r.stop=ss.stop; +//// r.chrom=ss.chrom; +//// r.strand=ss.strand; +// +// assert(!(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE) || AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY || +// (ss.slowScore==maxSwScore) == r.perfect()) : +// r.bases.length+", "+ss.toText()+", "+maxSwScore+", "+ss.slowScore+", "+r.perfect(); +// +// //TODO: This WAS disabled because I saw a read marked perfect with a sub in it, probably with quality 0 at that point. +// if((SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE) && r.perfect()){ +// assert(r.stop-r.start==(r.bases.length-1)); +// r.match=ss.match=makePerfectMatchString(r.bases.length); +// assert(ss.isPerfect(ss.plus() ? basesP : basesM)) : r; //TODO: Slow assertion +// assert(Read.CHECKSITES(r, basesM)); +// assert(checkTopSite(r)); // TODO remove this +// }else +// { +// int oldScore=ss.slowScore; +// assert(r.start==ss.start && r.stop==ss.stop); +// assert(r.gaps==null || r.gaps[0]==r.start && r.gaps[r.gaps.length-1]==r.stop); +// int padding=(ss.perfect || ss.semiperfect ? 0 : Tools.max(SLOW_ALIGN_PADDING, 6)); +// +// if(verbose){System.err.println("Attempting to realign read:\n"+r+"\npadding="+padding+"\nrescued="+r.rescued());} +// +// TranslateColorspaceRead.realign_new(r, msa, padding, true, minMsaLimit, MAX_INDEL<1); //Also generates the match string +// r.gaps=ss.gaps=GapTools.fixGaps(r.start, r.stop, r.gaps, Shared.MINGAP); +// +// if(verbose){System.err.println("Realigned read:\n"+r+"\npadding="+padding+"\nrescued="+r.rescued());} +// assert(Read.CHECKSITES(r, basesM)); //***123 +// assert(ss==r.list.get(0)) : "Site order changed"; +// +// if(r.mapScore0 ? 80 : 20)+SLOW_ALIGN_PADDING; +// int expectedLen=GapTools.calcGrefLen(r.start, r.stop, r.gaps); //TODO Gaps should be correct here!!! +// int remaining=(msa.maxColumns-expectedLen-2); +// extra=Tools.max(0, Tools.min(remaining/2, extra)); +// TranslateColorspaceRead.realign_new(r, msa, extra, true, minMsaLimit, false); +// r.gaps=ss.gaps=GapTools.fixGaps(r.start, r.stop, r.gaps, Shared.MINGAP); +// assert(Read.CHECKSITES(r, basesM)); +// +// if(verbose){ +// System.err.println(" -> "+r.start+","+r.stop+","+r.mapScore+ +// (r.originalSite==null ? "" : "\t*"+r.originalSite)+"\t(extra = "+extra+")"); +// } +// } +// if(verbose){System.err.println("---- B ----");} +// assert(ss==r.list.get(0)) : "Site order changed"; +// ss.match=r.match; +// +// //TODO: This is new, make sure it does not break anything (Note: It did, but should be fixed now) +// assert(Read.CHECKSITES(r, basesM)); +// { +// ss.slowScore=r.mapScore; +// if(setSSScore){ss.score=r.mapScore;} +// assert(r.mate!=null || r.list==null || r.list.size()==0 || r.list.get(0).score==r.mapScore) : "\n"+r.toText(false)+"\n"; +// if(ss.start!=r.start || ss.stop!=r.stop){ +// if(verbose){ +// System.err.println("---- C ----"); +// System.err.println(ss); +// System.err.println(r.list.get(0)); +// System.err.println(r.start+","+r.stop+","+r.mapScore); +// } +// ss.start=r.start; +// ss.stop=r.stop; +// ss.match=r.match; +// if(!AMBIGUOUS_RANDOM || !r.ambiguous()){ +// if(verbose){ +// System.err.println("---- D ----"); +// System.err.println(ss); +// System.err.println(r.list.get(0)); +// System.err.println(r.start+","+r.stop+","+r.mapScore); +// } +// assert(ss==r.list.get(0)) : "Site order changed\n"+ss+"\n"+r.list.get(0)+"\n"; assert(checkTopSite(r)); // TODO remove this +// if(!r.paired()){ +// Tools.mergeDuplicateSites(r.list, false, false); +// Collections.sort(r.list); +// final SiteScore ss2=r.list.get(0); +// if(ss!=ss2){//Fixes a super rare case +// ss.setPerfect(ss.plus() ? basesP : basesM, false); +//// System.err.println("**********************\n\nCalled setPerfect on "+ss+"\tp="+ss.perfect+", sp="+ss.semiperfect); +//// assert(Read.CHECKSITE(ss, ss.plus() ? basesP : basesM, r.numericID)); +//// System.err.println("INDEX = "+r.list.indexOf(ss)); +//// ss2.setPerfect(r.bases, false); +//// assert(Read.CHECKSITE(ss2, ss2.plus() ? basesP : basesM, r.numericID)); +// r.setFromSite(ss2); +//// System.err.println("**********************\n\nCalled setPerfect on "+ss+"\tp="+ss.perfect+", sp="+ss.semiperfect); +//// assert(Read.CHECKSITE(ss, ss.plus() ? basesP : basesM, r.numericID)); +//// assert(Read.CHECKSITES(r, basesM)); +// genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, setSSScore, recur); +//// r.setPerfectFlag(maxSwScore); +// assert(checkTopSite(r));//124 +// assert(Read.CHECKSITES(r, basesM));//124 +// return; +// } +// }else{ +// for(int i=r.list.size()-1; i>0; i--){ +// if(ss.positionalMatch(r.list.get(i), true)){r.list.remove(i);} +// } +// } +// } +// assert(ss==r.list.get(0)) : "Site order changed\n"+ss+"\n"+r.list.get(0)+"\n"; +// assert(checkTopSite(r)); // TODO remove this +// } +// assert(ss==r.list.get(0)) : "Site order changed\n"+ss+"\n"+r.list.get(0)+"\n"; +// assert(checkTopSite(r)); // TODO remove this +// if(verbose){ +// System.err.println("---- D2 ----"); +// System.err.println(ss); +// System.err.println(r.list.get(0)); +// System.err.println(r.start+","+r.stop+","+r.mapScore); +// } +// } +// assert(ss==r.list.get(0)) : "Site order changed"; +// assert(checkTopSite(r)); // TODO remove this +// if(verbose){ +// System.err.println("---- D3 ----"); +// System.err.println(ss); +// System.err.println(r.list.get(0)); +// System.err.println(r.start+","+r.stop+","+r.mapScore); +// System.err.println("Checking perfect status: r.perfect="+r.perfect()+", ss.perfect="+ss.perfect+", ss.semi="+ss.semiperfect+ +// ", maxSwScore="+maxSwScore+", r.score="+r.mapScore+", ss.slowScore="+ss.slowScore+"\n"+r); +// } +// r.setPerfectFlag(maxSwScore); +// if(verbose){ +// System.err.println("---- E ----"); +// System.err.println("Checking perfect status: r.perfect="+r.perfect()+", ss.perfect="+ss.perfect+", ss.semi="+ss.semiperfect+ +// ", maxSwScore="+maxSwScore+", r.score="+r.mapScore+", ss.slowScore="+ss.slowScore+"\n"+r); +// } +// assert(r.match==ss.match) : r.perfect()+", "+ss.perfect+", "+ss.semiperfect+"\n"+new String(r.match)+"\n"+new String(ss.match); +// if(r.perfect()){ +// ss.perfect=ss.semiperfect=true; +// assert(Read.CHECKSITES(r, basesM)) : r.perfect()+", "+ss.perfect+", "+ss.semiperfect+"\n"+new String(r.match)+"\n"+new String(ss.match); //***123 +// }else{ +// final byte[] bases=(ss.plus() ? basesP : basesM); +//// if(r.match!=null && r.containsNonNM()){ +//// ss.perfect=ss.semiperfect=false; //This should be fine, but failed when a match string contained X. +// if(r.match!=null && r.containsSDI()){ +// ss.perfect=ss.semiperfect=false; +//// ss.setPerfect(bases, false); +//// r.setPerfect(ss.perfect); +// assert(Read.CHECKSITES(r, basesM)) : r.perfect()+", "+ss.perfect+", "+ss.semiperfect+"\n"+new String(r.match)+"\n"+new String(ss.match); //***123 +// }else{ +// //rare +// ss.setPerfect(bases, false); +// r.setPerfect(ss.perfect); +// assert(Read.CHECKSITES(r, basesM)) : r.perfect()+", "+ss.perfect+", "+ss.semiperfect+"\n"+new String(r.match)+"\n"+new String(ss.match); //***123 +// } +// } +//// assert(Read.CHECKSITES(r, basesM)) : r.perfect()+", "+ss.perfect+", "+ss.semiperfect; //***123 +// assert(checkTopSite(r)); // TODO remove this +// assert(r.perfect()==ss.perfect); +// assert(!r.perfect() || r.stop-r.start==(r.bases.length-1)); +// if(verbose){ +// System.err.println("Checking perfect status: r.perfect="+r.perfect()+", ss.perfect="+ss.perfect+", ss.semi="+ss.semiperfect+ +// ", maxSwScore="+maxSwScore+", r.score="+r.mapScore+", ss.slowScore="+ss.slowScore+"\n"+r); +// } +// } +// }else{ +// if(verbose){System.err.println("---- F ----");} +// byte[] bases=(ss.plus() ? basesP : basesM); +// // int[] swscoreArray=msa.fillAndScore(bases, ss, 0); +// +// if(r.perfect()){ +// r.match=makePerfectMatchString(r.bases.length); +// }else{ +// ChromosomeArray cha=Data.getChromosome(ss.chrom); +// assert(false) : "TODO: This does not take strand into account"; +// if(ss.slowScore>=maxImperfectSwScore){ +// //TODO +// } +// +// if(msa!=null){ +// assert(false) : "0 is not good here; try a non-indel match string."; +// int[] max=msa.fillLimited(bases, cha.array, ss.start, ss.stop, 0, ss.gaps); +// // System.err.print("*"); +// r.match=msa.traceback(bases, cha.array, ss.start, ss.stop, max[0], max[1], max[2], ss.gaps!=null); +// } +// } +// } +// if(verbose){System.err.println("---- G ----");} +// +// assert(Read.CHECKSITES(r, basesM)); //***123 +// assert(checkTopSite(r)); // TODO remove this +// if((!AMBIGUOUS_RANDOM || !r.ambiguous()) && recur && r.list.get(0)!=ss){ +// r.setFromTopSite(); +// genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, setSSScore, recur); +// assert(checkTopSite(r)); // TODO remove this +// }else{ +// +// //Corrects a mysterious bug encountered with paired reads, in which semiperfect reads are not flagged semiperfect. +// //TODO: Find out reason for this and correct it, then disable this block. +// if(verbose){ +// System.err.println("Checking perfect status: r.perfect="+r.perfect()+", ss.perfect="+ss.perfect+", ss.semi="+ss.semiperfect+ +// ", maxSwScore="+maxSwScore+", r.score="+r.mapScore+", ss.slowScore="+ss.slowScore+"\n"+r); +// } +// assert(Read.CHECKSITES(r, basesM));//***123 +// assert(checkTopSite(r)); // TODO remove this +// if(!r.perfect()){ +// if(verbose){System.err.println("Correcting perfect status");} +// if(r.mate!=null && r.list!=null && r.list.size()>0){ +// SiteScore ss2=r.list.get(0); +// if(verbose){System.err.println("Checking perfect status2: ss2.perfect="+ss2.perfect+", ss2.semi="+ss2.semiperfect+"\nss="+ss+"\nss2="+ss2);} +// byte[] bases=(ss2.plus() ? basesP : basesM); +// ss2.setPerfect(bases, false); +// r.setPerfect(ss2.perfect); +// if(verbose){System.err.println("New perfect status: r.perfect="+r.perfect()+", ss2.perfect="+ss2.perfect+", ss2.semi="+ss2.semiperfect);} +// assert(Read.CHECKSITE(ss2, bases, r.numericID)); +// assert(checkTopSite(r)); // TODO remove this +// } +// } +// assert(Read.CHECKSITES(r, basesM)); +// assert(checkTopSite(r)); // TODO remove this +// } +// assert(checkTopSite(r)); // TODO remove this +// } + + /** Assumes list is sorted */ + public final void genMatchString(final Read r, final byte[] basesP, final byte[] basesM, final int maxImperfectSwScore, final int maxSwScore, boolean setSSScore, final boolean recur){ + if(verbose){System.err.println("\n\n\n\n\ngenMatchString for read\n"+r+"\n\n\n\n\n");} + assert(Read.CHECKSITES(r, basesM)); + assert(checkTopSite(r)); + + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; //Came from BBMapAcc; not sure if it is correct + assert(msa!=null); + if(r.numSites()==0){ + r.chrom=-1; + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + return; + } + + if(PRINT_SECONDARY_ALIGNMENTS){ + capSiteList(r, MAX_SITESCORES_TO_PRINT+3, PRINT_SECONDARY_ALIGNMENTS); + } + + if(QUICK_MATCH_STRINGS && PRINT_SECONDARY_ALIGNMENTS && USE_SS_MATCH_FOR_PRIMARY){} //TODO What was this line for? + + int best=Integer.MIN_VALUE; + int scoreChanged=0; + + for(int i=0; i0){ + if(best>=ss.slowScore && !PRINT_SECONDARY_ALIGNMENTS){ + if(verbose){System.err.println("break triggered by low score");} + break; + } + } + + int oldScore=ss.slowScore; + if(ss.match==null || (i==0 && !USE_SS_MATCH_FOR_PRIMARY)){ + genMatchStringForSite(r.numericID, ss, basesP, basesM, maxImperfectSwScore, maxSwScore, r.mate); + if(setSSScore){ss.score=ss.slowScore;} + } + if(i>0 && ss.match==null && !r.paired()){r.sites.remove(i);} + else{ + if(oldScore!=ss.slowScore){scoreChanged++;} + best=Tools.max(ss.slowScore, best); + } + + if(verbose){System.err.println("**************** best="+best+", scoreChanged="+scoreChanged+"\nconsidered ss "+ss);} + } + + if(verbose){System.err.println("Finished basic match generation. best="+best+", scoreChanged="+scoreChanged+", AMBIGUOUS_RANDOM="+AMBIGUOUS_RANDOM+", ambiguous="+r.ambiguous());} + if(scoreChanged>0 && (!AMBIGUOUS_RANDOM || !r.ambiguous())){ + if(!r.paired()){ + if(verbose){System.err.println("GMS 1");} + Tools.mergeDuplicateSites(r.sites, false, false); + Collections.sort(r.sites); + int prevScore=0; + for(int i=0; i0 && ss.match==null){r.sites.remove(i);} + i--; + } + if(i>0 || !PRINT_SECONDARY_ALIGNMENTS){ + if(verbose){System.err.println("GMS 4");} + break; + } + } + }else{ + if(verbose){System.err.println("GMS 5");} + SiteScore ss=r.topSite(); + for(int i=r.sites.size()-1; i>0; i--){ + if(verbose){System.err.println("GMS 6");} + if(ss.positionalMatch(r.sites.get(i), true)){r.sites.remove(i);} + } + } + } + + + final SiteScore ss=r.topSite(); + assert(ss==r.topSite()); + +// assert(ss.slowScore>0) : ss.slowScore+", "+best+", "+r.mapScore; + + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + r.match=ss.match; + r.gaps=ss.gaps; + r.mapScore=ss.slowScore; + r.setPerfect(ss.perfect()); + r.setRescued(ss.rescued()); + + assert(Read.CHECKSITES(r, basesM)); + assert(checkTopSite(r)); + +// assert(false) : r.numericID+", "+ss.slowScore+", "+r.mapScore; + } + + + protected final int genMatchStringForSite(final long id, final SiteScore ss, final byte[] basesP, final byte[] basesM, final int maxImperfectSwScore, final int maxSwScore, final Read mate){ + final byte[] bases=ss.plus() ? basesP : basesM; + assert(Read.CHECKSITE(ss, bases, id)); + assert(msa!=null); + + + final int minMsaLimit; + if(PAIRED){ +// minMsaLimit=-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore); + minMsaLimit=-1+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore); +// minMsaLimit=0; + }else{ + minMsaLimit=-1+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore); +// minMsaLimit=0; + } + + if(GEN_MATCH_FAST){ + + assert(!(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE) || AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY || + (ss.slowScore==maxSwScore) == ss.perfect()) : + bases.length+", "+ss.toText()+", "+maxSwScore+", "+ss.slowScore+", "+ss.perfect()+", "+ss.semiperfect(); + + //TODO: This WAS disabled because I saw a read marked perfect with a sub in it, probably with quality 0 at that point. + if((SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE) && ss.perfect()){ + assert(ss.stop-ss.start==(bases.length-1)); + ss.match=makePerfectMatchString(bases.length); + assert(ss.isPerfect(bases)) : id+", "+ss; //TODO: Slow assertion + }else{ + int oldScore=ss.slowScore; + assert(ss.gaps==null || ss.gaps[0]==ss.start && ss.gaps[ss.gaps.length-1]==ss.stop); + int padding=(ss.perfect || ss.semiperfect ? 0 : Tools.max(SLOW_ALIGN_PADDING, 6)); + + if(verbose){System.err.println("Attempting to realign read:\n"+id+", "+ss+"\npadding="+padding+"\nrescued="+ss.rescued());} + + TranslateColorspaceRead.realign_new(ss, bases, msa, padding, true, minMsaLimit, MAX_INDEL<1, id); //Also generates the match string + ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP); + + if(verbose){System.err.println("Realigned read:\n"+id+", "+ss+"\npadding="+padding+"\nrescued="+ss.rescued());} + assert(Read.CHECKSITE(ss, bases, id)); + + if(ss.slowScore"+ss.slowScore); + } + + int extra=(MAX_INDEL>0 ? 80 : 20)+SLOW_ALIGN_PADDING; + int expectedLen=GapTools.calcGrefLen(ss.start, ss.stop, ss.gaps); //TODO Gaps should be correct here!!! + int remaining=(msa.maxColumns-expectedLen-2); + extra=Tools.max(0, Tools.min(remaining/2, extra)); + TranslateColorspaceRead.realign_new(ss, bases, msa, extra, true, minMsaLimit, false, id); + ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP); + assert(Read.CHECKSITE(ss, bases, id)); + + if(verbose){ + System.err.println("\n-> "+ss.start+","+ss.stop+","+ss.slowScore+ + /*(r.originalSite==null ? "" : "\t*"+r.originalSite)+*/"\t(extra = "+extra+")"); + } + } + if(verbose){System.err.println("---- B ----");} + assert(Read.CHECKSITE(ss, bases, id)); + + if(verbose){ + System.err.println("---- D3 ----"); + System.err.println(ss); + System.err.println("Checking perfect status: ss.perfect="+ss.perfect()+", ss.semi="+ss.semiperfect()+ + ", maxSwScore="+maxSwScore+", ss.slowScore="+ss.slowScore); + } + ss.setPerfectFlag(maxSwScore, bases); + if(verbose){ + System.err.println("---- E ----"); + System.err.println("Checking perfect status: ss.perfect="+ss.perfect()+", ss.semi="+ss.semiperfect()+ + ", maxSwScore="+maxSwScore+", ss.slowScore="+ss.slowScore); + } + + assert(Read.CHECKSITE(ss, bases, id)); + } + }else{ + if(verbose){System.err.println("---- F ----");} + ChromosomeArray cha=Data.getChromosome(ss.chrom); + + if(ss.perfect()){ + ss.match=makePerfectMatchString(bases.length); + }else{ + assert(false) : "TODO: This does not take strand into account"; + if(ss.slowScore>=maxImperfectSwScore){ + //TODO + } + + if(msa!=null){ + assert(false) : "0 is not good here; try a non-indel match string."; + int[] max=msa.fillLimited(bases, cha.array, ss.start, ss.stop, 0, ss.gaps); + // System.err.print("*"); + ss.match=msa.traceback(bases, cha.array, ss.start, ss.stop, max[0], max[1], max[2], ss.gaps!=null); + } + } + } + if(verbose){System.err.println("---- G ----");} + + assert(Read.CHECKSITE(ss, bases, id)); + return ss.slowScore; + } + + + + /** Returns the number of additional bases away that should be searched for slow align. + * This should probably be called between quickMap and slowAlign, only on + * sites where stop-start<=bases.length-1 */ + final void findTipDeletions(final Read r, final byte[] basesP, final byte[] basesM, final int maxSwScore, final int maxImperfectScore){ + + boolean findRight=r.quality==null || (r.minQualityLastNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_MIN_QUALITY && + r.avgQualityLastNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY); + boolean findLeft=r.quality==null || (r.minQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_MIN_QUALITY && + r.avgQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY); + if(!findRight && !findLeft){ +// System.err.print("."); + return; + } +// System.err.print("*"); + + for(SiteScore ss : r.sites){ + final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM); + if(!ss.semiperfect && ss.slowScore=maxImperfectScore /*&& ss.stop-ss.start<=basesP.length-1*/){return false;} + assert(lookRight || lookLeft); + assert(TIP_DELETION_MAX_TIPLEN>2); + if(bases.length<=2*TIP_DELETION_MAX_TIPLEN){return false;} + assert(TIP_DELETION_MAX_TIPLEN0); + + int maxSearch=TIP_DELETION_SEARCH_RANGE; + maxSearch=Tools.min(maxSearch, ALIGN_COLUMNS-(SLOW_RESCUE_PADDING+8+Tools.max(bases.length, ss.stop-ss.start))); + if(maxSearch<1){return false;} + + boolean changed=false; + + if(lookRight){ + int x=findTipDeletionsRight(bases, ss.chrom, ss.stop, maxSearch, TIP_DELETION_MAX_TIPLEN); + if(x>0){ + assert(x+ss.stop-ss.start0){ + assert(y+ss.stop-ss.start(anchor.sites.size()); + } + + final int maxLooseSwScore=msa.maxQuality(basesP.length); + final int maxAnchorSwScore=msa.maxQuality(anchor.bases.length); + final int maxImperfectScore=msa.maxImperfectScore(basesP.length); + + final int bestLooseScore=loose.sites.isEmpty() ? 0 : loose.topSite().slowScore; + final int bestAnchorScore=anchor.topSite().slowScore; + + if(bestLooseScore==maxLooseSwScore && bestAnchorScore==maxAnchorSwScore + && anchor.topSite().pairedScore>0){return;} + + int rescueScoreLimit=(int)(0.95f*bestAnchorScore); +// int retainScoreLimit=(int)(bestLooseScore>0 ? 0.58f*bestLooseScore : 0.58f*maxLooseSwScore); + int retainScoreLimit=Tools.max((int)(0.68f*bestLooseScore), (int)(0.4f*maxLooseSwScore)); + int retainScoreLimit2=Tools.max((int)(0.95f*bestLooseScore), (int)(0.55f*maxLooseSwScore)); + final int maxMismatches=PERFECTMODE ? 0 : (bestLooseScore>maxImperfectScore) ? 5 : (int)(0.60f*basesP.length-1); //Higher number is more lenient + assert(PERFECTMODE || maxMismatches>1 || loose.bases.length<16) : loose; //Added the <16 qualifier when a 4bp read failed this assertion + + final boolean findTipDeletions=FIND_TIP_DELETIONS && bestLooseScore=TIP_DELETION_MIN_QUALITY + && loose.avgQualityLastNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY); + final boolean findLeft=findTipDeletions && loose.quality==null || (loose.minQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_MIN_QUALITY + && loose.avgQualityFirstNBases(TIP_DELETION_MAX_TIPLEN)>=TIP_DELETION_AVG_QUALITY); + +// int searchIntoAnchor=Tools.max(20, Tools.min(anchor.bases.length, loose.bases.length)); + for(SiteScore ssa : anchor.sites){ + if(ssa.slowScoreretainScoreLimit && ss.isInBounds()){ + if(ss.score>retainScoreLimit2){//Set them as paired to make them more resistant to being discarded + ss.pairedScore=Tools.max(ss.pairedScore, ss.slowScore+ssa.slowScore/4); + ssa.pairedScore=Tools.max(ssa.pairedScore, ssa.slowScore+ss.slowScore/4); + assert(ss.pairedScore>0); + assert(ssa.pairedScore>0); + } + loose.sites.add(ss); + } + } + } + }else{ + assert(ssa.pairedScore>0); + assert(ssa.pairedScore>ssa.quickScore || ssa.pairedScore>ssa.slowScore) : ssa.toText(); + } + } + } + + + final void slowRescue(final byte[] bases, SiteScore ss, final int maxScore, final int maxImperfectScore, + boolean findTipDeletionsRight, boolean findTipDeletionsLeft){ + + int swscoreNoIndel=msa.scoreNoIndels(bases, ss.chrom, ss.start); + final int oldStart=ss.start; + + if(swscoreNoIndel0){ + ss.slowScore=swscoreNoIndel; + if(findTipDeletionsRight || findTipDeletionsLeft){ + boolean changed=findTipDeletions(ss, bases, maxImperfectScore, findTipDeletionsRight, findTipDeletionsLeft); + if(changed){ + ss.match=null; + swscoreNoIndel=msa.scoreNoIndels(bases, ss.chrom, ss.start); + } + } + + final int minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxScore); + + final int minscore=Tools.max(swscoreNoIndel, minMsaLimit); + final int[] swscoreArray=msa.fillAndScoreLimited(bases, ss.chrom, ss.start, ss.stop, SLOW_RESCUE_PADDING, minscore, ss.gaps); + + if(swscoreArray!=null){ + ss.slowScore=ss.score=swscoreArray[0]; + ss.start=swscoreArray[1]; + ss.stop=swscoreArray[2]; + + if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore && (PRINT_SECONDARY_ALIGNMENTS || USE_SS_MATCH_FOR_PRIMARY)){ + assert(swscoreArray.length==6) : swscoreArray.length; + assert(swscoreArray[0]>=minscore) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore; + ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-SLOW_RESCUE_PADDING, ss.stop+SLOW_RESCUE_PADDING, + swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null); + ss.fixXY(bases, true, msa); + }else{ss.match=null;} + + }else{ + ss.slowScore=ss.score=swscoreNoIndel; + ss.start=oldStart; + ss.stop=ss.start+bases.length-1; + } + }else{ + ss.slowScore=ss.score=swscoreNoIndel; + ss.stop=ss.start+bases.length-1; + } + ss.pairedScore=ss.score+1; + assert(ss.slowScore<=maxScore); + ss.perfect=(ss.slowScore==maxScore); + if(ss.perfect){ss.semiperfect=true;} + else{ss.setPerfect(bases);} + } + + + protected static final void capSiteList(Read r, int cap, boolean printSecondary){ + if(r==null || r.sites==null || cap<0){return;} + if(cap==0){r.sites=null;} + else{ + for(int i=r.sites.size()-1; i>=cap; i--){r.sites.remove(i);} + } + if(!printSecondary || r.numSites()<2){return;} + int max=r.topSite().slowScore; + int min=Tools.min(max-500, (int)(max*.95f)); + for(int i=r.sites.size()-1; i>0; i--){ + if(r.sites.get(i).slowScore0; i--){ + SiteScore ss2=r.sites.get(i); + if(ss1.chrom==ss2.chrom && ss1.strand==ss2.strand && ss1.start==ss2.start && ss1.stop==ss2.stop){ + if(!Shared.anomaly){ +// Shared.anomaly=true; +// System.err.println("Ignoring anomalous duplicate site: "+"\n"+r.toText(false)+(r.mate==null ? "" : "\n"+r.mate.toText(false))+"\n"); + System.err.println("Ignoring anomalous duplicate site for rid="+r.numericID); +// new Exception().printStackTrace(System.err); + } + r.sites.remove(i); + x++; + }else{break;} + } + return x; + } + + protected final void removeUnmapped(ArrayList list){ + for(int i=0; i list){ + for(int i=0; i list){ + for(int i=0; i list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain); + + public final int trimListAdvanced(ArrayList list, boolean retainPaired, boolean retainSemiperfect, int maxScore, boolean specialCasePerfect, + int minSitesToRetain, int maxSitesToRetain, boolean indexUsesExtendedScore, float thresh){ + if(list==null || list.size()==0){return -99999;} + if(list.size()==1){return list.get(0).score;} + + final int highestScore; + if(indexUsesExtendedScore){ + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(highestScore==maxScore && specialCasePerfect){ + Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + return highestScore; + } + + }else{ + highestScore=Tools.trimSiteList(list, .4f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + thresh=thresh*0.5f; + } + + int lim, lastScore=list.get(0).score; + long area=lastScore; + for(lim=1; lim list, final byte[] basesP, final byte[] basesM, + final int maxSwScore, final int maxImperfectSwScore); + + /** This is only for saving ambiguous xy which is now irrelevant */ + public final boolean processAmbiguous(ArrayList list, boolean primary, boolean removeAmbiguous, int clearzone, boolean save_xy){ + if(!save_xy){return true;} + assert(false) : "Needs to be redone with contig names."; + + assert(list.size()>1); + boolean ambiguous=true; +// if(save_xy && minChrom<=24 && maxChrom>=24){ +// int best=list.get(0).score; +// +// //Remove everything outside of the clearzone +// for(int i=list.size()-1; i>0; i--){ +// assert(best>=list.get(i).score); +// if(best-list.get(i).score>clearzone){ +//// assert(i>1); //No longer true because of clearzone/clearzone2 +// list.remove(i); +// }else{ +//// assert(i>0); //Maybe no longer true because of clearzone/clearzone2 +// break; +// } +// } +// +// +// assert(list.size()>1); +// int Xcount=0; +// int Ycount=0; +// for(SiteScore ss : list){ +// assert(ss.score-list.get(0).score<=clearzone); +// if(ss.chrom==23){ +// Xcount++; +// }else if(ss.chrom==24){ +// Ycount++; +// } +// } +// if(Xcount>1 || Ycount>2 || (Xcount+Ycount)0; i--){list.remove(i);} +// assert(list.size()==1); +// } +// } + assert(list.size()>=1); + + if(ambiguous){ + assert(list.size()>1); + if(removeAmbiguous){ + list.clear(); + } + } + + return ambiguous; + } + + + public abstract void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore); + + + public abstract void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore); + +// /** Assumes list is sorted */ +// public abstract void genMatchString(final Read r, final byte[] basesP, final byte[] basesM, final int maxImperfectSwScore, final int maxSwScore, boolean setSSScore, boolean recur); + + public abstract void processRead(Read r, final byte[] basesM); + + @Deprecated + protected final boolean applyClearzone3_old(Read r, int CLEARZONE3, float INV_CLEARZONE3){ + + assert(!r.paired()); //This is currently for unpaired reads + if(!r.mapped() || r.ambiguous() || r.discarded() || r.numSites()<2){return false;} + + final int score1=r.topSite().slowScore; + final int score2=r.sites.get(1).slowScore; + final int score3=(r.sites.size()>2 ? r.sites.get(2).slowScore : -1); + int dif=score1-score2; + + assert(r.mapScore==score1) : r.mapScore+", "+r.topSite().toText(); + + assert(score1==r.mapScore); + assert(score1>=score2) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n"; + if(dif>=CLEARZONE3){return false;} + +// final int dif2=40+(CLEARZONE3-dif)/3; +// final int dif2=(CLEARZONE3-dif)/2; + int dif2=(CLEARZONE3-dif); + + float f=dif2*INV_CLEARZONE3; + + int sub=(dif2+2*(int)(f*dif2)); + + if(score3!=-1){ + assert(score1>=score3); + dif=score1-score3; + assert(score1>=score3); + if(dif0; + } + + + protected final boolean applyClearzone3(Read r, int CLEARZONE3, float INV_CLEARZONE3){ + + assert(!r.paired()); //This is currently for unpaired reads + final ArrayList list=r.sites; + if(!r.mapped() || r.ambiguous() || r.discarded() || list==null || list.size()<2){return false;} + + final int score1=list.get(0).slowScore; + assert(r.mapScore==score1) : r.mapScore+", "+list.get(0).toText()+"\n"+r; + + float sub=0; + final int max=Tools.min(CZ3_MULTS.length, list.size()); + for(int i=1; i2 && ss2.slowScore=CLEARZONE3){break;} +// int dif2=(CLEARZONE3-dif); +// float f=dif2*INV_CLEARZONE3; +// sub+=(dif2+2*(f*dif2))*CZ3_MULTS[i]; + float f=calcCZ3_fraction(score1, ss2.slowScore, CLEARZONE3, INV_CLEARZONE3); + if(f<=0){break;} + sub+=(f*CZ3_MULTS[i]); + } + } + assert(sub>=0); + if(sub<=0){return false;} + + float sub2; +// float asymptote=8f+0.0267f*r.bases.length; + float asymptote=4f+0.03f*r.bases.length; + sub=sub*1.8f; + sub2=CLEARZONE3*((asymptote*sub)/(sub+asymptote)); +// sub2=CLEARZONE3*sub; +// System.out.println("sub="+sub+", sub2="+sub2+", CLEARZONE3="+CLEARZONE3+", (5*sub)="+(5*sub)+", (sub+5*CLEARZONE3)="+(sub+5*CLEARZONE3)); + int subi=(int)(sub2+0.5f); + if(subi>=r.mapScore-300){ + subi=r.mapScore-300; + } + if(subi<=0){return false;} + + for(SiteScore ss : list){ + ss.score-=subi; + ss.slowScore-=subi; + } + r.mapScore-=subi; + assert(r.mapScore>200); + return true; + } + + +// protected float calcCZ3(int score1, int score2, int CLEARZONE3, float INV_CLEARZONE3){ +// +// int dif=score1-score2; +// if(dif>=CLEARZONE3){return 0;} +// //Now dif is between 0 and CZ3 +// +//// final int dif2=40+(CLEARZONE3-dif)/3; +//// final int dif2=(CLEARZONE3-dif)/2; +// int dif2=(CLEARZONE3-dif); //dif2 is higher if the scores are closer. +// +// float f=dif2*INV_CLEARZONE3; //f ranges linearly from 1 (if the scores are identical) to 0 (when score2 is maximally below score1) +// +// float f2=f*f; +// float f7=(float)Math.pow(f, .7); +// +//// return (dif2+2f*f*dif2+2f*Tools.min(f2,0.5f)*dif2); +// return (CLEARZONE3*f7+2f*f*dif2+2f*Tools.min(f2,0.5f)*dif2); +// } + + + protected float calcCZ3_fraction(int score1, int score2, int CLEARZONE3, float INV_CLEARZONE3){ + + int dif=score1-score2; + if(dif>=CLEARZONE3){return 0;} + //Now dif is between 0 and CZ3 + +// final int dif2=40+(CLEARZONE3-dif)/3; +// final int dif2=(CLEARZONE3-dif)/2; + int dif2=(CLEARZONE3-dif); //dif2 is higher if the scores are closer. + + float f=dif2*INV_CLEARZONE3; //f ranges linearly from 1 (if the scores are identical) to 0 (when score2 is maximally below score1) + + float f2=f*f; +// float f7=(float)Math.pow(f, .7); + +// return (dif2+2f*f*dif2+2f*Tools.min(f2,0.5f)*dif2); + return f+2f*f2+2f*f2*f; + } + + /** Returns number of perfect pairs */ + public abstract int pairSiteScoresInitial(Read r, Read r2, boolean trim); + + + + + + protected static void pairSiteScoresFinal(Read r, Read r2, boolean trim, boolean setScore, int MAX_PAIR_DIST, int AVERAGE_PAIR_DIST, + boolean SAME_STRAND_PAIRS, boolean REQUIRE_CORRECT_STRANDS_PAIRS, int maxTrimSitesToRetain){ + + if(r.sites!=null){ + for(SiteScore ss : r.sites){ss.pairedScore=0;} + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ss.pairedScore=0;} + } + + if(r.numSites()<1 || r2.numSites()<1){return;} + + SiteScore.PCOMP.sort(r.sites); + SiteScore.PCOMP.sort(r2.sites); + + int maxPairedScore1=-1; + int maxPairedScore2=-1; + + +// if(verbose){ +// System.out.println(r.list.size()+", "+r2.list.size()); +// System.out.println(); +// for(SiteScore ss : r.list){ +// System.out.println(ss.toText()); +// } +// System.out.println(); +// for(SiteScore ss : r2.list){ +// System.out.println(ss.toText()); +// } +// System.out.println(); +// } + + final float mult1=Tools.min(1/2f, Tools.max(1/4f, (r.bases.length/(4f*r2.bases.length)))); + final float mult2=Tools.min(1/2f, Tools.max(1/4f, (r2.bases.length/(4f*r.bases.length)))); + + final int ilimit=r.sites.size()-1; + final int jlimit=r2.sites.size()-1; + + final int outerDistLimit=(Tools.max(r.bases.length, r2.bases.length)*OUTER_DIST_MULT)/OUTER_DIST_DIV; //Minimum pairing distance + final int expectedFragLength=AVERAGE_PAIR_DIST+r.bases.length+r2.bases.length; + + if(verboseS){ + System.err.println("************************** PAIRING ********************************"); + System.err.println("outerDistLimit="+outerDistLimit+", MAX_PAIR_DIST="+MAX_PAIR_DIST); + } + + for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){ + SiteScore ss1=r.sites.get(i); + SiteScore ss2=r2.sites.get(j); + + while(jMAX_PAIR_DIST))){ + j++; +// if(verbose){System.err.println("a.Incrementing j->"+j);} + ss2=r2.sites.get(j); + } + + for(int k=j; k<=jlimit; k++){ + ss2=r2.sites.get(k); + + if(verboseS){ + System.err.println("Considering sites:\n"+ss1+"\n"+ss2); + } + + if(ss2.chrom>ss1.chrom){break;} + // if(verbose){System.err.println("Same chrom");} + if(ss2.start-ss1.stop>MAX_PAIR_DIST){break;} + + final int innerdist; + final int outerdist; + + //assert(!SAME_STRAND_PAIRS) : "TODO"; + + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + if(ss1.strand!=ss2.strand){ + if(ss1.strand==Gene.PLUS){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + + if(verboseS){ + System.err.println("innerdist="+innerdist+", outerdist="+outerdist); + } + +// if(ss1.start<=ss2.start){ +// innerdist=ss2.start-ss1.stop; +// outerdist=ss2.stop-ss1.start; +// }else{ +// innerdist=ss1.start-ss2.stop; +// outerdist=ss1.stop-ss2.start; +// } + assert(outerdist>=innerdist) : "outerdist=outerDistLimit && innerdist<=MAX_PAIR_DIST){ + + boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS); + // if(verbose){System.err.println("strandOK="+strandOK);} + + if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){ + + int deviation=absdif(AVERAGE_PAIR_DIST, innerdist); + + final int pairedScore1; + final int pairedScore2; + if(strandOK){ + // pairedScore1=ss1.score+(int)(ss2.score*mult1); + // pairedScore2=ss2.score+(int)(ss1.score*mult2); + + pairedScore1=ss1.score+1+ + Tools.max(1, (int)(ss2.score*mult1)-(((deviation)*ss2.score)/Tools.max(100,(10*expectedFragLength+100)))); + pairedScore2=ss2.score+1+ + Tools.max(1, (int)(ss1.score*mult2)-(((deviation)*ss1.score)/Tools.max(100,(10*expectedFragLength+100)))); + + + }else{//e.g. a junction + pairedScore1=ss1.score+ss2.score/16; + pairedScore2=ss2.score+ss1.score/16; + } + + if(verboseS){ + System.err.println("strandOK="+strandOK+"\tpairedScore1="+pairedScore1+", pairedScore2="+pairedScore2); + System.err.println(" \tscore1="+ss1.score+", score2="+ss2.score); + } + + ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); + ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); + maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); + maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + // if(verbose){System.err.println("Paired:\nss1="+ss1.toText()+", ss2="+ss2.toText());} + } + }else{ + // if(verbose){System.err.println("Out of range");} + } + } + // if(verbose){System.err.println("\nss1="+ss1.toText()+", ss2="+ss2.toText());} + + } + + if(setScore){ + for(SiteScore ss : r.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} + } + for(SiteScore ss : r2.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} + } + } + + if(trim){ +// Tools.trimSitesBelowCutoffInplace(r.list, (int)(maxPairedScore1*.95f), false); +// Tools.trimSitesBelowCutoffInplace(r2.list, (int)(maxPairedScore2*.95f), false); + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.95f), false, true, 1, maxTrimSitesToRetain); + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.95f), false, true, 1, maxTrimSitesToRetain); + } + } + + protected final boolean canPair(SiteScore ss1, SiteScore ss2, int len1, int len2, + boolean REQUIRE_CORRECT_STRANDS_PAIRS, boolean SAME_STRAND_PAIRS, int MAX_PAIR_DIST){ + if(ss1.chrom!=ss2.chrom){return false;} + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS); + if(!strandOK){return false;} + } +// int dist=0; +// +// if(ss1.start<=ss2.start){ +// dist=ss2.start-ss1.stop; +// }else if(ss1.start>ss2.start){ +// dist=ss1.start-ss2.stop; +// } +// +// return (dist>=MIN_PAIR_DIST && dist<=MAX_PAIR_DIST); + +// final int outerDistLimit=MIN_PAIR_DIST+len1+len2; +// final int outerDistLimit=(Tools.max(len1, len2)*(OUTER_DIST_MULT2))/OUTER_DIST_DIV; + final int outerDistLimit=(Tools.max(len1, len2)*(OUTER_DIST_MULT))/OUTER_DIST_DIV; + int innerdist=0; + int outerdist=0; + + if(verboseS){ + System.err.println("canPair: outerDistLimit="+outerDistLimit); + } + +// if(ss1.start<=ss2.start){ +// innerdist=ss2.start-ss1.stop; +// outerdist=ss2.stop-ss1.start; +// }else if(ss1.start>ss2.start){ +// innerdist=ss1.start-ss2.stop; +// outerdist=ss1.stop-ss2.start; +// } +// assert(outerdist>=innerdist); + + //assert(!SAME_STRAND_PAIRS) : "TODO"; + + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + if(ss1.strand!=ss2.strand){ + if(ss1.strand==Gene.PLUS){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + + return (outerdist>=outerDistLimit && innerdist<=MAX_PAIR_DIST); + } + + +// /** Returns the number of additional bases away that should be searched for slow align. +// * This should probably be called between quickMap and slowAlign, only on +// * sites where stop-start<=bases.length-1 */ +// public abstract void findTipDeletions(final Read r, final byte[] basesP, final byte[] basesM, final int maxSwScore, final int maxImperfectScore); +// +// public abstract boolean findTipDeletions(SiteScore ss, final byte[] bases, final int maxImperfectScore, boolean lookRight, boolean lookLeft); + + + /** Returns the number of additional bases away that should be searched for slow align. + * This should probably be called between quickMap and slowAlign, only on + * sites where stop-start<=bases.length-1 */ + protected final int findTipDeletionsRight(final byte[] bases, final int chrom, + int originalStop, int searchDist, int tiplen){ + ChromosomeArray cha=Data.getChromosome(chrom); + byte[] ref=cha.array; + if(originalStop1); + if(tiplen<4){return 0;} +// System.err.println("Tiplen="+tiplen+", mismatches="+originalMismatches); +// System.err.print("* "); + + searchDist=Tools.min(searchDist, 30*originalMismatches); + int lastIndexToStart=Tools.min(ref.length-1, originalStop+searchDist); + for(int start=originalStop+1; start<=lastIndexToStart && minMismatches>0; start++){ +// System.err.print("_"); + int mismatches=0; + for(int j=0; j2 || originalMismatches-minMismatches<2){ + return 0; + } +// System.err.println(" $$$ "); + return bestStart-originalStop; + } + + + /** Returns the number of additional bases away that should be searched for slow align. + * This should probably be called between quickMap and slowAlign, only on + * sites where stop-start<=bases.length-1 */ + protected final int findTipDeletionsLeft(final byte[] bases, final int chrom, + final int originalStart, int searchDist, int tiplen){ + ChromosomeArray cha=Data.getChromosome(chrom); + byte[] ref=cha.array; + if(originalStart+tiplen>=ref.length){return 0;} //fail + + if(cha.minIndex>=originalStart){return 0;} //fail + + int minMismatches=tiplen; + int bestStart=originalStart; + + int lastMismatch=0; + int originalMismatches=0; + int contig=0; + for(int i=0; i1); + if(tiplen<4){return 0;} +// System.err.println("Tiplen="+tiplen+", mismatches="+originalMismatches); +// System.err.print("* "); + + searchDist=Tools.min(searchDist, 16+16*originalMismatches+8*tiplen); + int lastIndexToStart=Tools.max(cha.minIndex, originalStart-searchDist); + for(int start=originalStart-1; start>=lastIndexToStart && minMismatches>0; start--){ +// System.err.print("_"); + int mismatches=0; + for(int j=0; j2 || originalMismatches-minMismatches<2){ + return 0; + } +// System.err.println(" $$$ "); + return originalStart-bestStart; + } + + +// public abstract void rescue(Read anchor, Read loose, byte[] basesP, byte[] basesM, int searchDist); + + +// public abstract void slowRescue(final byte[] bases, SiteScore ss, final int maxScore, final int maxImperfectScore, +// boolean findTipDeletionsRight, boolean findTipDeletionsLeft); + + + /** Assumes bases/colors are already on the correct strand */ + public final SiteScore quickRescue(final byte[] bases, final int chrom, final byte strand, final int loc, final int searchDist, + final boolean searchRight, final int idealStart, final int maxAllowedMismatches, int POINTS_MATCH, int POINTS_MATCH2){ + if(bases==null || bases.length<10){return null;} + ChromosomeArray cha=Data.getChromosome(chrom); + byte[] ref=cha.array; + + int lowerBound, upperBound; + if(searchRight){ + lowerBound=Tools.max(cha.minIndex, loc); + upperBound=Tools.min(ref.length-bases.length, loc+searchDist); + }else{ + lowerBound=Tools.max(cha.minIndex, loc-searchDist); + upperBound=Tools.min(ref.length-bases.length, loc); + } + +// int minMismatches=(int)(bases.length*.6f); //Default: .75f. Lower numbers are faster with lower quality. + int minMismatches=maxAllowedMismatches+1; + //For situations like RNASEQ with lots of deletions, a higher value of at least .75 should be used. + + int maxContigMatches=0; + int bestScore=0; + int bestStart=-1; + int bestAbsdif=Integer.MAX_VALUE; + + if(searchRight){ + for(int start=lowerBound; start<=upperBound/* && minMismatches>0*/; start++){ + int mismatches=0; + int contig=0; + int currentContig=0; + for(int j=0; jbestScore || (score==bestScore && absdif=start && lowerBound<=start); +// assert(upperBound>=idealStart); +// assert(lowerBound<=idealStart); + } + } + }else{ + for(int start=upperBound; start>=lowerBound/* && minMismatches>0*/; start--){ + int mismatches=0; + int contig=0; + int currentContig=0; + for(int j=0; jbestScore || (score==bestScore && absdif=start && lowerBound<=start); +// assert(upperBound>=idealStart); +// assert(lowerBound<=idealStart); + } + } + } + + if(bestStart<0){return null;} + + //These scores are dummies and will not quite match the normally generated scores. + final int scoreOut; + if(USE_AFFINE_SCORE){ + scoreOut=POINTS_MATCH+(POINTS_MATCH2*(bases.length-1-minMismatches)); + }else{ + scoreOut=maxContigMatches+(BASE_HIT_SCORE*(bases.length-minMismatches)); + } + + SiteScore ss=new SiteScore(chrom, strand, bestStart, bestStart+bases.length-1, 0, scoreOut); + ss.setPerfect(bases); + ss.rescued=true; + ss.slowScore=minMismatches; //TODO: Clear this field later! + return ss; + } + + + /** Assumes bases/colors are already on the correct strand */ + protected final int[] quickerRescue(final byte[] bases, final int chrom, int loc, final int searchDist){ + ChromosomeArray cha=Data.getChromosome(chrom); + byte[] ref=cha.array; + if(loc0; start++){ + int mismatches=0; + for(int j=0; j ssl=r.sites; + if(ssl==null){return 0;} + int initial=ssl.size(); + for(int i=0; imax){ + ssl.remove(i); + i--; + ss=null; + }else if(/*DONT_OUTPUT_UNMAPPED_READS && */SAM_OUT){ + if(!Data.isSingleScaffold(ss.chrom, ss.start, ss.stop)){ + //TODO: Attempt to trim instead of removing + ssl.remove(i); + i--; + ss=null; + } + } + if(ss!=null){ + int expectedLen=GapTools.calcGrefLen(ss); + if(expectedLen>=EXPECTED_LEN_LIMIT){ + //TODO: Alternately, I could kill the site. + ss.stop=ss.start+Tools.min(r.bases.length+40, EXPECTED_LEN_LIMIT); + if(ss.gaps!=null){GapTools.fixGaps(ss);} + } + } + } + +// System.out.println("Estimated greflen: "+GapTools.calcGrefLen(r.start, r.stop, r.gaps)); +// assert(false); + + return initial-ssl.size(); + } + + protected static final int forbidSelfMapping(ArrayList ssl, SiteScore original){ +// assert(original!=null); + if(ssl==null || ssl.isEmpty() || original==null){return 0;} + int removed=0; + for(int i=0; i0){Tools.condenseStrict(ssl);} + return removed; + } + + + /** Generate a score penalty based on the presence of errors near the read tips. */ + public static int calcTipScorePenalty(final Read r, final int maxScore, final int tiplen){ + if(!r.mapped() || r.match==null || r.bases.length<2*tiplen){return 0;} + + int points=0; + final byte[] match=r.match; + final byte[] bases=r.bases; + final int last=r.bases.length-1; + byte prev='m'; + for(int i=0, cpos=0; cpos<=tiplen; i++){ + byte b=match[i]; + if(b=='m'){ + cpos++; + }else if(b=='D'){ + if(prev!='D'){points+=2*(tiplen+2-cpos);} + }else if(b=='N' || b=='C'){ + points+=(tiplen+2-cpos); + cpos++; + }else{ + if(Character.isDigit(b)){ + r.match=Read.toLongMatchString(r.match); + return calcTipScorePenalty(r, maxScore, tiplen); + } + assert(b=='I' || b=='S') : ((char)b)+"\n"+new String(match)+"\n"+new String(bases)+"\n"; + points+=2*(tiplen+2-cpos); + cpos++; + } + prev=b; + } + + prev='m'; + for(int i=match.length-1, cpos=0; cpos<=tiplen; i--){ + byte b=match[i]; + if(b=='m'){ + cpos++; + }else if(b=='D'){ + if(prev!='D'){points+=2*(tiplen+2-cpos);} + }else if(b=='N' || b=='C'){ + points+=(tiplen+2-cpos); + cpos++; + }else{ + assert(b=='I' || b=='S'); + points+=2*(tiplen+2-cpos); + cpos++; + } + prev=b; + } + + byte b=bases[0]; + //homopolymer tip penalty + if(b!='N' && b==bases[1]){ + for(int i=2; i<=tiplen && bases[i]==b; i++){points++;} + } + + //homopolymer tip penalty + b=bases[last]; + if(b!='N' && b==bases[last-1]){ + for(int i=last-2; i>=(last-tiplen) && bases[i]==b; i--){points++;} + } + + //Did not seem to help +// int hits=r.list.get(0).hits; +// float desired=Tools.min(6, bases.length/12f); +// if(hits0){ + r.mapScore-=penalty; + for(SiteScore ss : r.sites){ + ss.score-=penalty; + ss.slowScore-=penalty; + ss.pairedScore-=penalty; + } + } + } + + + /** {group of correct hit (or -1), size of correct group, number of groups, + * number of elements, correctScore, maxScore, size of top group, num correct, firstElementCorrect, + * firstElementCorrectLoose, firstGroupCorrectLoose} */ + protected int[] calcCorrectness(Read r, int thresh){ + //assume sorted. + ArrayList ssl=r.sites; + + if(ssl==null || ssl.isEmpty()){ + return new int[] {-1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + } + + SiteScore original=r.originalSite; + assert((original==null) != (r.synthetic())); + if(original==null){ + original=ssl.get(0); + } + + int group=0; + int correctGroup=-1; + int groupSize=0; + int correctGroupSize=-1; + int prevScore=Integer.MAX_VALUE; + int sizeOfTopGroup=0; + SiteScore correct=null; + + int firstElementCorrect=0; + int firstElementCorrectLoose=0; + int firstGroupCorrectLoose=0; + + int numCorrect=0; + + for(int i=0; iss.score || (AMBIGUOUS_RANDOM && r.ambiguous()) || r.mate!=null) : "i="+i+", r="+r; + + if(correctGroup==group){ + correctGroupSize=groupSize; + } + + group++; + groupSize=0; + prevScore=ss.score; + } + groupSize++; + + +// boolean b=isCorrectHit(ss, original.chrom, original.strand, original.start, 1, thresh); + boolean b=isCorrectHit(ss, original.chrom, original.strand, original.start, original.stop, thresh); + boolean b2=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, thresh+20); + if(b){ + if(i==0){firstElementCorrect=1;} + numCorrect++; + if(correct==null){ + correct=ss; + correctGroup=group; + } + } + if(b2){ + if(i==0){firstElementCorrectLoose=1;} + if(group==0){firstGroupCorrectLoose=1;} + } + } + if(correctGroup==group){ + correctGroupSize=groupSize; + } + + assert(correctGroup!=0 && correctGroup<=group); + assert(group<=ssl.size()); + assert(sizeOfTopGroup>0 && sizeOfTopGroup<=ssl.size()); + assert((correctGroup>0) == (correctGroupSize>0)); + return new int[] {correctGroup, correctGroupSize, group, ssl.size(), + correct==null ? 0 : correct.score, ssl.get(0).score, sizeOfTopGroup, numCorrect, firstElementCorrect, + firstElementCorrectLoose, firstGroupCorrectLoose}; + } + + + public static final boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh){ +// boolean b=(ss.chrom==trueChrom && ss.strand==trueStrand); + if(ss.chrom!=trueChrom || ss.strand!=trueStrand){return false;} + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + + return (absdif(ss.start, trueStart)<=thresh && absdif(ss.stop, trueStop)<=thresh); +// return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh); + +// if(absdif(ss.start, trueStart)<=thresh){return true;} +// if(absdif(ss.stop, trueStop)<=thresh){return true;} +// return false; + +// if(absdif(ss.start, trueStart)>thresh){return false;} +// if(absdif(ss.stop, trueStop)>thresh){return false;} +// return true; + } + + + public static final boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh){ +// boolean b=(ss.chrom==trueChrom && ss.strand==trueStrand); + if(ss.chrom!=trueChrom || ss.strand!=trueStrand){return false;} + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + + return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh); + +// if(absdif(ss.start, trueStart)<=thresh){return true;} +// if(absdif(ss.stop, trueStop)<=thresh){return true;} +// return false; + +// if(absdif(ss.start, trueStart)>thresh){return false;} +// if(absdif(ss.stop, trueStop)>thresh){return false;} +// return true; + } + + protected static final byte[] makePerfectMatchString(int len){ + byte[] r=new byte[len]; + Arrays.fill(r, (byte)'m'); + return r; + } + + protected static final int absdif(int a, int b){ + return a>b ? a-b : b-a; + } + + /** Returns maximum read length supported by this mapper */ + public abstract int maxReadLength(); + + /** Ensure top site is congruent with read */ + protected static final boolean checkTopSite(Read r){ + if(!r.mapped()){return true;} + if(r.numSites()==0){return false;} + SiteScore ss=r.topSite(); + if(ss==null){return false;} + boolean b=(ss.start==r.start) && (ss.stop==r.stop) && (ss.strand==r.strand()) && (ss.chrom==r.chrom) && (ss.match==r.match); + assert(b) : "\nread="+r+"\nmate="+r.mate+"\nss="+ss+"\n"+(ss==null ? "ss is null" : + ((ss.start==r.start)+", "+(ss.stop==r.stop)+", "+(ss.strand==r.strand())+", "+(ss.chrom==r.chrom)+", "+(ss.match==r.match))+"\nlist="+r.sites); + return b; + } + + + protected static final int removeLongIndels(ArrayList list, int maxlen){ + if(list==null || list.size()<1){return 0;} + int removed=0; + for(int i=list.size()-1; i>=0; i--){ + SiteScore ss=list.get(i); + if(hasLongIndel(ss.match, maxlen)){ + list.remove(i); + removed++; + } + } + return removed; + } + + protected static final boolean hasLongIndel(byte[] match, int maxlen){ + if(match==null || match.lengthmaxlen){return true;} + }else{ + len=0; + } + prev=b; + } + return false; + } + + /** TODO */ + final void processReadSplit(Read r, byte[] basesM, int minlen, int maxlen){ + assert(minlen>=KEYLEN && maxlen>=minlen) : KEYLEN+", "+maxlen+", "+minlen; + int len=r.bases==null ? 0 : r.bases.length; + if(len<=maxlen){ + processRead(r, basesM); + return; + } + ArrayList subreads=r.split(minlen, maxlen); + } + + public final synchronized boolean finished(){return finished;} + + public final synchronized boolean working(){return !finished;} + + final synchronized void finish(){ + assert(!finished); + finished=true; + notifyAll(); + } + + private boolean finished=false; + + private static final float[] CZ3_MULTS=new float[] {0f, 1f, .75f, 0.5f, 0.25f, 0.125f, 0.0625f}; + + /*--------------------------------------------------------------*/ + + /** Input read source. */ + protected final ConcurrentReadStreamInterface cris; + + + /** All reads go here.
+ * If outputunmapped=false, omit unmapped single reads and double-unmapped paired reads. */ + protected final RTextOutputStream3 outStream; + /** All mapped reads (and half-mapped pairs) go here except reads that only map to the blacklist. */ + protected final RTextOutputStream3 outStreamMapped; + /** All unmapped reads (and double-unmapped pairs) go here. */ + protected final RTextOutputStream3 outStreamUnmapped; + /** All reads (and half-mapped pairs) that map best to the blacklist go here. */ + protected final RTextOutputStream3 outStreamBlack; + + + /*--------------------------------------------------------------*/ + + + public final String MSA_TYPE; + final MSA msa; + final TranslateColorspaceRead tcr; + public final ReadStats readstats; + public final int POINTS_MATCH, POINTS_MATCH2; + public final int KEYLEN; + + protected final boolean PERFECTMODE; //Only look for perfect matches + protected final boolean SEMIPERFECTMODE; //Only look for perfect and semiperfect matches + protected final boolean FORBID_SELF_MAPPING; //Do not allow reads to map to their official origin. Allows you to find next-best matches (when supported) + protected final boolean RCOMP_MATE; //Reverse-complement mate prior to mapping + /** True if this thread should generate a match string for the best match */ + protected final boolean MAKE_MATCH_STRING; + + protected final boolean DONT_OUTPUT_UNMAPPED_READS; + protected final boolean DONT_OUTPUT_BLACKLISTED_READS; + protected final boolean PRINT_SECONDARY_ALIGNMENTS; + protected final boolean QUICK_MATCH_STRINGS; + protected final boolean USE_SS_MATCH_FOR_PRIMARY=true; + + protected final int MAX_SITESCORES_TO_PRINT; + + /** Scores below the (max possible alignment score)*(MINIMUM_ALIGNMENT_SCORE_RATIO) will be discarded. + * Default: 0.4 for synthetic data. */ + protected final float MINIMUM_ALIGNMENT_SCORE_RATIO; + protected final float MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE; + protected final float MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED; + + protected final float keyDensity; + protected final float maxKeyDensity; + protected final float minKeyDensity; + protected final int maxDesiredKeys; + + /*--------------------------------------------------------------*/ + + final int CLEARZONE1e; + + /*--------------------------------------------------------------*/ + + final int MIN_APPROX_HITS_TO_KEEP; + final boolean USE_EXTENDED_SCORE; + public static final boolean GENERATE_BASE_SCORES_FROM_QUALITY=AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY; + final int BASE_HIT_SCORE; + final int BASE_KEY_HIT_SCORE; + final boolean USE_AFFINE_SCORE; + final int EXPECTED_LEN_LIMIT; + final int MAX_INDEL; + + final boolean TRIM_LIST; + final int TIP_DELETION_SEARCH_RANGE; + final boolean FIND_TIP_DELETIONS; + final int ALIGN_COLUMNS; + + /*--------------------------------------------------------------*/ + + + /** Deprecated. Must be set to false. Reads and index are in SOLiD colorspace. */ + protected final boolean colorspace; + /** Use dynamic programming slow-alignment phase to increase quality. Program may not run anymore if this is disabled. */ + protected final boolean SLOW_ALIGN; + /** Produce local alignments instead of global alignments */ + protected final boolean LOCAL_ALIGN; + /** Discard reads with ambiguous alignments (consider them unmapped). */ + protected final boolean AMBIGUOUS_TOSS; + /** Choose a random site for reads with ambiguous alignments. */ + protected final boolean AMBIGUOUS_RANDOM; + /** Output all sites for reads with ambiguous alignments. */ + protected final boolean AMBIGUOUS_ALL; + /** Quality-trim left side of reads before mapping. */ + protected final boolean TRIM_LEFT; + /** Quality-trim right side of reads before mapping. */ + protected final boolean TRIM_RIGHT; + /** Undo quality trimming after mapping. */ + protected final boolean UNTRIM; + /** Trim until 2 consecutive bases are encountered with at least this quality. */ + protected final byte TRIM_QUAL; + /** Don't trim reads to be shorter than this */ + protected final int TRIM_MIN_LENGTH=30; + /** Distance cutoff for classifying a read as loosely correct */ + protected final int THRESH; + /** Semi-deprecated. Minimum chrom to index or load. */ + protected final int minChrom; + /** Semi-deprecated. Maximum chrom to index or load. */ + protected final int maxChrom; + /** Disallow sites that do not have at least k consecutive matching bases. */ + protected final int KFILTER; + + + /** When reads are not in valid pairing orientation, eliminate (mark unmapped) the lower-scoring read. */ + protected final boolean KILL_BAD_PAIRS; + /** For human genome, map ambiguous reads in the PAR to the X chromosome. */ + protected final boolean SAVE_AMBIGUOUS_XY; + /** Deprecated. Must be set to true. */ + protected final boolean GEN_MATCH_FAST=true; + /** For colorspace reads, translate to base space before outputting them. */ + protected final boolean translateToBaseSpace; + + /** Padding for dynamic-programming slow alignment. */ + protected final int SLOW_ALIGN_PADDING; + /** Padding for dynamic-programming slow alignment for rescued reads (which typically may need more padding). */ + protected final int SLOW_RESCUE_PADDING; + /** If a site is unpaired, search nearby for a possible site for the other read. */ + protected final boolean DO_RESCUE; + /** Forbid alignments with indels longer than MAX_INDEL */ + protected final boolean STRICT_MAX_INDEL; + /** Bandwidth of banded MSA */ + protected final int BANDWIDTH; + + protected final boolean PAIRED; + protected final boolean REQUIRE_CORRECT_STRANDS_PAIRS; + protected final boolean SAME_STRAND_PAIRS; + + /*--------------------------------------------------------------*/ + + protected int AVERAGE_PAIR_DIST=100; + + /** Extra padding for when slow alignment fails. */ + protected int EXTRA_PADDING=10; + + protected final boolean GENERATE_KEY_SCORES_FROM_QUALITY; + + /*--------------------------------------------------------------*/ + + protected static boolean CALC_STATISTICS=true; + protected static int MIN_PAIR_DIST=-160; + protected static int MAX_PAIR_DIST=32000; + /** IMPORTANT!!!! This option causes non-deterministic output. */ + protected static final boolean DYNAMIC_INSERT_LENGTH=true; + /** Counts undefined bases. */ + protected static final boolean DISCARD_MOSTLY_UNDEFINED_READS=true; + + protected static final byte TIP_DELETION_MIN_QUALITY=6; + protected static final byte TIP_DELETION_AVG_QUALITY=14; + protected static final int TIP_DELETION_MAX_TIPLEN=8; + + protected static final int OUTER_DIST_MULT=14; +// protected static final int OUTER_DIST_MULT2=OUTER_DIST_MULT-1; + protected static final int OUTER_DIST_DIV=32; + + protected static long SKIP_INITIAL=0; + + protected static boolean OUTPUT_PAIRED_ONLY=false; + +// static{if(OUTER_DIST_MULT2<1){throw new RuntimeException();}} + + /*--------------------------------------------------------------*/ + + public int totalNumCorrect1=0; + public int totalNumIncorrect1=0; + public int totalNumIncorrectPrior1=0; + public int totalNumCapturedAllCorrect1=0; + public int totalNumCapturedAllCorrectTop1=0; + public int totalNumCapturedAllCorrectOnly1=0; + + public int totalNumCorrect2=0; + public int totalNumIncorrect2=0; + public int totalNumIncorrectPrior2=0; + public int totalNumCapturedAllCorrect2=0; + public int totalNumCapturedAllCorrectTop2=0; + public int totalNumCapturedAllCorrectOnly2=0; + + /*--------------------------------------------------------------*/ + + public boolean verbose=false; + public static final boolean verboseS=false; + + public long readsUsed=0; + public long readsUsed2=0; + public long numMated=0; + public long badPairs=0; + public long innerLengthSum=0; + public long outerLengthSum=0; + public long insertSizeSum=0; + public long keysUsed=0; + public long basesUsed=0; //basesUsed and basesAtQuickmap are identical + public long basesAtQuickmap=0; //basesUsed and basesAtQuickmap are identical + public long syntheticReads=0; + + public int mapped1=0; + public int mappedRetained1=0; + public int rescuedP1=0; + public int rescuedM1=0; + public int truePositiveP1=0; + public int truePositiveM1=0; + public int falsePositive1=0; + public int totalCorrectSites1=0; + + public int firstSiteCorrectP1=0; + public int firstSiteCorrectM1=0; + public int firstSiteIncorrect1=0; + public int firstSiteCorrectLoose1=0; + public int firstSiteIncorrectLoose1=0; + public int firstSiteCorrectPaired1=0; + public int firstSiteCorrectSolo1=0; + public int firstSiteCorrectRescued1=0; + + public long matchCountS1=0; + public long matchCountI1=0; + public long matchCountD1=0; + public long matchCountM1=0; + public long matchCountN1=0; + + + public int perfectHit1=0; //Highest quick score is max quick score + public int uniqueHit1=0; //Only one hit has highest score + public int correctUniqueHit1=0; //unique highest hit on answer site + public int correctMultiHit1=0; //non-unique highest hit on answer site + public int correctLowHit1=0; //hit on answer site, but not highest scorer + public int noHit1=0; + + /** Number of perfect hit sites found */ + public int perfectHitCount1=0; + /** Number of sites found that are perfect except for no-ref */ + public int semiPerfectHitCount1=0; + + + public int perfectMatch1=0; //Highest slow score is max slow score + public int semiperfectMatch1=0; + + public int ambiguousBestAlignment1=0; + + public long initialSiteSum1=0; + public long postTrimSiteSum1=0; + public long postRescueSiteSum1=0; + public long siteSum1=0; + public long topSiteSum1=0; + + public long lowQualityReadsDiscarded1=0; + + public int mapped2=0; + public int mappedRetained2=0; + public int rescuedP2=0; + public int rescuedM2=0; + public int truePositiveP2=0; + public int truePositiveM2=0; + public int falsePositive2=0; + public int totalCorrectSites2=0; + + public int firstSiteCorrectP2=0; + public int firstSiteCorrectM2=0; + public int firstSiteIncorrect2=0; + public int firstSiteCorrectLoose2=0; + public int firstSiteIncorrectLoose2=0; + public int firstSiteCorrectPaired2=0; + public int firstSiteCorrectSolo2=0; + public int firstSiteCorrectRescued2=0; + + public long matchCountS2=0; + public long matchCountI2=0; + public long matchCountD2=0; + public long matchCountM2=0; + public long matchCountN2=0; + + public int perfectHit2=0; //Highest quick score is max quick score + public int uniqueHit2=0; //Only one hit has highest score + public int correctUniqueHit2=0; //unique highest hit on answer site + public int correctMultiHit2=0; //non-unique highest hit on answer site + public int correctLowHit2=0; //hit on answer site, but not highest scorer + public int noHit2=0; + + /** Number of perfect hit sites found */ + public int perfectHitCount2=0; + /** Number of sites found that are perfect except for no-ref */ + public int semiPerfectHitCount2=0; + + public int perfectMatch2=0; //Highest slow score is max slow score + public int semiperfectMatch2=0; + + public int ambiguousBestAlignment2=0; + + public long initialSiteSum2=0; + public long postTrimSiteSum2=0; + public long postRescueSiteSum2=0; + public long siteSum2=0; + public long topSiteSum2=0; + + public long lowQualityReadsDiscarded2=0; + + /*--------------------------------------------------------------*/ + + int idmodulo; +} diff --git a/current/align2/AbstractMapper.java b/current/align2/AbstractMapper.java new file mode 100755 index 0000000..75280bd --- /dev/null +++ b/current/align2/AbstractMapper.java @@ -0,0 +1,2476 @@ +package align2; + +import java.io.File; +import java.io.PrintStream; +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; + +import jgi.CalcTrueQuality; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.ConcurrentSolidInputStream; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.FastqReadInputStream; +import stream.RTextInputStream; +import stream.RTextOutputStream3; +import stream.RandomReadInputStream; +import stream.Read; +import stream.ReadStreamWriter; +import stream.SamLine; +import stream.SamReadInputStream; +import stream.SequentialReadInputStream; + +import dna.Data; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ReadWrite; +import fileIO.FileFormat; + +/** + * Abstract superclass created from BBMap variants. + * Handles argument parsing, I/O stream initialization and shutdown, + * thread management, statistics collection and formatting. + * @author Brian Bushnell + * @date Oct 15, 2013 + * + */ +public abstract class AbstractMapper { + + public AbstractMapper(String[] args){ + if(Shared.COMMAND_LINE==null){ + Shared.COMMAND_LINE=(args==null ? null : args.clone()); + Shared.BBMAP_CLASS=this.getClass().getName(); + int x=Shared.BBMAP_CLASS.lastIndexOf('.'); + if(x>=0){Shared.BBMAP_CLASS=Shared.BBMAP_CLASS.substring(x+1);} + } + setDefaults(); + preparse0(args); + String[] args2=preparse(args); + parse(args2); + postparse(args2); + setup(); + } + + void printOptions(){ + sysout.println("For help, please consult readme.txt or run the shellscript with no parameters."); + } + + final void abort(AbstractMapThread[] mtts, String message){ + closeStreams(cris, rosA, rosM, rosU, rosB); + if(mtts!=null){int x=shutDownThreads(mtts, true);} + if(message==null){throw new RuntimeException();} + throw new RuntimeException(message); + } + + /** In megabytes */ + final void adjustThreadsforMemory(long threadMem){ + Runtime rt=Runtime.getRuntime(); + long mmemory=rt.maxMemory()/1000000; + long tmemory=rt.totalMemory()/1000000; + long fmemory=rt.freeMemory()/1000000; + long umemory=tmemory-fmemory; + long amemory=mmemory-umemory-40; +// System.err.println("mmemory="+mmemory+", tmemory="+tmemory+", fmemory="+fmemory+", umemory="+umemory+", amemory="+amemory); + int maxThreads=(int)(amemory/threadMem); + if(Shared.THREADS>maxThreads){ + System.err.println("\nMax Memory = "+mmemory+" MB\nAvailable Memory = "+amemory+" MB"); + if(maxThreads<1){abort(null, "\n\nNot enough memory. Please run on a node with at least "+((long)((umemory+40+threadMem)*1.15))+" MB.\n");} + System.err.println("Reducing threads from "+Shared.THREADS+" to "+maxThreads+" due to low system memory."); + Shared.THREADS=maxThreads; + } + } + + abstract void setDefaults(); + + abstract String[] preparse(String[] args); + + abstract void postparse(String[] args); + + abstract void setup(); + + abstract void loadIndex(); + + abstract void processAmbig2(); + + abstract void testSpeed(String[] args); + + abstract void setSemiperfectMode(); + + abstract void setPerfectMode(); + + abstract void printSettings(int k); + + private final void parse(String[] args){ + + + sysout.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + sysout.println("BBMap version "+Shared.BBMAP_VERSION_STRING); + + if(Tools.parseHelp(args)){ + printOptions(); + System.exit(0); + } + + Timer t=new Timer(); + t.start(); + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+arg); + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("printtoerr")){ + if(Tools.parseBoolean(b)){ + sysout=System.err; + Data.sysout=System.err; + } + }else if(a.equals("colorspace") || a.equals("cs")){ + colorspace=Tools.parseBoolean(b); + sysout.println("Set colorspace to "+colorspace); + }else if(a.equals("path") || a.equals("root")){ + Data.setPath(b); + }else if(a.equals("ref") || a.equals("reference") || a.equals("fasta")){ + reference=b; + }else if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.equals("out")){ + if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")){ + outFile=null; + }else{ + outFile=b; +// outFile=b.replace('#', '1'); +// outFile2=(b.contains("#") ? b.replace('#', '2') : null); + } + }else if(a.equals("out1")){ + outFile=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + if(outFile==null){ + outFile=null; + } + }else if(a.equals("out2")){ + outFile2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("outm") || a.equals("outm1") || a.equals("outmapped") || a.equals("outmapped1")){ + outFileM=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("outm2") || a.equals("outmapped2")){ + outFileM2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("outu") || a.equals("outu1") || a.equals("outunmapped") || a.equals("outunmapped1")){ + outFileU=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("outu2") || a.equals("outunmapped2")){ + outFileU2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("outb") || a.equals("outb1") || a.equals("outblack") || a.equals("outblack1") || a.equals("outblacklist") || a.equals("outblacklist1")){ + outFileB=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("outb2") || a.equals("outblack2") || a.equals("outblacklist2")){ + outFileB2=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("blacklist") && !Data.scaffoldPrefixes){ + if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")){blacklist=null;} + else{ + if(blacklist==null){blacklist=new ArrayList();} + if(b.indexOf(',')<0 || new File(b).exists()){blacklist.add(b);} + else{ + String[] temp=b.split(","); + for(String tmp : temp){blacklist.add(tmp);} + } + } + }else if(a.startsWith("out_") && b!=null){ + //ignore, it will be processed later + }else if(a.equals("qualityhistogram") || a.equals("qualityhist") || a.equals("qhist")){ + ReadStats.QUAL_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + ReadStats.COLLECT_QUALITY_STATS=(ReadStats.QUAL_HIST_FILE!=null); + if(ReadStats.COLLECT_QUALITY_STATS){sysout.println("Set quality histogram output to "+ReadStats.QUAL_HIST_FILE);} + }else if(a.equals("matchhistogram") || a.equals("matchhist") || a.equals("mhist")){ + ReadStats.MATCH_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + ReadStats.COLLECT_MATCH_STATS=(ReadStats.MATCH_HIST_FILE!=null); + if(ReadStats.COLLECT_MATCH_STATS){sysout.println("Set match histogram output to "+ReadStats.MATCH_HIST_FILE);} + }else if(a.equals("inserthistogram") || a.equals("inserthist") || a.equals("ihist")){ + ReadStats.INSERT_HIST_FILE=(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")) ? null : b; + ReadStats.COLLECT_INSERT_STATS=(ReadStats.INSERT_HIST_FILE!=null); + if(ReadStats.COLLECT_INSERT_STATS){sysout.println("Set insert size histogram output to "+ReadStats.INSERT_HIST_FILE);} + }else if(a.equals("bamscript") || a.equals("bs")){ + bamscript=b; + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("trd") || a.equals("trc") || a.equals("trimreaddescription") || a.equals("trimreaddescriptions")){ + Shared.TRIM_READ_COMMENTS=Tools.parseBoolean(b); + }else if(a.equals("fakequal") || a.equals("fakequality")){ + if(b==null || b.length()<1){b="f";} + if(Character.isLetter(b.charAt(0))){ + FastaReadInputStream.FAKE_QUALITY=Tools.parseBoolean(b); + }else{ + int x=Integer.parseInt(b); + if(x<1){ + FastaReadInputStream.FAKE_QUALITY=false; + }else{ + FastaReadInputStream.FAKE_QUALITY=true; + FastaReadInputStream.FAKE_QUALITY_LEVEL=(byte)Tools.min(x, 50); + } + } + }else if(a.equals("keepnames")){ + SamLine.KEEP_NAMES=Tools.parseBoolean(b); + }else if(a.equals("local")){ + LOCAL_ALIGN=Tools.parseBoolean(b); + }else if(a.equals("idtag")){ + SamLine.MAKE_IDENTITY_TAG=Tools.parseBoolean(b); + }else if(a.equals("inserttag")){ + SamLine.MAKE_INSERT_TAG=Tools.parseBoolean(b); + }else if(a.equals("correctnesstag")){ + SamLine.MAKE_CORRECTNESS_TAG=Tools.parseBoolean(b); + }else if(a.equals("minidentity") || a.equals("minid")){ + if(b.lastIndexOf('%')==b.length()-1){minid=Double.parseDouble(b.substring(b.length()-1))/100;} + else{minid=Double.parseDouble(b);} + assert(minid>=0 && minid<=100) : "min identity must be between 0 and 1. Values from 1 to 100 will be assumed percent and divided by 100."; + }else if(a.equals("xmtag") || a.equals("xm")){ + SamLine.MAKE_XM_TAG=Tools.parseBoolean(b); + }else if(a.equals("stoptag")){ + SamLine.MAKE_STOP_TAG=Tools.parseBoolean(b); + }else if(a.equals("parsecustom") || a.equals("fastqparsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + sysout.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM); + }else if(a.equals("reads")){ + reads=Long.parseLong(b); + }else if(a.equals("skipreads")){ + AbstractMapThread.SKIP_INITIAL=Long.parseLong(b); + }else if(a.equals("readlen") || a.equals("length") || a.equals("len")){ + readlen=Integer.parseInt(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ziplevel=Integer.parseInt(b); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + gzip=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){pigz=false;} + else{ + pigz=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{pigz=Tools.parseBoolean(b);} + + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + gunzip=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + unpigz=Tools.parseBoolean(b); + }else if(a.equals("kfilter")){ + KFILTER=Integer.parseInt(b); + }else if(a.equals("msa")){ + MSA_TYPE=b; + }else if(a.equals("bandwidth") || a.equals("bw")){ + int x=Tools.max(0, Integer.parseInt(b)); + MSA.bandwidth=x; + }else if(a.equals("bandwidthratio") || a.equals("bwr")){ + float x=Tools.max(0, Float.parseFloat(b)); + MSA.bandwidthRatio=x; + assert(x>=0) : "Bandwidth ratio should be at least 0."; + }else if(a.equals("trim") || a.equals("qtrim")){ + if(b==null){TRIM_RIGHT=TRIM_LEFT=true;} + else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){TRIM_LEFT=true;TRIM_RIGHT=false;} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){TRIM_LEFT=false;TRIM_RIGHT=true;} + else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){TRIM_LEFT=TRIM_RIGHT=true;} + else{TRIM_RIGHT=TRIM_LEFT=Tools.parseBoolean(b);} + }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){ + if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){ + TrimRead.optimalMode=true; + TrimRead.optimalBias=Float.parseFloat(b); + assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimright")){ + TRIM_RIGHT=Tools.parseBoolean(b); + }else if(a.equals("trimleft")){ + TRIM_LEFT=Tools.parseBoolean(b); + }else if(a.equals("trimq") || a.equals("trimquality")){ + TRIM_QUALITY=Byte.parseByte(b); + }else if(a.equals("q102matrix") || a.equals("q102m")){ + CalcTrueQuality.q102matrix=b; + }else if(a.equals("qbpmatrix") || a.equals("bqpm")){ + CalcTrueQuality.qbpmatrix=b; + }else if(a.equals("loadq102")){ + CalcTrueQuality.q102=Tools.parseBoolean(b); + }else if(a.equals("loadqbp")){ + CalcTrueQuality.qbp=Tools.parseBoolean(b); + }else if(a.equals("loadq10")){ + CalcTrueQuality.q10=Tools.parseBoolean(b); + }else if(a.equals("loadq12")){ + CalcTrueQuality.q12=Tools.parseBoolean(b); + }else if(a.equals("loadqb012")){ + CalcTrueQuality.qb012=Tools.parseBoolean(b); + }else if(a.equals("loadqb234")){ + CalcTrueQuality.qb234=Tools.parseBoolean(b); + }else if(a.equals("loadqp")){ + CalcTrueQuality.qp=Tools.parseBoolean(b); + }else if(a.equals("adjustquality") || a.equals("adjq")){ + TrimRead.ADJUST_QUALITY=Tools.parseBoolean(b); + }else if(a.equals("untrim") || a.equals("outputuntrimmed")){ + UNTRIM=Tools.parseBoolean(b); + }else if(a.equals("eono") || a.equals("erroronnooutput")){ + ERROR_ON_NO_OUTPUT=Tools.parseBoolean(b); + }else if(a.equals("log")){ + RefToIndex.LOG=Tools.parseBoolean(b); + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + sysout.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + sysout.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + sysout.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=ReadStats.OVERWRITE=Tools.parseBoolean(b); + sysout.println("Set OVERWRITE to "+OVERWRITE); + }else if(a.equals("sitesonly") || a.equals("outputsitesonly")){ + outputSitesOnly=Tools.parseBoolean(b); + sysout.println("Set outputSitesOnly to "+outputSitesOnly); + }else if(a.equals("discardambiguous") || a.equals("tossambiguous")){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=Tools.parseBoolean(b); + sysout.println("Set REMOVE_DUPLICATE_BEST_ALIGNMENTS to "+REMOVE_DUPLICATE_BEST_ALIGNMENTS); + }else if(a.equals("ambiguous") || a.equals("ambig")){ + if(b==null){ + throw new RuntimeException(arg); + }else if(b.equalsIgnoreCase("keep") || b.equalsIgnoreCase("best") || b.equalsIgnoreCase("first")){ + ambigMode=AMBIG_BEST; + }else if(b.equalsIgnoreCase("all")){ + ambigMode=AMBIG_ALL; + }else if(b.equalsIgnoreCase("random")){ + ambigMode=AMBIG_RANDOM; + }else if(b.equalsIgnoreCase("toss") || b.equalsIgnoreCase("discard") || b.equalsIgnoreCase("remove")){ + ambigMode=AMBIG_TOSS; + }else{ + throw new RuntimeException(arg); + } +// sysout.println("Set REMOVE_DUPLICATE_BEST_ALIGNMENTS to "+REMOVE_DUPLICATE_BEST_ALIGNMENTS); + }else if(a.equals("maxsites")){ + MAX_SITESCORES_TO_PRINT=Integer.parseInt(b); + }else if(a.equals("secondary")){ + PRINT_SECONDARY_ALIGNMENTS=Tools.parseBoolean(b); + ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=PRINT_SECONDARY_ALIGNMENTS; + }else if(a.equals("quickmatch")){ + QUICK_MATCH_STRINGS=Tools.parseBoolean(b); + }else if(a.equals("ambiguous2") || a.equals("ambig2")){ + if(b==null){ + throw new RuntimeException(arg); + }else if(b.equalsIgnoreCase("split") || b.equalsIgnoreCase("stream")){ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_SPLIT; + }else if(b.equalsIgnoreCase("keep") || b.equalsIgnoreCase("best") || b.equalsIgnoreCase("first")){ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST; + }else if(b.equalsIgnoreCase("toss") || b.equalsIgnoreCase("discard") || b.equalsIgnoreCase("remove")){ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_TOSS; + }else if(b.equalsIgnoreCase("random")){ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_RANDOM; + }else if(b.equalsIgnoreCase("all")){ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_ALL; + }else{ + throw new RuntimeException(arg); + } + }else if(a.equals("forbidselfmapping")){ + FORBID_SELF_MAPPING=Tools.parseBoolean(b); + sysout.println("Set FORBID_SELF_MAPPING to "+FORBID_SELF_MAPPING); + }else if(a.equals("threads") || a.equals("t")){ + if(b.equalsIgnoreCase("auto")){Shared.SET_THREADS(-1);} + else{Shared.THREADS=Integer.parseInt(b);} + sysout.println("Set threads to "+Shared.THREADS); + }else if(a.equals("samversion") || a.equals("samv") || a.equals("sam")){ + SamLine.VERSION=Float.parseFloat(b); + }else if(a.equals("match") || a.equals("cigar")){ + if(b!=null){b=b.toLowerCase();}else{b="true";} + if(b.equals("long") || b.equals("normal")){ + MAKE_MATCH_STRING=true; + Read.COMPRESS_MATCH_BEFORE_WRITING=false; +// sysout.println("Writing long match strings."); + }else if(b.equals("short") || b.equals("compressed")){ + MAKE_MATCH_STRING=true; + Read.COMPRESS_MATCH_BEFORE_WRITING=true; +// sysout.println("Writing short match strings."); + }else{ + MAKE_MATCH_STRING=Tools.parseBoolean(b); + } + + if(MAKE_MATCH_STRING){ + sysout.println("Cigar strings enabled."); + }else{ + sysout.println("Cigar strings disabled."); + } + }else if(a.equals("semiperfectmode")){ + SEMIPERFECTMODE=Tools.parseBoolean(b); + if(ziplevel==-1){ziplevel=2;} + }else if(a.equals("perfectmode")){ + PERFECTMODE=Tools.parseBoolean(b); + if(ziplevel==-1){ziplevel=2;} + }else if(a.equals("trimlist")){ + TRIM_LIST=Tools.parseBoolean(b); + }else if(a.equals("pairedrandom")){ + PAIRED_RANDOM_READS=Tools.parseBoolean(b); + }else if(a.equals("ordered") || a.equals("ord")){ + OUTPUT_ORDERED_READS=Tools.parseBoolean(b); + sysout.println("Set OUTPUT_ORDERED_READS to "+OUTPUT_ORDERED_READS); + }else if(a.equals("outputunmapped")){ + DONT_OUTPUT_UNMAPPED_READS=!Tools.parseBoolean(b); + sysout.println("Set DONT_OUTPUT_UNMAPPED_READS to "+DONT_OUTPUT_UNMAPPED_READS); + }else if(a.equals("outputblacklisted")){ + DONT_OUTPUT_BLACKLISTED_READS=!Tools.parseBoolean(b); + sysout.println("Set DONT_OUTPUT_BLACKLISTED_READS to "+DONT_OUTPUT_BLACKLISTED_READS); + }else if(a.equals("build") || a.equals("genome") || a.equals("index")){ + build=Integer.parseInt(b); + }else if(a.equals("minchrom")){ + minChrom=Integer.parseInt(b); + maxChrom=Tools.max(minChrom, maxChrom); + }else if(a.equals("maxchrom")){ + maxChrom=Byte.parseByte(b); + minChrom=Tools.min(minChrom, maxChrom); + }else if(a.equals("expectedsites")){ + expectedSites=Integer.parseInt(b); + }else if(a.equals("targetsize")){ + targetGenomeSize=Tools.parseKMG(b); + }else if(a.equals("fgte")){ + fractionGenomeToExclude=Float.parseFloat(b); + sysout.println("Set fractionGenomeToExclude to "+String.format("%.4f",fractionGenomeToExclude)); + }else if(a.equals("minratio")){ + MINIMUM_ALIGNMENT_SCORE_RATIO=Float.parseFloat(b); + sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO)); + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + if(b.equalsIgnoreCase("auto")){ + FASTQ.DETECT_QUALITY=true; + }else{ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else{x=Byte.parseByte(b);} + FASTQ.ASCII_OFFSET=x; + sysout.println("Set fastq input ASCII offset to "+FASTQ.ASCII_OFFSET); + FASTQ.DETECT_QUALITY=false; + } + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + if(b.equalsIgnoreCase("auto")){ + FASTQ.DETECT_QUALITY_OUT=true; + }else{ + byte ascii_offset=Byte.parseByte(b); + FASTQ.ASCII_OFFSET_OUT=ascii_offset; + sysout.println("Set fastq output ASCII offset to "+FASTQ.ASCII_OFFSET_OUT); + FASTQ.DETECT_QUALITY_OUT=false; + } + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("rcompmate") || a.equals("reversecomplementmate")){ + rcompMate=Tools.parseBoolean(b); + sysout.println("Set RCOMP_MATE to "+rcompMate); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + TranslateColorspaceRead.verbose=verbose; + AbstractIndex.verbose2=verbose; + }else if(a.equals("verbosestats")){ + if(Character.isDigit(b.charAt(0))){ + verbose_stats=Integer.parseInt(b); + }else{ + verbose_stats=Tools.parseBoolean(b) ? 9 : 0; + } + }else if(a.equals("maxdellen")){ + maxDelLen=Integer.parseInt(b); + }else if(a.equals("maxinslen")){ + maxInsLen=Integer.parseInt(b); + }else if(a.equals("maxsublen")){ + maxSubLen=Integer.parseInt(b); + }else if(a.equals("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.equals("fastaminread") || a.equals("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("minqual")){ + minQuality=Byte.parseByte(b); + midQuality=Tools.max(minQuality, midQuality); + maxQuality=Tools.max(midQuality, maxQuality); + }else if(a.equals("midqual")){ + midQuality=Byte.parseByte(b); + maxQuality=Tools.max(midQuality, maxQuality); + minQuality=Tools.min(minQuality, midQuality); + }else if(a.equals("maxqual")){ + maxQuality=Byte.parseByte(b); + midQuality=Tools.min(maxQuality, midQuality); + minQuality=Tools.min(minQuality, midQuality); + }else if(a.equals("matelen") || a.equals("pairlen")){ + int x=Integer.parseInt(b); + RandomReads.mateLen=x; + AbstractMapThread.MAX_PAIR_DIST=Tools.max(x, AbstractMapThread.MAX_PAIR_DIST); + }else if(a.equals("s") || a.equals("snps")){ + maxSnps=Integer.parseInt(b); + baseSnpRate=1; + }else if(a.equals("u") || a.equals("subs")){ + maxInss=Integer.parseInt(b); + baseInsRate=1; + }else if(a.equals("d") || a.equals("dels")){ + maxDels=Integer.parseInt(b); + baseDelRate=1; + }else if(a.equals("i") || a.equals("inss")){ + maxSubs=Integer.parseInt(b); + baseSubRate=1; + }else if(a.equals("sequentialoverlap")){ + sequentialOverlap=Integer.parseInt(b); + }else if(a.equals("sequentialstrandalt")){ + sequentialStrandAlt=Tools.parseBoolean(b); + }else if(a.equals("k") || a.equals("keylen")){ + keylen=Integer.parseInt(b); + }else if(a.equals("genscaffoldinfo")){ + RefToIndex.genScaffoldInfo=Tools.parseBoolean(b); + }else if(a.equals("loadscaffolds")){ + Data.LOAD_SCAFFOLDS=Tools.parseBoolean(b); + }else if(a.equals("autoRefToIndex.chrombits")){ + if("auto".equalsIgnoreCase(b)){RefToIndex.AUTO_CHROMBITS=true;} + else{RefToIndex.AUTO_CHROMBITS=Tools.parseBoolean(b);} + }else if(a.equals("RefToIndex.chrombits") || a.equals("cbits")){ + if("auto".equalsIgnoreCase(b)){RefToIndex.AUTO_CHROMBITS=true;} + else{ + RefToIndex.AUTO_CHROMBITS=false; + RefToIndex.chrombits=Integer.parseInt(b); + } + }else if(a.equals("requirecorrectstrand") || a.equals("rcs")){ + REQUIRE_CORRECT_STRANDS_PAIRS=Tools.parseBoolean(b); + }else if(a.equals("samestrandpairs") || a.equals("ssp")){ + SAME_STRAND_PAIRS=Tools.parseBoolean(b); + if(SAME_STRAND_PAIRS){sysout.println("Warning! SAME_STRAND_PAIRS=true mode is not fully tested.");} + }else if(a.equals("killbadpairs") || a.equals("kbp")){ + KILL_BAD_PAIRS=Tools.parseBoolean(b); + }else if(a.equals("pairedonly") || a.equals("po")){ + AbstractMapThread.OUTPUT_PAIRED_ONLY=Tools.parseBoolean(b); + }else if(a.equals("mdtag") || a.equals("md")){ + SamLine.MAKE_MD_TAG=Tools.parseBoolean(b); + }else if(a.equals("tophat")){ + if(Tools.parseBoolean(b)){ + SamLine.MAKE_TOPHAT_TAGS=true; + FastaReadInputStream.FAKE_QUALITY=true; + FastaReadInputStream.FAKE_QUALITY_LEVEL=40; + SamLine.MAKE_MD_TAG=true; + } + }else if(a.equals("xstag") || a.equals("xs")){ + SamLine.MAKE_XS_TAG=true; + if(b!=null){ + b=b.toLowerCase(); + if(b.startsWith("fr-")){b=b.substring(3);} + if(b.equals("ss") || b.equals("secondstrand")){ + SamLine.XS_SECONDSTRAND=true; + }else if(b.equals("fs") || b.equals("firststrand")){ + SamLine.XS_SECONDSTRAND=false; + }else if(b.equals("us") || b.equals("unstranded")){ + SamLine.XS_SECONDSTRAND=false; + }else{ + SamLine.MAKE_XS_TAG=Tools.parseBoolean(b); + } + } + setxs=true; + }else if(a.equals("intronlen") || a.equals("intronlength")){ + SamLine.INTRON_LIMIT=Integer.parseInt(b); + setintron=true; + }else if(a.equals("sortscaffolds")){ + SamLine.SORT_SCAFFOLDS=Tools.parseBoolean(b); + }else if(a.equals("customtag")){ + SamLine.MAKE_CUSTOM_TAGS=Tools.parseBoolean(b); + }else if(a.equals("idmodulo") || a.equals("idmod")){ + idmodulo=Integer.parseInt(b); + }else if(a.equals("samplerate")){ + samplerate=Float.parseFloat(b); + assert(samplerate<=1f && samplerate>=0f) : "samplerate="+samplerate+"; should be between 0 and 1"; + }else if(a.equals("sampleseed")){ + sampleseed=Long.parseLong(b); + }else if(a.equals("minhits") || a.equals("minapproxhits")){ + minApproxHits=Integer.parseInt(b); + }else if(a.equals("maxindel")){ + maxIndel1=Tools.max(0, Integer.parseInt(b)); + maxIndel2=2*maxIndel1; + }else if(a.equals("maxindel1") || a.equals("maxindelsingle")){ + maxIndel1=Tools.max(0, Integer.parseInt(b)); + maxIndel2=Tools.max(maxIndel1, maxIndel2); + }else if(a.equals("maxindel2") || a.equals("maxindelsum")){ + maxIndel2=Tools.max(0, Integer.parseInt(b)); + maxIndel1=Tools.min(maxIndel1, maxIndel2); + }else if(a.equals("strictmaxindel")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + maxIndel1=Tools.max(0, Integer.parseInt(b)); + maxIndel2=2*maxIndel1; + STRICT_MAX_INDEL=true; + }else{ + STRICT_MAX_INDEL=Tools.parseBoolean(b); + } + }else if(a.equals("padding")){ + SLOW_ALIGN_PADDING=Integer.parseInt(b); + SLOW_RESCUE_PADDING=SLOW_ALIGN_PADDING; + }else if(a.equals("rescue")){ + RESCUE=Tools.parseBoolean(b); + }else if(a.equals("tipsearch")){ + TIP_SEARCH_DIST=Tools.max(0, Integer.parseInt(b)); + }else if(a.equals("dper") || a.equals("dprr")){ + DOUBLE_PRINT_ERROR_RATE=Tools.parseBoolean(b); + }else if(a.equals("chromc")){ + Data.CHROMC=Tools.parseBoolean(b); + }else if(a.equals("chromgz")){ + Data.CHROMGZ=Tools.parseBoolean(b); + }else if(a.equals("nodisk")){ + RefToIndex.NODISK=Tools.parseBoolean(b); + }else if(a.equals("maxchromlen")){ + RefToIndex.maxChromLen=Long.parseLong(b); + }else if(a.equals("minscaf") || a.equals("mincontig")){ + RefToIndex.minScaf=Integer.parseInt(b); + }else if(a.equals("midpad")){ + RefToIndex.midPad=Integer.parseInt(b); + }else if(a.equals("startpad")){ + RefToIndex.startPad=Integer.parseInt(b); + }else if(a.equals("stoppad")){ + RefToIndex.stopPad=Integer.parseInt(b); + }else if(a.equals("forceanalyze")){ + forceanalyze=Tools.parseBoolean(b); + }else if(a.equals("machineoutput") || a.equals("machineout")){ + MACHINE_OUTPUT=Tools.parseBoolean(b); + }else if(a.equals("showprogress")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + long x=Tools.max(1, Long.parseLong(b)); + ConcurrentGenericReadInputStream.PROGRESS_INCR=x; + ConcurrentGenericReadInputStream.SHOW_PROGRESS=(x>0); + }else{ + ConcurrentGenericReadInputStream.SHOW_PROGRESS=Tools.parseBoolean(b); + } + }else if(a.equals("scafstats") || a.equals("scaffoldstats")){ + if(b==null && arg.indexOf('=')<0){b="stdout";} + if(b==null || b.equalsIgnoreCase("false") || b.equalsIgnoreCase("f") || b.equalsIgnoreCase("none") || b.equalsIgnoreCase("null")){ + BBSplitter.TRACK_SCAF_STATS=false; + BBSplitter.SCAF_STATS_FILE=null; + sysout.println("No file specified; not tracking scaffold statistics."); + }else{ + BBSplitter.TRACK_SCAF_STATS=true; + BBSplitter.SCAF_STATS_FILE=b; + sysout.println("Scaffold statistics will be written to "+b); + } + }else if(a.equals("setstats") || a.equals("refstats")){ + if(b==null && arg.indexOf('=')<0){b="stdout";} + if(b==null || b.equalsIgnoreCase("false") || b.equalsIgnoreCase("f") || b.equalsIgnoreCase("none") || b.equalsIgnoreCase("null")){ + BBSplitter.TRACK_SET_STATS=false; + BBSplitter.SET_STATS_FILE=null; + sysout.println("No file specified; not tracking reference set statistics."); + }else{ + BBSplitter.TRACK_SET_STATS=true; + BBSplitter.SET_STATS_FILE=b; + sysout.println("Reference set statistics will be written to "+b); + } + }else if(a.equals("camelwalk")){ + AbstractIndex.USE_CAMELWALK=Tools.parseBoolean(b); + }else if(a.equals("usequality") || a.equals("uq")){ + AbstractIndex.GENERATE_KEY_SCORES_FROM_QUALITY=AbstractIndex.GENERATE_BASE_SCORES_FROM_QUALITY=Tools.parseBoolean(b); + }else if(a.equals("keepbadkeys") || a.equals("kbk")){ + KeyRing.KEEP_BAD_KEYS=Tools.parseBoolean(b); + }else if(i>1){ + throw new RuntimeException("Unknown parameter: "+arg); + } + } + + if(TrimRead.ADJUST_QUALITY){CalcTrueQuality.initializeMatrices();} + } + + private final void preparse0(String[] args){ + for(int i=0; i1 ? split[1].toLowerCase() : null; + if("null".equalsIgnoreCase(b)){b=null;} + if(b!=null && (b.equals("stdout") || b.startsWith("stdout."))){ + sysout=System.err; + Data.sysout=System.err; + }else if(a.equals("printtoerr")){ + if(Tools.parseBoolean(b)){sysout=System.err; Data.sysout=System.err;} + }else if(b!=null && (b.equals("stdin") || b.startsWith("stdin."))){ + SYSIN=true; + }else if(a.equals("fast")){ + fast=Tools.parseBoolean(b); + if(fast){slow=false;} + args[i]=null; + }else if(a.equals("slow")){ + slow=Tools.parseBoolean(b); + if(slow){fast=false;} + args[i]=null; + } + } + } + + static final String padPercent(double value, int places){ + String x=String.format("%."+places+"f", value); + int desired=3+(places<1 ? 0 : 1+places); + while(x.length()0){ + System.err.println("\n\n**************************************************************************\n\n" + + "Warning! "+broken+" mapping thread"+(broken==1 ? "" : "s")+" did not terminate normally.\n" + + "Please check the error log; the output may be corrupt or incomplete.\n\n" + + "**************************************************************************\n\n"); + } + return broken; + } + + static final boolean closeStreams(ConcurrentReadStreamInterface cris, RTextOutputStream3 rosA, RTextOutputStream3 rosM, RTextOutputStream3 rosU, RTextOutputStream3 rosB){ + errorState|=ReadWrite.closeStreams(cris, rosA, rosM, rosU, rosB); + if(BBSplitter.streamTable!=null){ + for(RTextOutputStream3 tros : BBSplitter.streamTable.values()){ + errorState|=ReadWrite.closeStream(tros); + } + } + if(BBSplitter.streamTableAmbiguous!=null){ + for(RTextOutputStream3 tros : BBSplitter.streamTableAmbiguous.values()){ + errorState|=ReadWrite.closeStream(tros); + } + } + return errorState; + } + + static final ConcurrentReadStreamInterface getReadInputStream(String in1, String in2){ + + assert(in1!=null); + assert(!in1.equalsIgnoreCase(in2)) : in1+", "+in2; + + BBIndex.COLORSPACE=colorspace; + final ConcurrentReadStreamInterface cris; + + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, 0, 0, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, 0, 0, true, true); + + if(ff1.sequential()){ + if(reads<0){reads=Long.MAX_VALUE;} +// assert(false) : trials; + SequentialReadInputStream ris=new SequentialReadInputStream(reads, readlen, Tools.max(50, readlen/2), sequentialOverlap, sequentialStrandAlt); + cris=new ConcurrentReadInputStream(ris, reads); + + }else if(ff1.csfasta()){ + colorspace=true; + BBIndex.COLORSPACE=colorspace; + + if(in2!=null){ + cris=new ConcurrentSolidInputStream(in1, in1.replace(".csfasta", ".qual"), in2, in2.replace(".csfasta", ".qual"), reads); + }else{ + cris=new ConcurrentSolidInputStream(in1, in1.replace(".csfasta", ".qual"), reads, null); + } + }else if(ff1.fastq()){ + FastqReadInputStream fris1=new FastqReadInputStream(ff1, colorspace); + FastqReadInputStream fris2=(ff2==null ? null : new FastqReadInputStream(ff2, colorspace)); + cris=new ConcurrentGenericReadInputStream(fris1, fris2, reads); + + }else if(ff1.samOrBam()){ + + SamReadInputStream fris1=new SamReadInputStream(ff1, colorspace, false, FASTQ.FORCE_INTERLEAVED); + cris=new ConcurrentGenericReadInputStream(fris1, null, reads); + + }else if(ff1.fasta()){ + + FastaReadInputStream fris1=new FastaReadInputStream(ff1, false, (FASTQ.FORCE_INTERLEAVED && ff2==null), ff2==null ? Shared.READ_BUFFER_MAX_DATA : -1); + FastaReadInputStream fris2=(ff2==null ? null : new FastaReadInputStream(ff2, colorspace, false, -1)); + cris=new ConcurrentGenericReadInputStream(fris1, fris2, reads); + + }else if(ff1.bread()){ + + RTextInputStream rtis=new RTextInputStream(in1, in2, reads); + cris=new ConcurrentReadInputStream(rtis, reads); + + + }else if(ff1.random()){ + + useRandomReads=true; + assert(readlen>0); + + RandomReads.PERFECT_READ_RATIO=PERFECT_READ_RATIO; + + RandomReadInputStream ris=new RandomReadInputStream(reads, readlen, + maxSnps, maxInss, maxDels, maxSubs, + baseSnpRate, baseInsRate, baseDelRate, baseSubRate, + maxInsLen, maxDelLen, maxSubLen, + minChrom, maxChrom, colorspace, PAIRED_RANDOM_READS, + minQuality, midQuality, maxQuality); + cris=new ConcurrentReadInputStream(ris, reads); + }else{ + throw new RuntimeException("Can't determine read input source: ff1="+ff1+", ff2="+ff2); + } + return cris; + } + + + static void printOutput(final AbstractMapThread[] mtts, final Timer t, final int keylen, final boolean paired, final boolean SKIMMER){ + if(MACHINE_OUTPUT){ + printOutput_Machine(mtts, t, keylen, paired, false); + return; + } + long msaIterationsLimited=0; + long msaIterationsUnlimited=0; + + long basesUsed=0; + long basesAtQuickmap=0; + long keysUsed=0; + + long syntheticReads=0; + long numMated=0; + long badPairs=0; + long innerLengthSum=0; + long outerLengthSum=0; + long insertSizeSum=0; + + long callsToScore=0; + long callsToExtend=0; + long initialKeys=0; + long initialKeyIterations=0; + long usedKeys=0; + long usedKeyIterations=0; + + long[] hist_hits=new long[41]; + long[] hist_hits_score=new long[41]; + long[] hist_hits_extend=new long[41]; + + long initialSiteSum1=0; + long postTrimSiteSum1=0; + long postRescueSiteSum1=0; + long siteSum1=0; + long topSiteSum1=0; + + long matchCountS1=0; + long matchCountI1=0; + long matchCountD1=0; + long matchCountM1=0; + long matchCountN1=0; + + + long mapped1=0; + long mappedRetained1=0; + long rescuedP1=0; + long rescuedM1=0; + long truePositiveP1=0; + long truePositiveM1=0; + long falsePositive1=0; + long totalCorrectSites1=0; + long firstSiteCorrectP1=0; + long firstSiteCorrectM1=0; + long firstSiteIncorrect1=0; + long firstSiteCorrectLoose1=0; + long firstSiteIncorrectLoose1=0; + long firstSiteCorrectPaired1=0; + long firstSiteCorrectSolo1=0; + long firstSiteCorrectRescued1=0; + long perfectHit1=0; //Highest score is max score + long uniqueHit1=0; //Only one hit has highest score + long correctUniqueHit1=0; //unique highest hit on answer site + long correctMultiHit1=0; //non-unique highest hit on answer site (non-skimmer only) + long correctLowHit1=0; //hit on answer site, but not highest scorer + long noHit1=0; + long perfectMatch1=0; //Highest slow score is max slow score + long semiperfectMatch1=0; + long perfectHitCount1=0; + long semiPerfectHitCount1=0; + long duplicateBestAlignment1=0; + + long totalNumCorrect1=0; //Only for skimmer + long totalNumIncorrect1=0; //Only for skimmer + long totalNumIncorrectPrior1=0; //Only for skimmer + long totalNumCapturedAllCorrect1=0; //Only for skimmer + long totalNumCapturedAllCorrectTop1=0; //Only for skimmer + long totalNumCapturedAllCorrectOnly1=0; //Only for skimmer + + long initialSiteSum2=0; + long postTrimSiteSum2=0; + long postRescueSiteSum2=0; + long siteSum2=0; + long topSiteSum2=0; + + long mapped2=0; + long mappedRetained2=0; + long rescuedP2=0; + long rescuedM2=0; + long truePositiveP2=0; + long truePositiveM2=0; + long falsePositive2=0; + long totalCorrectSites2=0; + long firstSiteCorrectP2=0; + long firstSiteCorrectM2=0; + long firstSiteIncorrect2=0; + long firstSiteCorrectLoose2=0; + long firstSiteIncorrectLoose2=0; + long firstSiteCorrectPaired2=0; + long firstSiteCorrectSolo2=0; + long firstSiteCorrectRescued2=0; + long perfectHit2=0; //Highest score is max score + long perfectHitCount2=0; + long semiPerfectHitCount2=0; + + long uniqueHit2=0; //Only one hit has highest score + long correctUniqueHit2=0; //unique highest hit on answer site + long correctMultiHit2=0; //non-unique highest hit on answer site (non-skimmer only) + long correctLowHit2=0; //hit on answer site, but not highest scorer + long noHit2=0; + long perfectMatch2=0; //Highest slow score is max slow score + long semiperfectMatch2=0; + long duplicateBestAlignment2=0; + + long totalNumCorrect2=0; //Only for skimmer + long totalNumIncorrect2=0; //Only for skimmer + long totalNumIncorrectPrior2=0; //Only for skimmer + long totalNumCapturedAllCorrect2=0; //Only for skimmer + long totalNumCapturedAllCorrectTop2=0; //Only for skimmer + long totalNumCapturedAllCorrectOnly2=0; //Only for skimmer + + long matchCountS2=0; + long matchCountI2=0; + long matchCountD2=0; + long matchCountM2=0; + long matchCountN2=0; + + readsUsed=0; + for(int i=0; i "+falsePositive); + totalCorrectSites1+=mtt.totalCorrectSites1; + + firstSiteCorrectP1+=mtt.firstSiteCorrectP1; + firstSiteCorrectM1+=mtt.firstSiteCorrectM1; + firstSiteIncorrect1+=mtt.firstSiteIncorrect1; + firstSiteCorrectLoose1+=mtt.firstSiteCorrectLoose1; + firstSiteIncorrectLoose1+=mtt.firstSiteIncorrectLoose1; + firstSiteCorrectPaired1+=mtt.firstSiteCorrectPaired1; + firstSiteCorrectSolo1+=mtt.firstSiteCorrectSolo1; + firstSiteCorrectRescued1+=mtt.firstSiteCorrectRescued1; + + perfectHit1+=mtt.perfectHit1; //Highest score is max score + perfectHitCount1+=mtt.perfectHitCount1; + semiPerfectHitCount1+=mtt.semiPerfectHitCount1; + uniqueHit1+=mtt.uniqueHit1; //Only one hit has highest score + correctUniqueHit1+=mtt.correctUniqueHit1; //unique highest hit on answer site + correctMultiHit1+=mtt.correctMultiHit1; //non-unique highest hit on answer site + correctLowHit1+=mtt.correctLowHit1; //hit on answer site, but not highest scorer + noHit1+=mtt.noHit1; + + totalNumCorrect1+=mtt.totalNumCorrect1; //Skimmer only + totalNumIncorrect1+=mtt.totalNumIncorrect1; //Skimmer only + totalNumIncorrectPrior1+=mtt.totalNumIncorrectPrior1; //Skimmer only + totalNumCapturedAllCorrect1+=mtt.totalNumCapturedAllCorrect1; //Skimmer only + totalNumCapturedAllCorrectTop1+=mtt.totalNumCapturedAllCorrectTop1; //Skimmer only + totalNumCapturedAllCorrectOnly1+=mtt.totalNumCapturedAllCorrectOnly1; //Skimmer only + + perfectMatch1+=mtt.perfectMatch1; //Highest slow score is max slow score + semiperfectMatch1+=mtt.semiperfectMatch1; //A semiperfect mapping was found + + duplicateBestAlignment1+=mtt.ambiguousBestAlignment1; + + initialSiteSum1+=mtt.initialSiteSum1; + postTrimSiteSum1+=mtt.postTrimSiteSum1; + postRescueSiteSum1+=mtt.postRescueSiteSum1; + siteSum1+=mtt.siteSum1; + topSiteSum1+=mtt.topSiteSum1; + + AbstractIndex index=mtt.index(); + callsToScore+=index.callsToScore; + callsToExtend+=index.callsToExtendScore; + initialKeys+=index.initialKeys; + initialKeyIterations+=index.initialKeyIterations; + usedKeys+=index.usedKeys; + usedKeyIterations+=index.usedKeyIterations; + + for(int j=0; j "+falsePositive); + totalCorrectSites2+=mtt.totalCorrectSites2; + + firstSiteCorrectP2+=mtt.firstSiteCorrectP2; + firstSiteCorrectM2+=mtt.firstSiteCorrectM2; + firstSiteIncorrect2+=mtt.firstSiteIncorrect2; + firstSiteCorrectLoose2+=mtt.firstSiteCorrectLoose2; + firstSiteIncorrectLoose2+=mtt.firstSiteIncorrectLoose2; + firstSiteCorrectPaired2+=mtt.firstSiteCorrectPaired2; + firstSiteCorrectSolo2+=mtt.firstSiteCorrectSolo2; + firstSiteCorrectRescued2+=mtt.firstSiteCorrectRescued2; + + perfectHit2+=mtt.perfectHit2; //Highest score is max score + perfectHitCount2+=mtt.perfectHitCount2; + semiPerfectHitCount2+=mtt.semiPerfectHitCount2; + uniqueHit2+=mtt.uniqueHit2; //Only one hit has highest score + correctUniqueHit2+=mtt.correctUniqueHit2; //unique highest hit on answer site + correctMultiHit2+=mtt.correctMultiHit2; //non-unique highest hit on answer site + correctLowHit2+=mtt.correctLowHit2; //hit on answer site, but not highest scorer + noHit2+=mtt.noHit2; + + totalNumCorrect2+=mtt.totalNumCorrect2; //Skimmer only + totalNumIncorrect2+=mtt.totalNumIncorrect2; //Skimmer only + totalNumIncorrectPrior2+=mtt.totalNumIncorrectPrior2; //Skimmer only + totalNumCapturedAllCorrect2+=mtt.totalNumCapturedAllCorrect2; //Skimmer only + totalNumCapturedAllCorrectTop2+=mtt.totalNumCapturedAllCorrectTop2; //Skimmer only + totalNumCapturedAllCorrectOnly2+=mtt.totalNumCapturedAllCorrectOnly2; //Skimmer only + + perfectMatch2+=mtt.perfectMatch2; //Highest slow score is max slow score + semiperfectMatch2+=mtt.semiperfectMatch2; //A semiperfect mapping was found + + duplicateBestAlignment2+=mtt.ambiguousBestAlignment2; + + initialSiteSum2+=mtt.initialSiteSum2; + postTrimSiteSum2+=mtt.postTrimSiteSum2; + postRescueSiteSum2+=mtt.postRescueSiteSum2; + siteSum2+=mtt.siteSum2; + topSiteSum2+=mtt.topSiteSum2; + + matchCountS2+=mtt.matchCountS2; + matchCountI2+=mtt.matchCountI2; + matchCountD2+=mtt.matchCountD2; + matchCountM2+=mtt.matchCountM2; + matchCountN2+=mtt.matchCountN2; + + } + reads=readsUsed; + if(syntheticReads>0){SYNTHETIC=true;} + + t.stop(); + long nanos=t.elapsed; + + if(verbose_stats>1){ + StringBuilder sb=new StringBuilder(1000); + sb.append("\n\n###################\n#hits\tcount\tscore\textend\n"); + for(int i=0; i=1){sysout.println("MSA iterations: \t"+String.format("%.2fL + %.2fU = %.2f", milf,milu,milf+milu));} + + sysout.println(); + sysout.println("\nRead 1 data:"); + if(verbose_stats>=1){ + if(avgInitialKeys>0){sysout.println(String.format("Avg Initial Keys: \t"+(avgInitialKeys<100?" ":"")+"%.3f", + avgInitialKeys));} + if(avgUsedKeys>0){sysout.println(String.format("Avg Used Keys: \t"+(avgUsedKeys<100?" ":"")+"%.3f", + avgUsedKeys));} + if(avgCallsToScore>0){sysout.println(String.format("Avg Calls to Score: \t"+(avgCallsToScore<100?" ":"")+"%.3f", + avgCallsToScore));} + if(avgCallsToExtendScore>0){sysout.println(String.format("Avg Calls to Extend:\t"+(avgCallsToExtendScore<100?" ":"")+"%.3f", + avgCallsToExtendScore));} + sysout.println(); + + sysout.println(String.format("Avg Initial Sites: \t"+(avgInitialSites<10?" ":"")+"%.3f", avgInitialSites)); + if(TRIM_LIST){sysout.println(String.format("Avg Post-Trim: \t"+(avgPostTrimSites<10?" ":"")+"%.3f", avgPostTrimSites));} + if(paired){sysout.println(String.format("Avg Post-Rescue: \t"+(avgPostRescueSites<10?" ":"")+"%.3f", avgPostRescueSites));} + sysout.println(String.format("Avg Final Sites: \t"+(avgSites<10?" ":"")+"%.3f", avgSites)); + sysout.println(String.format("Avg Top Sites: \t"+(avgTopSites<10?" ":"")+"%.3f", avgTopSites)); + if(verbose_stats>1){ + sysout.println(String.format("Avg Perfect Sites: \t"+(avgPerfectSites<10?" ":"")+"%.3f \t"+ + (perfectHitCountPercent<10?" ":"")+"%.3f%%", avgPerfectSites, perfectHitCountPercent)); + sysout.println(String.format("Avg Semiperfect Sites:\t"+(avgSemiPerfectSites<10?" ":"")+"%.3f \t"+ + (semiPerfectHitCountPercent<10?" ":"")+"%.3f%%", avgSemiPerfectSites, semiPerfectHitCountPercent)); + } + + if(SYNTHETIC){ + sysout.println(String.format("Avg Correct Sites: \t"+(avgNumCorrect<10?" ":"")+"%.3f", avgNumCorrect)); + if(SKIMMER){ + sysout.println(String.format("Avg Incorrect Sites:\t"+(avgNumIncorrect<10?" ":"")+"%.3f", avgNumIncorrect)); + sysout.println(String.format("Avg IncorrectP Sites:\t"+(avgNumIncorrectPrior<10?" ":"")+"%.3f", avgNumIncorrectPrior)); + } + } + } + + sysout.println(); +// sysout.println(String.format("perfectHit: \t%.2f", perfectHitPercent)+"%"); +// sysout.println(String.format("uniqueHit: \t%.2f", uniqueHitPercent)+"%"); +// sysout.println(String.format("correctUniqueHit:\t%.2f", correctUniqueHitPercent)+"%"); +//// sysout.println(String.format("correctMultiHit: \t%.2f", correctMultiHitPercent)+"%"); +// sysout.println(String.format("correctHighHit: \t%.2f", correctHighHitPercent)+"%"); +// sysout.println(String.format("correctHit: \t%.2f", correctHitPercent)+"%"); + + //sysout.println(String.format("mapped: \t"+(mappedB<10?" ":"")+"%.3f", mappedB)+"%"); + if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){ + double x=ambiguousFound+mappedRetainedB; + sysout.println("mapped: \t"+padPercent(x,4)+"%"+"\t"+mappedReads+" reads"); + sysout.println("unambiguous: \t"+padPercent(mappedRetainedB,4)+"%"+"\t"+unambiguousReads+" reads"); + }else{ + double x=mappedRetainedB-ambiguousFound; + sysout.println("mapped: \t"+padPercent(mappedRetainedB,4)+"%"+"\t"+mappedReads+" reads"); + sysout.println("unambiguous: \t"+padPercent(x,4)+"%"+"\t"+unambiguousReads+" reads"); + } + if(SYNTHETIC){ + sysout.println(String.format("true positive: \t"+((truePositiveStrict)<10?" ":"")+"%.4f%%\t(loose: "+(truePositiveLoose<10?" ":"")+"%.4f%%)", + truePositiveStrict, truePositiveLoose)); + sysout.println(String.format("false positive: \t"+(falsePositiveB<10?" ":"")+"%.4f%%\t(loose: "+(falsePositiveLooseB<10?" ":"")+"%.4f%%)", + falsePositiveB, falsePositiveLooseB)); + sysout.println(String.format("SNR: \t"+(snrStrict<10 && snrStrict>=0 ?" ":"")+"%.4f \t(loose: "+(snrLoose<10&&snrLoose>=0?" ":"")+"%.4f)", + snrStrict, snrLoose)); + if(verbose_stats>0){sysout.println(String.format("Plus/Minus ratio:\t %1.4f", truePositivePMRatio));} + + if(SKIMMER){ + sysout.println(String.format("found all correct:\t"+(rateCapturedAllCorrect<10?" ":"")+"%.3f", rateCapturedAllCorrect)+"%"); + sysout.println(String.format("all correct top: \t"+(rateCapturedAllTop<10?" ":"")+"%.3f", rateCapturedAllTop)+"%"); + sysout.println(String.format("all correct only: \t"+(rateCapturedAllOnly<10?" ":"")+"%.3f", rateCapturedAllOnly)+"%"); + } + } + + sysout.println(); + if(paired){ + sysout.println(String.format("Mated pairs: \t"+(matedPercent<10?" ":"")+"%.4f", matedPercent)+"%"); + if(SYNTHETIC){ + sysout.println(String.format("correct pairs: \t"+(truePositivePairedB<10?" ":"")+"%.3f", truePositivePairedB)+"% (of mated)"); + } + sysout.println(String.format("bad pairs: \t"+(badPairsPercent<10?" ":"")+"%.3f", badPairsPercent)+"% (of all reads)"); + } + if(SYNTHETIC){ + sysout.println(String.format("correct singles: \t"+(truePositiveSoloB<10?" ":"")+"%.4f", truePositiveSoloB)+"%"); + } + if(paired){ + sysout.println(String.format("rescued: \t"+(rescuedPB+rescuedMB<10?" ":"")+"%.3f", rescuedPB+rescuedMB)+"%"); +// sysout.println(String.format("rescued +: \t%.3f", rescuedPB)+"%"); +// sysout.println(String.format("rescued -: \t%.3f", rescuedMB)+"%"); + if(SYNTHETIC){ + sysout.println(String.format("correct rescued: \t"+(truePositiveRescuedB<10?" ":"")+"%.3f", truePositiveRescuedB)+"%"); + } + sysout.println(String.format("avg insert size: \t%.2f", insertSizeAvg)); + if(verbose_stats>=1){ + sysout.println(String.format("avg inner length:\t%.2f", innerLengthAvg)); + sysout.println(String.format("avg insert size: \t%.2f", outerLengthAvg)); + } + } + sysout.println(); + sysout.println(String.format("perfect best site:\t"+(perfectMatchPercent<10?" ":"")+"%.4f", perfectMatchPercent)+"%"); + sysout.println(String.format("semiperfect site:\t"+(semiperfectMatchPercent<10?" ":"")+"%.4f", semiperfectMatchPercent)+"%"); + sysout.println(String.format("ambiguousMapping:\t"+(ambiguousFound<10?" ":"")+"%.4f", ambiguousFound)+"%\t"+ + (REMOVE_DUPLICATE_BEST_ALIGNMENTS ? "(Removed)" : "(Kept)")); + sysout.println(String.format("low-Q discards: \t"+(lowQualityReadsDiscardedPercent<10?" ":"")+"%.4f", + lowQualityReadsDiscardedPercent)+"%"); + if(SYNTHETIC){ + sysout.println(String.format("false negative: \t"+(noHitPercent<10?" ":"")+"%.4f", noHitPercent)+"%"); + sysout.println(String.format("correctLowHit: \t"+(correctLowHitPercent<10?" ":"")+"%.4f", correctLowHitPercent)+"%"); + } + + if(MAKE_MATCH_STRING){ + sysout.println(); + sysout.println("Match Rate: \t"+padPercent(matchRate,4)+"% \t"+matchCountM1); + sysout.println("Error Rate: \t"+padPercent(errorRate,4)+"% \t"+matchErrors); + sysout.println("Sub Rate: \t"+padPercent(subRate,4)+"% \t"+matchCountS1); + sysout.println("Del Rate: \t"+padPercent(delRate,4)+"% \t"+matchCountD1); + sysout.println("Ins Rate: \t"+padPercent(insRate,4)+"% \t"+matchCountI1); + sysout.println("N Rate: \t"+padPercent(nRate,4)+"% \t"+matchCountN1); + + if(DOUBLE_PRINT_ERROR_RATE){ + System.err.println(); + System.err.println(String.format("Match Rate: \t"+(matchRate<10?" ":"")+"%.4f", matchRate)+"% \t"+matchCountM1); + System.err.println(String.format("Error Rate: \t"+(errorRate<10?" ":"")+"%.4f", errorRate)+"% \t"+matchErrors); + System.err.println(String.format("Sub Rate: \t"+(subRate<10?" ":"")+"%.4f", subRate)+"% \t"+matchCountS1); + System.err.println(String.format("Del Rate: \t"+(delRate<10?" ":"")+"%.4f", delRate)+"% \t"+matchCountD1); + System.err.println(String.format("Ins Rate: \t"+(insRate<10?" ":"")+"%.4f", insRate)+"% \t"+matchCountI1); + System.err.println(String.format("N Rate: \t"+(nRate<10?" ":"")+"%.4f", nRate)+"% \t"+matchCountN1); + } + } + + if(paired){ + invSites100=100d/siteSum2; + + perfectHitPercent=perfectHit2*invTrials100; //Highest score is max score + perfectMatchPercent=perfectMatch2*invTrials100; + semiperfectMatchPercent=semiperfectMatch2*invTrials100; + + perfectHitCountPercent=perfectHitCount2*invSites100; + semiPerfectHitCountPercent=semiPerfectHitCount2*invSites100; + + uniqueHitPercent=uniqueHit2*invTrials100; //Only one hit has highest score + correctUniqueHitPercent=correctUniqueHit2*invTrials100; //unique highest hit on answer site + correctMultiHitPercent=correctMultiHit2*invTrials100; //non-unique highest hit on answer site + correctLowHitPercent=correctLowHit2*invTrials100; //hit on answer site, but not highest scorer + ambiguousFound=(duplicateBestAlignment2*invTrials100); + correctHighHitPercent=(correctMultiHit2+correctUniqueHit2)*invTrials100; + correctHitPercent=(correctLowHit2+correctMultiHit2+correctUniqueHit2)*invTrials100; + + mappedB=(mapped2*invTrials100); + mappedRetainedB=(mappedRetained2*invTrials100); + rescuedPB=(rescuedP2*invTrials100); + rescuedMB=(rescuedM2*invTrials100); + falsePositiveB=(firstSiteIncorrect2*invTrials100); + falsePositiveLooseB=(firstSiteIncorrectLoose2*invTrials100); + truePositivePB=(firstSiteCorrectP2*invTrials100); + truePositiveMB=(firstSiteCorrectM2*invTrials100); + truePositiveStrict=((firstSiteCorrectP2+firstSiteCorrectM2)*invTrials100); + truePositiveLoose=(firstSiteCorrectLoose2*invTrials100); + snrStrict=10*Math.log10((firstSiteCorrectM2+firstSiteCorrectP2+0.1)/(firstSiteIncorrect2+0.1)); + snrLoose=10*Math.log10((firstSiteCorrectLoose2+0.1)/(firstSiteIncorrectLoose2+0.1)); + truePositivePMRatio=(truePositivePB/truePositiveMB); + truePositivePairedB=(firstSiteCorrectPaired2*100d/numMated); + truePositiveSoloB=(firstSiteCorrectSolo2*100d/(mappedRetained2-numMated)); + truePositiveRescuedB=(firstSiteCorrectRescued2*100d/(rescuedP2+rescuedM2)); + avgNumCorrect=(totalCorrectSites2/(1d*(truePositiveP2+truePositiveM2))); + noHitPercent=noHit2*invTrials100; + + avgNumCorrect=(SKIMMER ? totalNumCorrect2*invTrials : (totalCorrectSites2/(1d*(truePositiveP2+truePositiveM2)))); + avgNumIncorrect=totalNumIncorrect1*invTrials; //Skimmer only + avgNumIncorrectPrior=totalNumIncorrectPrior1*invTrials; //Skimmer only + + rateCapturedAllCorrect=totalNumCapturedAllCorrect2*invTrials100; //Skimmer only + rateCapturedAllTop=totalNumCapturedAllCorrectTop2*invTrials100; //Skimmer only + rateCapturedAllOnly=totalNumCapturedAllCorrectOnly2*invTrials100; //Skimmer only + + if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){ + mappedReads=mappedRetained2+duplicateBestAlignment2; + unambiguousReads=mappedRetained2; + }else{ + mappedReads=mappedRetained2; + unambiguousReads=mappedRetained2-duplicateBestAlignment2; + } + + avgInitialSites=initialSiteSum2*invTrials; + avgPostTrimSites=postTrimSiteSum2*invTrials; + avgPostRescueSites=postRescueSiteSum2*invTrials; + avgSites=siteSum2*invTrials; + avgPerfectSites=(perfectHitCount1*invTrials); + avgSemiPerfectSites=(semiPerfectHitCount1*invTrials); + avgTopSites=topSiteSum2*invTrials; + lowQualityReadsDiscardedPercent=lowQualityReadsDiscarded2*invTrials100; + + matchErrors=matchCountS2+matchCountI2+matchCountD2; + baseLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2; + matchLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2+matchCountD2; + refLen=matchCountM2+matchCountS2+matchCountN2+matchCountD2; + errorRate=matchErrors*100d/matchLen; + matchRate=matchCountM2*100d/matchLen;//baseLen; + subRate=matchCountS2*100d/matchLen;//baseLen; + delRate=matchCountD2*100d/matchLen; + insRate=matchCountI2*100d/matchLen;//baseLen; + nRate=matchCountN2*100d/matchLen;//baseLen; + + sysout.println("\n\nRead 2 data:"); + if(verbose_stats>=1){ + sysout.println(String.format("Avg Initial Sites: \t"+(avgInitialSites<10?" ":"")+"%.3f", avgInitialSites)); + if(TRIM_LIST){sysout.println(String.format("Avg Post-Trim: \t"+(avgPostTrimSites<10?" ":"")+"%.3f", avgPostTrimSites));} + sysout.println(String.format("Avg Post-Rescue: \t"+(avgPostRescueSites<10?" ":"")+"%.3f", avgPostRescueSites)); + sysout.println(String.format("Avg Final Sites: \t"+(avgSites<10?" ":"")+"%.3f", avgSites)); + sysout.println(String.format("Avg Top Sites: \t"+(avgTopSites<10?" ":"")+"%.3f", avgTopSites)); + sysout.println(String.format("Avg Perfect Sites: \t"+(avgPerfectSites<10?" ":"")+"%.3f \t"+ + (perfectHitCountPercent<10?" ":"")+"%.3f%%", avgPerfectSites, perfectHitCountPercent)); + sysout.println(String.format("Avg Semiperfect Sites:\t"+(avgSemiPerfectSites<10?" ":"")+"%.3f \t"+ + (semiPerfectHitCountPercent<10?" ":"")+"%.3f%%", avgSemiPerfectSites, semiPerfectHitCountPercent)); + + if(SYNTHETIC){ + sysout.println(String.format("Avg Correct Sites: \t"+(avgNumCorrect<10?" ":"")+"%.3f", avgNumCorrect)); + if(SKIMMER){ + sysout.println(String.format("Avg Incorrect Sites:\t"+(avgNumIncorrect<10?" ":"")+"%.3f", avgNumIncorrect)); + sysout.println(String.format("Avg IncorrectP Sites:\t"+(avgNumIncorrectPrior<10?" ":"")+"%.3f", avgNumIncorrectPrior)); + } + } + } + sysout.println(); +// sysout.println(String.format("perfectHit: \t%.2f", perfectHitPercent)+"%"); +// sysout.println(String.format("uniqueHit: \t%.2f", uniqueHitPercent)+"%"); +// sysout.println(String.format("correctUniqueHit:\t%.2f", correctUniqueHitPercent)+"%"); +// sysout.println(String.format("correctMultiHit: \t%.2f", correctMultiHitPercent)+"%"); +// sysout.println(String.format("correctHighHit: \t%.2f", correctHighHitPercent)+"%"); +// sysout.println(String.format("correctHit: \t%.2f", correctHitPercent)+"%"); + + //sysout.println(String.format("mapped: \t"+(mappedB<10?" ":"")+"%.3f", mappedB)+"%"); + if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){ + double x=ambiguousFound+mappedRetainedB; + sysout.println("mapped: \t"+padPercent(x,4)+"%"+"\t"+mappedReads+" reads"); + sysout.println("unambiguous: \t"+padPercent(mappedRetainedB,4)+"%"+"\t"+unambiguousReads+" reads"); + }else{ + double x=mappedRetainedB-ambiguousFound; + sysout.println("mapped: \t"+padPercent(mappedRetainedB,4)+"%"+"\t"+mappedReads+" reads"); + sysout.println("unambiguous: \t"+padPercent(x,4)+"%"+"\t"+unambiguousReads+" reads"); + } + if(SYNTHETIC){ + sysout.println(String.format("true positive: \t"+((truePositiveStrict)<10?" ":"")+"%.4f%%\t(loose: "+(truePositiveLoose<10?" ":"")+"%.4f%%)", + truePositiveStrict, truePositiveLoose)); + sysout.println(String.format("false positive: \t"+(falsePositiveB<10?" ":"")+"%.4f%%\t(loose: "+(falsePositiveLooseB<10?" ":"")+"%.4f%%)", + falsePositiveB, falsePositiveLooseB)); + sysout.println(String.format("SNR: \t"+(snrStrict<10 && snrStrict>=0 ?" ":"")+"%.4f \t(loose: "+(snrLoose<10&&snrLoose>=0?" ":"")+"%.4f)", + snrStrict, snrLoose)); + if(verbose_stats>0){sysout.println(String.format("Plus/Minus ratio:\t %1.4f", truePositivePMRatio));} + } + sysout.println(); + if(paired){ +// sysout.println(String.format("Mated pairs: \t"+(matedPercent<10?" ":"")+"%.4f", matedPercent)+"%"); + if(SYNTHETIC){ + sysout.println(String.format("correct pairs: \t"+(truePositivePairedB<10?" ":"")+"%.4f", truePositivePairedB)+"%"); + } + } + if(SYNTHETIC){ + sysout.println(String.format("correct singles: \t"+(truePositiveSoloB<10?" ":"")+"%.4f", truePositiveSoloB)+"%"); + } + if(paired){ + sysout.println(String.format("rescued: \t"+(rescuedPB+rescuedMB<10?" ":"")+"%.3f", rescuedPB+rescuedMB)+"%"); + if(SYNTHETIC){ + sysout.println(String.format("correct rescued: \t"+(truePositiveRescuedB<10?" ":"")+"%.3f", truePositiveRescuedB)+"%"); + } + } + sysout.println(); + sysout.println(String.format("perfect best site:\t"+(perfectMatchPercent<10?" ":"")+"%.4f", perfectMatchPercent)+"%"); + sysout.println(String.format("semiperfect site:\t"+(semiperfectMatchPercent<10?" ":"")+"%.4f", semiperfectMatchPercent)+"%"); + sysout.println(String.format("ambiguousMapping:\t"+(ambiguousFound<10?" ":"")+"%.4f", ambiguousFound)+"%\t"+ + (REMOVE_DUPLICATE_BEST_ALIGNMENTS ? "(Removed)" : "(Kept)")); + sysout.println(String.format("low-Q discards: \t"+(lowQualityReadsDiscardedPercent<10?" ":"")+"%.4f", + lowQualityReadsDiscardedPercent)+"%"); + if(SYNTHETIC){ + sysout.println(String.format("false negative: \t"+(noHitPercent<10?" ":"")+"%.4f", noHitPercent)+"%"); + sysout.println(String.format("correctLowHit: \t"+(correctLowHitPercent<10?" ":"")+"%.4f", correctLowHitPercent)+"%"); + } + + if(MAKE_MATCH_STRING){ + sysout.println(); + sysout.println("Match Rate: \t"+padPercent(matchRate,4)+"% \t"+matchCountM2); + sysout.println("Error Rate: \t"+padPercent(errorRate,4)+"% \t"+matchErrors); + sysout.println("Sub Rate: \t"+padPercent(subRate,4)+"% \t"+matchCountS2); + sysout.println("Del Rate: \t"+padPercent(delRate,4)+"% \t"+matchCountD2); + sysout.println("Ins Rate: \t"+padPercent(insRate,4)+"% \t"+matchCountI2); + sysout.println("N Rate: \t"+padPercent(nRate,4)+"% \t"+matchCountN2); + } + } + + if(BBSplitter.TRACK_SCAF_STATS){ + BBSplitter.printCounts(BBSplitter.SCAF_STATS_FILE, BBSplitter.scafCountTable, true, readsUsed+readsUsed2); + } + + if(BBSplitter.TRACK_SET_STATS){ + BBSplitter.printCounts(BBSplitter.SET_STATS_FILE, BBSplitter.setCountTable, true, readsUsed+readsUsed2); + } + + if(ReadStats.COLLECT_QUALITY_STATS || ReadStats.COLLECT_MATCH_STATS || ReadStats.COLLECT_INSERT_STATS){ + ReadStats rs=ReadStats.mergeAll(); + if(ReadStats.QUAL_HIST_FILE!=null){rs.writeQualityToFile(ReadStats.QUAL_HIST_FILE, paired);} + if(ReadStats.MATCH_HIST_FILE!=null){rs.writeMatchToFile(ReadStats.MATCH_HIST_FILE, paired);} + if(ReadStats.INSERT_HIST_FILE!=null){rs.writeInsertToFile(ReadStats.INSERT_HIST_FILE);} + } + + assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1==reads) : + "\nThe number of reads out does not add up to the number of reads in.\nThis may indicate that a mapping thread crashed.\n"+ + truePositiveP1+"+"+truePositiveM1+"+"+falsePositive1+"+"+noHit1+"+"+lowQualityReadsDiscarded1+" = "+ + (truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1)+" != "+reads; + if(!SKIMMER){ + assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctMultiHit1+correctUniqueHit1); + }else{ + assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctUniqueHit1); + } + } + + + static void printOutput_Machine(final AbstractMapThread[] mtts, final Timer t, final int keylen, final boolean paired, final boolean SKIMMER){ + long msaIterationsLimited=0; + long msaIterationsUnlimited=0; + + long basesUsed=0; + long basesAtQuickmap=0; + long keysUsed=0; + + long syntheticReads=0; + long numMated=0; + long badPairs=0; + long innerLengthSum=0; + long outerLengthSum=0; + long insertSizeSum=0; + + long callsToScore=0; + long callsToExtend=0; + long initialKeys=0; + long initialKeyIterations=0; + long usedKeys=0; + long usedKeyIterations=0; + + long[] hist_hits=new long[41]; + long[] hist_hits_score=new long[41]; + long[] hist_hits_extend=new long[41]; + + long initialSiteSum1=0; + long postTrimSiteSum1=0; + long postRescueSiteSum1=0; + long siteSum1=0; + long topSiteSum1=0; + + long matchCountS1=0; + long matchCountI1=0; + long matchCountD1=0; + long matchCountM1=0; + long matchCountN1=0; + + + long mapped1=0; + long mappedRetained1=0; + long rescuedP1=0; + long rescuedM1=0; + long truePositiveP1=0; + long truePositiveM1=0; + long falsePositive1=0; + long totalCorrectSites1=0; + long firstSiteCorrectP1=0; + long firstSiteCorrectM1=0; + long firstSiteIncorrect1=0; + long firstSiteCorrectLoose1=0; + long firstSiteIncorrectLoose1=0; + long firstSiteCorrectPaired1=0; + long firstSiteCorrectSolo1=0; + long firstSiteCorrectRescued1=0; + long perfectHit1=0; //Highest score is max score + long uniqueHit1=0; //Only one hit has highest score + long correctUniqueHit1=0; //unique highest hit on answer site + long correctMultiHit1=0; //non-unique highest hit on answer site (non-skimmer only) + long correctLowHit1=0; //hit on answer site, but not highest scorer + long noHit1=0; + long perfectMatch1=0; //Highest slow score is max slow score + long semiperfectMatch1=0; + long perfectHitCount1=0; + long semiPerfectHitCount1=0; + long duplicateBestAlignment1=0; + + long totalNumCorrect1=0; //Only for skimmer + long totalNumIncorrect1=0; //Only for skimmer + long totalNumIncorrectPrior1=0; //Only for skimmer + long totalNumCapturedAllCorrect1=0; //Only for skimmer + long totalNumCapturedAllCorrectTop1=0; //Only for skimmer + long totalNumCapturedAllCorrectOnly1=0; //Only for skimmer + + long initialSiteSum2=0; + long postTrimSiteSum2=0; + long postRescueSiteSum2=0; + long siteSum2=0; + long topSiteSum2=0; + + long mapped2=0; + long mappedRetained2=0; + long rescuedP2=0; + long rescuedM2=0; + long truePositiveP2=0; + long truePositiveM2=0; + long falsePositive2=0; + long totalCorrectSites2=0; + long firstSiteCorrectP2=0; + long firstSiteCorrectM2=0; + long firstSiteIncorrect2=0; + long firstSiteCorrectLoose2=0; + long firstSiteIncorrectLoose2=0; + long firstSiteCorrectPaired2=0; + long firstSiteCorrectSolo2=0; + long firstSiteCorrectRescued2=0; + long perfectHit2=0; //Highest score is max score + long perfectHitCount2=0; + long semiPerfectHitCount2=0; + + long uniqueHit2=0; //Only one hit has highest score + long correctUniqueHit2=0; //unique highest hit on answer site + long correctMultiHit2=0; //non-unique highest hit on answer site (non-skimmer only) + long correctLowHit2=0; //hit on answer site, but not highest scorer + long noHit2=0; + long perfectMatch2=0; //Highest slow score is max slow score + long semiperfectMatch2=0; + long duplicateBestAlignment2=0; + + long totalNumCorrect2=0; //Only for skimmer + long totalNumIncorrect2=0; //Only for skimmer + long totalNumIncorrectPrior2=0; //Only for skimmer + long totalNumCapturedAllCorrect2=0; //Only for skimmer + long totalNumCapturedAllCorrectTop2=0; //Only for skimmer + long totalNumCapturedAllCorrectOnly2=0; //Only for skimmer + + long matchCountS2=0; + long matchCountI2=0; + long matchCountD2=0; + long matchCountM2=0; + long matchCountN2=0; + + readsUsed=0; + for(int i=0; i "+falsePositive); + totalCorrectSites1+=mtt.totalCorrectSites1; + + firstSiteCorrectP1+=mtt.firstSiteCorrectP1; + firstSiteCorrectM1+=mtt.firstSiteCorrectM1; + firstSiteIncorrect1+=mtt.firstSiteIncorrect1; + firstSiteCorrectLoose1+=mtt.firstSiteCorrectLoose1; + firstSiteIncorrectLoose1+=mtt.firstSiteIncorrectLoose1; + firstSiteCorrectPaired1+=mtt.firstSiteCorrectPaired1; + firstSiteCorrectSolo1+=mtt.firstSiteCorrectSolo1; + firstSiteCorrectRescued1+=mtt.firstSiteCorrectRescued1; + + perfectHit1+=mtt.perfectHit1; //Highest score is max score + perfectHitCount1+=mtt.perfectHitCount1; + semiPerfectHitCount1+=mtt.semiPerfectHitCount1; + uniqueHit1+=mtt.uniqueHit1; //Only one hit has highest score + correctUniqueHit1+=mtt.correctUniqueHit1; //unique highest hit on answer site + correctMultiHit1+=mtt.correctMultiHit1; //non-unique highest hit on answer site + correctLowHit1+=mtt.correctLowHit1; //hit on answer site, but not highest scorer + noHit1+=mtt.noHit1; + + totalNumCorrect1+=mtt.totalNumCorrect1; //Skimmer only + totalNumIncorrect1+=mtt.totalNumIncorrect1; //Skimmer only + totalNumIncorrectPrior1+=mtt.totalNumIncorrectPrior1; //Skimmer only + totalNumCapturedAllCorrect1+=mtt.totalNumCapturedAllCorrect1; //Skimmer only + totalNumCapturedAllCorrectTop1+=mtt.totalNumCapturedAllCorrectTop1; //Skimmer only + totalNumCapturedAllCorrectOnly1+=mtt.totalNumCapturedAllCorrectOnly1; //Skimmer only + + perfectMatch1+=mtt.perfectMatch1; //Highest slow score is max slow score + semiperfectMatch1+=mtt.semiperfectMatch1; //A semiperfect mapping was found + + duplicateBestAlignment1+=mtt.ambiguousBestAlignment1; + + initialSiteSum1+=mtt.initialSiteSum1; + postTrimSiteSum1+=mtt.postTrimSiteSum1; + postRescueSiteSum1+=mtt.postRescueSiteSum1; + siteSum1+=mtt.siteSum1; + topSiteSum1+=mtt.topSiteSum1; + + AbstractIndex index=mtt.index(); + callsToScore+=index.callsToScore; + callsToExtend+=index.callsToExtendScore; + initialKeys+=index.initialKeys; + initialKeyIterations+=index.initialKeyIterations; + usedKeys+=index.usedKeys; + usedKeyIterations+=index.usedKeyIterations; + + for(int j=0; j "+falsePositive); + totalCorrectSites2+=mtt.totalCorrectSites2; + + firstSiteCorrectP2+=mtt.firstSiteCorrectP2; + firstSiteCorrectM2+=mtt.firstSiteCorrectM2; + firstSiteIncorrect2+=mtt.firstSiteIncorrect2; + firstSiteCorrectLoose2+=mtt.firstSiteCorrectLoose2; + firstSiteIncorrectLoose2+=mtt.firstSiteIncorrectLoose2; + firstSiteCorrectPaired2+=mtt.firstSiteCorrectPaired2; + firstSiteCorrectSolo2+=mtt.firstSiteCorrectSolo2; + firstSiteCorrectRescued2+=mtt.firstSiteCorrectRescued2; + + perfectHit2+=mtt.perfectHit2; //Highest score is max score + perfectHitCount2+=mtt.perfectHitCount2; + semiPerfectHitCount2+=mtt.semiPerfectHitCount2; + uniqueHit2+=mtt.uniqueHit2; //Only one hit has highest score + correctUniqueHit2+=mtt.correctUniqueHit2; //unique highest hit on answer site + correctMultiHit2+=mtt.correctMultiHit2; //non-unique highest hit on answer site + correctLowHit2+=mtt.correctLowHit2; //hit on answer site, but not highest scorer + noHit2+=mtt.noHit2; + + totalNumCorrect2+=mtt.totalNumCorrect2; //Skimmer only + totalNumIncorrect2+=mtt.totalNumIncorrect2; //Skimmer only + totalNumIncorrectPrior2+=mtt.totalNumIncorrectPrior2; //Skimmer only + totalNumCapturedAllCorrect2+=mtt.totalNumCapturedAllCorrect2; //Skimmer only + totalNumCapturedAllCorrectTop2+=mtt.totalNumCapturedAllCorrectTop2; //Skimmer only + totalNumCapturedAllCorrectOnly2+=mtt.totalNumCapturedAllCorrectOnly2; //Skimmer only + + perfectMatch2+=mtt.perfectMatch2; //Highest slow score is max slow score + semiperfectMatch2+=mtt.semiperfectMatch2; //A semiperfect mapping was found + + duplicateBestAlignment2+=mtt.ambiguousBestAlignment2; + + initialSiteSum2+=mtt.initialSiteSum2; + postTrimSiteSum2+=mtt.postTrimSiteSum2; + postRescueSiteSum2+=mtt.postRescueSiteSum2; + siteSum2+=mtt.siteSum2; + topSiteSum2+=mtt.topSiteSum2; + + matchCountS2+=mtt.matchCountS2; + matchCountI2+=mtt.matchCountI2; + matchCountD2+=mtt.matchCountD2; + matchCountM2+=mtt.matchCountM2; + matchCountN2+=mtt.matchCountN2; + + } + reads=readsUsed; + if(syntheticReads>0){SYNTHETIC=true;} + + t.stop(); + long nanos=t.elapsed; + + if(verbose_stats>1){ + StringBuilder sb=new StringBuilder(1000); + sb.append("\n\n###################\n#hits\tcount\tscore\textend\n"); + for(int i=0; i=1){sysout.println("MSA_iterations"+DELIMITER+String.format("%.2fL + %.2fU = %.2f", milf,milu,milf+milu));} + +// sysout.println(); +// sysout.println("\nRead 1 data:"); + + sysout.println(); + + if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){ + double x=ambiguousFound+mappedRetainedB; + sysout.println("R1_Mapped_Percent"+DELIMITER+padPercentMachine(x,4)+"%"); + sysout.println("R1_Unambiguous_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%"); + sysout.println("R1_Mapped_Reads"+DELIMITER+mappedReads); + sysout.println("R1_Unambiguous_Reads"+DELIMITER+unambiguousReads); + }else{ + double x=mappedRetainedB-ambiguousFound; + sysout.println("R1_Mapped_Percent"+DELIMITER+padPercentMachine(x,4)+"%"); + sysout.println("R1_Unambiguous_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%"); + sysout.println("R1_Mapped_Reads"+DELIMITER+mappedReads); + sysout.println("R1_Unambiguous_Reads"+DELIMITER+unambiguousReads); + } + + sysout.println(); + if(paired){ + sysout.println(String.format("Mated_Pairs"+DELIMITER+"%.4f%%", matedPercent)); + sysout.println(String.format("Bad_Pairs"+DELIMITER+"%.3f%%", badPairsPercent)); + } + if(paired){ + sysout.println(String.format("R1_Rescued"+DELIMITER+"%.3f", rescuedPB+rescuedMB)+"%"); + sysout.println(String.format("Avg_Insert_Size"+DELIMITER+"%.2f", insertSizeAvg)); + } + sysout.println(); + sysout.println(String.format("R1_Perfect_Best_Site"+DELIMITER+"%.4f", perfectMatchPercent)+"%"); + sysout.println(String.format("R1_Semiperfect_Site"+DELIMITER+"%.4f", semiperfectMatchPercent)+"%"); + sysout.println(String.format("R1_Ambiguous_Mapping"+DELIMITER+"%.4f", ambiguousFound)+"%"); +// +(REMOVE_DUPLICATE_BEST_ALIGNMENTS ? " (Removed)" : " (Kept)")); + sysout.println(String.format("R1_Low_Quality_Discards"+DELIMITER+"%.4f", lowQualityReadsDiscardedPercent)+"%"); + + if(MAKE_MATCH_STRING){ + sysout.println(); + sysout.println("R1_Match_Rate"+DELIMITER+padPercentMachine(matchRate,4)+"%"); + sysout.println("R1_Error_Rate"+DELIMITER+padPercentMachine(errorRate,4)+"%"); + sysout.println("R1_Sub_Rate"+DELIMITER+padPercentMachine(subRate,4)+"%"); + sysout.println("R1_Del_Rate"+DELIMITER+padPercentMachine(delRate,4)+"%"); + sysout.println("R1_Ins_Rate"+DELIMITER+padPercentMachine(insRate,4)+"%"); + sysout.println("R1_N_Rate"+DELIMITER+padPercentMachine(nRate,4)+"%"); + + sysout.println("R1_Match_Count"+DELIMITER+matchCountM1); + sysout.println("R1_Error_Count"+DELIMITER+matchErrors); + sysout.println("R1_Sub_Count"+DELIMITER+matchCountS1); + sysout.println("R1_Del_Count"+DELIMITER+matchCountD1); + sysout.println("R1_Ins_Count"+DELIMITER+matchCountI1); + sysout.println("R1_N_Count"+DELIMITER+matchCountN1); + } + + if(paired){ + invSites100=100d/siteSum2; + + perfectHitPercent=perfectHit2*invTrials100; //Highest score is max score + perfectMatchPercent=perfectMatch2*invTrials100; + semiperfectMatchPercent=semiperfectMatch2*invTrials100; + + perfectHitCountPercent=perfectHitCount2*invSites100; + semiPerfectHitCountPercent=semiPerfectHitCount2*invSites100; + + uniqueHitPercent=uniqueHit2*invTrials100; //Only one hit has highest score + correctUniqueHitPercent=correctUniqueHit2*invTrials100; //unique highest hit on answer site + correctMultiHitPercent=correctMultiHit2*invTrials100; //non-unique highest hit on answer site + correctLowHitPercent=correctLowHit2*invTrials100; //hit on answer site, but not highest scorer + ambiguousFound=(duplicateBestAlignment2*invTrials100); + correctHighHitPercent=(correctMultiHit2+correctUniqueHit2)*invTrials100; + correctHitPercent=(correctLowHit2+correctMultiHit2+correctUniqueHit2)*invTrials100; + + mappedB=(mapped2*invTrials100); + mappedRetainedB=(mappedRetained2*invTrials100); + rescuedPB=(rescuedP2*invTrials100); + rescuedMB=(rescuedM2*invTrials100); + falsePositiveB=(firstSiteIncorrect2*invTrials100); + falsePositiveLooseB=(firstSiteIncorrectLoose2*invTrials100); + truePositivePB=(firstSiteCorrectP2*invTrials100); + truePositiveMB=(firstSiteCorrectM2*invTrials100); + truePositiveStrict=((firstSiteCorrectP2+firstSiteCorrectM2)*invTrials100); + truePositiveLoose=(firstSiteCorrectLoose2*invTrials100); + snrStrict=10*Math.log10((firstSiteCorrectM2+firstSiteCorrectP2+0.1)/(firstSiteIncorrect2+0.1)); + snrLoose=10*Math.log10((firstSiteCorrectLoose2+0.1)/(firstSiteIncorrectLoose2+0.1)); + truePositivePMRatio=(truePositivePB/truePositiveMB); + truePositivePairedB=(firstSiteCorrectPaired2*100d/numMated); + truePositiveSoloB=(firstSiteCorrectSolo2*100d/(mappedRetained2-numMated)); + truePositiveRescuedB=(firstSiteCorrectRescued2*100d/(rescuedP2+rescuedM2)); + avgNumCorrect=(totalCorrectSites2/(1d*(truePositiveP2+truePositiveM2))); + noHitPercent=noHit2*invTrials100; + + avgNumCorrect=(SKIMMER ? totalNumCorrect2*invTrials : (totalCorrectSites2/(1d*(truePositiveP2+truePositiveM2)))); + avgNumIncorrect=totalNumIncorrect1*invTrials; //Skimmer only + avgNumIncorrectPrior=totalNumIncorrectPrior1*invTrials; //Skimmer only + + rateCapturedAllCorrect=totalNumCapturedAllCorrect2*invTrials100; //Skimmer only + rateCapturedAllTop=totalNumCapturedAllCorrectTop2*invTrials100; //Skimmer only + rateCapturedAllOnly=totalNumCapturedAllCorrectOnly2*invTrials100; //Skimmer only + + if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){ + mappedReads=mappedRetained2+duplicateBestAlignment2; + unambiguousReads=mappedRetained2; + }else{ + mappedReads=mappedRetained2; + unambiguousReads=mappedRetained2-duplicateBestAlignment2; + } + + avgInitialSites=initialSiteSum2*invTrials; + avgPostTrimSites=postTrimSiteSum2*invTrials; + avgPostRescueSites=postRescueSiteSum2*invTrials; + avgSites=siteSum2*invTrials; + avgPerfectSites=(perfectHitCount1*invTrials); + avgSemiPerfectSites=(semiPerfectHitCount1*invTrials); + avgTopSites=topSiteSum2*invTrials; + lowQualityReadsDiscardedPercent=lowQualityReadsDiscarded2*invTrials100; + + matchErrors=matchCountS2+matchCountI2+matchCountD2; + baseLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2; + matchLen=matchCountM2+matchCountI2+matchCountS2+matchCountN2+matchCountD2; + refLen=matchCountM2+matchCountS2+matchCountN2+matchCountD2; + errorRate=matchErrors*100d/matchLen; + matchRate=matchCountM2*100d/matchLen;//baseLen; + subRate=matchCountS2*100d/matchLen;//baseLen; + delRate=matchCountD2*100d/matchLen; + insRate=matchCountI2*100d/matchLen;//baseLen; + nRate=matchCountN2*100d/matchLen;//baseLen; + +// sysout.println("\n\nRead 2 data:"); + sysout.println(); +// sysout.println(String.format("perfectHit"+DELIMITER+"%.2f", perfectHitPercent)+"%"); +// sysout.println(String.format("uniqueHit"+DELIMITER+"%.2f", uniqueHitPercent)+"%"); +// sysout.println(String.format("correctUniqueHit"+DELIMITER+"%.2f", correctUniqueHitPercent)+"%"); +// sysout.println(String.format("correctMultiHit"+DELIMITER+"%.2f", correctMultiHitPercent)+"%"); +// sysout.println(String.format("correctHighHit"+DELIMITER+"%.2f", correctHighHitPercent)+"%"); +// sysout.println(String.format("correctHit"+DELIMITER+"%.2f", correctHitPercent)+"%"); + + //sysout.println(String.format("mapped"+DELIMITER+(mappedB<10?" ":"")+"%.3f", mappedB)+"%"); + if(REMOVE_DUPLICATE_BEST_ALIGNMENTS){ + double x=ambiguousFound+mappedRetainedB; + sysout.println("R2_Mapped_Percent"+DELIMITER+padPercentMachine(x,4)+"%"); + sysout.println("R2_Unambiguous_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%"); + sysout.println("R2_Mapped_Reads"+DELIMITER+mappedReads); + sysout.println("R2_Unambiguous_Reads"+DELIMITER+unambiguousReads); + }else{ + double x=mappedRetainedB-ambiguousFound; + sysout.println("R2_Mapped_Percent"+DELIMITER+padPercentMachine(x,4)+"%"); + sysout.println("R2_Unambiguous_Percent"+DELIMITER+padPercentMachine(mappedRetainedB,4)+"%"); + sysout.println("R2_Mapped_Reads"+DELIMITER+mappedReads); + sysout.println("R2_Unambiguous_Reads"+DELIMITER+unambiguousReads); + } + sysout.println(); + if(paired){ + sysout.println(String.format("R2_Rescued"+DELIMITER+"%.3f", rescuedPB+rescuedMB)+"%"); + } + sysout.println(); + sysout.println(String.format("R2_Perfect_Best_Site"+DELIMITER+"%.4f", perfectMatchPercent)+"%"); + sysout.println(String.format("R2_Semiperfect_Site"+DELIMITER+"%.4f", semiperfectMatchPercent)+"%"); + sysout.println(String.format("R2_Ambiguous_Mapping"+DELIMITER+"%.4f", ambiguousFound)+"%"); + //(REMOVE_DUPLICATE_BEST_ALIGNMENTS ? "(Removed)" : "(Kept)")); + sysout.println(String.format("R2_Low_Quality_Discards"+DELIMITER+"%.4f", lowQualityReadsDiscardedPercent)+"%"); + + if(MAKE_MATCH_STRING){ + sysout.println(); + sysout.println("R2_Match_Rate"+DELIMITER+padPercentMachine(matchRate,4)+"%"); + sysout.println("R2_Error_Rate"+DELIMITER+padPercentMachine(errorRate,4)+"%"); + sysout.println("R2_Sub_Rate"+DELIMITER+padPercentMachine(subRate,4)+"%"); + sysout.println("R2_Del_Rate"+DELIMITER+padPercentMachine(delRate,4)+"%"); + sysout.println("R2_Ins_Rate"+DELIMITER+padPercentMachine(insRate,4)+"%"); + sysout.println("R2_N_Rate"+DELIMITER+padPercentMachine(nRate,4)+"%"); + + sysout.println("R2_Match_Count"+DELIMITER+matchCountM2); + sysout.println("R2_Error_Count"+DELIMITER+matchErrors); + sysout.println("R2_Sub_Count"+DELIMITER+matchCountS2); + sysout.println("R2_Del_Count"+DELIMITER+matchCountD2); + sysout.println("R2_Ins_Count"+DELIMITER+matchCountI2); + sysout.println("R2_N_Count"+DELIMITER+matchCountN2); + } + } + + if(BBSplitter.TRACK_SCAF_STATS){ + BBSplitter.printCounts(BBSplitter.SCAF_STATS_FILE, BBSplitter.scafCountTable, true, readsUsed+readsUsed2); + } + + if(BBSplitter.TRACK_SET_STATS){ + BBSplitter.printCounts(BBSplitter.SET_STATS_FILE, BBSplitter.setCountTable, true, readsUsed+readsUsed2); + } + + if(ReadStats.COLLECT_QUALITY_STATS || ReadStats.COLLECT_MATCH_STATS || ReadStats.COLLECT_INSERT_STATS){ + ReadStats rs=ReadStats.mergeAll(); + if(ReadStats.QUAL_HIST_FILE!=null){rs.writeQualityToFile(ReadStats.QUAL_HIST_FILE, paired);} + if(ReadStats.MATCH_HIST_FILE!=null){rs.writeMatchToFile(ReadStats.MATCH_HIST_FILE, paired);} + if(ReadStats.INSERT_HIST_FILE!=null){rs.writeInsertToFile(ReadStats.INSERT_HIST_FILE);} + } + + assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1==reads) : + "\nThe number of reads out does not add up to the number of reads in.\nThis may indicate that a mapping thread crashed.\n"+ + truePositiveP1+"+"+truePositiveM1+"+"+falsePositive1+"+"+noHit1+"+"+lowQualityReadsDiscarded1+" = "+ + (truePositiveP1+truePositiveM1+falsePositive1+noHit1+lowQualityReadsDiscarded1)+" != "+reads; + if(!SKIMMER){ + assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctMultiHit1+correctUniqueHit1); + }else{ + assert(!CALC_STATISTICS || truePositiveP1+truePositiveM1==correctLowHit1+correctUniqueHit1); + } + } + + static final void printSettings0(int k, int maxindel, float minratio){ + if(MACHINE_OUTPUT){ + sysout.println("Genome"+DELIMITER+Data.GENOME_BUILD); + sysout.println("Key_Length"+DELIMITER+k); + sysout.println("Max_Indel"+DELIMITER+maxindel); + sysout.println("Minimum_Score_Ratio"+DELIMITER+minratio); + sysout.println("Mapping_Mode"+DELIMITER+(PERFECTMODE ? "perfect" : SEMIPERFECTMODE ? "semiperfect" : "normal")); + }else{ + sysout.println("Genome: \t"+Data.GENOME_BUILD); + sysout.println("Key Length: \t"+k); + sysout.println("Max Indel: \t"+maxindel); + sysout.println("Minimum Score Ratio: \t"+minratio); + sysout.println("Mapping Mode: \t"+(PERFECTMODE ? "perfect" : SEMIPERFECTMODE ? "semiperfect" : "normal")); + } + } + + + static final int absdif(int a, int b){ + return a>b ? a-b : b-a; + } + + /* ------------ Non-static fields ----------- */ + + + ConcurrentReadStreamInterface cris; + RTextOutputStream3 rosA=null, rosM=null, rosU=null, rosB=null; + + float fractionGenomeToExclude=-1; + int maxIndel1=-1; + int maxIndel2=-1; + int minApproxHits=-1; + int expectedSites=-1; + int ambigMode=AMBIG_BEST; +// int ambigMode2=AMBIG_BEST; + boolean fast=false; + boolean slow=false; + boolean verbose=false; + boolean rcompMate=false; + boolean outputSitesOnly=false; + long targetGenomeSize=-1; + int ziplevel=-1; + int build=1; + String reference=null; + int keylen=13; + int idmodulo=1; + float samplerate=1f; + double minid=-1; + long sampleseed=1; + boolean ambiguousRandom=false, ambiguousAll=false; + boolean forceanalyze=false; + boolean gunzip=false; + boolean gzip=false; + boolean pigz=true; + boolean unpigz=ReadWrite.USE_UNPIGZ; + boolean setxs=false, setintron=false; + String bamscript=null; + String in1=null, in2=null; + String qfout=null, qfout2=null, qfoutM=null, qfoutM2=null, qfoutU=null, qfoutU2=null, qfoutB=null, qfoutB2=null; + + + + /** Scores below the (max possible alignment score)*(MINIMUM_ALIGNMENT_SCORE_RATIO) will be discarded. + * Default: 0.4 ~ 0.5 for clean data against raw PacBio data. + * Very sensitive! A value of 0.2 will potentially produce many false positives. */ + float MINIMUM_ALIGNMENT_SCORE_RATIO; + + float keyDensity;//Normal key density + float maxKeyDensity; //For situations where some of the read is too low quality, this is the max for the rest of the read. + float minKeyDensity; + int maxDesiredKeys; //Don't go above this number of keys except to maintain minKeyDensity. + + /** Additional ref bases on each end of site mapping location in alignment window. + * If there are no insertions or deletions, 0 is fine. */ + int SLOW_ALIGN_PADDING; + int SLOW_RESCUE_PADDING; + int TIP_SEARCH_DIST; + + /** Class name of MSA to use */ + String MSA_TYPE; + int MAX_SITESCORES_TO_PRINT; + boolean PRINT_SECONDARY_ALIGNMENTS; + + + /* ------------ Static fields ----------- */ + + static final int AMBIG_BEST=0; + static final int AMBIG_TOSS=1; + static final int AMBIG_RANDOM=2; + static final int AMBIG_ALL=3; + + static int THRESH=0; //Threshold for calculating true positives on synthetic data, or something. + + static int readlen=100; + + static int maxInsLen=40; //Default 40 + static int maxSubLen=40; //Default 40 + static int maxDelLen=40; //Default 8000 + + static byte minQuality=3; + static byte midQuality=23; + static byte maxQuality=35; + + static int maxSnps=3;//4; + static int maxInss=3;//2; + static int maxDels=3; + static int maxSubs=3;//2; + + static float baseSnpRate=0.25f; + static float baseInsRate=0.25f; + static float baseDelRate=0.25f; + static float baseSubRate=0.25f;//0.3f; + static float PERFECT_READ_RATIO=0.0f;//0.2f;//0.8f + + //Extra work for rare cases in human only. + static boolean SAVE_AMBIGUOUS_XY=false; + + static boolean colorspace=false; + + static boolean translateToBaseSpace=false; //Translate (colorspace) reads before outputting them + + + static boolean TRIM_LIST=true; //Increases speed many times; reduces accuracy a bit + + static boolean PAIRED_RANDOM_READS=false; + static boolean REQUIRE_CORRECT_STRANDS_PAIRS=true; + static boolean SAME_STRAND_PAIRS=false; + static boolean KILL_BAD_PAIRS=false; + + static final boolean SLOW_ALIGN=true; //Do a more accurate scoring pass with MSA + static boolean MAKE_MATCH_STRING=SLOW_ALIGN; + + /** Rescue paired reads by searching near mate */ + static boolean RESCUE=true; + + /** Generally should be set to false unless SLOW_ALIGN==true */ + static boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + + /** Forbid alignments with indels longer than MAX_INDEL */ + static boolean STRICT_MAX_INDEL=false; + /** Don't allow reads to map to their origin location in the reference. Useful for self-correcting reads. */ + static boolean FORBID_SELF_MAPPING=false; + /** Only allow perfect and semiperfect mappings */ + static boolean SEMIPERFECTMODE=false; + /** Only allow perfect mappings */ + static boolean PERFECTMODE=false; + /** Only allow sites with at least this many contiguous matches */ + static int KFILTER=-1; + + /** Quality-trim left side of read before mapping */ + static boolean TRIM_LEFT=false; + /** Quality-trim right side of read before mapping */ + static boolean TRIM_RIGHT=false; + /** Restore read to untrimmed state after mapping (and destroy match string) */ + static boolean UNTRIM=false; + /** Trim bases with quality less than or equal to this value */ + static byte TRIM_QUALITY=7; + /** Produce local alignments instead of global alignments */ + static boolean LOCAL_ALIGN=false; + + static int minChrom=1; + static int maxChrom=Integer.MAX_VALUE; + + static long reads=-1; + static long readsUsed=0; + static long readsUsed2=0; + static long lowQualityReadsDiscarded1=0; + static long lowQualityReadsDiscarded2=0; + + protected static boolean CALC_STATISTICS=true; + + static boolean QUICK_MATCH_STRINGS=false; + static boolean OUTPUT_READS=false; + static boolean DONT_OUTPUT_UNMAPPED_READS=false; + static boolean DONT_OUTPUT_BLACKLISTED_READS=false; + + static boolean OUTPUT_ORDERED_READS=false; + static boolean DOUBLE_PRINT_ERROR_RATE=false; + + static String outputBaseName="readsOut_"+(System.nanoTime()&0x1FFFF); + static String outFile=null;//outputBaseName+"_1.txt"; + static String outFile2=null;//outputBaseName+"_2.txt"; + static String outFileM=null;//outputBaseName+"_mapped_1.txt"; + static String outFileM2=null;//outputBaseName+"_mapped_2.txt"; + static String outFileU=null;//outputBaseName+"_unmapped_1.txt"; + static String outFileU2=null;//outputBaseName+"_unmapped_2.txt"; + static String outFileB=null;//outputBaseName+"_blacklist_1.txt"; + static String outFileB2=null;//outputBaseName+"_blacklist_2.txt"; + static ArrayList blacklist=null; + + static boolean useRandomReads=false; + static int sequentialOverlap=5; + static boolean sequentialStrandAlt=false; + + static boolean OVERWRITE=false; + static boolean SYNTHETIC=false; + static boolean ERROR_ON_NO_OUTPUT=false; + static boolean MACHINE_OUTPUT=false; + final static String DELIMITER="="; + + static PrintStream sysout=System.err; + static boolean SYSIN=false; + static int verbose_stats=0; + static boolean waitForMemoryClear=false; + + public static boolean errorState=false; + +} diff --git a/current/align2/BBIndex.java b/current/align2/BBIndex.java new file mode 100755 index 0000000..fd8cb05 --- /dev/null +++ b/current/align2/BBIndex.java @@ -0,0 +1,3296 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import stream.SiteScore; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; + + +/** + * Based on Index11f + * Index stored in single array per block. + * + * + * + * @author Brian Bushnell + * @date Dec 22, 2012 + * + */ +public final class BBIndex extends AbstractIndex { + + + public static void main(String[] args){ + + int k=13; + + for(int i=0; iData.numChroms){maxChrom=Data.numChroms;} + assert(minChrom<=maxChrom); + Data.sysout.println("Loading index for chrom "+minChrom+"-"+maxChrom+", genome "+Data.GENOME_BUILD); + index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom, + k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, writeToDisk, diskInvalid, index); + } + + /** Calculate statistics of index, such as list lengths, and find clumpy keys */ + public static final synchronized void analyzeIndex(int minChrom, int maxChrom, boolean cs, float fractionToExclude, int k){ + assert(!cs) : "Re-enable old reverse complement mode."; + assert(lengthHistogram==null); + assert(COUNTS==null); + + int KEYSPACE=1<<(2*k); + COUNTS=new int[KEYSPACE]; + maxChrom=maxChrom(maxChrom); + + HashMap cmap=new HashMap(); + + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + Block b=index[chrom]; + final int[] sites=b.sites; + final int[] starts=b.starts; + + for(int key=0; key0 && dif<=CLUMPY_MAX_DIST){ + clumps++; + } + } + if(clumps>0){ + final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k)); + final Integer ko=x; + LongM lm=cmap.get(ko); + if(lm==null){ + lm=new LongM(0); + cmap.put(ko, lm); + } + lm.increment(clumps); + } + } + } + } + + for(int key=0; keyCLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){ + int rkey=AminoAcid.reverseComplementBinaryFast(key, k); + assert(key<=rkey); + assert(key==KeyRing.reverseComplementKey(rkey, k, cs)); + COUNTS[key]=0; + COUNTS[rkey]=0; + } + } + } + + lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); + + if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} + + if(REMOVE_FREQUENT_GENOME_FRACTION){ + + int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); + int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); + + MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); + MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); + + if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} + } + + Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); + if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} + if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} + assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; + } + + + /** Returns the filename for the block holding this chrom */ + public static final String fname(int chrom, int k){ + return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS, COLORSPACE); + } + + /** Ensure key offsets are strictly ascending. */ + private static boolean checkOffsets(int[] offsets){ + for(int i=1; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; ilimit || sum/finalHitCount>limit2; i--){ + Pointer p=ptrs[i]; + sum-=hits[p.key].length; + hits[p.key]=null; + finalHitCount--; + } + + return finalHitCount; + } + + /** Remove least useful keys to accelerate search */ + private final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){ + + float[] keyWeights=getKeyWeightArray(keyScores.length); + for(int i=0; ilimitS){hits[i]=null;} +// } + + final int[] lengths=getGenericArray(keys.length); + + for(int i=0; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; i=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){ + final int[] lists=getGreedyListArray(hitsCount); + for(int i=0, j=0; j0){ + lists[j]=i; + j++; + } + } + + Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn); + int worstIndex=greedyReturn[0]; + int worst=lists[worstIndex]; + worstValue=greedyReturn[1]; + sum-=lengths[worst]; + +// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]0 || lengths[worst]=0){ + final int len=count(key); + if(len>0 && len0){ + starts[i]=b.starts[key]; + stops[i]=starts[i]+len2; + numHits++; + } + } + } + } + return numHits; + } + + + private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){ + int numHits=0; + for(int i=0; i=0){ + final int len=count(key); + if(len>0 && len findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){ + assert(minChrom<=maxChrom && minChrom>=0); + ArrayList result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id); + if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);} + + return result; + } + + + public final ArrayList find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){ + + assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); + final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN, COLORSPACE); + int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length); + + initialKeys+=offsetsP.length; + initialKeyIterations++; + + final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2); + + int numHits=0; + numHits=countHits(keysP, maxLen, true); + if(numHits>0){ //TODO: Change these to higher numbers + int trigger=(3*keysP.length)/4; + if(numHits<4 && numHitsMIN_APPROX_HITS_TO_KEEP){ + int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits); + + int zeroes=keysP.length-numHits; + int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1)); + cutoffIndex=Tools.max(cutoffIndex, altMinIndex); + + assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits; + + if(cutoffIndex<(keysP.length-1)){ + int[] lens=getGenericArray(keysP.length); + for(int i=0; icutoff){ + keysP[i]=-1; + removed++; + numHits--; + } + } + } + } +// assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + final ArrayList result=new ArrayList(8); + if(numHits\n"+Arrays.toString(offsetsM)); + } + final int[] keysM=(COLORSPACE ? KeyRing.makeKeys(basesM, offsetsM, KEYLEN, COLORSPACE) : KeyRing.reverseComplementKeys(keysP, KEYLEN, COLORSPACE)); + +// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); +// assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM); + + assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length))); + assert(keyScoresP!=null); + assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP); + final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1)); + final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1)); + final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); + + assert(offsetsM.length==offsetsP.length); + assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM)); + + /* + * bestScores: + * + * bestScores[0] currentTopScore + * bestScores[1] maxHits + * bestScores[2] qcutoff + * bestScores[3] bestqscore + * bestScores[4] maxQuickScore + * bestScores[5] perfectsFound + */ + final int[] bestScores=new int[6]; + + //This prevents filtering by qscore when a low-quality read only uses a few keys. + //In that case, extending is more important. + final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5); + + int[][] prescanResults=null; + int[] precounts=null; + int[] prescores=null; + + int hitsCutoff=0; + int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore); + + boolean allBasesCovered=true; + { + if(offsetsP[0]!=0){allBasesCovered=false;} + else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;} + else{ + for(int i=1; ioffsetsP[i-1]+KEYLEN){ + allBasesCovered=false; + break; + } + } + } + } + + //TODO I don't understand this logic + final boolean pretendAllBasesAreCovered=//false; + (allBasesCovered || + keysP.length>=keysOriginal.length-4 || + (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f)))); + +// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP)); +// assert(allBasesCovered); + + if(prescan_qscore){ + prescanResults=prescanAllBlocks(bestScores, + keysP, keyScoresP, offsetsP, + keysM, keyScoresM, offsetsM, + pretendAllBasesAreCovered); + + if(prescanResults!=null){ + precounts=prescanResults[0]; + prescores=prescanResults[1]; + } + + if(bestScores[1]=maxQuickScore && pretendAllBasesAreCovered){ + assert(bestScores[3]==maxQuickScore); + assert(bestScores[1]==numHits); + + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT)); + }else{ + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH)); + } + } + + final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true); + final boolean fullyDefined=AminoAcid.isFullyDefined(basesP); + assert(bestScores[2]<=0) : Arrays.toString(bestScores); + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS, + offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human + } + cycle++; + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS, + offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human + } + cycle++; + } + +// assert(Read.CHECKSITES(result, basesP, basesM, id)); //TODO: Comment out once checked + + return result; + } + + /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */ + private final int[][] prescanAllBlocks(int[] bestScores, + int[] keysP, int[] keyScoresP, int[] offsetsP, + int[] keysM, int[] keyScoresM, int[] offsetsM, + final boolean allBasesCovered){ + + int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}}; + + int bestqscore=0; + int maxHits=0; + int minHitsToScore=MIN_APPROX_HITS_TO_KEEP; + + final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); + + final int[] counts=precountArray; + final int[] scores=prescoreArray; + final int[][] ret=prescanReturn; + Arrays.fill(counts, keysP.length); + Arrays.fill(scores, maxQuickScore); + ret[0]=counts; + ret[1]=scores; + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + final int baseChrom=baseChrom(chrom); + for(int pmi=0; pmi<2; pmi++, cycle++){ + + int[] keys=pm[pmi][0]; + int[] keyScores=pm[pmi][1]; + int[] offsets=pm[pmi][2]; +// int[][] hits=getHitArray(offsets.length); + + int[] starts=startArray; + int[] stops=stopArray; + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + + if(numHits find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores, + final int chrom, final byte strand, + int[] offsets, final boolean obeyLimits, ArrayList ssl, int[] bestScores, + final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + //Index of first location of each key + int[] starts=startArray; + //Index of first location of next key (i.e., (last location of key)+1) + int[] stops=stopArray; + + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + if(numHits=0){numHits++;} + } + + if(numHits==offsets.length){ + return null; + }else{ + int[][] r=shrinkReturn3; + int[] starts2=startArray; + int[] stops2=stopArray; + int[] offsets2=getOffsetArray(numHits); + int[] keyScores2=new int[numHits]; + + for(int i=0, j=0; i=0){ + starts2[j]=starts[i]; + stops2[j]=stops[i]; + offsets2[j]=offsets[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + r[0]=starts2; + r[1]=stops2; + r[2]=offsets2; + r[4]=keyScores2; + return r; + } + } + + /** Removes "-1" keys. */ + private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){ + + + int numHits=0; + for(int i=0; i=0){numHits++;} + } + + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + if(numHits==keys.length){ + return null; + }else{ + int[][] r=shrinkReturn2; + int[] offsets2=getOffsetArray(numHits); + assert(offsets2!=offsets); + assert(offsets2.length=0){ + offsets2[j]=offsets[i]; + keys2[j]=keys[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+ + "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n"; + r[0]=offsets2; + r[1]=keys2; + r[2]=keyScores2; + return r; + } + } + + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk2(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, final boolean fullyDefined){ + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true); +// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets)); +// System.err.println("maxScore = "+maxScore); + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); +// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f)); +// final int minScore=(int)(MIN_SCORE_MULT*maxScore); +// System.err.println("minScore = "+minScore); + + final int baseChrom=baseChrom(baseChrom_); + + + heap.clear(); + final Quad[] triples=tripleStorage; + + final Block b=index[baseChrom]; + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=-999999999; + + int cutoff=minScore; + + int maxHits=0; + int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println(); + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + int approxHits=0; + + {//Inner loop + final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + + int mapStart=site, mapStop=maxNearbySite; + + if(USE_EXTENDED_SCORE){ + final int chrom=numberToChrom(site, baseChrom); + score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); + if(true/*USE_AFFINE_SCORE*/){ + //Correct begin and end positions if they changed. + int min=Integer.MAX_VALUE; + int max=Integer.MIN_VALUE; + for(int i=0; i-1){ + if(xmax){max=x;} + } + } + +// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ + chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ + Arrays.toString(locArray)+"\n"+ + Arrays.toString(values)+"\n"+ + new String(bases)+"\nstrand="+strand+"\n"); + System.err.println(); + } + score=-99999; + } + + + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + + +// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); + +// if(chrom==17 && absdif(min, 30354420)<2000){ +// System.err.println("\n*****\n"); +// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+ +// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); +// System.err.println(); +// System.err.println(Arrays.toString(locArray)); +// System.err.println(); +// System.err.println("chrom="+chrom); +// System.err.println("score="+score); +// } + } + }else{ + score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + score+=scoreZ; + } + } + + if(score>=cutoff){ + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + + // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH)); + if(USE_EXTENDED_SCORE && score>=maxScore){ + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + } + if(heap.isEmpty()){break;} + } + + } + + return ssl; + } + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk3(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, + int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + assert(USE_EXTENDED_SCORE); + + final int numKeys=offsets.length; //Before shrink + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + usedKeys+=numHits; + usedKeyIterations++; + + final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); + final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); + + final int baseChrom=baseChrom(baseChrom_); + + heap.clear(); + + final Quad[] triples=tripleStorage; + + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + final Block b=index[baseChrom]; + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=bestScores[0]; + int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); + + int qcutoff=Tools.max(bestScores[2], minQuickScore); + int bestqscore=bestScores[3]; + int maxHits=bestScores[1]; + int perfectsFound=bestScores[5]; + assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits+", "+new String(bases); + int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); + if(approxHitsCutoff>numHits){return ssl;} + + final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); + + + assert(USE_EXTENDED_SCORE); + + if(currentTopScore>=maxScore){ + assert(currentTopScore==maxScore); + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + } + + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); + + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + int mapStart=site, mapStop=maxNearbySite; + + assert(USE_EXTENDED_SCORE); + + boolean locArrayValid=false; + if(qscore-1){ + if(xmax){max=x;} + } + } + + if(score>=maxScore){ + assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + + // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+ + "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+ + "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+ + "numHits="+approxHits+", approxHits="+approxHits+"\n"+ + "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+ + "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+ + "locArray:\t"+Arrays.toString(locArray)+"\n"+ + "values:\t"+Arrays.toString(values)+"\n"+ + "bases:\t"+new String(bases)); + System.err.println(); + assert(false); + } + score=-99999; + } + + //mapStart and mapStop are indices + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + if(score>=maxScore){ + assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + } + + if(score==maxScore){ + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); + } + + if(score>=cutoff){ + qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); + bestqscore=Tools.max(qscore, bestqscore); + } + } + + if(score>=cutoff){ + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + if(score>=maxScore){ + assert(USE_EXTENDED_SCORE); + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop; + + assert(NUM_CHROM_BITS==0 || site2 "+site2); +// System.err.println(mapStop+" -> "+site3); + + assert(gapArray[0]>=site2 && gapArray[0]-site2 "+site2+"\n"+ + mapStop+" -> "+site3+"\n\n"+ + Arrays.toString(gapArray)+"\n\n"+ +// Arrays.toString(clone)+"\n\n"+ + Arrays.toString(locArray)+"\n"+ + "numHits="+numHits+", "+ + "heap.size="+heap.size()+", "+ + "numHits="+numHits+", "+ + "approxHits="+approxHits+"\n"; + gapArray[0]=Tools.min(gapArray[0], site2); + gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); + } + if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} +// assert(false) : Arrays.toString(locArray); + } + + + //This block is optional, but tries to eliminate multiple identical alignments + + SiteScore ss=null; + final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; + final boolean inbounds=(site2>=0 && site3=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(heap.size() camelWalk3(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, + int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + assert(USE_EXTENDED_SCORE); + + final int numKeys=offsets.length; //Before shrink + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + usedKeys+=numHits; + usedKeyIterations++; + + final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); + final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); + + final int baseChrom=baseChrom(baseChrom_); + + heap.clear(); + active.clear(); + + final Quad[] triples=tripleStorage; + + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + final Block b=index[baseChrom]; + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=bestScores[0]; + int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); + + int qcutoff=Tools.max(bestScores[2], minQuickScore); + int bestqscore=bestScores[3]; + int maxHits=bestScores[1]; + int perfectsFound=bestScores[5]; + assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits; + int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); + if(approxHitsCutoff>numHits){return ssl;} + + final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); + + if(currentTopScore>=maxScore){ + assert(currentTopScore==maxScore); + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + } + + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + assert(numHits>0); + assert(heap.size()==numHits); + + /* Tracks largest element allowed in 'active' */ + +// System.err.println("\nEntering SS loop:"); +// System.err.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.err.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); + +// int iter=0; + SiteScore prevSS=null; + int maxNearbySite=0; + int site=0; + int horizon=0; + assert(active.isEmpty()); + while(!heap.isEmpty() || !active.isEmpty()){ +// iter++; + + do{ + while(!active.isEmpty() && active.peek().site==site){ //Remove all identical elements, and add subsequent elements + final Quad t2=active.poll(); + final int row=t2.row+1, col=t2.column; + + //This is called the "increment" operation. Very messy and slow due to rare cases at beginning of a chrom. + if(row=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + if(a2<=horizon){ + active.add(t2); + maxNearbySite=Tools.max(t2.site, maxNearbySite); + }else{heap.add(t2);} + }else if((heap.size()+active.size())=0; column++){ +// final int x=values[column]; +// assert(x==triples[column].site); +// if(x>=minsite && x<=maxsite){ +// maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); +// approxHits++; +// }else{chances--;} +//// if(verbose){ +//// System.err.println("column="+column+", numHits="+numHits+", approxHits="+approxHits+ +//// ", approxHitsCutoff="+approxHitsCutoff+", chances="+chances); +//// } +// } +// //Invalid assertion due to loop early exit +//// assert(approxHits>0) : "\niter="+iter+", maxHits="+maxHits+", numHits="+numHits+", approxHitsCutoff="+approxHitsCutoff+ +//// "\nheap.size()="+heap.size()+", minsite="+minsite+", maxsite="+maxsite+", values[center]="+values[centerIndex]+", t="+t; +// } +// assert(approxHits<=active.size()) : "approxHits="+approxHits+", active.size()="+active.size()+", maxNearbySite="+maxNearbySite+"\nvalues="+Arrays.toString(values); + + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + +// if(verbose){System.err.println("A");} + + int score; + int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + int mapStart=site, mapStop=maxNearbySite; + assert(mapStart<=mapStop); + + assert(USE_EXTENDED_SCORE); + + boolean locArrayValid=false; + if(qscore-1){ + if(xmax){max=x;} + } + } + + if(score>=maxScore){ + assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + + // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+ + "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+ + "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+ + "numHits="+approxHits+", approxHits="+approxHits+"\n"+ + "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+ + "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+ + "locArray:\t"+Arrays.toString(locArray)+"\n"+ + "values:\t"+Arrays.toString(values)+"\n"+ + "bases:\t"+new String(bases)); + System.err.println(); + assert(false); + } + score=-99999; + } + + //mapStart and mapStop are indices + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + assert(mapStart<=mapStop); + + if(score>=maxScore){ + assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + } +// if(verbose){System.err.println("F");} + + if(score==maxScore){ + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); + } + + if(score>=cutoff){ + qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); + bestqscore=Tools.max(qscore, bestqscore); + } + } +// if(verbose){System.err.println("G");} + + if(score>=cutoff){ +// if(verbose){System.err.println("H");} + + if(score>currentTopScore){ +// if(verbose){System.err.println("I");} +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + if(score>=maxScore){ + assert(USE_EXTENDED_SCORE); + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.err.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } +// if(verbose){System.err.println("J");} + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2 "+site2); +// System.err.println(mapStop+" -> "+site3); + + assert(gapArray[0]>=site2 && gapArray[0]-site2 "+site2+"\n"+ + mapStop+" -> "+site3+"\n\n"+ + Arrays.toString(gapArray)+"\n\n"+ +// Arrays.toString(clone)+"\n\n"+ + Arrays.toString(locArray)+"\n"+ + "numHits="+numHits+", "+ + "heap.size="+heap.size()+", "+ + "numHits="+numHits+", "+ + "approxHits="+approxHits+"\n"; + gapArray[0]=Tools.min(gapArray[0], site2); + gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); + } + if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} +// assert(false) : Arrays.toString(locArray); + } + + + //This block is optional, but tries to eliminate multiple identical alignments + + SiteScore ss=null; + final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; + final boolean inbounds=(site2>=0 && site3=offsets[col]){ +// a2=a-offsets[col]; +// +// assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : +// "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ +// ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; +// }else{ +// int ch=numberToChrom(a, baseChrom); +// int st=numberToSite(a); +// int st2=Tools.max(st-offsets[col], 0); +// a2=toNumber(st2, ch); +// +// assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : +// "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ +// ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; +// } +// +// assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : +// "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ +// ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; +// +// t2.site=a2; +// values[col]=a2; +// if() +// heap.add(t2); +// }else if((heap.size()+active.size())=prevMaxHits); + + final int baseChrom=baseChrom(baseChrom_); + final Block b=index[baseChrom]; + final int[] sizes=sizeArray; + + heap.clear(); + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + int topQscore=-999999999; + + int maxHits=0; +// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + + + int approxHitsCutoff; + final int indelCutoff; + if(perfectOnly){ + approxHitsCutoff=numHits; + indelCutoff=0; + }else{ + approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy + indelCutoff=MAX_INDEL2; + } + + + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + {//Inner loop + final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + if(qscore>topQscore){ + +// maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan + + topQscore=qscore; + + if(qscore>=maxQuickScore){ + assert(qscore==maxQuickScore); + assert(approxHits==numHits); + if(earlyExit){ + return new int[] {topQscore, maxHits}; + } + } + } + } + + while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements + final Quad t2=heap.poll(); + final int row=t2.row+1, col=t2.column; + if(row=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(earlyExit && (perfectOnly || heap.size()b ? a-b : b-a; + } + + + final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){ + + if(useQuality){ + //These lines apparently MUST be used if quality is used later on for slow align. + if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sum(baseScores);} + }else{ + if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);} + } + + return maxQuickScore(offsets, keyScores); + } + + + public final int maxQuickScore(int[] offsets, int[] keyScores){ + +// int x=offsets.length*BASE_KEY_HIT_SCORE; + int x=Tools.intSum(keyScores); + int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]); +// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);} +// assert(!ADD_SCORE_Z) : "Need to make sure this is correct..."; + +// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);} + if(ADD_SCORE_Z){x+=maxScoreZ(offsets);} + + return x+y; +// int bonus=(2*(HIT_SCORE/2)); //For matching both ends +// return x+y+bonus; + } + + + private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], + int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){ + + hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++; + if(numApproxHits==1){return keyScores[centerIndex];} + + //Done! + //Correct way to calculate score: + //Find the first chunk that exactly hits the center. + //Then, align leftward of it, and align rightward of it, and sum the scores. + + //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns + //(for example, a small indel will generate two valid site candidates), choose the lower site. + + int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+ + scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex; + + int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets); + if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);} +// int z=scoreZ(locs, hits); + return x+y; + } + + +// /** Generates a term that increases score with how many bases in the read match the ref. */ +// private static final int scoreZ(int[] locs, int centerIndex, int offsets[]){ +// final int center=locs[centerIndex]; +// +// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE]; +// +// final int maxLoc=center+MAX_INDEL2; +// final int minLoc=Tools.max(0, center-MAX_INDEL); +// +// int score=0; +// +// for(int i=0; i=minLoc && loc<=maxLoc){ +//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); +// +// int offset=offsets[i]; +// int max=CHUNKSIZE+offset; +// +// for(int j=offset; jloc){ +// refLoc[j]=loc; +// score-=2; +// }else if(old==loc){ +// score-=1; +// //do nothing, perhaps, or add 1? +// }else{ +// score-=2; +// assert(old=0 && rloc>=0 && rloc=0){ + score+=BASE_HIT_SCORE+baseScores[i]; + if(loc==centerLoc){score+=centerBonus;} + if(loc!=lastLoc && lastLoc>=0){ + int dif=absdif(loc, lastLoc); + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; + } + lastLoc=loc; + } + } + +// System.err.println("Extended score: "+score); +// System.err.println(Arrays.toString(locArray)); + + + return score; + } + + + /** NOTE! This destroys the locArray, so use a copy if needed. */ + private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){ + int gaps=0; + boolean doSort=false; + + if(locArray[0]<0){locArray[0]=minLoc;} + for(int i=1; i=0); + if(dif>minGap){ + gaps++; + } + } + if(gaps<1){return null;} + int[] out=new int[2+gaps*2]; + out[0]=locArray[0]; + out[out.length-1]=locArray[locArray.length-1]; + + for(int i=1, j=1; i=0); + if(dif>minGap){ + out[j]=locArray[i-1]; + out[j+1]=locArray[i]; + j+=2; + } + } + return out; + } + + + /** Generates a term that increases score with how many bases in the read match the ref. */ + private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){ + + if(numApproxHits==1){return SCOREZ_1KEY;} + + final int center=locs[centerIndex]; + + final int maxLoc=center+MAX_INDEL2; + final int minLoc=Tools.max(0, center-MAX_INDEL); + + int score=0; + + int a0=-1, b0=-1; + + for(int i=0; i=minLoc && loc<=maxLoc){ +// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); + int a=offsets[i]; + + if(b0=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; +// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4)); + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){ + + callsToScore++; + + int score=0; + + int prev, loc=locs[centerIndex]; + + for(int i=centerIndex-1; i>=0; i--){ + + if(locs[i]>=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + /** Encode a (location, chrom) pair to an index */ + private static final int toNumber(int site, int chrom){ + int out=(chrom&CHROM_MASK_LOW); + out=out<=0 && f<1); + FRACTION_GENOME_TO_EXCLUDE=f; + MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + } + + + /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */ + static final float HIT_FRACTION_TO_RETAIN=0.85f; //default: .8 + /** Range: 0 to 1000. Lower should be faster and less accurate. */ + static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + /** Range: 2 to infinity. Lower should be faster and less accurate. */ + static final int MIN_HIT_LISTS_TO_RETAIN=6; + + static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + //lower is faster + static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + //lower is faster + static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + //lower is faster + static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + + /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */ + public static final int SMALL_GENOME_LIST=20; + + static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";} + + static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy. + + /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */ + static final int CLUMPY_MIN_LENGTH_INDEX=2000; + static final float CLUMPY_FRACTION=0.75f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy. + + static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */ + public static final int MAX_HITS_REDUCTION1=0; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */ + public static int MAX_HITS_REDUCTION2=2; //default 1; higher is more accurate (more mapping and less FP) but slower + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */ + public static final int MAX_HITS_REDUCTION_PERFECT=0; + + public static int MAXIMUM_MAX_HITS_REDUCTION=3; + public static int HIT_REDUCTION_DIV=5; + + private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$ + assert(keys>=hits) : keys+", "+hits; + assert(hits>=0); + + int mahtk=MIN_APPROX_HITS_TO_KEEP; + if(SEMIPERFECTMODE || PERFECTMODE){ + if(keys==1){return 1;} + else if(MIN_APPROX_HITS_TO_KEEP=0); + int r=hits-reduction; + + r=Tools.max(mahtk, currentCutoff, r); + + if(perfect){ + r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT); + } + return r; + } + + public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE; + public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed + public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast. + public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.15f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT=0.025f; //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT2=0.1f; + static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.84f : USE_EXTENDED_SCORE ? .84f : 0.6f); //Default .85f; lower is more accurate + static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f + static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$ + static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false + static{ + assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1); + assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1); + } + + +} diff --git a/current/align2/BBIndex5.java b/current/align2/BBIndex5.java new file mode 100755 index 0000000..eb861e7 --- /dev/null +++ b/current/align2/BBIndex5.java @@ -0,0 +1,2647 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; + +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; + + +/** + * Based on Index11f + * Index stored in single array per block. + * Supports 32-bit unsigned index. + * + * @author Brian Bushnell + * @date Jan 3, 2013 + * + */ +public final class BBIndex5 extends AbstractIndex { + + + public static void main(String[] args){ + + int k=13; + + for(int i=0; iData.numChroms){maxChrom=Data.numChroms;} + assert(minChrom<=maxChrom); + Data.sysout.println("Loading index for chrom "+minChrom+"-"+maxChrom+", genome "+Data.GENOME_BUILD); + index=IndexMaker5.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom, + k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, writeToDisk, diskInvalid, index); + + } + + /** Calculate statistics of index, such as list lengths, and find clumpy keys */ + public static final synchronized void analyzeIndex(int minChrom, int maxChrom, boolean cs, float fractionToExclude, int k){ + + assert(lengthHistogram==null); + assert(COUNTS==null); + + int KEYSPACE=1<<(2*k); + COUNTS=new int[KEYSPACE]; + + maxChrom=maxChrom(maxChrom); + + for(int key=0; keyCLUMPY_MIN_LENGTH_INDEX && clumps>(CLUMPY_FRACTION*len)){ + COUNTS[key]=0; + COUNTS[rkey]=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + Block b=index[chrom]; + final int[] sites=b.sites; + sites[b.starts[key]]=-1; + sites[b.starts[rkey]]=-1; + } + } + +// System.err.println("COUNTS["+key+"] = "+COUNTS[key]+", COUNTS["+rkey+"] = "+COUNTS[rkey]); + } + } + + lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); + + if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} + + if(REMOVE_FREQUENT_GENOME_FRACTION){ + + int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); + int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); + + MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); + MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); + + if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} + } + + Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); + if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} + if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} + assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; + } + + + /** Returns the filename for the block holding this chrom */ + public static final String fname(int chrom, int k){ + return IndexMaker5.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS, COLORSPACE); + } + + /** Ensure key offsets are strictly ascending. */ + private static boolean checkOffsets(int[] offsets){ + for(int i=1; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; ilimit || sum/finalHitCount>limit2; i--){ + Pointer p=ptrs[i]; + sum-=hits[p.key].length; + hits[p.key]=null; + finalHitCount--; + } + + return finalHitCount; + } + + /** Remove least useful keys to accelerate search */ + private final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){ + + float[] keyWeights=getKeyWeightArray(keyScores.length); + for(int i=0; ilimitS){hits[i]=null;} +// } + + final int[] lengths=getGenericArray(keys.length); + + for(int i=0; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; i=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){ + final int[] lists=getGreedyListArray(hitsCount); + for(int i=0, j=0; j0){ + lists[j]=i; + j++; + } + } + + Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn); + int worstIndex=greedyReturn[0]; + int worst=lists[worstIndex]; + worstValue=greedyReturn[1]; + sum-=lengths[worst]; + +// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]0 || lengths[worst]=0){ + final int len=count(key); + if(len>0 && len0){ + starts[i]=b.starts[key]; + stops[i]=starts[i]+len2; + numHits++; + } + } + } + } + return numHits; + } + + + private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){ + int numHits=0; + for(int i=0; i=0){ + final int len=count(key); + if(len>0 && len findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){ + assert(minChrom<=maxChrom && minChrom>=0); + ArrayList result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id); + if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);} + + return result; + } + + + public final ArrayList find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){ + + assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); + final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN, COLORSPACE); + int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length); + + initialKeys+=offsetsP.length; + initialKeyIterations++; + + final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2); + + int numHits=0; + numHits=countHits(keysP, maxLen, true); + if(numHits>0){ //TODO: Change these to higher numbers + int trigger=(3*keysP.length)/4; + if(numHits<4 && numHitsMIN_APPROX_HITS_TO_KEEP){ + int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits); + + int zeroes=keysP.length-numHits; + int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1)); + cutoffIndex=Tools.max(cutoffIndex, altMinIndex); + + assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits; + + if(cutoffIndex<(keysP.length-1)){ + int[] lens=getGenericArray(keysP.length); + for(int i=0; icutoff){ + keysP[i]=-1; + removed++; + numHits--; + } + } + } + } +// assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + final ArrayList result=new ArrayList(8); + if(numHits=5); + + int[][] prescanResults=null; + int[] precounts=null; + int[] prescores=null; + + int hitsCutoff=0; + int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore); + + boolean allBasesCovered=true; + { + if(offsetsP[0]!=0){allBasesCovered=false;} + else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;} + else{ + for(int i=1; ioffsetsP[i-1]+KEYLEN){ + allBasesCovered=false; + break; + } + } + } + } + + //TODO I don't understand this logic + final boolean pretendAllBasesAreCovered=(allBasesCovered || + keysP.length>=keysOriginal.length-4 || + (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f)))); + +// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP)); +// assert(allBasesCovered); + + if(prescan_qscore){ + prescanResults=prescanAllBlocks(bestScores, + keysP, keyScoresP, offsetsP, + keysM, keyScoresM, offsetsM, + pretendAllBasesAreCovered); + + if(prescanResults!=null){ + precounts=prescanResults[0]; + prescores=prescanResults[1]; + } + + if(bestScores[1]=maxQuickScore && pretendAllBasesAreCovered){ + assert(bestScores[3]==maxQuickScore); + assert(bestScores[1]==numHits); + + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT)); + }else{ + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH)); + } + } + + final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true); + final boolean fullyDefined=AminoAcid.isFullyDefined(basesP); + assert(bestScores[2]<=0) : Arrays.toString(bestScores); + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS, + offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human + } + cycle++; + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS, + offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human + } + cycle++; + } + + assert(Read.CHECKSITES(result, basesP, basesM, id)); //TODO: Comment out once checked + + return result; + } + + /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */ + private final int[][] prescanAllBlocks(int[] bestScores, + int[] keysP, int[] keyScoresP, int[] offsetsP, + int[] keysM, int[] keyScoresM, int[] offsetsM, + final boolean allBasesCovered){ + + int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}}; + + int bestqscore=0; + int maxHits=0; + int minHitsToScore=MIN_APPROX_HITS_TO_KEEP; + + final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); + + final int[] counts=precountArray; + final int[] scores=prescoreArray; + final int[][] ret=prescanReturn; + Arrays.fill(counts, keysP.length); + Arrays.fill(scores, maxQuickScore); + ret[0]=counts; + ret[1]=scores; + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + final int baseChrom=baseChrom(chrom); + for(int pmi=0; pmi<2; pmi++, cycle++){ + + int[] keys=pm[pmi][0]; + int[] keyScores=pm[pmi][1]; + int[] offsets=pm[pmi][2]; +// int[][] hits=getHitArray(offsets.length); + + int[] starts=startArray; + int[] stops=stopArray; + final int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + + if(numHits=maxQuickScore && allBasesCovered); + + scores[cycle]=temp[0]; + counts[cycle]=temp[1]; + + bestqscore=Tools.max(temp[0], bestqscore); + maxHits=Tools.max(maxHits, temp[1]); + if(bestqscore>=maxQuickScore && allBasesCovered){ + assert(bestqscore==maxQuickScore); + assert(maxHits==keysP.length) : + "\nTemp: \t"+Arrays.toString(temp)+", cycle="+cycle+"\n" + + "Scores: \t"+Arrays.toString(scores)+ + "Counts: \t"+Arrays.toString(counts)+ + "bestqscore: \t"+bestqscore+ + "maxHits: \t"+maxHits+ + "maxQuickScore: \t"+maxQuickScore+ + "numHits: \t"+numHits+ + "minHitsToScore: \t"+minHitsToScore+ + "keys.length: \t"+keys.length; + + minHitsToScore=Tools.max(minHitsToScore, maxHits); + + { + //This early exit is optional. Does not seem to impact speed much either way. + bestScores[1]=Tools.max(bestScores[1], maxHits); + bestScores[3]=Tools.max(bestScores[3], bestqscore); + return ret; + } + } + } + } + } + + bestScores[1]=Tools.max(bestScores[1], maxHits); + bestScores[3]=Tools.max(bestScores[3], bestqscore); + + if(!RETAIN_BEST_QCUTOFF){bestScores[2]=-9999;} + + return ret; + } + + + /** Search a single block and strand */ + public final ArrayList find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores, + final int chrom, final byte strand, + int[] offsets, final boolean obeyLimits, ArrayList ssl, int[] bestScores, + final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + int[] starts=startArray; + int[] stops=stopArray; + + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + if(numHits=0){numHits++;} + } + + if(numHits==offsets.length){ + return null; + }else{ + int[][] r=shrinkReturn3; + int[] starts2=startArray; + int[] stops2=stopArray; + int[] offsets2=getOffsetArray(numHits); + int[] keyScores2=new int[numHits]; + + for(int i=0, j=0; i=0){ + starts2[j]=starts[i]; + stops2[j]=stops[i]; + offsets2[j]=offsets[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + r[0]=starts2; + r[1]=stops2; + r[2]=offsets2; + r[4]=keyScores2; + return r; + } + } + + /** Removes "-1" keys. */ + private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){ + + + int numHits=0; + for(int i=0; i=0){numHits++;} + } + + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + if(numHits==keys.length){ + return null; + }else{ + int[][] r=shrinkReturn2; + int[] offsets2=getOffsetArray(numHits); + assert(offsets2!=offsets); + assert(offsets2.length=0){ + offsets2[j]=offsets[i]; + keys2[j]=keys[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+ + "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n"; + r[0]=offsets2; + r[1]=keys2; + r[2]=keyScores2; + return r; + } + } + + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk2(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, final boolean fullyDefined){ + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + +//// System.out.println("After SHRINK_BEFORE_WALK: numHits = "+hits.length); +// Block b=index[baseChrom_]; +// int[][] hits=b.getHitLists(starts, stops); +// if(SHRINK_BEFORE_WALK){ +// Object[] r=shrink(hits, offsets, keyScores); +// if(r!=null){ +// hits=(int[][])r[0]; +// offsets=(int[])r[1]; +// keyScores=(int[])r[3]; +// } +// } +// +// final int numHits=hits.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true); +// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets)); +// System.err.println("maxScore = "+maxScore); + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); +// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f)); +// final int minScore=(int)(MIN_SCORE_MULT*maxScore); +// System.err.println("minScore = "+minScore); + + final int baseChrom=baseChrom(baseChrom_); + + +// final PriorityQueue heap=new PriorityQueue(numHits); + heap.clear(); +// final Quad64[] triples=new Quad64[numHits]; + final Quad64[] triples=tripleStorage; + + final Block b=index[baseChrom]; + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad64 t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=((long)a2)&0xFFFFFFFFL; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=-999999999; + + int cutoff=minScore; + + int maxHits=0; + int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println(); + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad64 t=heap.peek(); + final int site=(int)t.site; //*** TODO + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=subUnsigned(site, MAX_INDEL); + final int maxsite=addUnsigned(site, MAX_INDEL2); + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==(int)triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + + int mapStart=site, mapStop=maxNearbySite; + + if(USE_EXTENDED_SCORE){ + final int chrom=numberToChrom(site, baseChrom); + score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); + if(true/*USE_AFFINE_SCORE*/){ + //Correct begin and end positions if they changed. + int min=Integer.MAX_VALUE; + int max=Integer.MIN_VALUE; + for(int i=0; i-1){ + if(xmax){max=x;} + } + } + +// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ + chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ + Arrays.toString(locArray)+"\n"+ + Arrays.toString(values)+"\n"+ + new String(bases)+"\nstrand="+strand+"\n"); + System.err.println(); + } + score=-99999; + } + + + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + + +// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); + +// if(chrom==17 && absdif(min, 30354420)<2000){ +// System.err.println("\n*****\n"); +// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+ +// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); +// System.err.println(); +// System.err.println(Arrays.toString(locArray)); +// System.err.println(); +// System.err.println("chrom="+chrom); +// System.err.println("score="+score); +// } + } + }else{ + score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + score+=scoreZ; + } + } + + +// score=score(values, centerIndex, offsets, hits); +// if(ADD_SCORE_Z){ +// int scoreZ=scoreZ2(values, centerIndex, offsets); +// score+=scoreZ; +// } +// +// if(USE_EXTENDED_SCORE){ +// if(score>minQuickScore){ +//// System.out.println(score+" > "+minQuickScore); +// score=extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex, locArray); +// }else{ +//// System.out.print("."); +// score=-1; +// } +// } + + +// System.err.println("maxScore = "+maxScore); +// System.err.println("hits = "+approxHits+" / "+approxHitsCutoff); +// System.err.println("score = "+score+" / "+cutoff); + + if(score>=cutoff){ + +// System.err.println("Passed!"); + +// System.out.println("approxHits="+approxHits+" / "+approxHitsCutoff); +// System.out.println("score="+score+" / "+cutoff); +// System.out.println("strand="+Gene.strandCodes[strand]); +// System.out.println("center="+values[centerIndex]); +// System.out.println("values="+Arrays.toString(values)); +// extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex); +// System.out.println(); + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + + // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH)); + if(USE_EXTENDED_SCORE && score>=maxScore){ + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + +// final int chrom=numberToChrom(site, baseChrom); +// final int site2=numberToSite(site); +// final int site3=numberToSite(maxNearbySite)+read.length; + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=((long)a2)&0xFFFFFFFFL; + values[col]=a2; + heap.add(t2); + } + if(heap.isEmpty()){break;} + } + + } + + return ssl; + } + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk3(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, + int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + assert(USE_EXTENDED_SCORE); + + final int numKeys=offsets.length; //Before shrink + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + usedKeys+=numHits; + usedKeyIterations++; + + final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); + final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); + + final int baseChrom=baseChrom(baseChrom_); + + heap.clear(); + + final Quad64[] triples=tripleStorage; + + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + final Block b=index[baseChrom]; + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=bestScores[0]; + int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); + + int qcutoff=Tools.max(bestScores[2], minQuickScore); + int bestqscore=bestScores[3]; + int maxHits=bestScores[1]; + int perfectsFound=bestScores[5]; + assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits; + int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); + if(approxHitsCutoff>numHits){return ssl;} + + final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); + + + assert(USE_EXTENDED_SCORE); + + if(currentTopScore>=maxScore){ + assert(currentTopScore==maxScore); + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + } + + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad64 t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=((long)a2)&0xFFFFFFFFL; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); + + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad64 t=heap.peek(); + final int site=(int)t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=subUnsigned(site, MAX_INDEL); + final int maxsite=addUnsigned(site, MAX_INDEL2); + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==(int)triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + int mapStart=site, mapStop=maxNearbySite; + + assert(USE_EXTENDED_SCORE); + + boolean locArrayValid=false; + if(qscore-1){ + if(xmax){max=x;} + } + } + + if(score>=maxScore){ + assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + + // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ + chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ + Arrays.toString(locArray)+"\n"+ + Arrays.toString(values)+"\n"+ + new String(bases)+"\nstrand="+strand+"\n"); + System.err.println(); + } + score=-99999; + } + + //mapStart and mapStop are indices + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + if(score>=maxScore){ + assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + } + + if(score==maxScore){ + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); + } + + if(score>=cutoff){ + qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); + bestqscore=Tools.max(qscore, bestqscore); + } + } + +// System.err.println("maxScore = "+maxScore); +// System.err.println("hits = "+approxHits+" / "+approxHitsCutoff); +// System.err.println("score = "+score+" / "+cutoff); + + if(score>=cutoff){ + +// System.err.println("Passed!"); + +// System.out.println("approxHits="+approxHits+" / "+approxHitsCutoff); +// System.out.println("score="+score+" / "+cutoff); +// System.out.println("strand="+Gene.strandCodes[strand]); +// System.out.println("center="+values[centerIndex]); +// System.out.println("values="+Arrays.toString(values)); +// extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex); +// System.out.println(); + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + if(score>=maxScore){ + assert(USE_EXTENDED_SCORE); + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2 "+site2); +// System.err.println(mapStop+" -> "+site3); + + assert(gapArray[0]>=site2 && gapArray[0]-site2 "+site2+"\n"+ + mapStop+" -> "+site3+"\n\n"+ + Arrays.toString(gapArray)+"\n\n"+ +// Arrays.toString(clone)+"\n\n"+ + Arrays.toString(locArray)+"\n"+ + "numHits="+numHits+", "+ + "heap.size="+heap.size()+", "+ + "numHits="+numHits+", "+ + "approxHits="+approxHits+"\n"; + gapArray[0]=Tools.min(gapArray[0], site2); + gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); + } + if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} +// assert(false) : Arrays.toString(locArray); + } + + + //This block is optional, but tries to eliminate multiple identical alignments + + SiteScore ss=null; + final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; + final boolean inbounds=(site2>=0 && site3=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=((long)a2)&0xFFFFFFFFL; + values[col]=a2; + heap.add(t2); + }else if(heap.size()=prevMaxHits); + + final int baseChrom=baseChrom(baseChrom_); + final Block b=index[baseChrom]; + final int[] sizes=sizeArray; + + heap.clear(); + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad64 t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=((long)a2)&0xFFFFFFFFL; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + int topQscore=-999999999; + + int maxHits=0; +// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + + + int approxHitsCutoff; + final int indelCutoff; + if(perfectOnly){ + approxHitsCutoff=numHits; + indelCutoff=0; + }else{ + approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy + indelCutoff=MAX_INDEL2; + } + + + while(!heap.isEmpty()){ + Quad64 t=heap.peek(); + final int site=(int)t.site; //*** TODO + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=subUnsigned(site, MAX_INDEL); + final int maxsite=addUnsigned(site, MAX_INDEL2); + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==(int)triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + if(qscore>topQscore){ + +// maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan + + topQscore=qscore; + + if(qscore>=maxQuickScore){ + assert(qscore==maxQuickScore); + assert(approxHits==numHits); + if(earlyExit){ + return new int[] {topQscore, maxHits}; + } + } + } + } + + while(site==(int)heap.peek().site){ //Remove all identical elements, and add subsequent elements + final Quad64 t2=heap.poll(); + final int row=t2.row+1, col=t2.column; + if(row=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=((long)a2)&0xFFFFFFFFL; + values[col]=a2; + heap.add(t2); + }else if(earlyExit && (perfectOnly || heap.size()b ? a-b : b-a; + return (a<0 == b<0) ? a>b ? a-b : b-a : Integer.MAX_VALUE; + } + + + final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){ + + if(useQuality){ + //These lines apparently MUST be used if quality is used later on for slow align. + if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sum(baseScores);} + }else{ + if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);} + } + + return maxQuickScore(offsets, keyScores); + } + + + public final int maxQuickScore(int[] offsets, int[] keyScores){ + +// int x=offsets.length*BASE_KEY_HIT_SCORE; + int x=Tools.intSum(keyScores); + int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]); +// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);} +// assert(!ADD_SCORE_Z) : "Need to make sure this is correct..."; + +// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);} + if(ADD_SCORE_Z){x+=maxScoreZ(offsets);} + + return x+y; +// int bonus=(2*(HIT_SCORE/2)); //For matching both ends +// return x+y+bonus; + } + + + private final int quickScore(final int[] values, final int[] keyScores, final int centerIndex, final int offsets[], + int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){ + + if(numApproxHits==1){return keyScores[centerIndex];} + + //Done! + //Correct way to calculate score: + //Find the first chunk that exactly hits the center. + //Then, align leftward of it, and align rightward of it, and sum the scores. + + //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns + //(for example, a small indel will generate two valid site candidates), choose the lower site. + + int x=keyScores[centerIndex]+scoreLeft(values, keyScores, centerIndex, sizes, penalizeIndels)+ + scoreRight(values, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex; + + int y=Y_SCORE_MULT*scoreY(values, centerIndex, offsets); + if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);} +// int z=scoreZ(locs, hits); + return x+y; + } + + +// /** Generates a term that increases score with how many bases in the read match the ref. */ +// private static final int scoreZ(int[] locs, int centerIndex, int offsets[]){ +// final int center=locs[centerIndex]; +// +// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE]; +// +// final int maxLoc=center+MAX_INDEL2; +// final int minLoc=Tools.max(0, center-MAX_INDEL); +// +// int score=0; +// +// for(int i=0; i=minLoc && loc<=maxLoc){ +//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); +// +// int offset=offsets[i]; +// int max=CHUNKSIZE+offset; +// +// for(int j=offset; jloc){ +// refLoc[j]=loc; +// score-=2; +// }else if(old==loc){ +// score-=1; +// //do nothing, perhaps, or add 1? +// }else{ +// score-=2; +// assert(old=0 && rloc>=0 && rloc=0); + if(dif>minGap){ + gaps++; + } + } + if(gaps<1){return null;} + int[] out=new int[2+gaps*2]; + out[0]=locArray[0]; + out[out.length-1]=locArray[locArray.length-1]; + + for(int i=1, j=1; i=0); + if(dif>minGap){ + out[j]=locArray[i-1]; + out[j+1]=locArray[i]; + j+=2; + } + } + return out; + } + + + /** Generates a term that increases score with how many bases in the read match the ref. */ + private final int scoreZ2(int[] values, int centerIndex, int offsets[], int numApproxHits, int numHits){ + + if(numApproxHits==1){return SCOREZ_1KEY;} + + final int center=values[centerIndex]; + + final int maxLoc=center+MAX_INDEL2; + final int minLoc=Tools.max(0, center-MAX_INDEL); + + + +// final int minVal=subUnsigned(centerVal, MAX_INDEL); +// final int maxVal=addUnsigned(centerVal, MAX_INDEL2); + + int score=0; + + int a0=-1, b0=-1; + + for(int i=0; i=minLoc && loc<=maxLoc){ +// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); + int a=offsets[i]; + + if(b0=0; i--){ + + if(values[i]!=-1){ + prev=loc; + loc=values[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + /** Encode a (location, chrom) pair to an index */ + private static final int toNumber(int site, int chrom){ + int out=(chrom&CHROM_MASK_LOW); + out=out<=0); //max is 3 for human; perhaps more for other organisms +// assert((1<<(NUM_CHROM_BITS))>=CHROMSPERBLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMSPERBLOCK; + assert((1<<(NUM_CHROM_BITS))==CHROMS_PER_BLOCK) : (1<<(NUM_CHROM_BITS))+" < "+CHROMS_PER_BLOCK; + assert(Integer.bitCount(CHROMS_PER_BLOCK)==1); + assert(Integer.numberOfLeadingZeros(SITE_MASK)==(NUM_CHROM_BITS)) : Integer.toHexString(SITE_MASK); + } + + private final int cycles; + + public static final int BASE_HIT_SCORE=100; + public static final int ALIGN_COLUMNS=3000; + public static int MAX_INDEL=16000; //Max indel length, min 0, default 400; longer is more accurate + public static int MAX_INDEL2=2*MAX_INDEL; + + private final float INV_BASE_KEY_HIT_SCORE; + private final int INDEL_PENALTY; //default (HIT_SCORE/2)-1 + private final int INDEL_PENALTY_MULT; //default 20; penalty for indel length + private final int MAX_PENALTY_FOR_MISALIGNED_HIT; + private final int SCOREZ_1KEY; + + public static final boolean GENERATE_KEY_SCORES_FROM_QUALITY=true; //True: Much faster and more accurate. + public static final boolean GENERATE_BASE_SCORES_FROM_QUALITY=true; //True: Faster, and at least as accurate. + public static final boolean ADD_SCORE_Z=true; //Increases quality, decreases speed + public static final int Z_SCORE_MULT=20; + public static final int Y_SCORE_MULT=10; + + + /** + * Return only sites that match completely or with partial no-reference + */ + public static void setSemiperfectMode() { + assert(!PERFECTMODE); + SEMIPERFECTMODE=true; + PRESCAN_QSCORE=false; +// MIN_APPROX_HITS_TO_KEEP++; + + + + MAX_INDEL=0; + MAX_INDEL2=0; + } + + /** + * Return only sites that match completely + */ + public static void setPerfectMode() { + assert(!SEMIPERFECTMODE); + PERFECTMODE=true; + PRESCAN_QSCORE=false; +// MIN_APPROX_HITS_TO_KEEP++; + + + + MAX_INDEL=0; + MAX_INDEL2=0; + } + + static float FRACTION_GENOME_TO_EXCLUDE=0.03f; //Default .03; lower is slower and more accurate. For perfect reads and small genomes, lower is FASTER. + + public static final void setFractionToExclude(float f){ + assert(f>=0 && f<1); + FRACTION_GENOME_TO_EXCLUDE=f; + MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + } + + + /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */ + static final float HIT_FRACTION_TO_RETAIN=0.85f; //default: .8 + /** Range: 0 to 1000. Lower should be faster and less accurate. */ + static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + /** Range: 2 to infinity. Lower should be faster and less accurate. */ + static final int MIN_HIT_LISTS_TO_RETAIN=6; + + static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + //lower is faster + static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + //lower is faster + static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + //lower is faster + static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + + /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */ + public static final int SMALL_GENOME_LIST=20; + + static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";} + + static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy. + + /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */ + static final int CLUMPY_MIN_LENGTH_INDEX=2000; + static final float CLUMPY_FRACTION=0.75f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy. + + static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */ + public static final int MAX_HITS_REDUCTION1=0; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */ + public static int MAX_HITS_REDUCTION2=2; //default 1; higher is more accurate (more mapping and less FP) but slower + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */ + public static final int MAX_HITS_REDUCTION_PERFECT=0; + + public static int MAXIMUM_MAX_HITS_REDUCTION=3; + public static int HIT_REDUCTION_DIV=5; + + private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$ + assert(keys>=hits) : keys+", "+hits; + assert(hits>=0); + + int mahtk=MIN_APPROX_HITS_TO_KEEP; + if(SEMIPERFECTMODE || PERFECTMODE){ + if(keys==1){return 1;} + else if(MIN_APPROX_HITS_TO_KEEP=0); + int r=hits-reduction; + + r=Tools.max(mahtk, currentCutoff, r); + + if(perfect){ + r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT); + } + return r; + } + + public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE; + public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed + public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast. + public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.15f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT=0.025f; //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT2=0.1f; + static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.84f : USE_EXTENDED_SCORE ? .84f : 0.6f); //Default .85f; lower is more accurate + static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f + static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$ + static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false + static{ + assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1); + assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1); + } + + +} diff --git a/current/align2/BBIndexAcc.java b/current/align2/BBIndexAcc.java new file mode 100755 index 0000000..00511cf --- /dev/null +++ b/current/align2/BBIndexAcc.java @@ -0,0 +1,2809 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; + + +/** + * Based on Index11a + * + * + * + * + * @author Brian Bushnell + * @date Jul 11, 2012 + * + */ +public final class BBIndexAcc extends AbstractIndex { + + + public static void main(String[] args){ + + int k=13; + + for(int i=0; iData.numChroms){maxChrom=Data.numChroms;} + assert(minChrom<=maxChrom); + Data.sysout.println("Loading index for chrom "+minChrom+"-"+maxChrom+", genome "+Data.GENOME_BUILD); + index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom, + k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, writeToDisk, diskInvalid, index); + } + + /** Calculate statistics of index, such as list lengths, and find clumpy keys */ + public static final synchronized void analyzeIndex(int minChrom, int maxChrom, boolean cs, float fractionToExclude, int k){ + assert(!cs) : "Re-enable old reverse complement mode."; + assert(lengthHistogram==null); + assert(COUNTS==null); + + int KEYSPACE=1<<(2*k); + COUNTS=new int[KEYSPACE]; + maxChrom=maxChrom(maxChrom); + + HashMap cmap=new HashMap(); + + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + Block b=index[chrom]; + final int[] sites=b.sites; + final int[] starts=b.starts; + + for(int key=0; key0 && dif<=CLUMPY_MAX_DIST){ + clumps++; + } + } + if(clumps>0){ + final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k)); + final Integer ko=x; + LongM lm=cmap.get(ko); + if(lm==null){ + lm=new LongM(0); + cmap.put(ko, lm); + } + lm.increment(clumps); + } + } + } + } + + for(int key=0; keyCLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){ + int rkey=AminoAcid.reverseComplementBinaryFast(key, k); + assert(key<=rkey); + assert(key==KeyRing.reverseComplementKey(rkey, k, cs)); + COUNTS[key]=0; + COUNTS[rkey]=0; + } + } + } + + lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); + + if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} + + if(REMOVE_FREQUENT_GENOME_FRACTION){ + + int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); + int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); + + MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); + MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); + + if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} + } + + Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); + if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} + if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} + assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; + } + +// /** Calculate statistics of index, such as list lengths, and find clumpy keys */ +// public static final synchronized void analyzeIndex(int minChrom, int maxChrom, boolean cs, float fractionToExclude, int k){ +// +// assert(lengthHistogram==null); +// assert(COUNTS==null); +// +// int KEYSPACE=1<<(2*k); +// COUNTS=new int[KEYSPACE]; +// +// maxChrom=maxChrom(maxChrom); +// +// for(int key=0; key0 && dif<=CLUMPY_MAX_DIST){ +// clumps++; +// } +// } +// +// for(int i=start2+1; i0 && dif<=CLUMPY_MAX_DIST){ +// clumps++; +// } +// } +// } +// +// } +// +// COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len); +// if(key!=rkey){COUNTS[rkey]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[rkey]+len);} +// assert(COUNTS[key]==COUNTS[rkey]) : key+", "+rkey; +// +// if(REMOVE_CLUMPY && len>CLUMPY_MIN_LENGTH_INDEX && clumps>(CLUMPY_FRACTION*len)){ +// COUNTS[key]=0; +// COUNTS[rkey]=0; +// for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ +// Block b=index[chrom]; +// final int[] sites=b.sites; +// sites[b.starts[key]]=-1; +// sites[b.starts[rkey]]=-1; +// } +// } +// +//// System.err.println("COUNTS["+key+"] = "+COUNTS[key]+", COUNTS["+rkey+"] = "+COUNTS[rkey]); +// } +// } +// +// lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); +// +// if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} +// +// if(REMOVE_FREQUENT_GENOME_FRACTION){ +// +// int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); +// int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); +// +// MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); +// MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); +// +// if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} +// } +// +// Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); +// if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} +// if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} +// assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; +// } + + + /** Returns the filename for the block holding this chrom */ + public static final String fname(int chrom, int k){ + return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS, COLORSPACE); + } + + /** Ensure key offsets are strictly ascending. */ + private static boolean checkOffsets(int[] offsets){ + for(int i=1; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; ilimit || sum/finalHitCount>limit2; i--){ + Pointer p=ptrs[i]; + sum-=hits[p.key].length; + hits[p.key]=null; + finalHitCount--; + } + + return finalHitCount; + } + + /** Remove least useful keys to accelerate search */ + private final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){ + + float[] keyWeights=getKeyWeightArray(keyScores.length); + for(int i=0; ilimitS){hits[i]=null;} +// } + + final int[] lengths=getGenericArray(keys.length); + + for(int i=0; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; i=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){ + final int[] lists=getGreedyListArray(hitsCount); + for(int i=0, j=0; j0){ + lists[j]=i; + j++; + } + } + + Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn); + int worstIndex=greedyReturn[0]; + int worst=lists[worstIndex]; + worstValue=greedyReturn[1]; + sum-=lengths[worst]; + +// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]0 || lengths[worst]=0){ + final int len=count(key); + if(len>0 && len0){ + starts[i]=b.starts[key]; + stops[i]=starts[i]+len2; + numHits++; + } + } + } + } + return numHits; + } + + + private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){ + int numHits=0; + for(int i=0; i=0){ + final int len=count(key); + if(len>0 && len findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){ + assert(minChrom<=maxChrom && minChrom>=0); + ArrayList result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id); + if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);} + + return result; + } + + + public final ArrayList find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){ + + assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); + final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN, COLORSPACE); + int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length); + + initialKeys+=offsetsP.length; + initialKeyIterations++; + + final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2); + + int numHits=0; + numHits=countHits(keysP, maxLen, true); + if(numHits>0){ //TODO: Change these to higher numbers + int trigger=(3*keysP.length)/4; + if(numHits<6 && numHitsMIN_APPROX_HITS_TO_KEEP){ + int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits); + + int zeroes=keysP.length-numHits; + int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1)); + cutoffIndex=Tools.max(cutoffIndex, altMinIndex); + + assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits; + + if(cutoffIndex<(keysP.length-1)){ + int[] lens=getGenericArray(keysP.length); + for(int i=0; icutoff){ + keysP[i]=-1; + removed++; + numHits--; + } + } + } + } +// assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + final ArrayList result=new ArrayList(8); + if(numHits\n"+Arrays.toString(offsetsM)); + } + final int[] keysM=(COLORSPACE ? KeyRing.makeKeys(basesM, offsetsM, KEYLEN, COLORSPACE) : KeyRing.reverseComplementKeys(keysP, KEYLEN, COLORSPACE)); + +// assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); +// assert(checkOffsets(offsetsM)) : Arrays.toString(offsetsM); + + assert(!USE_EXTENDED_SCORE || (baseScoresP!=null && (qual==null || baseScoresP.length==qual.length))); + assert(keyScoresP!=null); + assert(keyScoresP.length==offsetsP.length) : keyScoresP.length+", "+offsetsP.length+", "+Arrays.toString(keyScoresP); + final byte[] baseScoresM=Tools.reverseAndCopy(baseScoresP, getBaseScoreArray(baseScoresP.length, 1)); + final int[] keyScoresM=Tools.reverseAndCopy(keyScoresP, getKeyScoreArray(keyScoresP.length, 1)); + final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); + + assert(offsetsM.length==offsetsP.length); + assert(maxQuickScore==maxQuickScore(offsetsM, keyScoresM)); + + /* + * bestScores: + * + * bestScores[0] currentTopScore + * bestScores[1] maxHits + * bestScores[2] qcutoff + * bestScores[3] bestqscore + * bestScores[4] maxQuickScore + * bestScores[5] perfectsFound + */ + final int[] bestScores=new int[6]; + + //This prevents filtering by qscore when a low-quality read only uses a few keys. + //In that case, extending is more important. + final boolean prescan_qscore=(PRESCAN_QSCORE && numHits>=5); + + int[][] prescanResults=null; + int[] precounts=null; + int[] prescores=null; + + int hitsCutoff=0; + int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore); + + boolean allBasesCovered=true; + { + if(offsetsP[0]!=0){allBasesCovered=false;} + else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;} + else{ + for(int i=1; ioffsetsP[i-1]+KEYLEN){ + allBasesCovered=false; + break; + } + } + } + } + + //TODO I don't understand this logic + final boolean pretendAllBasesAreCovered=//false; + (allBasesCovered || + keysP.length>=keysOriginal.length-4 || + (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f)))); + +// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP)); +// assert(allBasesCovered); + + if(prescan_qscore){ + prescanResults=prescanAllBlocks(bestScores, + keysP, keyScoresP, offsetsP, + keysM, keyScoresM, offsetsM, + pretendAllBasesAreCovered); + + if(prescanResults!=null){ + precounts=prescanResults[0]; + prescores=prescanResults[1]; + } + + if(bestScores[1]=maxQuickScore && pretendAllBasesAreCovered){ + assert(bestScores[3]==maxQuickScore); + assert(bestScores[1]==numHits); + + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT)); + }else{ + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH)); + } + } + + final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true); + final boolean fullyDefined=AminoAcid.isFullyDefined(basesP); + assert(bestScores[2]<=0) : Arrays.toString(bestScores); //Note - I am not sure what this assertion does, or if it is valid for acc + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS, + offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human + } + cycle++; + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS, + offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} //if(bestScores[5]>=3 || (bestScores[5]>=2 && chrom<24)){break;} //for human + } + cycle++; + } + + assert(Read.CHECKSITES(result, basesP, basesM, id)); //TODO: Comment out once checked + + return result; + } + + /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */ + private final int[][] prescanAllBlocks(int[] bestScores, + int[] keysP, int[] keyScoresP, int[] offsetsP, + int[] keysM, int[] keyScoresM, int[] offsetsM, + final boolean allBasesCovered){ + + int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}}; + + int bestqscore=0; + int maxHits=0; + int minHitsToScore=MIN_APPROX_HITS_TO_KEEP; + + final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); + + final int[] counts=precountArray; + final int[] scores=prescoreArray; + final int[][] ret=prescanReturn; + Arrays.fill(counts, keysP.length); + Arrays.fill(scores, maxQuickScore); + ret[0]=counts; + ret[1]=scores; + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + final int baseChrom=baseChrom(chrom); + for(int pmi=0; pmi<2; pmi++, cycle++){ + + int[] keys=pm[pmi][0]; + int[] keyScores=pm[pmi][1]; + int[] offsets=pm[pmi][2]; +// int[][] hits=getHitArray(offsets.length); + + int[] starts=startArray; + int[] stops=stopArray; + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + + if(numHits find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores, + final int chrom, final byte strand, + int[] offsets, final boolean obeyLimits, ArrayList ssl, int[] bestScores, + final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + //Index of first location of each key + int[] starts=startArray; + //Index of first location of next key (i.e., (last location of key)+1) + int[] stops=stopArray; + + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + if(numHits=0){numHits++;} + } + + if(numHits==offsets.length){ + return null; + }else{ + int[][] r=shrinkReturn3; + int[] starts2=startArray; + int[] stops2=stopArray; + int[] offsets2=getOffsetArray(numHits); + int[] keyScores2=new int[numHits]; + + for(int i=0, j=0; i=0){ + starts2[j]=starts[i]; + stops2[j]=stops[i]; + offsets2[j]=offsets[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + r[0]=starts2; + r[1]=stops2; + r[2]=offsets2; + r[4]=keyScores2; + return r; + } + } + + /** Removes "-1" keys. */ + private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){ + + + int numHits=0; + for(int i=0; i=0){numHits++;} + } + + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + if(numHits==keys.length){ + return null; + }else{ + int[][] r=shrinkReturn2; + int[] offsets2=getOffsetArray(numHits); + assert(offsets2!=offsets); + assert(offsets2.length=0){ + offsets2[j]=offsets[i]; + keys2[j]=keys[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+ + "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n"; + r[0]=offsets2; + r[1]=keys2; + r[2]=keyScores2; + return r; + } + } + + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk2(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, final boolean fullyDefined){ + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true); +// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets)); +// System.err.println("maxScore = "+maxScore); + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); +// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f)); +// final int minScore=(int)(MIN_SCORE_MULT*maxScore); +// System.err.println("minScore = "+minScore); + + final int baseChrom=baseChrom(baseChrom_); + + + heap.clear(); + final Quad[] triples=tripleStorage; + + final Block b=index[baseChrom]; + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=-999999999; + + int cutoff=minScore; + + int maxHits=0; + int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println(); + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + int approxHits=0; + + {//Inner loop + final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + boolean locArrayValid=false; + if(approxHits>=approxHitsCutoff){ + + int score; + + int mapStart=site, mapStop=maxNearbySite; + + if(USE_EXTENDED_SCORE){ + final int chrom=numberToChrom(site, baseChrom); + if(verbose){ + System.err.println(new String(bases)); + System.err.println("numHits="+numHits+", approxHits="+approxHits+/*", keys="+numKeys+*/", centerIndex="+centerIndex); + System.err.println("Extending "+Arrays.toString(values)); + } + score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); + locArrayValid=true; + if(true/*USE_AFFINE_SCORE*/){ + //Correct begin and end positions if they changed. + int min=Integer.MAX_VALUE; + int max=Integer.MIN_VALUE; + for(int i=0; i-1){ + if(xmax){max=x;} + } + } + +// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ + chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ + Arrays.toString(locArray)+"\n"+ + Arrays.toString(values)+"\n"+ + new String(bases)+"\nstrand="+strand+"\n"); + System.err.println(); + } + score=-99999; + } + + + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + + +// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); + +// if(chrom==17 && absdif(min, 30354420)<2000){ +// System.err.println("\n*****\n"); +// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+ +// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); +// System.err.println(); +// System.err.println(Arrays.toString(locArray)); +// System.err.println(); +// System.err.println("chrom="+chrom); +// System.err.println("score="+score); +// } + } + }else{ + score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + score+=scoreZ; + } + } + + if(score>=cutoff){ + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + + // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH)); + if(USE_EXTENDED_SCORE && score>=maxScore){ + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2=MINGAP+bases.length){ +// assert(locArrayValid) : "Loc array was not filled."; +// gapArray=makeGapArray(locArray, site2, MINGAP); +// if(gapArray!=null){ +// gapArray[0]=Tools.min(gapArray[0], site2); +// gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); +// } +// if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} +// } +// + + if(gapArray==null && prevSS!=null && prevSS.gaps==null && + prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ + + int betterScore=Tools.max(score, prevSS.score); + int minStart=Tools.min(prevSS.start, site2); + int maxStop=Tools.max(prevSS.stop, site3); + final boolean perfect2=USE_EXTENDED_SCORE && prevSS.score==maxScore && fullyDefined; + assert(!USE_EXTENDED_SCORE || perfect2==prevSS.perfect); + + boolean shortEnough=(!LIMIT_SUBSUMPTION_LENGTH_TO_2X || (maxStop-minStart<2*bases.length)); + + if(prevSS.start==site2 && prevSS.stop==site3){ + prevSS.score=prevSS.quickScore=betterScore; + }else if(SUBSUME_SAME_START_SITES && shortEnough && prevSS.start==site2){ + if(perfect2){ + //do nothing + }else if(perfect1){ + prevSS.stop=site3; + prevSS.perfect=true; + }else{ + prevSS.stop=maxStop; + } + prevSS.score=prevSS.quickScore=betterScore; + }else if(SUBSUME_SAME_STOP_SITES && shortEnough && prevSS.stop==site3){ + if(perfect2){ + //do nothing + }else if(perfect1){ + prevSS.start=site2; + prevSS.perfect=true; + }else{ + prevSS.start=minStart; + } + prevSS.score=prevSS.quickScore=betterScore; + }else if(SUBSUME_OVERLAPPING_SITES && shortEnough && (maxStop-minStart<=bases.length+MAX_SUBSUMPTION_LENGTH) + && !perfect1 && !perfect2){ + prevSS.start=minStart; + prevSS.stop=maxStop; + prevSS.score=prevSS.quickScore=betterScore; + }else{ + ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); + if(!perfect1){ss.setPerfect(bases);} + assert(!perfect1 || ss.stop-ss.start==bases.length-1); + } + assert(!perfect2 || prevSS.stop-prevSS.start==bases.length-1); + }else{ + ss=new SiteScore(chrom, strand, site2, site3, approxHits, score, false, perfect1); + if(!perfect1){ss.setPerfect(bases);} + ss.gaps=gapArray; + if(verbose && gapArray!=null){ + System.err.println(ss.toText()+"\t"+Arrays.toString(gapArray)+"\n"+Arrays.toString(locArray)+"\n"); + } + } + + if(ss!=null){ +// System.out.println("Added site "+ss.toText()); + ssl.add(ss); + prevSS=ss; + }else{ +// System.out.println("Subsumed site "+new SiteScore(chrom, strand, site2, site3, score).toText()); + } + +// if(prevSS!=null && prevSS.chrom==chrom && prevSS.strand==strand && overlap(prevSS.start, prevSS.stop, site2, site3)){ +// int betterScore=Tools.max(score, prevSS.score); +// if(prevSS.start==site2 && prevSS.stop==site3){ +// prevSS.score=prevSS.quickScore=betterScore; +// }else if(prevSS.start==site2 +// /*isWithin(prevSS.start, prevSS.stop, site2, site3) || +// isWithin(site2, site3, prevSS.start, prevSS.stop)*/){ +// prevSS.score=prevSS.quickScore=betterScore; +// assert(prevSS.start=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + } + if(heap.isEmpty()){break;} + } + + } + + return ssl; + } + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk3(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, + int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + assert(USE_EXTENDED_SCORE); + + final int numKeys=offsets.length; //Before shrink + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + usedKeys+=numHits; + usedKeyIterations++; + + final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); + final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); + + final int baseChrom=baseChrom(baseChrom_); + + heap.clear(); + + final Quad[] triples=tripleStorage; + + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + final Block b=index[baseChrom]; + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=bestScores[0]; + int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); + + int qcutoff=Tools.max(bestScores[2], minQuickScore); + int bestqscore=bestScores[3]; + int maxHits=bestScores[1]; + int perfectsFound=bestScores[5]; + assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits+", "+new String(bases); + int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); + if(approxHitsCutoff>numHits){return ssl;} + + final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); + + + assert(USE_EXTENDED_SCORE); + + if(currentTopScore>=maxScore){ + assert(currentTopScore==maxScore); + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + } + + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); + + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + int mapStart=site, mapStop=maxNearbySite; + + assert(USE_EXTENDED_SCORE); + + boolean locArrayValid=false; + if(qscore-1){ + if(xmax){max=x;} + } + } + + if(score>=maxScore){ + assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + + // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("\nAnomaly in "+getClass().getName()+".slowWalk:\n"+ + "chrom="+chrom+", mapStart="+mapStart+", mapStop="+mapStop+", centerIndex="+centerIndex+", strand="+strand+"\n"+ + "score="+score+", maxScore="+maxScore+", qscore="+qscore+", filter_by_qscore="+filter_by_qscore+"\n"+ + "numHits="+approxHits+", approxHits="+approxHits+"\n"+ + "min="+min+", max="+max+", (max-min)="+(max-min)+"\n"+ + "bases.length="+bases.length+"\n"+Arrays.toString(locArray)+"\n"+ + "locArray:\t"+Arrays.toString(locArray)+"\n"+ + "values:\t"+Arrays.toString(values)+"\n"+ + "bases:\t"+new String(bases)); + System.err.println(); + assert(false); + } + score=-99999; + } + + //mapStart and mapStop are indices + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + if(score>=maxScore){ + assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + } + + if(score==maxScore){ + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); + } + + if(score>=cutoff){ + qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); + bestqscore=Tools.max(qscore, bestqscore); + } + } + + if(score>=cutoff){ + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + if(score>=maxScore){ + assert(USE_EXTENDED_SCORE); + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(site2!=site3) : site2+", "+site3+", "+mapStart+", "+mapStop; + + assert(NUM_CHROM_BITS==0 || site2 "+site2); +// System.err.println(mapStop+" -> "+site3); + + assert(gapArray[0]>=site2 && gapArray[0]-site2 "+site2+"\n"+ + mapStop+" -> "+site3+"\n\n"+ + Arrays.toString(gapArray)+"\n\n"+ +// Arrays.toString(clone)+"\n\n"+ + Arrays.toString(locArray)+"\n"+ + "numHits="+numHits+", "+ + "heap.size="+heap.size()+", "+ + "numHits="+numHits+", "+ + "approxHits="+approxHits+"\n"; + gapArray[0]=Tools.min(gapArray[0], site2); + gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); + } + if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} +// assert(false) : Arrays.toString(locArray); + } + + + //This block is optional, but tries to eliminate multiple identical alignments + + SiteScore ss=null; + final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; + final boolean inbounds=(site2>=0 && site3=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(heap.size()=prevMaxHits); + + final int baseChrom=baseChrom(baseChrom_); + final Block b=index[baseChrom]; + final int[] sizes=sizeArray; + + heap.clear(); + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + int topQscore=-999999999; + + int maxHits=0; +// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + + + int approxHitsCutoff; + final int indelCutoff; + if(perfectOnly){ + approxHitsCutoff=numHits; + indelCutoff=0; + }else{ + approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy + indelCutoff=MAX_INDEL2; + } + + + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + {//Inner loop + final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + if(qscore>topQscore){ + +// maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan + + topQscore=qscore; + + if(qscore>=maxQuickScore){ + assert(qscore==maxQuickScore); + assert(approxHits==numHits); + if(earlyExit){ + return new int[] {topQscore, maxHits}; + } + } + } + } + + while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements + final Quad t2=heap.poll(); + final int row=t2.row+1, col=t2.column; + if(row=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(earlyExit && (perfectOnly || heap.size()b ? a-b : b-a; + } + + + final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){ + + if(useQuality){ + //These lines apparently MUST be used if quality is used later on for slow align. + if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sum(baseScores);} + }else{ + if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);} + } + + return maxQuickScore(offsets, keyScores); + } + + + public final int maxQuickScore(int[] offsets, int[] keyScores){ + +// int x=offsets.length*BASE_KEY_HIT_SCORE; + int x=Tools.intSum(keyScores); + int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]); +// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);} +// assert(!ADD_SCORE_Z) : "Need to make sure this is correct..."; + +// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);} + if(ADD_SCORE_Z){x+=maxScoreZ(offsets);} + + return x+y; +// int bonus=(2*(HIT_SCORE/2)); //For matching both ends +// return x+y+bonus; + } + + + private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], + int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){ + + hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++; + if(numApproxHits==1){return keyScores[centerIndex];} + + //Done! + //Correct way to calculate score: + //Find the first chunk that exactly hits the center. + //Then, align leftward of it, and align rightward of it, and sum the scores. + + //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns + //(for example, a small indel will generate two valid site candidates), choose the lower site. + + int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+ + scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex; + + int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets); + if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);} +// int z=scoreZ(locs, hits); + return x+y; + } + + +// /** Generates a term that increases score with how many bases in the read match the ref. */ +// private static final int scoreZ(int[] locs, int centerIndex, int offsets[]){ +// final int center=locs[centerIndex]; +// +// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE]; +// +// final int maxLoc=center+MAX_INDEL2; +// final int minLoc=Tools.max(0, center-MAX_INDEL); +// +// int score=0; +// +// for(int i=0; i=minLoc && loc<=maxLoc){ +//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); +// +// int offset=offsets[i]; +// int max=CHUNKSIZE+offset; +// +// for(int j=offset; jloc){ +// refLoc[j]=loc; +// score-=2; +// }else if(old==loc){ +// score-=1; +// //do nothing, perhaps, or add 1? +// }else{ +// score-=2; +// assert(old=0 && rloc>=0 && rloc=0){ + score+=BASE_HIT_SCORE+baseScores[i]; + if(loc==centerLoc){score+=centerBonus;} + if(loc!=lastLoc && lastLoc>=0){ + int dif=absdif(loc, lastLoc); + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*dif, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; + } + lastLoc=loc; + } + } + +// System.err.println("Extended score: "+score); +// System.err.println(Arrays.toString(locArray)); + + + return score; + } + + + /** NOTE! This destroys the locArray, so use a copy if needed. */ + private static final int[] makeGapArray(int[] locArray, int minLoc, int minGap){ + int gaps=0; + boolean doSort=false; + + if(locArray[0]<0){locArray[0]=minLoc;} + for(int i=1; i=0); + if(dif>minGap){ + gaps++; + } + } + if(gaps<1){return null;} + int[] out=new int[2+gaps*2]; + out[0]=locArray[0]; + out[out.length-1]=locArray[locArray.length-1]; + + for(int i=1, j=1; i=0); + if(dif>minGap){ + out[j]=locArray[i-1]; + out[j+1]=locArray[i]; + j+=2; + } + } + return out; + } + + + /** Generates a term that increases score with how many bases in the read match the ref. */ + private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){ + + if(numApproxHits==1){return SCOREZ_1KEY;} + + final int center=locs[centerIndex]; + + final int maxLoc=center+MAX_INDEL2; + final int minLoc=Tools.max(0, center-MAX_INDEL); + + int score=0; + + int a0=-1, b0=-1; + + for(int i=0; i=minLoc && loc<=maxLoc){ +// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); + int a=offsets[i]; + + if(b0=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; +// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4)); + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){ + + callsToScore++; + + int score=0; + + int prev, loc=locs[centerIndex]; + + for(int i=centerIndex-1; i>=0; i--){ + + if(locs[i]>=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + /** Encode a (location, chrom) pair to an index */ + private static final int toNumber(int site, int chrom){ + int out=(chrom&CHROM_MASK_LOW); + out=out<=0 && f<1); + FRACTION_GENOME_TO_EXCLUDE=f; + MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + } + + + /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */ + static final float HIT_FRACTION_TO_RETAIN=0.85f; //default: .85 + /** Range: 0 to 1000. Lower should be faster and less accurate. */ + static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + /** Range: 2 to infinity. Lower should be faster and less accurate. */ + static final int MIN_HIT_LISTS_TO_RETAIN=8; + + static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + //lower is faster + static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + //lower is faster + static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + //lower is faster + static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + + /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */ + public static final int SMALL_GENOME_LIST=80; + + static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";} + + static final int CLUMPY_MAX_DIST=4; //Keys repeating over intervals of this or less are clumpy. + + /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */ + static final int CLUMPY_MIN_LENGTH_INDEX=6000; + static final float CLUMPY_FRACTION=0.8f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy. + + static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */ + public static final int MAX_HITS_REDUCTION1=3; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */ + public static int MAX_HITS_REDUCTION2=3; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */ + public static int MAX_HITS_REDUCTION_PERFECT=5; + + public static int MAXIMUM_MAX_HITS_REDUCTION=7; + public static int HIT_REDUCTION_DIV=4; + + private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$ + assert(keys>=hits) : keys+", "+hits; + assert(hits>=0); + + int mahtk=MIN_APPROX_HITS_TO_KEEP; + if(SEMIPERFECTMODE || PERFECTMODE){ + if(keys==1){return 1;} + else if(MIN_APPROX_HITS_TO_KEEP=0); + int r=hits-reduction; + + r=Tools.max(mahtk, currentCutoff, r); + + if(perfect){ + r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT); + } + return r; + } + + public static final boolean USE_SLOWALK3=false && USE_EXTENDED_SCORE; + public static boolean PRESCAN_QSCORE=false && USE_EXTENDED_SCORE; //Decrease quality and increase speed + public static final boolean FILTER_BY_QSCORE=false; //Slightly lower quality, but very fast. + public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.08f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT=0.01f; //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT2=0.1f; + static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.5f : USE_EXTENDED_SCORE ? .84f : 0.6f); //Default .85f; lower is more accurate + static final float DYNAMIC_QSCORE_THRESH=0.40f; //default .58f + static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.6f; //***$ + static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.8f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false + static{ + assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1); + assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1); + } + + +} diff --git a/current/align2/BBIndexPacBio.java b/current/align2/BBIndexPacBio.java new file mode 100755 index 0000000..d05336d --- /dev/null +++ b/current/align2/BBIndexPacBio.java @@ -0,0 +1,2602 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; + + +/** + * Based on Index11f + * + * + * + * + * @author Brian Bushnell + * @date Jul 11, 2012 + * + */ +public final class BBIndexPacBio extends AbstractIndex { + + + public static void main(String[] args){ + + int k=12; + + for(int i=0; iData.numChroms){maxChrom=Data.numChroms;} + assert(minChrom<=maxChrom); + Data.sysout.println("Loading index for chrom "+minChrom+"-"+maxChrom+", genome "+Data.GENOME_BUILD); + index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom, + k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, writeToDisk, diskInvalid, index); + } + + /** Calculate statistics of index, such as list lengths, and find clumpy keys */ + public static final synchronized void analyzeIndex(int minChrom, int maxChrom, boolean cs, float fractionToExclude, int k){ + assert(!cs) : "Re-enable old reverse complement mode."; + assert(lengthHistogram==null); + assert(COUNTS==null); + + int KEYSPACE=1<<(2*k); + COUNTS=new int[KEYSPACE]; + maxChrom=maxChrom(maxChrom); + + HashMap cmap=new HashMap(); + + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + Block b=index[chrom]; + final int[] sites=b.sites; + final int[] starts=b.starts; + + for(int key=0; key0 && dif<=CLUMPY_MAX_DIST){ + clumps++; + } + } + if(clumps>0){ + final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k)); + final Integer ko=x; + LongM lm=cmap.get(ko); + if(lm==null){ + lm=new LongM(0); + cmap.put(ko, lm); + } + lm.increment(clumps); + } + } + } + } + + for(int key=0; keyCLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){ + int rkey=AminoAcid.reverseComplementBinaryFast(key, k); + assert(key<=rkey); + assert(key==KeyRing.reverseComplementKey(rkey, k, cs)); + COUNTS[key]=0; + COUNTS[rkey]=0; + } + } + } + + lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); + + if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} + + if(REMOVE_FREQUENT_GENOME_FRACTION){ + + int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); + int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); + + MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); + MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); + + if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} + } + + Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); + if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} + if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} + assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; + } + + + /** Returns the filename for the block holding this chrom */ + public static final String fname(int chrom, int k){ + return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS, COLORSPACE); + } + + /** Ensure key offsets are strictly ascending. */ + private static boolean checkOffsets(int[] offsets){ + for(int i=1; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; ilimit || sum/finalHitCount>limit2; i--){ + Pointer p=ptrs[i]; + sum-=hits[p.key].length; + hits[p.key]=null; + finalHitCount--; + } + + return finalHitCount; + } + + /** Remove least useful keys to accelerate search */ + public final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){ + + float[] keyWeights=getKeyWeightArray(keyScores.length); + for(int i=0; ilimitS){hits[i]=null;} +// } + + final int[] lengths=getGenericArray(keys.length); + + for(int i=0; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; i=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){ + final int[] lists=getGreedyListArray(hitsCount); + for(int i=0, j=0; j0){ + lists[j]=i; + j++; + } + } + + Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn); + int worstIndex=greedyReturn[0]; + int worst=lists[worstIndex]; + worstValue=greedyReturn[1]; + sum-=lengths[worst]; + +// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]0 || lengths[worst]=0){ + final int len=count(key); + if(len>0 && len0){ + starts[i]=b.starts[key]; + stops[i]=starts[i]+len2; + numHits++; + } + } + } + } + return numHits; + } + + + private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){ + int numHits=0; + for(int i=0; i=0){ + final int len=count(key); + if(len>0 && len findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){ + assert(minChrom<=maxChrom && minChrom>=0); + ArrayList result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id); + if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);} + + return result; + } + + +public final ArrayList find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){ + + assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); + final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN, COLORSPACE); + int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length); + + initialKeys+=offsetsP.length; + initialKeyIterations++; + + final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2); + + int numHits=0; + numHits=countHits(keysP, maxLen, true); + if(numHits>0){ //TODO: Change these to higher numbers + int trigger=(3*keysP.length)/4; + if(numHits<20 && numHitsMIN_APPROX_HITS_TO_KEEP){ + int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits); + + int zeroes=keysP.length-numHits; + int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1)); + cutoffIndex=Tools.max(cutoffIndex, altMinIndex); + + assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits; + + if(cutoffIndex<(keysP.length-1)){ + int[] lens=getGenericArray(keysP.length); + for(int i=0; icutoff){ + keysP[i]=-1; + removed++; + numHits--; + } + } + } + } +// assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + final ArrayList result=new ArrayList(8); + if(numHits=5); + + int[][] prescanResults=null; + int[] precounts=null; + int[] prescores=null; + + int hitsCutoff=0; + int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore); + + boolean allBasesCovered=true; + { + if(offsetsP[0]!=0){allBasesCovered=false;} + else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;} + else{ + for(int i=1; ioffsetsP[i-1]+KEYLEN){ + allBasesCovered=false; + break; + } + } + } + } + + //TODO I don't understand this logic + final boolean pretendAllBasesAreCovered=(allBasesCovered || + keysP.length>=keysOriginal.length-4 || + (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f)))); + +// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP)); +// assert(allBasesCovered); + + if(prescan_qscore){ + prescanResults=prescanAllBlocks(bestScores, + keysP, keyScoresP, offsetsP, + keysM, keyScoresM, offsetsM, + pretendAllBasesAreCovered); + + if(prescanResults!=null){ + precounts=prescanResults[0]; + prescores=prescanResults[1]; + } + + if(bestScores[1]=maxQuickScore && pretendAllBasesAreCovered){ + assert(bestScores[3]==maxQuickScore); + assert(bestScores[1]==numHits); + + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*DYNAMIC_QSCORE_THRESH_PERFECT)); + }else{ + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, false); + qscoreCutoff=Tools.max(qscoreCutoff, (int)(bestScores[3]*PRESCAN_QSCORE_THRESH)); + } + } + + final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true); + final boolean fullyDefined=AminoAcid.isFullyDefined(basesP); + assert(bestScores[2]<=0) : Arrays.toString(bestScores); + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS, + offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} + } + cycle++; + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS, + offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + if(QUIT_AFTER_TWO_PERFECTS){ + if(bestScores[5]>=2){break;} + } + cycle++; + } + + assert(Read.CHECKSITES(result, basesP, basesM, id)); //TODO: Comment out once checked + + return result; + } + + /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */ + private final int[][] prescanAllBlocks(int[] bestScores, + int[] keysP, int[] keyScoresP, int[] offsetsP, + int[] keysM, int[] keyScoresM, int[] offsetsM, + final boolean allBasesCovered){ + + int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}}; + + int bestqscore=0; + int maxHits=0; + int minHitsToScore=MIN_APPROX_HITS_TO_KEEP; + + final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); + + final int[] counts=precountArray; + final int[] scores=prescoreArray; + final int[][] ret=prescanReturn; + Arrays.fill(counts, keysP.length); + Arrays.fill(scores, maxQuickScore); + ret[0]=counts; + ret[1]=scores; + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + final int baseChrom=baseChrom(chrom); + for(int pmi=0; pmi<2; pmi++, cycle++){ + + int[] keys=pm[pmi][0]; + int[] keyScores=pm[pmi][1]; + int[] offsets=pm[pmi][2]; +// int[][] hits=getHitArray(offsets.length); + + int[] starts=startArray; + int[] stops=stopArray; + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + + if(numHits find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores, + final int chrom, final byte strand, + int[] offsets, final boolean obeyLimits, ArrayList ssl, int[] bestScores, + final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + int[] starts=startArray; + int[] stops=stopArray; + + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + if(numHits=0){numHits++;} + } + + if(numHits==offsets.length){ + return null; + }else{ + int[][] r=shrinkReturn3; + int[] starts2=startArray; + int[] stops2=stopArray; + int[] offsets2=getOffsetArray(numHits); + int[] keyScores2=new int[numHits]; + + for(int i=0, j=0; i=0){ + starts2[j]=starts[i]; + stops2[j]=stops[i]; + offsets2[j]=offsets[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + r[0]=starts2; + r[1]=stops2; + r[2]=offsets2; + r[4]=keyScores2; + return r; + } + } + + /** Removes "-1" keys. */ + private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){ + + + int numHits=0; + for(int i=0; i=0){numHits++;} + } + + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + if(numHits==keys.length){ + return null; + }else{ + int[][] r=shrinkReturn2; + int[] offsets2=getOffsetArray(numHits); + assert(offsets2!=offsets); + assert(offsets2.length=0){ + offsets2[j]=offsets[i]; + keys2[j]=keys[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+ + "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n"; + r[0]=offsets2; + r[1]=keys2; + r[2]=keyScores2; + return r; + } + } + + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk2(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, final boolean fullyDefined){ + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + +//// System.out.println("After SHRINK_BEFORE_WALK: numHits = "+hits.length); +// Block b=index[baseChrom_]; +// int[][] hits=b.getHitLists(starts, stops); +// if(SHRINK_BEFORE_WALK){ +// Object[] r=shrink(hits, offsets, keyScores); +// if(r!=null){ +// hits=(int[][])r[0]; +// offsets=(int[])r[1]; +// keyScores=(int[])r[3]; +// } +// } +// +// final int numHits=hits.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxScore=maxScore(offsets, baseScores, keyScores, bases.length, true); +// final int maxQuickScore=(!USE_EXTENDED_SCORE ? maxScore : maxQuickScore(offsets)); +// System.err.println("maxScore = "+maxScore); + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); +// final int minQuickScore=(!USE_EXTENDED_SCORE ? minScore : (int)(maxQuickScore*0.15f)); +// final int minScore=(int)(MIN_SCORE_MULT*maxScore); +// System.err.println("minScore = "+minScore); + + final int baseChrom=baseChrom(baseChrom_); + + +// final PriorityQueue heap=new PriorityQueue(numHits); + heap.clear(); +// final Quad[] triples=new Quad[numHits]; + final Quad[] triples=tripleStorage; + + final Block b=index[baseChrom]; + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=-999999999; + + int cutoff=minScore; + + int maxHits=0; + int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println(); + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + + int mapStart=site, mapStop=maxNearbySite; + + if(USE_EXTENDED_SCORE){ + final int chrom=numberToChrom(site, baseChrom); + score=extendScore(bases, baseScores, offsets, values, chrom, centerIndex, locArray, numHits, approxHits); + if(true/*USE_AFFINE_SCORE*/){ + //Correct begin and end positions if they changed. + int min=Integer.MAX_VALUE; + int max=Integer.MIN_VALUE; + for(int i=0; i-1){ + if(xmax){max=x;} + } + } + +// assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + //Note: This error can trigger if minChrom and maxChrom do not align to block boundaries + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ + chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ + Arrays.toString(locArray)+"\n"+ + Arrays.toString(values)+"\n"+ + new String(bases)+"\nstrand="+strand+"\n"); + System.err.println(); + } + score=-99999; + } + + + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + + +// System.err.println("site="+site+", maxNearbySite="+maxNearbySite+", min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); + +// if(chrom==17 && absdif(min, 30354420)<2000){ +// System.err.println("\n*****\n"); +// System.err.println("site="+site+" ("+numberToSite(site)+"), maxNearbySite="+maxNearbySite+ +// " ("+numberToSite(maxNearbySite)+"), min="+min+", max="+max+ +// ", mapStart="+mapStart+", mapStop="+mapStop); +// System.err.println(); +// System.err.println(Arrays.toString(locArray)); +// System.err.println(); +// System.err.println("chrom="+chrom); +// System.err.println("score="+score); +// } + } + }else{ + score=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + score+=scoreZ; + } + } + + +// score=score(values, centerIndex, offsets, hits); +// if(ADD_SCORE_Z){ +// int scoreZ=scoreZ2(values, centerIndex, offsets); +// score+=scoreZ; +// } +// +// if(USE_EXTENDED_SCORE){ +// if(score>minQuickScore){ +//// System.out.println(score+" > "+minQuickScore); +// score=extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex, locArray); +// }else{ +//// System.out.print("."); +// score=-1; +// } +// } + + +// System.err.println("maxScore = "+maxScore); +// System.err.println("hits = "+approxHits+" / "+approxHitsCutoff); +// System.err.println("score = "+score+" / "+cutoff); + + if(score>=cutoff){ + +// System.err.println("Passed!"); + +// System.out.println("approxHits="+approxHits+" / "+approxHitsCutoff); +// System.out.println("score="+score+" / "+cutoff); +// System.out.println("strand="+Gene.strandCodes[strand]); +// System.out.println("center="+values[centerIndex]); +// System.out.println("values="+Arrays.toString(values)); +// extendScore(read, offsets, values, numberToChrom(site, baseChrom), centerIndex); +// System.out.println(); + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits-1); //More sensitive, but slower + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + + // cutoff=Tools.max(cutoff, minScore+(int)((score-minScore)*DYNAMIC_SCORE_THRESH)); + if(USE_EXTENDED_SCORE && score>=maxScore){ + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + +// final int chrom=numberToChrom(site, baseChrom); +// final int site2=numberToSite(site); +// final int site3=numberToSite(maxNearbySite)+read.length; + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + } + if(heap.isEmpty()){break;} + } + + } + + return ssl; + } + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk3(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, + int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + assert(USE_EXTENDED_SCORE); + + final int numKeys=offsets.length; //Before shrink + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + usedKeys+=numHits; + usedKeyIterations++; + + final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); + final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); + + final int baseChrom=baseChrom(baseChrom_); + + heap.clear(); + + final Quad[] triples=tripleStorage; + + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + final Block b=index[baseChrom]; + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=bestScores[0]; + int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); + + int qcutoff=Tools.max(bestScores[2], minQuickScore); + int bestqscore=bestScores[3]; + int maxHits=bestScores[1]; + int perfectsFound=bestScores[5]; + assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits; + int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); + if(approxHitsCutoff>numHits){return ssl;} + + final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); + + if(currentTopScore>=maxScore){ + assert(currentTopScore==maxScore); + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + } + + +// assert(false) : "numHits="+numHits+", maxHits="+maxHits+", MIN_APPROX_HITS_TO_KEEP="+MIN_APPROX_HITS_TO_KEEP+", approxHitsCutoff="+approxHitsCutoff+", maxHits="+maxHits; + + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); + + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + int mapStart=site, mapStop=maxNearbySite; + + assert(USE_EXTENDED_SCORE); + + boolean locArrayValid=false; + if(qscore-1){ + if(xmax){max=x;} + } + } + + if(score>=maxScore){ + assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + + // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + if(!Shared.anomaly){ + Shared.anomaly=true; + System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ + chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ + Arrays.toString(locArray)+"\n"+ + Arrays.toString(values)+"\n"+ + new String(bases)+"\nstrand="+strand+"\n"); + System.err.println(); + } + score=-99999; + } + + //mapStart and mapStop are indices + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + if(score>=maxScore){ + assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + } + + if(score==maxScore){ + qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); + } + + if(score>=cutoff){ + qcutoff=Tools.max(qcutoff, (int)(qscore*DYNAMIC_QSCORE_THRESH)); + bestqscore=Tools.max(qscore, bestqscore); + } + } + + if(score>=cutoff){ + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); + + cutoff=Tools.max(cutoff, (int)(score*DYNAMIC_SCORE_THRESH)); + if(score>=maxScore){ + assert(USE_EXTENDED_SCORE); + cutoff=Tools.max(cutoff, (int)(score*0.95f)); + } + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2 "+site2); +// System.err.println(mapStop+" -> "+site3); + + assert(gapArray[0]>=site2 && gapArray[0]-site2 "+site2+"\n"+ + mapStop+" -> "+site3+"\n\n"+ + Arrays.toString(gapArray)+"\n\n"+ +// Arrays.toString(clone)+"\n\n"+ + Arrays.toString(locArray)+"\n"+ + "numHits="+numHits+", "+ + "heap.size="+heap.size()+", "+ + "numHits="+numHits+", "+ + "approxHits="+approxHits+"\n"; + gapArray[0]=Tools.min(gapArray[0], site2); + gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); + } + if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} +// assert(false) : Arrays.toString(locArray); + } + + + //This block is optional, but tries to eliminate multiple identical alignments + + SiteScore ss=null; + final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; + final boolean inbounds=(site2>=0 && site3=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(heap.size()=prevMaxHits); + + final int baseChrom=baseChrom(baseChrom_); + final Block b=index[baseChrom]; + final int[] sizes=sizeArray; + + heap.clear(); + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + int topQscore=-999999999; + + int maxHits=0; +// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + + + int approxHitsCutoff; + final int indelCutoff; + if(perfectOnly){ + approxHitsCutoff=numHits; + indelCutoff=0; + }else{ + approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy + indelCutoff=MAX_INDEL2; + } + + + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + {//Inner loop + final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + hist_hits[Tools.min(HIT_HIST_LEN, approxHits)]++; + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + if(qscore>topQscore){ + +// maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan + + topQscore=qscore; + + if(qscore>=maxQuickScore){ + assert(qscore==maxQuickScore); + assert(approxHits==numHits); + if(earlyExit){ + return new int[] {topQscore, maxHits}; + } + } + } + } + + while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements + final Quad t2=heap.poll(); + final int row=t2.row+1, col=t2.column; + if(row=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(earlyExit && (perfectOnly || heap.size()b ? a-b : b-a; + } + + + final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){ + + if(useQuality){ + //These lines apparently MUST be used if quality is used later on for slow align. + if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sum(baseScores);} + }else{ + if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);} + } + + return maxQuickScore(offsets, keyScores); + } + + + public final int maxQuickScore(int[] offsets, int[] keyScores){ + +// int x=offsets.length*BASE_KEY_HIT_SCORE; + int x=Tools.intSum(keyScores); + int y=Y_SCORE_MULT*(offsets[offsets.length-1]-offsets[0]); +// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);} +// assert(!ADD_SCORE_Z) : "Need to make sure this is correct..."; + +// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);} + if(ADD_SCORE_Z){x+=maxScoreZ(offsets);} + + return x+y; +// int bonus=(2*(HIT_SCORE/2)); //For matching both ends +// return x+y+bonus; + } + + + private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], + int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){ + + hist_hits_score[Tools.min(HIT_HIST_LEN, numApproxHits)]++; + if(numApproxHits==1){return keyScores[centerIndex];} + + //Done! + //Correct way to calculate score: + //Find the first chunk that exactly hits the center. + //Then, align leftward of it, and align rightward of it, and sum the scores. + + //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns + //(for example, a small indel will generate two valid site candidates), choose the lower site. + + int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+ + scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex; + + int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets); + if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);} +// int z=scoreZ(locs, hits); + return x+y; + } + + +// /** Generates a term that increases score with how many bases in the read match the ref. */ +// public static final int scoreZ(int[] locs, int centerIndex, int offsets[]){ +// final int center=locs[centerIndex]; +// +// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE]; +// +// final int maxLoc=center+MAX_INDEL2; +// final int minLoc=Tools.max(0, center-MAX_INDEL); +// +// int score=0; +// +// for(int i=0; i=minLoc && loc<=maxLoc){ +//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); +// +// int offset=offsets[i]; +// int max=CHUNKSIZE+offset; +// +// for(int j=offset; jloc){ +// refLoc[j]=loc; +// score-=2; +// }else if(old==loc){ +// score-=1; +// //do nothing, perhaps, or add 1? +// }else{ +// score-=2; +// assert(old=minVal && value<=maxVal){ + final int refbase=numberToSite(value); + assert(refbase>=minLoc && refbase<=maxLoc); + +// System.out.println("numApproxHits="+numApproxHits+", numHits="+numHits+", i="+i+", minVal="+minVal+", value="+value+", maxVal="+maxVal+ +// ", refbase="+refbase+", minLoc="+minLoc+", maxLoc="+maxLoc+", keynum="+keynum); +// System.out.println("Reverse: Trying key "+refbase+" @ "+offsets[i]); +// System.out.println("Passed!"); +// +// System.out.println("Number: \t"+Long.toHexString(value|(1l<<63))); +// System.out.println("Mask: \t"+Long.toHexString(SITE_MASK|(1l<<63))); +// System.out.println("Both: \t"+Long.toHexString((value&SITE_MASK)|(1l<<63))); + + keynum++; + final int callbase=offsets[i]; + + int misses=0; + for(int cloc=callbase+KEYLEN-1, rloc=refbase+cloc; cloc>=0 && rloc>=0 && rloc=0); + if(dif>minGap){ + gaps++; + } + } + if(gaps<1){return null;} + int[] out=new int[2+gaps*2]; + out[0]=locArray[0]; + out[out.length-1]=locArray[locArray.length-1]; + + for(int i=1, j=1; i=0); + if(dif>minGap){ + out[j]=locArray[i-1]; + out[j+1]=locArray[i]; + j+=2; + } + } + return out; + } + + + /** Generates a term that increases score with how many bases in the read match the ref. */ + private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){ + + if(numApproxHits==1){return SCOREZ_1KEY;} + + final int center=locs[centerIndex]; + + final int maxLoc=center+MAX_INDEL2; + final int minLoc=Tools.max(0, center-MAX_INDEL); + + int score=0; + + int a0=-1, b0=-1; + + for(int i=0; i=minLoc && loc<=maxLoc){ +// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); + int a=offsets[i]; + + if(b0=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; +// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4)); + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){ + + callsToScore++; + + int score=0; + + int prev, loc=locs[centerIndex]; + + for(int i=centerIndex-1; i>=0; i--){ + + if(locs[i]>=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + /** Encode a (location, chrom) pair to an index */ + private static final int toNumber(int site, int chrom){ + int out=(chrom&CHROM_MASK_LOW); + out=out<=0 && f<1); + FRACTION_GENOME_TO_EXCLUDE=f; + MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + } + + + /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */ + static final float HIT_FRACTION_TO_RETAIN=.97f; //default: .85 + /** Range: 0 to 1000. Lower should be faster and less accurate. */ + static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + /** Range: 2 to infinity. Lower should be faster and less accurate. */ + static final int MIN_HIT_LISTS_TO_RETAIN=12; + + static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + //lower is faster + static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + //lower is faster + static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + //lower is faster + static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + + /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */ + public static final int SMALL_GENOME_LIST=80; + + static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";} + + static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy. + + /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */ + static final int CLUMPY_MIN_LENGTH_INDEX=2800; + static final float CLUMPY_FRACTION=0.8f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy. + + static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION1 when slowWalk3 is first entered */ + public static final int MAX_HITS_REDUCTION1=2; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION2 dynamically when best score is exceeded */ + public static int MAX_HITS_REDUCTION2=3; + + /** approxHitsCutoff=maxHits-MAX_HITS_REDUCTION_PERFECT when perfect score is found */ + public static final int MAX_HITS_REDUCTION_PERFECT=2; + + public static int MAXIMUM_MAX_HITS_REDUCTION=6; + public static int HIT_REDUCTION_DIV=4; + + private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$ + assert(keys>=hits) : keys+", "+hits; + assert(hits>=0); + + int mahtk=MIN_APPROX_HITS_TO_KEEP; + if(SEMIPERFECTMODE || PERFECTMODE){ + if(keys==1){return 1;} + else if(MIN_APPROX_HITS_TO_KEEP=0); + int r=hits-reduction; + + r=Tools.max(mahtk, currentCutoff, r); + + if(perfect){ + r=Tools.max(r, keys-MAX_HITS_REDUCTION_PERFECT); + } + return r; + } + + public static final boolean USE_SLOWALK3=true && USE_EXTENDED_SCORE; + public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed + public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast. + public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.02f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT=0.005f; //Fraction of max score to use as cutoff. Default 0.025, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT2=0.005f; + static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.64f : USE_EXTENDED_SCORE ? .74f : 0.6f); //Default .85f; lower is more accurate + static final float DYNAMIC_QSCORE_THRESH=0.6f; //default .58f + static final float DYNAMIC_QSCORE_THRESH_PERFECT=0.8f; //***$ + static final float PRESCAN_QSCORE_THRESH=DYNAMIC_QSCORE_THRESH*.95f; //default 1.0f; lower is more accurate and 0 essentially sets PRESCAN_QSCORE=false + static{ + assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1); + assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1); + } + + +} diff --git a/current/align2/BBIndexPacBioSkimmer.java b/current/align2/BBIndexPacBioSkimmer.java new file mode 100755 index 0000000..5b3879a --- /dev/null +++ b/current/align2/BBIndexPacBioSkimmer.java @@ -0,0 +1,2291 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; + +import stream.SiteScore; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; + + +/** + * Based on Index11f + * Designed to skim and retain all sites above a threshold. + * + * + * + * @author Brian Bushnell + * @date Jul 11, 2012 + * + */ +public final class BBIndexPacBioSkimmer extends AbstractIndex { + + + public static void main(String[] args){ + + int k=12; + + for(int i=0; iData.numChroms){maxChrom=Data.numChroms;} + assert(minChrom<=maxChrom); + Data.sysout.println("Loading index for chrom "+minChrom+"-"+maxChrom+", genome "+Data.GENOME_BUILD); + index=IndexMaker4.makeIndex(Data.GENOME_BUILD, minChrom, maxChrom, + k, NUM_CHROM_BITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, writeToDisk, diskInvalid, index); + + } + + /** Calculate statistics of index, such as list lengths, and find clumpy keys */ + public static final synchronized void analyzeIndex(int minChrom, int maxChrom, boolean cs, float fractionToExclude, int k){ + assert(!cs) : "Re-enable old reverse complement mode."; + assert(lengthHistogram==null); + assert(COUNTS==null); + + int KEYSPACE=1<<(2*k); + COUNTS=new int[KEYSPACE]; + maxChrom=maxChrom(maxChrom); + + HashMap cmap=new HashMap(); + + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + Block b=index[chrom]; + final int[] sites=b.sites; + final int[] starts=b.starts; + + for(int key=0; key0 && dif<=CLUMPY_MAX_DIST){ + clumps++; + } + } + if(clumps>0){ + final int x=Tools.min(key, AminoAcid.reverseComplementBinaryFast(key, k)); + final Integer ko=x; + LongM lm=cmap.get(ko); + if(lm==null){ + lm=new LongM(0); + cmap.put(ko, lm); + } + lm.increment(clumps); + } + } + } + } + + for(int key=0; keyCLUMPY_MIN_LENGTH_INDEX && clumps>CLUMPY_FRACTION*len)/* || (len>8*CLUMPY_MIN_LENGTH_INDEX && clumps>.75f*CLUMPY_FRACTION*len)*/){ + int rkey=AminoAcid.reverseComplementBinaryFast(key, k); + assert(key<=rkey); + assert(key==KeyRing.reverseComplementKey(rkey, k, cs)); + COUNTS[key]=0; + COUNTS[rkey]=0; + } + } + } + + lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); + + if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} + + if(REMOVE_FREQUENT_GENOME_FRACTION){ + + int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); + int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); + + MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); + MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); + + if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} + } + + Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); + if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} + if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} + assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; + } + +// /** Calculate statistics of index, such as list lengths, and find clumpy keys */ +// public static final synchronized void analyzeIndex(int minChrom, int maxChrom, boolean cs, float fractionToExclude, int k){ +// +// assert(lengthHistogram==null); +// assert(COUNTS==null); +// +// int KEYSPACE=1<<(2*k); +// COUNTS=new int[KEYSPACE]; +// +// maxChrom=maxChrom(maxChrom); +// +// for(int key=0; key0 && dif<=CLUMPY_MAX_DIST){ +// clumps++; +// } +// } +// +// for(int i=start2+1; i0 && dif<=CLUMPY_MAX_DIST){ +// clumps++; +// } +// } +// } +// +// } +// +// COUNTS[key]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[key]+len); +// if(key!=rkey){COUNTS[rkey]=(int)Tools.min(Integer.MAX_VALUE, COUNTS[rkey]+len);} +// assert(COUNTS[key]==COUNTS[rkey]) : key+", "+rkey; +// +// if(REMOVE_CLUMPY && len>CLUMPY_MIN_LENGTH_INDEX && clumps>(CLUMPY_FRACTION*len)){ +// COUNTS[key]=0; +// COUNTS[rkey]=0; +// for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ +// Block b=index[chrom]; +// final int[] sites=b.sites; +// sites[b.starts[key]]=-1; +// sites[b.starts[rkey]]=-1; +// } +// } +// +//// System.err.println("COUNTS["+key+"] = "+COUNTS[key]+", COUNTS["+rkey+"] = "+COUNTS[rkey]); +// } +// } +// +// lengthHistogram=Tools.makeLengthHistogram3(COUNTS, 1000, verbose2); +// +// if(verbose2){System.err.println("lengthHistogram: "+Arrays.toString(lengthHistogram));} +// +// if(REMOVE_FREQUENT_GENOME_FRACTION){ +// +// int lengthLimitIndex=(int)((1-fractionToExclude)*(lengthHistogram.length-1)); +// int lengthLimitIndex2=(int)((1-fractionToExclude*DOUBLE_SEARCH_THRESH_MULT)*(lengthHistogram.length-1)); +// +// MAX_USABLE_LENGTH=Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex]); +// MAX_USABLE_LENGTH2=Tools.max(6*SMALL_GENOME_LIST, lengthHistogram[lengthLimitIndex2]); +// +// if(verbose2){System.err.println("MAX_USABLE_LENGTH: "+MAX_USABLE_LENGTH+"\nMAX_USABLE_LENGTH2: "+MAX_USABLE_LENGTH2);} +// } +// +// Solver.POINTS_PER_SITE=(int)Math.floor((Solver.BASE_POINTS_PER_SITE*4000f)/Tools.max(2*SMALL_GENOME_LIST, lengthHistogram[MAX_AVERAGE_LIST_TO_SEARCH])); +// if(Solver.POINTS_PER_SITE==0){Solver.POINTS_PER_SITE=-1;} +// if(verbose2){System.err.println("POINTS_PER_SITE: "+Solver.POINTS_PER_SITE);} +// assert(Solver.POINTS_PER_SITE<0) : Solver.POINTS_PER_SITE; +// } + + + /** Returns the filename for the block holding this chrom */ + public static final String fname(int chrom, int k){ + return IndexMaker4.fname(minChrom(chrom), maxChrom(chrom), k, NUM_CHROM_BITS, COLORSPACE); + } + + /** Ensure key offsets are strictly ascending. */ + private static boolean checkOffsets(int[] offsets){ + for(int i=1; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; ilimit || sum/finalHitCount>limit2; i--){ + Pointer p=ptrs[i]; + sum-=hits[p.key].length; + hits[p.key]=null; + finalHitCount--; + } + + return finalHitCount; + } + + /** Remove least useful keys to accelerate search */ + public final int trimExcessHitListsByGreedy(int[] offsets, int[] keyScores, int maxHitLists, int[] keys){ + + float[] keyWeights=getKeyWeightArray(keyScores.length); + for(int i=0; ilimitS){hits[i]=null;} +// } + + final int[] lengths=getGenericArray(keys.length); + + for(int i=0; i0){ + if(x=shortest); + if(initialHitCountlimit3){ + for(int i=0; i=MIN_APPROX_HITS_TO_KEEP && (sum>limit || sum/initialHitCount>limit2 || hitsCount>maxHitLists/* || worstValue<0*/)){ + final int[] lists=getGreedyListArray(hitsCount); + for(int i=0, j=0; j0){ + lists[j]=i; + j++; + } + } + + Solver.findWorstGreedy(offsets, lengths, keyWeights, KEYLEN, lists, greedyReturn); + int worstIndex=greedyReturn[0]; + int worst=lists[worstIndex]; + worstValue=greedyReturn[1]; + sum-=lengths[worst]; + +// if(worstValue>0 && (hitsCount<=maxHitLists || lengths[worst]0 || lengths[worst]=0){ + final int len=count(key); + if(len>0 && len0){ + starts[i]=b.starts[key]; + stops[i]=starts[i]+len2; + numHits++; + } + } + } + } + return numHits; + } + + + private final int countHits(final int[] keys, final int maxLen, boolean clearBadKeys){ + int numHits=0; + for(int i=0; i=0){ + final int len=count(key); + if(len>0 && len findAdvanced(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsets, long id){ + assert(minChrom<=maxChrom && minChrom>=0); + ArrayList result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, true, id); + if(DOUBLE_SEARCH_NO_HIT && (result==null || result.isEmpty())){result=find(basesP, basesM, qual, baseScoresP, keyScoresP, offsets, false, id);} + + return result; + } + + + public final ArrayList find(byte[] basesP, byte[] basesM, byte[] qual, byte[] baseScoresP, int[] keyScoresP, int[] offsetsP, boolean obeyLimits, long id){ + + assert(checkOffsets(offsetsP)) : Arrays.toString(offsetsP); + final int[] keysOriginal=KeyRing.makeKeys(basesP, offsetsP, KEYLEN, COLORSPACE); + int[] keysP=Arrays.copyOf(keysOriginal, keysOriginal.length); + + initialKeys+=offsetsP.length; + initialKeyIterations++; + + final int maxLen=(obeyLimits ? MAX_USABLE_LENGTH : MAX_USABLE_LENGTH2); + + int numHits=0; + numHits=countHits(keysP, maxLen, true); + if(numHits>0){ //TODO: Change these to higher numbers + int trigger=(3*keysP.length)/4; + if(numHits<20 && numHitsMIN_APPROX_HITS_TO_KEEP){ + int cutoffIndex=((int) (HIT_FRACTION_TO_RETAIN*(keysP.length)-0.01f))+(keysP.length-numHits); + + int zeroes=keysP.length-numHits; + int altMinIndex=(zeroes+(MIN_HIT_LISTS_TO_RETAIN-1)); + cutoffIndex=Tools.max(cutoffIndex, altMinIndex); + + assert(cutoffIndex>0) : cutoffIndex+"\n"+numHits; + + if(cutoffIndex<(keysP.length-1)){ + int[] lens=getGenericArray(keysP.length); + for(int i=0; icutoff){ + keysP[i]=-1; + removed++; + numHits--; + } + } + } + } +// assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + final ArrayList result=new ArrayList(8); + if(numHits=5); + + int[][] prescanResults=null; + int[] precounts=null; + int[] prescores=null; + + int hitsCutoff=0; + int qscoreCutoff=(int)(MIN_QSCORE_MULT*maxQuickScore); + + boolean allBasesCovered=true; + { + if(offsetsP[0]!=0){allBasesCovered=false;} + else if(offsetsP[offsetsP.length-1]!=(basesP.length-KEYLEN)){allBasesCovered=false;} + else{ + for(int i=1; ioffsetsP[i-1]+KEYLEN){ + allBasesCovered=false; + break; + } + } + } + } + + //TODO I don't understand this logic + final boolean pretendAllBasesAreCovered=(allBasesCovered || + keysP.length>=keysOriginal.length-4 || + (keysP.length>=9 && (offsetsP[offsetsP.length-1]-offsetsP[0]+KEYLEN)>Tools.max(40, (int)(basesP.length*.75f)))); + +// System.err.println(allBasesCovered+"\t"+Arrays.toString(offsetsP)); +// assert(allBasesCovered); + + if(prescan_qscore){ + prescanResults=prescanAllBlocks(bestScores, + keysP, keyScoresP, offsetsP, + keysM, keyScoresM, offsetsM, + pretendAllBasesAreCovered); + + if(prescanResults!=null){ + precounts=prescanResults[0]; + prescores=prescanResults[1]; + } + + if(bestScores[1]=maxQuickScore && pretendAllBasesAreCovered){ + assert(bestScores[3]==maxQuickScore); + assert(bestScores[1]==numHits); + } + + hitsCutoff=calcApproxHitsCutoff(keysP.length, bestScores[1], MIN_APPROX_HITS_TO_KEEP, true); + qscoreCutoff=calcQScoreCutoff(maxQuickScore, bestScores[3]/2, qscoreCutoff); + } + + final int maxScore=maxScore(offsetsP, baseScoresP, keyScoresP, basesP.length, true); + final boolean fullyDefined=AminoAcid.isFullyDefined(basesP); + assert(bestScores[2]<=0) : Arrays.toString(bestScores); + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysP, basesP, baseScoresP, keyScoresP, chrom, Gene.PLUS, + offsetsP, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + cycle++; + if(precounts==null || precounts[cycle]>=hitsCutoff || prescores[cycle]>=qscoreCutoff){ + find(keysM, basesM, baseScoresM, keyScoresM, chrom, Gene.MINUS, + offsetsM, obeyLimits, result, bestScores, allBasesCovered, maxScore, fullyDefined); + } + cycle++; + } + +// assert(Read.CHECKSITES(result, basesP)); + + return result; + } + + /** Search blocks rapidly to find max hits, and perfect sites. May indicate some blocks can be skipped. */ + private final int[][] prescanAllBlocks(int[] bestScores, + int[] keysP, int[] keyScoresP, int[] offsetsP, + int[] keysM, int[] keyScoresM, int[] offsetsM, + final boolean allBasesCovered){ + + int[][][] pm=new int[][][] {{keysP, keyScoresP, offsetsP}, {keysM, keyScoresM, offsetsM}}; + + int bestqscore=0; + int maxHits=0; + int minHitsToScore=MIN_APPROX_HITS_TO_KEEP; + + final int maxQuickScore=maxQuickScore(offsetsP, keyScoresP); + + final int[] counts=precountArray; + final int[] scores=prescoreArray; + final int[][] ret=prescanReturn; + Arrays.fill(counts, keysP.length); + Arrays.fill(scores, maxQuickScore); + ret[0]=counts; + ret[1]=scores; + + int cycle=0; + for(int chrom=minChrom; chrom<=maxChrom; chrom=((chrom&CHROM_MASK_HIGH)+CHROMS_PER_BLOCK)){ + final int baseChrom=baseChrom(chrom); + for(int pmi=0; pmi<2; pmi++, cycle++){ + + int[] keys=pm[pmi][0]; + int[] keyScores=pm[pmi][1]; + int[] offsets=pm[pmi][2]; +// int[][] hits=getHitArray(offsets.length); + + int[] starts=startArray; + int[] stops=stopArray; + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + + if(numHits find(int[] keys, final byte[] bases, final byte[] baseScores, int[] keyScores, + final int chrom, final byte strand, + int[] offsets, final boolean obeyLimits, ArrayList ssl, int[] bestScores, + final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + + int[] starts=startArray; + int[] stops=stopArray; + + int numHits=getHits(keys, chrom, Integer.MAX_VALUE, starts, stops); + if(numHits=0){numHits++;} + } + + if(numHits==offsets.length){ + return null; + }else{ + int[][] r=shrinkReturn3; + int[] starts2=startArray; + int[] stops2=stopArray; + int[] offsets2=getOffsetArray(numHits); + int[] keyScores2=new int[numHits]; + + for(int i=0, j=0; i=0){ + starts2[j]=starts[i]; + stops2[j]=stops[i]; + offsets2[j]=offsets[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + r[0]=starts2; + r[1]=stops2; + r[2]=offsets2; + r[4]=keyScores2; + return r; + } + } + + /** Removes "-1" keys. */ + private final int[][] shrink2(int[] offsets, int[] keys, int[] keyScores){ + + + int numHits=0; + for(int i=0; i=0){numHits++;} + } + + + assert(checkOffsets(offsets)) : Arrays.toString(offsets); + if(numHits==keys.length){ + return null; + }else{ + int[][] r=shrinkReturn2; + int[] offsets2=getOffsetArray(numHits); + assert(offsets2!=offsets); + assert(offsets2.length=0){ + offsets2[j]=offsets[i]; + keys2[j]=keys[i]; + keyScores2[j]=keyScores[i]; + j++; + } + } + assert(checkOffsets(offsets2)) : "\nnumHits="+numHits+"\n"+Arrays.toString(offsets)+" -> \n"+Arrays.toString(offsets2)+"\n"+ + "\n"+Arrays.toString(keys)+" -> \n"+Arrays.toString(keys2)+"\n"; + r[0]=offsets2; + r[1]=keys2; + r[2]=keyScores2; + return r; + } + } + + + /** This uses a heap to track next column to increment */ + private final ArrayList slowWalk3(int[] starts, int[] stops, final byte[] bases, + final byte[] baseScores, int[] keyScores, int[] offsets, + final int baseChrom_, final byte strand, final boolean obeyLimits, ArrayList ssl, + int[] bestScores, final boolean allBasesCovered, final int maxScore, final boolean fullyDefined){ + assert(USE_EXTENDED_SCORE); + + final int numKeys=offsets.length; //Before shrink + + //This can be done before or after shrinking, but the results will change depending on MIN_SCORE_MULT and etc. + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + if(SHRINK_BEFORE_WALK){ + int[][] r=shrink(starts, stops, offsets, keyScores, offsets.length); + if(r!=null){ + starts=r[0]; + stops=r[1]; + offsets=r[2]; + keyScores=r[4]; + } + } + + final int numHits=offsets.length; //After shrink + + + assert(numHits==offsets.length); + assert(numHits==keyScores.length); + + usedKeys+=numHits; + usedKeyIterations++; + + final boolean filter_by_qscore=(FILTER_BY_QSCORE && numKeys>=5); + + assert(!(!SHRINK_BEFORE_WALK && ADD_SCORE_Z)); + + +// final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*0.85f*maxScore)); + final int minScore=(obeyLimits ? (int)(MIN_SCORE_MULT*maxScore) : (int)(MIN_SCORE_MULT*1.25f*maxScore)); + final int minQuickScore=(int)(MIN_QSCORE_MULT*maxQuickScore); + + final int baseChrom=baseChrom(baseChrom_); + + heap.clear(); + + final Quad[] triples=tripleStorage; + + final int[] values=valueArray; + final int[] sizes=sizeArray; + final int[] locArray=(USE_EXTENDED_SCORE ? getLocArray(bases.length) : null); + final Block b=index[baseChrom]; + + if(ssl==null){ssl=new ArrayList(8);} + + int currentTopScore=bestScores[0]; + int cutoff=Tools.max(minScore, (int)(currentTopScore*DYNAMIC_SCORE_THRESH)); + + int qcutoff=Tools.max(bestScores[2], minQuickScore); + int bestqscore=bestScores[3]; + int maxHits=bestScores[1]; + int perfectsFound=bestScores[5]; + assert((currentTopScore>=maxScore) == (perfectsFound>0)) : currentTopScore+", "+maxScore+", "+perfectsFound+", "+maxHits+", "+numHits; + int approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, currentTopScore>=maxScore); + if(approxHitsCutoff>numHits){return ssl;} + + final boolean shortCircuit=(allBasesCovered && numKeys==numHits && filter_by_qscore); + + if(currentTopScore>=maxScore){ + assert(currentTopScore==maxScore); + + } + + + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + +// System.out.println("\nEntering SS loop:"); +// System.out.println("maxScore="+maxScore+"\tminScore="+minScore+"\tcurrentTopScore="+currentTopScore+"\n" + +// "cutoff="+cutoff+"\tmaxHits="+maxHits+"\tapproxHitsCutoff="+approxHitsCutoff); +// System.out.println("maxQuickScore="+maxQuickScore+"\tminQuickScore="+minQuickScore+"\tqcutoff="+qcutoff); + + + SiteScore prevSS=null; + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=site-MAX_INDEL, maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int score; + int qscore=(filter_by_qscore ? quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits) : qcutoff); + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + int mapStart=site, mapStop=maxNearbySite; + + assert(USE_EXTENDED_SCORE); + + boolean locArrayValid=false; + if(qscore-1){ + if(xmax){max=x;} + } + } + + if(score>=maxScore){ + assert(min==max && min>-1) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + + // assert(min>-1 && max>-1) : Arrays.toString(locArray); //TODO: How did this assertion trigger? + if(min<0 || max<0){ + System.err.println("Anomaly in "+getClass().getName()+".slowWalk: "+ + chrom+", "+mapStart+", "+mapStop+", "+centerIndex+", "+ + Arrays.toString(locArray)+"\n"+ + Arrays.toString(values)+"\n"+ + new String(bases)+"\nstrand="+strand+"\n"); + System.err.println(); + score=-99999; + } + + //mapStart and mapStop are indices + mapStart=toNumber(min, chrom); + mapStop=toNumber(max, chrom); + + if(score>=maxScore){ + assert(mapStop-mapStart==0) : "\n"+score+", "+maxScore+", "+min+", "+max+ + ", "+(max-min)+", "+(mapStop-mapStart)+", "+bases.length+"\n"+Arrays.toString(locArray)+"\n"; + } + } + +// if(score==maxScore){//Disabled for Skimmer version +// qcutoff=Tools.max(qcutoff, (int)(maxQuickScore*DYNAMIC_QSCORE_THRESH_PERFECT)); +// approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, MIN_APPROX_HITS_TO_KEEP, true); +// } + + if(score>=cutoff){ + qcutoff=calcQScoreCutoff(maxQuickScore, qscore, qcutoff); + bestqscore=Tools.max(qscore, bestqscore); + } + } + + if(score>=cutoff){ + + if(score>currentTopScore){ +// System.err.println("New top score!"); + + if(DYNAMICALLY_TRIM_LOW_SCORES){ + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=calcApproxHitsCutoff(numKeys, maxHits, approxHitsCutoff, currentTopScore>=maxScore); + cutoff=calcScoreCutoff(maxScore, currentTopScore, cutoff); + } + + currentTopScore=score; + +// System.out.println("New top score: "+currentTopScore+" \t("+cutoff+")"); + } + + final int chrom=numberToChrom(mapStart, baseChrom); + final int site2=numberToSite(mapStart); + final int site3=numberToSite(mapStop)+bases.length-1; + + assert(NUM_CHROM_BITS==0 || site2 "+site2); +// System.err.println(mapStop+" -> "+site3); + + assert(gapArray[0]>=site2 && gapArray[0]-site2 "+site2+"\n"+ + mapStop+" -> "+site3+"\n\n"+ + Arrays.toString(gapArray)+"\n\n"+ +// Arrays.toString(clone)+"\n\n"+ + Arrays.toString(locArray)+"\n"+ + "numHits="+numHits+", "+ + "heap.size="+heap.size()+", "+ + "numHits="+numHits+", "+ + "approxHits="+approxHits+"\n"; + gapArray[0]=Tools.min(gapArray[0], site2); + gapArray[gapArray.length-1]=Tools.max(gapArray[gapArray.length-1], site3); + } + if(verbose){System.err.println("@ site "+site2+", made gap array: "+Arrays.toString(gapArray));} +// assert(false) : Arrays.toString(locArray); + } + + + //This block is optional, but tries to eliminate multiple identical alignments + + SiteScore ss=null; + final boolean perfect1=USE_EXTENDED_SCORE && score==maxScore && fullyDefined; + final boolean inbounds=(site2>=0 && site3=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", strand="+strand+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(heap.size()=prevMaxHits); + + final int baseChrom=baseChrom(baseChrom_); + final Block b=index[baseChrom]; + final int[] sizes=sizeArray; + + heap.clear(); + for(int i=0; i0); + + int a=sites[start]; + int a2; + if((a&SITE_MASK)>=offsets[i]){ + a2=a-offsets[i]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[i], 0); + a2=toNumber(st2, ch); + } + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)); + + Quad t=triples[i]; + assert(t!=null) : "Should be using tripleStorage"; + assert(i==t.column); + t.row=start; + t.site=a2; + t.list=sites; + values[i]=a2; + + heap.add(t); + } + + final int maxQuickScore=maxQuickScore(offsets, keyScores); + + int topQscore=-999999999; + + int maxHits=0; +// int approxHitsCutoff=MIN_APPROX_HITS_TO_KEEP; + + + int approxHitsCutoff; + final int indelCutoff; + if(perfectOnly){ + approxHitsCutoff=numHits; + indelCutoff=0; + }else{ + approxHitsCutoff=Tools.max(prevMaxHits, Tools.min(MIN_APPROX_HITS_TO_KEEP, numHits-1)); //Faster, same accuracy + indelCutoff=MAX_INDEL2; + } + + + while(!heap.isEmpty()){ + Quad t=heap.peek(); + final int site=t.site; + final int centerIndex=t.column; + + int maxNearbySite=site; + + + int approxHits=0; + + {//Inner loop + final int minsite=site-Tools.min(MAX_INDEL, indelCutoff), maxsite=site+MAX_INDEL2; + for(int column=0, chances=numHits-approxHitsCutoff; column=0; column++){ + final int x=values[column]; + assert(x==triples[column].site); + if(x>=minsite && x<=maxsite){ + maxNearbySite=(x>maxNearbySite ? x : maxNearbySite); + approxHits++; + }else{chances--;} + } + } + + assert(centerIndex>=0) : centerIndex; + assert(approxHits>=1 || approxHitsCutoff>1) : approxHits+", "+approxHitsCutoff+", "+numHits+", "+t.column; + if(approxHits>=approxHitsCutoff){ + + int qscore=quickScore(values, keyScores, centerIndex, offsets, sizes, true, approxHits, numHits); + + if(ADD_SCORE_Z){ + int scoreZ=scoreZ2(values, centerIndex, offsets, approxHits, numHits); + qscore+=scoreZ; + } + + if(qscore>topQscore){ + +// maxHits=Tools.max(approxHits, maxHits); +// approxHitsCutoff=Tools.max(approxHitsCutoff, maxHits); //Best setting for pre-scan + + maxHits=Tools.max(approxHits, maxHits); + approxHitsCutoff=Tools.max(approxHitsCutoff, approxHits-1); //Best setting for pre-scan + + topQscore=qscore; + + if(qscore>=maxQuickScore){ + assert(qscore==maxQuickScore); + assert(approxHits==numHits); + if(earlyExit){ + return new int[] {topQscore, maxHits}; + } + } + } + } + + while(heap.peek().site==site){ //Remove all identical elements, and add subsequent elements + final Quad t2=heap.poll(); + final int row=t2.row+1, col=t2.column; + if(row=offsets[col]){ + a2=a-offsets[col]; + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + }else{ + int ch=numberToChrom(a, baseChrom); + int st=numberToSite(a); + int st2=Tools.max(st-offsets[col], 0); + a2=toNumber(st2, ch); + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + } + + assert(numberToChrom(a, baseChrom) == numberToChrom(a2, baseChrom)) : + "baseChrom="+baseChrom+", chrom="+numberToChrom(a, baseChrom)+", site="+site+ + ", maxNearbySite="+maxNearbySite+", a="+a+", a2="+a2+", offsets["+col+"]="+offsets[col]; + + t2.site=a2; + values[col]=a2; + heap.add(t2); + }else if(earlyExit && (perfectOnly || heap.size()b ? a-b : b-a; + } + + + final int maxScore(int[] offsets, byte[] baseScores, int[] keyScores, int readlen, boolean useQuality){ + + if(useQuality){ + //These lines apparently MUST be used if quality is used later on for slow align. + if(USE_AFFINE_SCORE){return msa.maxQuality(baseScores);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5)+Tools.sum(baseScores);} + }else{ + if(USE_AFFINE_SCORE){return msa.maxQuality(readlen);} + if(USE_EXTENDED_SCORE){return readlen*(BASE_HIT_SCORE+BASE_HIT_SCORE/5);} + } + + return maxQuickScore(offsets, keyScores); + } + + + public final int maxQuickScore(int[] offsets, int[] keyScores){ + +// int x=offsets.length*BASE_KEY_HIT_SCORE; + int x=Tools.intSum(keyScores); + int y=(Y_SCORE_MULT+Y2_SCORE_MULT)*(offsets[offsets.length-1]-offsets[0]); +// if(ADD_LIST_SIZE_BONUS){x+=(LIST_SIZE_BONUS[1]*offsets.length);} +// assert(!ADD_SCORE_Z) : "Need to make sure this is correct..."; + +// if(ADD_SCORE_Z){x+=((offsets[offsets.length-1]+CHUNKSIZE)*Z_SCORE_MULT);} + if(ADD_SCORE_Z){x+=maxScoreZ(offsets);} + + return x+y; +// int bonus=(2*(HIT_SCORE/2)); //For matching both ends +// return x+y+bonus; + } + + + private final int quickScore(final int[] locs, final int[] keyScores, final int centerIndex, final int offsets[], + int[] sizes, final boolean penalizeIndels, final int numApproxHits, final int numHits){ + + if(numApproxHits==1){return keyScores[centerIndex];} + + //Done! + //Correct way to calculate score: + //Find the first chunk that exactly hits the center. + //Then, align leftward of it, and align rightward of it, and sum the scores. + + //"-centerIndex" is a disambiguating term that, given otherwise identical match patterns + //(for example, a small indel will generate two valid site candidates), choose the lower site. + + int x=keyScores[centerIndex]+scoreLeft(locs, keyScores, centerIndex, sizes, penalizeIndels)+ + scoreRight(locs, keyScores, centerIndex, sizes, penalizeIndels, numHits)-centerIndex; + + int y=Y_SCORE_MULT*scoreY(locs, centerIndex, offsets)+Y2_SCORE_MULT*scoreY2(locs, centerIndex, offsets); + if(ADD_LIST_SIZE_BONUS){x+=calcListSizeBonus(sizes[centerIndex]);} +// int z=scoreZ(locs, hits); + return x+y; + } + + + /** Generates a term that increases score with how far apart the two farthest perfect (+- Y2_INDEL) matches are. + * Assumes that the centerIndex corresponds to the leftmost perfect match. */ + public final int scoreY2(int[] locs, int centerIndex, int offsets[]){ + int center=locs[centerIndex]; +// +// int leftIndex=centerIndex; +// for(int i=centerIndex-1; i>=0; i--){ +// if(absdif(locs[i], centerIndex)>Y2_INDEL){break;} +// leftIndex=i; +// } + + int leftIndex=centerIndex; + for(int i=0; iY2_INDEL){break;} + if(absdif(locs[i], center)<=Y2_INDEL){ + leftIndex=i; + break; + } + } + + int rightIndex=centerIndex; + for(int i=offsets.length-1; i>centerIndex; i--){ +// assert(locs[i]>=locs[centerIndex]); +// if(locs[i]-centerIndex>Y2_INDEL){break;} + if(absdif(locs[i], center)<=Y2_INDEL){ + rightIndex=i; + break; + } + } + + return offsets[rightIndex]-offsets[leftIndex]; + } + + +// /** Generates a term that increases score with how many bases in the read match the ref. */ +// public static final int scoreZ(int[] locs, int centerIndex, int offsets[]){ +// final int center=locs[centerIndex]; +// +// final int[] refLoc=new int[offsets[offsets.length-1]+CHUNKSIZE]; +// +// final int maxLoc=center+MAX_INDEL2; +// final int minLoc=Tools.max(0, center-MAX_INDEL); +// +// int score=0; +// +// for(int i=0; i=minLoc && loc<=maxLoc){ +//// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +//// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); +// +// int offset=offsets[i]; +// int max=CHUNKSIZE+offset; +// +// for(int j=offset; jloc){ +// refLoc[j]=loc; +// score-=2; +// }else if(old==loc){ +// score-=1; +// //do nothing, perhaps, or add 1? +// }else{ +// score-=2; +// assert(old=0 && rloc>=0 && rloc=0); + if(dif>minGap){ + gaps++; + } + } + if(gaps<1){return null;} + int[] out=new int[2+gaps*2]; + out[0]=locArray[0]; + out[out.length-1]=locArray[locArray.length-1]; + + for(int i=1, j=1; i=0); + if(dif>minGap){ + out[j]=locArray[i-1]; + out[j+1]=locArray[i]; + j+=2; + } + } + return out; + } + + + /** Generates a term that increases score with how many bases in the read match the ref. */ + private final int scoreZ2(int[] locs, int centerIndex, int offsets[], int numApproxHits, int numHits){ + + if(numApproxHits==1){return SCOREZ_1KEY;} + + final int center=locs[centerIndex]; + + final int maxLoc=center+MAX_INDEL2; + final int minLoc=Tools.max(0, center-MAX_INDEL); + + int score=0; + + int a0=-1, b0=-1; + + for(int i=0; i=minLoc && loc<=maxLoc){ +// assert(loc>=center) : "loc="+loc+"\ni="+i+"\ncenterIndex="+centerIndex+ +// "\nmaxLoc="+maxLoc+"\nlocs:\t"+Arrays.toString(locs)+"\noffsets:\t"+Arrays.toString(offsets); + int a=offsets[i]; + + if(b0=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==locs.length-1){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; +// score-=(INDEL_PENALTY+Tools.min(INDEL_PENALTY_MULT*offset, 1+HIT_SCORE/4)); + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + private final int scoreLeft(int[] locs, int[] keyScores, int centerIndex, int[] sizes, boolean penalizeIndels){ + + callsToScore++; + + int score=0; + + int prev, loc=locs[centerIndex]; + + for(int i=centerIndex-1; i>=0; i--){ + + if(locs[i]>=0){ + prev=loc; + loc=locs[i]; + + int offset=absdif(loc, prev); + + if(offset<=MAX_INDEL){ + score+=keyScores[i]; + if(ADD_LIST_SIZE_BONUS){score+=calcListSizeBonus(sizes[i]);} + +// if(i==0){score+=HIT_SCORE/2;} //Adds a bonus for matching the first or last key + if(penalizeIndels && offset!=0){ + int penalty=Tools.min(INDEL_PENALTY+INDEL_PENALTY_MULT*offset, MAX_PENALTY_FOR_MISALIGNED_HIT); + score-=penalty; + } + }else{ + loc=prev; + } + } + + } + return score; + + } + + /** Encode a (location, chrom) pair to an index */ + private static final int toNumber(int site, int chrom){ + int out=(chrom&CHROM_MASK_LOW); + out=out<=0 && f<1); + FRACTION_GENOME_TO_EXCLUDE=f; + MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + } + + + /** Default .75. Range: 0 to 1 (but 0 will break everything). Lower is faster and less accurate. */ + static final float HIT_FRACTION_TO_RETAIN=.97f; //default: .85 + /** Range: 0 to 1000. Lower should be faster and less accurate. */ + static int MIN_INDEX_TO_DROP_LONG_HIT_LIST=(int)(1000*(1-3.5*FRACTION_GENOME_TO_EXCLUDE)); //default 810 + /** Range: 2 to infinity. Lower should be faster and less accurate. */ + static final int MIN_HIT_LISTS_TO_RETAIN=12; + + static int MAX_AVERAGE_LIST_TO_SEARCH=(int)(1000*(1-2.3*FRACTION_GENOME_TO_EXCLUDE)); //lower is faster, default 840 + //lower is faster + static int MAX_AVERAGE_LIST_TO_SEARCH2=(int)(1000*(1-1.4*FRACTION_GENOME_TO_EXCLUDE)); //default 910 + //lower is faster + static int MAX_SINGLE_LIST_TO_SEARCH=(int)(1000*(1-1.0*FRACTION_GENOME_TO_EXCLUDE)); //default 935 + //lower is faster + static int MAX_SHORTEST_LIST_TO_SEARCH=(int)(1000*(1-2.8*FRACTION_GENOME_TO_EXCLUDE)); //Default 860 + + /** To increase accuracy on small genomes, override greedy list dismissal when the list is at most this long. */ + public static final int SMALL_GENOME_LIST=80; + + static{assert(!(TRIM_BY_GREEDY && TRIM_BY_TOTAL_SITE_COUNT)) : "Pick one.";} + + static final int CLUMPY_MAX_DIST=5; //Keys repeating over intervals of this or less are clumpy. + + /** Minimum length of list before clumpiness is considered. This is an index in the length histogram, from 0 to 1000. */ + static final int CLUMPY_MIN_LENGTH_INDEX=2800; + static final float CLUMPY_FRACTION=0.8f; //0 to 1; higher is slower but more stringent. 0.5 means the median distance is clumpy. + + static final int MAX_SUBSUMPTION_LENGTH=MAX_INDEL2; + + private static final int calcQScoreCutoff(final int max, final int score, final int currentCutoff){ + assert(max>=score) : max+", "+score; + assert(score>=0); + + assert(currentCutoff>0); + int r=Tools.max(currentCutoff, Tools.min((int)(SKIM_LEVEL_Q*max), (int)(DYNAMIC_SKIM_LEVEL_Q*score))); +// if(r>currentCutoff){ +// System.out.println("qcutoff: "+currentCutoff+"\t->\t"+r); +// } + return r; + } + + private static final int calcScoreCutoff(final int max, final int score, final int currentCutoff){ + assert(max>=score) : max+", "+score; + assert(score>=0); + + assert(currentCutoff>0); + int r=Tools.max(currentCutoff, Tools.min((int)(SKIM_LEVEL*max), (int)(DYNAMIC_SKIM_LEVEL*score))); + return r; + } + + private static final int calcApproxHitsCutoff(final int keys, final int hits, int currentCutoff, final boolean perfect){ //***$ + assert(keys>=hits) : keys+", "+hits; + assert(hits>=0); + + int mahtk=MIN_APPROX_HITS_TO_KEEP; + if(SEMIPERFECTMODE || PERFECTMODE){ + if(keys==1){return 1;} + else if(MIN_APPROX_HITS_TO_KEEP0); + return Tools.max(currentCutoff, Tools.min((int)(SKIM_LEVEL_H*keys), (int)(DYNAMIC_SKIM_LEVEL_H*hits))); + } + + public static boolean PRESCAN_QSCORE=true && USE_EXTENDED_SCORE; //Decrease quality and increase speed + public static final boolean FILTER_BY_QSCORE=true; //Slightly lower quality, but very fast. + public static final float MIN_SCORE_MULT=(USE_AFFINE_SCORE ? 0.03f : USE_EXTENDED_SCORE ? .3f : 0.10f); //Fraction of max score to use as cutoff. Default 0.15, max is 1; lower is more accurate + public static final float MIN_QSCORE_MULT=0.03f; //Fraction of max score to use as cutoff. Default 0.025, max is 1; lower is more accurate. VERY SENSITIVE. + public static final float MIN_QSCORE_MULT2=0.03f; + static final float DYNAMIC_SCORE_THRESH=(USE_AFFINE_SCORE ? 0.55f : USE_EXTENDED_SCORE ? .74f : 0.6f); //Default .85f; lower is more accurate + static{ + assert(MIN_SCORE_MULT>=0 && MIN_SCORE_MULT<1); +// assert(DYNAMIC_SCORE_THRESH>=0 && DYNAMIC_SCORE_THRESH<1); + } + + //Skim Depth Settings + + /** Always retain sites with at least this fraction of max hits (to pass on to qscore) */ + public static float SKIM_LEVEL_H=0.098f; //.08 or .09 + /** Always retain sites with at least this fraction of best hits */ + public static final float DYNAMIC_SKIM_LEVEL_H=0.48f; //.45 + + /** Always retain sites with at least this fraction of max qscore (to pass on to extend) */ + public static float SKIM_LEVEL_Q=0.098f; //.09 + /** Always retain sites with at least this fraction of best qscore */ + public static final float DYNAMIC_SKIM_LEVEL_Q=0.78f; //.75 + + /** Always retain sites with at least this fraction of max score (to output) */ + public static float SKIM_LEVEL=0.105f; //.10 + /** Always retain sites with at least this fraction of best score */ + public static final float DYNAMIC_SKIM_LEVEL=0.78f; //.75 + + +} diff --git a/current/align2/BBMap.java b/current/align2/BBMap.java new file mode 100755 index 0000000..2aced97 --- /dev/null +++ b/current/align2/BBMap.java @@ -0,0 +1,528 @@ +package align2; + +import java.io.File; +import java.util.ArrayList; + +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.ReadStreamWriter; +import stream.SamLine; + +import dna.ChromosomeArray; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; + +/** + * Based on TestIndex11f + * + * @author Brian Bushnell + * @date Dec 22, 2012 + * + */ +public final class BBMap extends AbstractMapper { + + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + BBMap mapper=new BBMap(args); + mapper.loadIndex(); + if(Data.scaffoldPrefixes){mapper.processAmbig2();} + mapper.testSpeed(args); + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("\nTotal time: \t"+t); + } + + public BBMap(String[] args){ + super(args); + } + + @Override + public void setDefaults(){ + ReadWrite.ZIPLEVEL=2; + MAKE_MATCH_STRING=true; + keylen=13; + + MINIMUM_ALIGNMENT_SCORE_RATIO=0.56f; + + keyDensity=1.9f;//2.3f; + maxKeyDensity=3f;//4f; + minKeyDensity=1.5f;//1.8f; + maxDesiredKeys=15; + + SLOW_ALIGN_PADDING=4; + SLOW_RESCUE_PADDING=4+SLOW_ALIGN_PADDING; + TIP_SEARCH_DIST=100; + + MSA_TYPE="MultiStateAligner11ts"; + MAX_SITESCORES_TO_PRINT=5; + PRINT_SECONDARY_ALIGNMENTS=false; + AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1; + } + + @Override + public String[] preparse(String[] args){ + if(fast){ + ArrayList list=new ArrayList(); + list.add("tipsearch="+TIP_SEARCH_DIST/5); + list.add("maxindel=80"); + list.add("minhits=2"); + list.add("bwr=0.18"); + list.add("bw=40"); + list.add("minratio=0.65"); + list.add("midpad=150"); + list.add("minscaf=50"); + list.add("quickmatch=t"); +// list.add("k=13"); + + //TODO: Make these adjustable. +// MIN_TRIM_SITES_TO_RETAIN_SINGLE +// MIN_TRIM_SITES_TO_RETAIN_PAIRED +// MAX_TRIM_SITES_TO_RETAIN + //TODO: Make trimLists adjustable via an offset or multiplier + + BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*1.25f); + + for(String s : args){if(s!=null){list.add(s);}} + args=list.toArray(new String[list.size()]); + + keyDensity*=0.9f; + maxKeyDensity*=0.9f; + minKeyDensity*=0.9f; + }else if(slow){ + ArrayList list=new ArrayList(); + list.add("tipsearch="+(TIP_SEARCH_DIST*3)/2); +// list.add("maxindel=80"); + list.add("minhits=1"); +// list.add("bwr=0.18"); +// list.add("bw=40"); + list.add("minratio=0.45"); +// list.add("midpad=150"); +// list.add("minscaf=50"); +// list.add("k=13"); + + BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.4f); + + for(String s : args){if(s!=null){list.add(s);}} + args=list.toArray(new String[list.size()]); + + keyDensity*=1.2f; + maxKeyDensity*=1.2f; + minKeyDensity*=1.2f; + } + return args; + } + + @Override + void postparse(String[] args){ + + if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){ + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 3); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 6); + } + + if(maxIndel1>-1){ + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1); + BBIndex.MAX_INDEL=maxIndel1; + } + if(maxIndel2>-1){ + BBIndex.MAX_INDEL2=maxIndel2; + } + + if(minApproxHits>-1){ + BBIndex.MIN_APPROX_HITS_TO_KEEP=minApproxHits; + } + + if(expectedSites>-1){ + BBMapThread.setExpectedSites(expectedSites); + sysout.println("Set EXPECTED_SITES to "+expectedSites); + } + + if(fractionGenomeToExclude>=0){ + BBIndex.setFractionToExclude(fractionGenomeToExclude); + } + + { + final String a=(args.length>0 ? args[0] : null); + final String b=(args.length>1 ? args[1] : null); + if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;} + if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;} + if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);} + } + + assert(readlen0){ + int halfwidth=MSA.bandwidth/2; + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2); + BBIndex.MAX_INDEL=Tools.min(BBIndex.MAX_INDEL, halfwidth/2); + BBIndex.MAX_INDEL2=Tools.min(BBIndex.MAX_INDEL2, halfwidth); + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4); + } + + if(PRINT_SECONDARY_ALIGNMENTS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex.QUIT_AFTER_TWO_PERFECTS=false; + } + + if(ambigMode==AMBIG_BEST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + if(!PRINT_SECONDARY_ALIGNMENTS){BBIndex.QUIT_AFTER_TWO_PERFECTS=true;} + sysout.println("Retaining first best site only for ambiguous mappings."); + }else if(ambigMode==AMBIG_ALL){ + PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true; + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex.QUIT_AFTER_TWO_PERFECTS=false; + SamLine.MAKE_NH_TAG=true; + ambiguousAll=true; + sysout.println("Retaining all best sites for ambiguous mappings."); + }else if(ambigMode==AMBIG_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex.QUIT_AFTER_TWO_PERFECTS=false; + ambiguousRandom=true; + sysout.println("Choosing a site randomly for ambiguous mappings."); + }else if(ambigMode==AMBIG_TOSS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=true; + BBIndex.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Ambiguously mapped reads will be considered unmapped."); + }else{ + throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode); + } + + } + + @Override + public void setup(){ + + assert(!useRandomReads || reads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use."; + + if(minid!=-1){ + MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE); + sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO)); + } + + if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);} + if(setxs && !setintron){SamLine.INTRON_LIMIT=10;} + + if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null && outFileB==null && outFileB2==null && BBSplitter.streamTable==null){ + sysout.println("No output file."); + OUTPUT_READS=false; + }else{ + OUTPUT_READS=true; + if(bamscript!=null){ + BBSplitter.makeBamScript(bamscript, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2); + } + } + + FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN); + assert(FastaReadInputStream.settingsOK()); + + if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");} + else{Data.GENOME_BUILD=build;} + + if(blacklist!=null && blacklist.size()>0){ + Timer t=new Timer(); + t.start(); + for(String s : blacklist){ + Blacklist.addToBlacklist(s); + } + t.stop(); + sysout.println("Created blacklist:\t"+t); + t.start(); + } + + if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;} + if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);} + ReadWrite.USE_GZIP=gzip; + ReadWrite.USE_PIGZ=pigz; + //if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;} + } + + + @Override + void processAmbig2(){ + assert(Data.scaffoldPrefixes) : "Only process this block if there are multiple references."; + if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to special output streams."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to the first reference's stream only."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){ + BBIndex.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Reads that map to multiple references will be considered unmapped."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to a random stream."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to all relevant output streams."); + }else{ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST; + } + } + + @Override + void loadIndex(){ + Timer t=new Timer(); + t.start(); + + if(build>-1){ + Data.setGenome(build); + AbstractIndex.MINCHROM=1; + AbstractIndex.MAXCHROM=Data.numChroms; + if(minChrom<0){minChrom=1;} + if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;} + sysout.println("Set genome to "+Data.GENOME_BUILD); + + if(RefToIndex.AUTO_CHROMBITS){ + int maxLength=Tools.max(Data.chromLengths); + RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1; + RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16); + } + if(RefToIndex.chrombits!=-1){ + BBIndex.setChromBits(RefToIndex.chrombits); + if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);} + } + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){sysout.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(OUTPUT_READS && !Tools.testOutputFiles(OVERWRITE, false, outFile, outFile2)){ + throw new RuntimeException("\n\nOVERWRITE="+OVERWRITE+"; Can't write to output files "+outFile+", "+outFile2+"\n"); + } + + if(reads>0 && reads=keylen); + assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) : + minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM; + AbstractIndex.MINCHROM=minChrom; + AbstractIndex.MAXCHROM=maxChrom; + + if(targetGenomeSize>0){ + long bases=Data.numDefinedBases; + long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize)); + BBMapThread.setExpectedSites((int)x); + sysout.println("Set EXPECTED_SITES to "+x); + } + + assert(!(PERFECTMODE && SEMIPERFECTMODE)); + if(PERFECTMODE){setPerfectMode();} + if(SEMIPERFECTMODE){setSemiperfectMode();} + + //Optional section for discrete timing of chrom array loading + if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){ + sysout.println(); + if(RefToIndex.chromlist==null){ + Data.loadChromosomes(minChrom, maxChrom); + }else{ + assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size(); + for(ChromosomeArray cha : RefToIndex.chromlist){ + Data.chromosomePlusMatrix[cha.chromosome]=cha; + } + } + if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();} + t.stop(); + sysout.println("Loaded Reference:\t"+t); + t.start(); + } + RefToIndex.chromlist=null; + + t.start(); + BBIndex.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK); + + { + long len=Data.numDefinedBases; + if(len<300000000){ + BBIndex.MAX_HITS_REDUCTION2+=1; + BBIndex.MAXIMUM_MAX_HITS_REDUCTION+=1; + if(len<30000000){ + BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.5f); + BBIndex.MAXIMUM_MAX_HITS_REDUCTION+=1; + BBIndex.HIT_REDUCTION_DIV=Tools.max(BBIndex.HIT_REDUCTION_DIV-1, 3); + }else if(len<100000000){ + BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.6f); + }else{ + BBIndex.setFractionToExclude(BBIndex.FRACTION_GENOME_TO_EXCLUDE*0.75f); + } + } + } + + t.stop(); + sysout.println("Generated Index:\t"+t); + t.start(); + + if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + Data.unload(chrom, true); + } + } + + if(ReadWrite.countActiveThreads()>0){ + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("Finished Writing:\t"+t); + t.start(); + } + + if(!forceanalyze && (in1==null || reads==0)){return;} + + BBIndex.analyzeIndex(minChrom, maxChrom, colorspace, BBIndex.FRACTION_GENOME_TO_EXCLUDE, keylen); + + t.stop(); + sysout.println("Analyzed Index: \t"+t); + t.start(); + } + + public void testSpeed(String[] args){ + + if(in1==null || reads==0){ + sysout.println("No reads to process; quitting."); + return; + } + + Timer t=new Timer(); + t.start(); + + final boolean paired=openStreams(t, args); + if(paired){BBIndex.QUIT_AFTER_TWO_PERFECTS=false;} + + t.start(); + + adjustThreadsforMemory(25); + + AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS; + AbstractMapThread[] mtts=new AbstractMapThread[Shared.THREADS]; + for(int i=0; i0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");} + } + + @Override + void setSemiperfectMode() { + assert(SEMIPERFECTMODE); + if(SEMIPERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f; + BBIndex.setSemiperfectMode(); + } + } + + @Override + void setPerfectMode() { + assert(PERFECTMODE); + if(PERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f; + BBIndex.setPerfectMode(); + } + } + + + @Override + void printSettings(int k){ + + printSettings0(k, BBIndex.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO); + + if(verbose_stats>=2){ + sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")"); + sysout.println("Max keys: \t"+maxDesiredKeys); + + sysout.println("Block Subsections: \t"+BBIndex.CHROMS_PER_BLOCK); + sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndex.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndex.FRACTION_GENOME_TO_EXCLUDE : 0))); + // sysout.println("ADD_SCORE_Z: \t"+Index4.ADD_SCORE_Z); + sysout.println("Hits To Keep: \t"+BBIndex.MIN_APPROX_HITS_TO_KEEP); + } + + if(verbose_stats>=3){ + sysout.println("Remove Clumpy: \t"+BBIndex.REMOVE_CLUMPY); + if(BBIndex.REMOVE_CLUMPY){ + sysout.println("CLUMPY_MAX_DIST: \t"+BBIndex.CLUMPY_MAX_DIST); + sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndex.CLUMPY_MIN_LENGTH_INDEX); + sysout.println("CLUMPY_FRACTION: \t"+BBIndex.CLUMPY_FRACTION); + } + sysout.println("Remove Long Lists: \t"+BBIndex.TRIM_LONG_HIT_LISTS); + if(BBIndex.TRIM_LONG_HIT_LISTS){ + sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndex.HIT_FRACTION_TO_RETAIN); + } + sysout.println("Trim By Greedy: \t"+BBIndex.TRIM_BY_GREEDY); + sysout.println("Trim By Total Sites: \t"+BBIndex.TRIM_BY_TOTAL_SITE_COUNT); + if(BBIndex.TRIM_BY_TOTAL_SITE_COUNT){ + sysout.println("MAX_AVG_SITES: \t"+BBIndex.MAX_AVERAGE_LIST_TO_SEARCH); + sysout.println("MAX_AVG_SITES_2: \t"+BBIndex.MAX_AVERAGE_LIST_TO_SEARCH2); + sysout.println("MAX_SHORTEST_SITE: \t"+BBIndex.MAX_SHORTEST_LIST_TO_SEARCH); + } + sysout.println("Index Min Score: \t"+BBIndex.MIN_SCORE_MULT); + + sysout.println("Dynamic Trim: \t"+BBIndex.DYNAMICALLY_TRIM_LOW_SCORES); + if(BBIndex.DYNAMICALLY_TRIM_LOW_SCORES){ + sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndex.DYNAMIC_SCORE_THRESH); + } + } + + } + +} diff --git a/current/align2/BBMap5.java b/current/align2/BBMap5.java new file mode 100755 index 0000000..0a924d4 --- /dev/null +++ b/current/align2/BBMap5.java @@ -0,0 +1,501 @@ +package align2; + +import java.io.File; +import java.util.ArrayList; + +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.ReadStreamWriter; +import stream.SamLine; + +import dna.ChromosomeArray; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; + +/** + * Based on TestIndex11f + * + * @author Brian Bushnell + * @date Jan 3, 2013 + * + */ +public final class BBMap5 extends AbstractMapper { + + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + BBMap5 mapper=new BBMap5(args); + mapper.loadIndex(); + if(Data.scaffoldPrefixes){mapper.processAmbig2();} + mapper.testSpeed(args); + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("\nTotal time: \t"+t); + } + + public BBMap5(String[] args){ + super(args); + } + + @Override + public void setDefaults(){ + ReadWrite.ZIPLEVEL=2; + MAKE_MATCH_STRING=true; + keylen=13; + + MINIMUM_ALIGNMENT_SCORE_RATIO=0.56f; + + keyDensity=1.9f;//2.3f; + maxKeyDensity=3f;//4f; + minKeyDensity=1.5f;//1.8f; + maxDesiredKeys=15; + + SLOW_ALIGN_PADDING=4; + SLOW_RESCUE_PADDING=4+SLOW_ALIGN_PADDING; + TIP_SEARCH_DIST=100; + + MSA_TYPE="MultiStateAligner11ts"; + MAX_SITESCORES_TO_PRINT=5; + PRINT_SECONDARY_ALIGNMENTS=false; + AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1; + } + + @Override + public String[] preparse(String[] args){ + if(fast){ + ArrayList list=new ArrayList(); + list.add("tipsearch="+TIP_SEARCH_DIST/5); + list.add("maxindel=80"); + list.add("minhits=2"); + list.add("bwr=0.18"); + list.add("bw=40"); + list.add("minratio=0.65"); + list.add("midpad=150"); + list.add("minscaf=50"); + list.add("quickmatch=t"); +// list.add("k=13"); + + BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*1.25f); + + for(String s : args){if(s!=null){list.add(s);}} + args=list.toArray(new String[list.size()]); + + keyDensity*=0.9f; + maxKeyDensity*=0.9f; + minKeyDensity*=0.9f; + } + return args; + } + + @Override + void postparse(String[] args){ + + if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){ + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 3); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 6); + } + + if(maxIndel1>-1){ + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1); + BBIndex5.MAX_INDEL=maxIndel1; + } + if(maxIndel2>-1){ + BBIndex5.MAX_INDEL2=maxIndel2; + } + + if(minApproxHits>-1){ + BBIndex5.MIN_APPROX_HITS_TO_KEEP=minApproxHits; + } + + if(expectedSites>-1){ + BBMapThread5.setExpectedSites(expectedSites); + sysout.println("Set EXPECTED_SITES to "+expectedSites); + } + + if(fractionGenomeToExclude>=0){ + BBIndex5.setFractionToExclude(fractionGenomeToExclude); + } + + { + final String a=(args.length>0 ? args[0] : null); + final String b=(args.length>1 ? args[1] : null); + if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;} + if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;} + if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);} + } + + assert(readlen0){ + int halfwidth=MSA.bandwidth/2; + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2); + BBIndex5.MAX_INDEL=Tools.min(BBIndex5.MAX_INDEL, halfwidth/2); + BBIndex5.MAX_INDEL2=Tools.min(BBIndex5.MAX_INDEL2, halfwidth); + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4); + } + + if(PRINT_SECONDARY_ALIGNMENTS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=false; + } + + if(ambigMode==AMBIG_BEST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + if(!PRINT_SECONDARY_ALIGNMENTS){BBIndex5.QUIT_AFTER_TWO_PERFECTS=true;} + sysout.println("Retaining first best site only for ambiguous mappings."); + }else if(ambigMode==AMBIG_ALL){ + PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true; + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=false; + SamLine.MAKE_NH_TAG=true; + ambiguousAll=true; + sysout.println("Retaining all best sites for ambiguous mappings."); + }else if(ambigMode==AMBIG_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=false; + ambiguousRandom=true; + sysout.println("Choosing a site randomly for ambiguous mappings."); + }else if(ambigMode==AMBIG_TOSS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=true; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Ambiguously mapped reads will be considered unmapped."); + }else{ + throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode); + } + + } + + @Override + public void setup(){ + + assert(!useRandomReads || reads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use."; + + if(minid!=-1){ + MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE); + sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO)); + } + + if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);} + if(setxs && !setintron){SamLine.INTRON_LIMIT=10;} + + if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null && outFileB==null && outFileB2==null && BBSplitter.streamTable==null){ + sysout.println("No output file."); + OUTPUT_READS=false; + }else{ + OUTPUT_READS=true; + if(bamscript!=null){ + BBSplitter.makeBamScript(bamscript, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2); + } + } + + FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN); + assert(FastaReadInputStream.settingsOK()); + + if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");} + else{Data.GENOME_BUILD=build;} + + if(blacklist!=null && blacklist.size()>0){ + Timer t=new Timer(); + t.start(); + for(String s : blacklist){ + Blacklist.addToBlacklist(s); + } + t.stop(); + sysout.println("Created blacklist:\t"+t); + t.start(); + } + + if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;} + if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);} + ReadWrite.USE_GZIP=gzip; + ReadWrite.USE_PIGZ=pigz; + } + + + @Override + void processAmbig2(){ + assert(false) : "TODO: Only process this block if there are multiple references."; //This information may not be available yet, though... + if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to special output streams."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to the first reference's stream only."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){ + BBIndex5.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Reads that map to multiple references will be considered unmapped."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to a random stream."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndex5.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to all relevant output streams."); + }else{ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST; + } + } + + @Override + void loadIndex(){ + Timer t=new Timer(); + t.start(); + + if(build>-1){ + Data.setGenome(build); + AbstractIndex.MINCHROM=1; + AbstractIndex.MAXCHROM=Data.numChroms; + if(minChrom<0){minChrom=1;} + if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;} + sysout.println("Set genome to "+Data.GENOME_BUILD); + + if(RefToIndex.AUTO_CHROMBITS){ + int maxLength=Tools.max(Data.chromLengths); + RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength); //Different for v5! + RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16); + } + if(RefToIndex.chrombits!=-1){ + BBIndex5.setChromBits(RefToIndex.chrombits); + if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);} + } + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){sysout.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(OUTPUT_READS && !Tools.testOutputFiles(OVERWRITE, false, outFile, outFile2)){ + throw new RuntimeException("\n\nOVERWRITE="+OVERWRITE+"; Can't write to output files "+outFile+", "+outFile2+"\n"); + } + + if(reads>0 && reads=keylen); + assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) : + minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM; + AbstractIndex.MINCHROM=minChrom; + AbstractIndex.MAXCHROM=maxChrom; + + if(targetGenomeSize>0){ + long bases=Data.numDefinedBases; + long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize)); + BBMapThread5.setExpectedSites((int)x); + sysout.println("Set EXPECTED_SITES to "+x); + } + + assert(!(PERFECTMODE && SEMIPERFECTMODE)); + if(PERFECTMODE){setPerfectMode();} + if(SEMIPERFECTMODE){setSemiperfectMode();} + + //Optional section for discrete timing of chrom array loading + if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){ + sysout.println(); + if(RefToIndex.chromlist==null){ + Data.loadChromosomes(minChrom, maxChrom); + }else{ + assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size(); + for(ChromosomeArray cha : RefToIndex.chromlist){ + Data.chromosomePlusMatrix[cha.chromosome]=cha; + } + } + if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();} + t.stop(); + sysout.println("Loaded Reference:\t"+t); + t.start(); + } + RefToIndex.chromlist=null; + + t.start(); + BBIndex5.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK); + + { + long len=Data.numDefinedBases; + if(len<300000000){ + BBIndex5.MAX_HITS_REDUCTION2+=1; + BBIndex5.MAXIMUM_MAX_HITS_REDUCTION+=1; + if(len<30000000){ + BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*0.5f); + BBIndex5.MAXIMUM_MAX_HITS_REDUCTION+=1; + BBIndex5.HIT_REDUCTION_DIV=Tools.max(BBIndex5.HIT_REDUCTION_DIV-1, 3); + }else if(len<100000000){ + BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*0.6f); + }else{ + BBIndex5.setFractionToExclude(BBIndex5.FRACTION_GENOME_TO_EXCLUDE*0.75f); + } + } + } + + t.stop(); + sysout.println("Generated Index:\t"+t); + t.start(); + + if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + Data.unload(chrom, true); + } + } + + if(ReadWrite.countActiveThreads()>0){ + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("Finished Writing:\t"+t); + t.start(); + } + + if(!forceanalyze && (in1==null || reads==0)){return;} + + BBIndex5.analyzeIndex(minChrom, maxChrom, colorspace, BBIndex5.FRACTION_GENOME_TO_EXCLUDE, keylen); + + t.stop(); + sysout.println("Analyzed Index: \t"+t); + t.start(); + } + + public void testSpeed(String[] args){ + + if(in1==null || reads==0){ + sysout.println("No reads to process; quitting."); + return; + } + + Timer t=new Timer(); + t.start(); + + final boolean paired=openStreams(t, args); + if(paired){BBIndex5.QUIT_AFTER_TWO_PERFECTS=false;} + + t.start(); + + adjustThreadsforMemory(25); + + AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS; + AbstractMapThread[] mtts=new AbstractMapThread[Shared.THREADS]; + for(int i=0; i0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");} + } + + @Override + void setSemiperfectMode() { + assert(SEMIPERFECTMODE); + if(SEMIPERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f; + BBIndex5.setSemiperfectMode(); + } + } + + @Override + void setPerfectMode() { + assert(PERFECTMODE); + if(PERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f; + BBIndex5.setPerfectMode(); + } + } + + + @Override + void printSettings(int k){ + + printSettings0(k, BBIndex5.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO); + + if(verbose_stats>=2){ + sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")"); + sysout.println("Max keys: \t"+maxDesiredKeys); + + sysout.println("Block Subsections: \t"+BBIndex5.CHROMS_PER_BLOCK); + sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndex5.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndex5.FRACTION_GENOME_TO_EXCLUDE : 0))); + // sysout.println("ADD_SCORE_Z: \t"+Index5.ADD_SCORE_Z); + sysout.println("Hits To Keep: \t"+BBIndex5.MIN_APPROX_HITS_TO_KEEP); + } + + if(verbose_stats>=3){ + sysout.println("Remove Clumpy: \t"+BBIndex5.REMOVE_CLUMPY); + if(BBIndex5.REMOVE_CLUMPY){ + sysout.println("CLUMPY_MAX_DIST: \t"+BBIndex5.CLUMPY_MAX_DIST); + sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndex5.CLUMPY_MIN_LENGTH_INDEX); + sysout.println("CLUMPY_FRACTION: \t"+BBIndex5.CLUMPY_FRACTION); + } + sysout.println("Remove Long Lists: \t"+BBIndex5.TRIM_LONG_HIT_LISTS); + if(BBIndex5.TRIM_LONG_HIT_LISTS){ + sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndex5.HIT_FRACTION_TO_RETAIN); + } + sysout.println("Trim By Greedy: \t"+BBIndex5.TRIM_BY_GREEDY); + sysout.println("Trim By Total Sites: \t"+BBIndex5.TRIM_BY_TOTAL_SITE_COUNT); + if(BBIndex5.TRIM_BY_TOTAL_SITE_COUNT){ + sysout.println("MAX_AVG_SITES: \t"+BBIndex5.MAX_AVERAGE_LIST_TO_SEARCH); + sysout.println("MAX_AVG_SITES_2: \t"+BBIndex5.MAX_AVERAGE_LIST_TO_SEARCH2); + sysout.println("MAX_SHORTEST_SITE: \t"+BBIndex5.MAX_SHORTEST_LIST_TO_SEARCH); + } + sysout.println("Index Min Score: \t"+BBIndex5.MIN_SCORE_MULT); + + sysout.println("Dynamic Trim: \t"+BBIndex5.DYNAMICALLY_TRIM_LOW_SCORES); + if(BBIndex5.DYNAMICALLY_TRIM_LOW_SCORES){ + sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndex5.DYNAMIC_SCORE_THRESH); + } + } + + } + +} diff --git a/current/align2/BBMapAcc.java b/current/align2/BBMapAcc.java new file mode 100755 index 0000000..9874323 --- /dev/null +++ b/current/align2/BBMapAcc.java @@ -0,0 +1,501 @@ +package align2; + +import java.io.File; +import java.util.ArrayList; + +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.ReadStreamWriter; +import stream.SamLine; + +import dna.ChromosomeArray; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; + +/** + * Based on TestIndex11Ii + * + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public final class BBMapAcc extends AbstractMapper { + + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + BBMapAcc mapper=new BBMapAcc(args); + mapper.loadIndex(); + if(Data.scaffoldPrefixes){mapper.processAmbig2();} + mapper.testSpeed(args); + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("\nTotal time: \t"+t); + } + + public BBMapAcc(String[] args){ + super(args); + } + + @Override + public void setDefaults(){ + ReadWrite.ZIPLEVEL=2; + MAKE_MATCH_STRING=true; + keylen=13; + + MINIMUM_ALIGNMENT_SCORE_RATIO=0.56f; + + keyDensity=2.3f;//2.3f; + maxKeyDensity=3.2f;//4f; + minKeyDensity=1.8f;//1.8f; + maxDesiredKeys=20; + + SLOW_ALIGN_PADDING=20; + SLOW_RESCUE_PADDING=4+SLOW_ALIGN_PADDING; + TIP_SEARCH_DIST=200; + + MSA_TYPE="MultiStateAligner11ts"; + MAX_SITESCORES_TO_PRINT=8; + PRINT_SECONDARY_ALIGNMENTS=false; + AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1; + } + + @Override + public String[] preparse(String[] args){ + if(fast){ + ArrayList list=new ArrayList(); + list.add("tipsearch="+TIP_SEARCH_DIST/2); + list.add("maxindel=80"); +// list.add("minhits=2"); + list.add("bwr=0.3"); +// list.add("bw=64"); + list.add("minratio=0.60"); + list.add("midpad=150"); + list.add("minscaf=50"); + list.add("quickmatch=t"); +// list.add("k=13"); + + BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*1.25f); + + for(String s : args){if(s!=null){list.add(s);}} + args=list.toArray(new String[list.size()]); + + keyDensity*=0.9f; + maxKeyDensity*=0.9f; + minKeyDensity*=0.9f; + } + return args; + } + + @Override + void postparse(String[] args){ + + if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){ + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 4); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 8); + } + + if(maxIndel1>-1){ + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1); + BBIndexAcc.MAX_INDEL=maxIndel1; + } + if(maxIndel2>-1){ + BBIndexAcc.MAX_INDEL2=maxIndel2; + } + + if(minApproxHits>-1){ + BBIndexAcc.MIN_APPROX_HITS_TO_KEEP=minApproxHits; + } + + if(expectedSites>-1){ + BBMapThreadAcc.setExpectedSites(expectedSites); + sysout.println("Set EXPECTED_SITES to "+expectedSites); + } + + if(fractionGenomeToExclude>=0){ + BBIndexAcc.setFractionToExclude(fractionGenomeToExclude); + } + + { + final String a=(args.length>0 ? args[0] : null); + final String b=(args.length>1 ? args[1] : null); + if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;} + if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;} + if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);} + } + + assert(readlen0){ + int halfwidth=MSA.bandwidth/2; + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2); + BBIndexAcc.MAX_INDEL=Tools.min(BBIndexAcc.MAX_INDEL, halfwidth/2); + BBIndexAcc.MAX_INDEL2=Tools.min(BBIndexAcc.MAX_INDEL2, halfwidth); + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4); + } + + if(PRINT_SECONDARY_ALIGNMENTS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false; + } + + if(ambigMode==AMBIG_BEST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + if(!PRINT_SECONDARY_ALIGNMENTS){BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=true;} + sysout.println("Retaining first best site only for ambiguous mappings."); + }else if(ambigMode==AMBIG_ALL){ + PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true; + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false; + SamLine.MAKE_NH_TAG=true; + ambiguousAll=true; + sysout.println("Retaining all best sites for ambiguous mappings."); + }else if(ambigMode==AMBIG_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false; + ambiguousRandom=true; + sysout.println("Choosing a site randomly for ambiguous mappings."); + }else if(ambigMode==AMBIG_TOSS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=true; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Ambiguously mapped reads will be considered unmapped."); + }else{ + throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode); + } + + } + + @Override + public void setup(){ + + assert(!useRandomReads || reads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use."; + + if(minid!=-1){ + MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE); + sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO)); + } + + if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);} + if(setxs && !setintron){SamLine.INTRON_LIMIT=10;} + + if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null && outFileB==null && outFileB2==null && BBSplitter.streamTable==null){ + sysout.println("No output file."); + OUTPUT_READS=false; + }else{ + OUTPUT_READS=true; + if(bamscript!=null){ + BBSplitter.makeBamScript(bamscript, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2); + } + } + + FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN); + assert(FastaReadInputStream.settingsOK()); + + if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");} + else{Data.GENOME_BUILD=build;} + + if(blacklist!=null && blacklist.size()>0){ + Timer t=new Timer(); + t.start(); + for(String s : blacklist){ + Blacklist.addToBlacklist(s); + } + t.stop(); + sysout.println("Created blacklist:\t"+t); + t.start(); + } + + if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;} + if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);} + ReadWrite.USE_GZIP=gzip; + ReadWrite.USE_PIGZ=pigz; + } + + + @Override + void processAmbig2(){ + assert(false) : "TODO: Only process this block if there are multiple references."; //This information may not be available yet, though... + if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to special output streams."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to the first reference's stream only."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){ + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Reads that map to multiple references will be considered unmapped."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to a random stream."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to all relevant output streams."); + }else{ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST; + } + } + + @Override + void loadIndex(){ + Timer t=new Timer(); + t.start(); + + if(build>-1){ + Data.setGenome(build); + AbstractIndex.MINCHROM=1; + AbstractIndex.MAXCHROM=Data.numChroms; + if(minChrom<0){minChrom=1;} + if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;} + sysout.println("Set genome to "+Data.GENOME_BUILD); + + if(RefToIndex.AUTO_CHROMBITS){ + int maxLength=Tools.max(Data.chromLengths); + RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1; + RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16); + } + if(RefToIndex.chrombits!=-1){ + BBIndexAcc.setChromBits(RefToIndex.chrombits); + if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);} + } + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){sysout.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(OUTPUT_READS && !Tools.testOutputFiles(OVERWRITE, false, outFile, outFile2)){ + throw new RuntimeException("\n\nOVERWRITE="+OVERWRITE+"; Can't write to output files "+outFile+", "+outFile2+"\n"); + } + + if(reads>0 && reads=keylen); + assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) : + minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM; + AbstractIndex.MINCHROM=minChrom; + AbstractIndex.MAXCHROM=maxChrom; + + if(targetGenomeSize>0){ + long bases=Data.numDefinedBases; + long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize)); + BBMapThreadAcc.setExpectedSites((int)x); + sysout.println("Set EXPECTED_SITES to "+x); + } + + assert(!(PERFECTMODE && SEMIPERFECTMODE)); + if(PERFECTMODE){setPerfectMode();} + if(SEMIPERFECTMODE){setSemiperfectMode();} + + //Optional section for discrete timing of chrom array loading + if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){ + sysout.println(); + if(RefToIndex.chromlist==null){ + Data.loadChromosomes(minChrom, maxChrom); + }else{ + assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size(); + for(ChromosomeArray cha : RefToIndex.chromlist){ + Data.chromosomePlusMatrix[cha.chromosome]=cha; + } + } + if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();} + t.stop(); + sysout.println("Loaded Reference:\t"+t); + t.start(); + } + RefToIndex.chromlist=null; + + t.start(); + BBIndexAcc.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK); + + { + long len=Data.numDefinedBases; + if(len<300000000){ + BBIndexAcc.MAX_HITS_REDUCTION2+=1; + BBIndexAcc.MAXIMUM_MAX_HITS_REDUCTION+=1; + if(len<30000000){ + BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*0.5f); + BBIndexAcc.MAXIMUM_MAX_HITS_REDUCTION+=1; + BBIndexAcc.HIT_REDUCTION_DIV=Tools.max(BBIndexAcc.HIT_REDUCTION_DIV-1, 3); + }else if(len<100000000){ + BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*0.6f); + }else{ + BBIndexAcc.setFractionToExclude(BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE*0.75f); + } + } + } + + t.stop(); + sysout.println("Generated Index:\t"+t); + t.start(); + + if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + Data.unload(chrom, true); + } + } + + if(ReadWrite.countActiveThreads()>0){ + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("Finished Writing:\t"+t); + t.start(); + } + + if(!forceanalyze && (in1==null || reads==0)){return;} + + BBIndexAcc.analyzeIndex(minChrom, maxChrom, colorspace, BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE, keylen); + + t.stop(); + sysout.println("Analyzed Index: \t"+t); + t.start(); + } + + public void testSpeed(String[] args){ + + if(in1==null || reads==0){ + sysout.println("No reads to process; quitting."); + return; + } + + Timer t=new Timer(); + t.start(); + + final boolean paired=openStreams(t, args); + if(paired){BBIndexAcc.QUIT_AFTER_TWO_PERFECTS=false;} + + t.start(); + + adjustThreadsforMemory(25); + + AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS; + AbstractMapThread[] mtts=new AbstractMapThread[Shared.THREADS]; + for(int i=0; i0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");} + } + + @Override + void setSemiperfectMode() { + assert(SEMIPERFECTMODE); + if(SEMIPERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f; + BBIndexAcc.setSemiperfectMode(); + } + } + + @Override + void setPerfectMode() { + assert(PERFECTMODE); + if(PERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f; + BBIndexAcc.setPerfectMode(); + } + } + + + @Override + void printSettings(int k){ + + printSettings0(k, BBIndexAcc.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO); + + if(verbose_stats>=2){ + sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")"); + sysout.println("Max keys: \t"+maxDesiredKeys); + + sysout.println("Block Subsections: \t"+BBIndexAcc.CHROMS_PER_BLOCK); + sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndexAcc.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndexAcc.FRACTION_GENOME_TO_EXCLUDE : 0))); + // sysout.println("ADD_SCORE_Z: \t"+Indexi.ADD_SCORE_Z); + sysout.println("Hits To Keep: \t"+BBIndexAcc.MIN_APPROX_HITS_TO_KEEP); + } + + if(verbose_stats>=3){ + sysout.println("Remove Clumpy: \t"+BBIndexAcc.REMOVE_CLUMPY); + if(BBIndexAcc.REMOVE_CLUMPY){ + sysout.println("CLUMPY_MAX_DIST: \t"+BBIndexAcc.CLUMPY_MAX_DIST); + sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndexAcc.CLUMPY_MIN_LENGTH_INDEX); + sysout.println("CLUMPY_FRACTION: \t"+BBIndexAcc.CLUMPY_FRACTION); + } + sysout.println("Remove Long Lists: \t"+BBIndexAcc.TRIM_LONG_HIT_LISTS); + if(BBIndexAcc.TRIM_LONG_HIT_LISTS){ + sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndexAcc.HIT_FRACTION_TO_RETAIN); + } + sysout.println("Trim By Greedy: \t"+BBIndexAcc.TRIM_BY_GREEDY); + sysout.println("Trim By Total Sites: \t"+BBIndexAcc.TRIM_BY_TOTAL_SITE_COUNT); + if(BBIndexAcc.TRIM_BY_TOTAL_SITE_COUNT){ + sysout.println("MAX_AVG_SITES: \t"+BBIndexAcc.MAX_AVERAGE_LIST_TO_SEARCH); + sysout.println("MAX_AVG_SITES_2: \t"+BBIndexAcc.MAX_AVERAGE_LIST_TO_SEARCH2); + sysout.println("MAX_SHORTEST_SITE: \t"+BBIndexAcc.MAX_SHORTEST_LIST_TO_SEARCH); + } + sysout.println("Index Min Score: \t"+BBIndexAcc.MIN_SCORE_MULT); + + sysout.println("Dynamic Trim: \t"+BBIndexAcc.DYNAMICALLY_TRIM_LOW_SCORES); + if(BBIndexAcc.DYNAMICALLY_TRIM_LOW_SCORES){ + sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndexAcc.DYNAMIC_SCORE_THRESH); + } + } + + } + +} diff --git a/current/align2/BBMapPacBio.java b/current/align2/BBMapPacBio.java new file mode 100755 index 0000000..6cfd552 --- /dev/null +++ b/current/align2/BBMapPacBio.java @@ -0,0 +1,500 @@ +package align2; + +import java.io.File; +import java.util.ArrayList; + +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.ReadStreamWriter; +import stream.SamLine; + +import dna.ChromosomeArray; +import dna.Data; +import dna.FastaToChromArrays; +import dna.Timer; +import fileIO.ReadWrite; + +/** + * Based on TestIndex11f + * + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public final class BBMapPacBio extends AbstractMapper { + + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + BBMapPacBio mapper=new BBMapPacBio(args); + mapper.loadIndex(); + if(Data.scaffoldPrefixes){mapper.processAmbig2();} + mapper.testSpeed(args); + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("\nTotal time: \t"+t); + } + + public BBMapPacBio(String[] args){ + super(args); + } + + @Override + public void setDefaults(){ + FastaToChromArrays.MID_PADDING=2000; + ReadWrite.ZIPLEVEL=2; + MAKE_MATCH_STRING=true; + keylen=12; + + MINIMUM_ALIGNMENT_SCORE_RATIO=0.46f; + + keyDensity=3.5f;//2.3f; + maxKeyDensity=4.5f;//4f; + minKeyDensity=2.8f;//1.8f; + maxDesiredKeys=63; + + SLOW_ALIGN_PADDING=8; + SLOW_RESCUE_PADDING=8+SLOW_ALIGN_PADDING; + TIP_SEARCH_DIST=15; + + MSA_TYPE="MultiStateAligner9PacBio"; + MAX_SITESCORES_TO_PRINT=100; + PRINT_SECONDARY_ALIGNMENTS=false; + AbstractIndex.MIN_APPROX_HITS_TO_KEEP=1; + } + + @Override + public String[] preparse(String[] args){ + if(fast){ + ArrayList list=new ArrayList(); + list.add("tipsearch="+TIP_SEARCH_DIST/5); +// list.add("maxindel=100"); +// list.add("minhits=2"); + list.add("bwr=0.16"); +// list.add("minratio=0.5"); +// list.add("k=13"); + list.add("quickmatch=t"); + + BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*1.25f); + + for(String s : args){if(s!=null){list.add(s);}} + args=list.toArray(new String[list.size()]); + + keyDensity*=0.9f; + maxKeyDensity*=0.9f; + minKeyDensity*=0.9f; + } + return args; + } + + @Override + void postparse(String[] args){ + + if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){ + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 5); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 10); + } + + if(maxIndel1>-1){ + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1); + BBIndexPacBio.MAX_INDEL=maxIndel1; + } + if(maxIndel2>-1){ + BBIndexPacBio.MAX_INDEL2=maxIndel2; + } + + if(minApproxHits>-1){ + BBIndexPacBio.MIN_APPROX_HITS_TO_KEEP=minApproxHits; + } + + if(expectedSites>-1){ + BBMapThreadPacBio.setExpectedSites(expectedSites); + sysout.println("Set EXPECTED_SITES to "+expectedSites); + } + + if(fractionGenomeToExclude>=0){ + BBIndexPacBio.setFractionToExclude(fractionGenomeToExclude); + } + + { + final String a=(args.length>0 ? args[0] : null); + final String b=(args.length>1 ? args[1] : null); + if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;} + if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;} + if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);} + } + + assert(readlen0){ + int halfwidth=MSA.bandwidth/2; + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2); + BBIndexPacBio.MAX_INDEL=Tools.min(BBIndexPacBio.MAX_INDEL, halfwidth/2); + BBIndexPacBio.MAX_INDEL2=Tools.min(BBIndexPacBio.MAX_INDEL2, halfwidth); + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4); + } + + if(PRINT_SECONDARY_ALIGNMENTS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false; + } + + if(ambigMode==AMBIG_BEST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + if(!PRINT_SECONDARY_ALIGNMENTS){BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=true;} + sysout.println("Retaining first best site only for ambiguous mappings."); + }else if(ambigMode==AMBIG_ALL){ + PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true; + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false; + SamLine.MAKE_NH_TAG=true; + ambiguousAll=true; + sysout.println("Retaining all best sites for ambiguous mappings."); + }else if(ambigMode==AMBIG_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false; + ambiguousRandom=true; + sysout.println("Choosing a site randomly for ambiguous mappings."); + }else if(ambigMode==AMBIG_TOSS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=true; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Ambiguously mapped reads will be considered unmapped."); + }else{ + throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode); + } + + } + + @Override + public void setup(){ + + assert(!useRandomReads || reads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use."; + + if(minid!=-1){ + MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE); + sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO)); + } + + if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);} + if(setxs && !setintron){SamLine.INTRON_LIMIT=10;} + + if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null && outFileB==null && outFileB2==null && BBSplitter.streamTable==null){ + sysout.println("No output file."); + OUTPUT_READS=false; + }else{ + OUTPUT_READS=true; + if(bamscript!=null){ + BBSplitter.makeBamScript(bamscript, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2); + } + } + + FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN); + assert(FastaReadInputStream.settingsOK()); + + if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");} + else{Data.GENOME_BUILD=build;} + + if(blacklist!=null && blacklist.size()>0){ + Timer t=new Timer(); + t.start(); + for(String s : blacklist){ + Blacklist.addToBlacklist(s); + } + t.stop(); + sysout.println("Created blacklist:\t"+t); + t.start(); + } + + if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;} + if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);} + ReadWrite.USE_GZIP=gzip; + ReadWrite.USE_PIGZ=pigz; + } + + + @Override + void processAmbig2(){ + assert(false) : "TODO: Only process this block if there are multiple references."; //This information may not be available yet, though... + if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to special output streams."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to the first reference's stream only."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){ + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Reads that map to multiple references will be considered unmapped."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to a random stream."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; + BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to all relevant output streams."); + }else{ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST; + } + } + + @Override + void loadIndex(){ + Timer t=new Timer(); + t.start(); + + if(build>-1){ + Data.setGenome(build); + AbstractIndex.MINCHROM=1; + AbstractIndex.MAXCHROM=Data.numChroms; + if(minChrom<0){minChrom=1;} + if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;} + sysout.println("Set genome to "+Data.GENOME_BUILD); + + if(RefToIndex.AUTO_CHROMBITS){ + int maxLength=Tools.max(Data.chromLengths); + RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1; + RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16); + } + if(RefToIndex.chrombits!=-1){ + BBIndexPacBio.setChromBits(RefToIndex.chrombits); + if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);} + } + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){sysout.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(OUTPUT_READS && !Tools.testOutputFiles(OVERWRITE, false, outFile, outFile2)){ + throw new RuntimeException("\n\nOVERWRITE="+OVERWRITE+"; Can't write to output files "+outFile+", "+outFile2+"\n"); + } + + if(reads>0 && reads=keylen); + assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) : + minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM; + AbstractIndex.MINCHROM=minChrom; + AbstractIndex.MAXCHROM=maxChrom; + + if(targetGenomeSize>0){ + long bases=Data.numDefinedBases; + long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize)); + BBMapThreadPacBio.setExpectedSites((int)x); + sysout.println("Set EXPECTED_SITES to "+x); + } + + assert(!(PERFECTMODE && SEMIPERFECTMODE)); + if(PERFECTMODE){setPerfectMode();} + if(SEMIPERFECTMODE){setSemiperfectMode();} + + //Optional section for discrete timing of chrom array loading + if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){ + sysout.println(); + if(RefToIndex.chromlist==null){ + Data.loadChromosomes(minChrom, maxChrom); + }else{ + assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size(); + for(ChromosomeArray cha : RefToIndex.chromlist){ + Data.chromosomePlusMatrix[cha.chromosome]=cha; + } + } + if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();} + t.stop(); + sysout.println("Loaded Reference:\t"+t); + t.start(); + } + RefToIndex.chromlist=null; + + t.start(); + BBIndexPacBio.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK); + + { + long len=Data.numDefinedBases; + if(len<300000000){ + BBIndexPacBio.MAX_HITS_REDUCTION2+=1; + BBIndexPacBio.MAXIMUM_MAX_HITS_REDUCTION+=1; + if(len<30000000){ + BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*0.5f); + BBIndexPacBio.MAXIMUM_MAX_HITS_REDUCTION+=1; + BBIndexPacBio.HIT_REDUCTION_DIV=Tools.max(BBIndexPacBio.HIT_REDUCTION_DIV-1, 3); + }else if(len<100000000){ + BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*0.6f); + }else{ + BBIndexPacBio.setFractionToExclude(BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE*0.75f); + } + } + } + + t.stop(); + sysout.println("Generated Index:\t"+t); + t.start(); + + if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + Data.unload(chrom, true); + } + } + + if(ReadWrite.countActiveThreads()>0){ + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("Finished Writing:\t"+t); + t.start(); + } + + if(!forceanalyze && (in1==null || reads==0)){return;} + + BBIndexPacBio.analyzeIndex(minChrom, maxChrom, colorspace, BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE, keylen); + + t.stop(); + sysout.println("Analyzed Index: \t"+t); + t.start(); + } + + public void testSpeed(String[] args){ + + if(in1==null || reads==0){ + sysout.println("No reads to process; quitting."); + return; + } + + Timer t=new Timer(); + t.start(); + + final boolean paired=openStreams(t, args); + if(paired){BBIndexPacBio.QUIT_AFTER_TWO_PERFECTS=false;} + + t.start(); + + adjustThreadsforMemory(680); + + AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS; + AbstractMapThread[] mtts=new AbstractMapThread[Shared.THREADS]; + for(int i=0; i0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");} + } + + @Override + void setSemiperfectMode() { + assert(SEMIPERFECTMODE); + if(SEMIPERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f; + BBIndexPacBio.setSemiperfectMode(); + } + } + + @Override + void setPerfectMode() { + assert(PERFECTMODE); + if(PERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f; + BBIndexPacBio.setPerfectMode(); + } + } + + + @Override + void printSettings(int k){ + + printSettings0(k, BBIndexPacBio.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO); + + if(verbose_stats>=2){ + sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")"); + sysout.println("Max keys: \t"+maxDesiredKeys); + + sysout.println("Block Subsections: \t"+BBIndexPacBio.CHROMS_PER_BLOCK); + sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndexPacBio.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndexPacBio.FRACTION_GENOME_TO_EXCLUDE : 0))); + // sysout.println("ADD_SCORE_Z: \t"+IndexPacBio.ADD_SCORE_Z); + sysout.println("Hits To Keep: \t"+BBIndexPacBio.MIN_APPROX_HITS_TO_KEEP); + } + + if(verbose_stats>=3){ + sysout.println("Remove Clumpy: \t"+BBIndexPacBio.REMOVE_CLUMPY); + if(BBIndexPacBio.REMOVE_CLUMPY){ + sysout.println("CLUMPY_MAX_DIST: \t"+BBIndexPacBio.CLUMPY_MAX_DIST); + sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndexPacBio.CLUMPY_MIN_LENGTH_INDEX); + sysout.println("CLUMPY_FRACTION: \t"+BBIndexPacBio.CLUMPY_FRACTION); + } + sysout.println("Remove Long Lists: \t"+BBIndexPacBio.TRIM_LONG_HIT_LISTS); + if(BBIndexPacBio.TRIM_LONG_HIT_LISTS){ + sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndexPacBio.HIT_FRACTION_TO_RETAIN); + } + sysout.println("Trim By Greedy: \t"+BBIndexPacBio.TRIM_BY_GREEDY); + sysout.println("Trim By Total Sites: \t"+BBIndexPacBio.TRIM_BY_TOTAL_SITE_COUNT); + if(BBIndexPacBio.TRIM_BY_TOTAL_SITE_COUNT){ + sysout.println("MAX_AVG_SITES: \t"+BBIndexPacBio.MAX_AVERAGE_LIST_TO_SEARCH); + sysout.println("MAX_AVG_SITES_2: \t"+BBIndexPacBio.MAX_AVERAGE_LIST_TO_SEARCH2); + sysout.println("MAX_SHORTEST_SITE: \t"+BBIndexPacBio.MAX_SHORTEST_LIST_TO_SEARCH); + } + sysout.println("Index Min Score: \t"+BBIndexPacBio.MIN_SCORE_MULT); + + sysout.println("Dynamic Trim: \t"+BBIndexPacBio.DYNAMICALLY_TRIM_LOW_SCORES); + if(BBIndexPacBio.DYNAMICALLY_TRIM_LOW_SCORES){ + sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndexPacBio.DYNAMIC_SCORE_THRESH); + } + } + + } + +} diff --git a/current/align2/BBMapPacBioSkimmer.java b/current/align2/BBMapPacBioSkimmer.java new file mode 100755 index 0000000..c1af215 --- /dev/null +++ b/current/align2/BBMapPacBioSkimmer.java @@ -0,0 +1,500 @@ +package align2; + +import java.io.File; +import java.util.ArrayList; + +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.ReadStreamWriter; +import stream.SamLine; + +import dna.ChromosomeArray; +import dna.Data; +import dna.FastaToChromArrays; +import dna.Timer; +import fileIO.ReadWrite; + +/** + * Based on TestIndex11f + * Designed to skim and retain all sites above a threshold. + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public final class BBMapPacBioSkimmer extends AbstractMapper { + + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + BBMapPacBioSkimmer mapper=new BBMapPacBioSkimmer(args); + mapper.loadIndex(); + if(Data.scaffoldPrefixes){mapper.processAmbig2();} + mapper.testSpeed(args); + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("\nTotal time: \t"+t); + } + + public BBMapPacBioSkimmer(String[] args){ + super(args); + } + + @Override + public void setDefaults(){ + FastaToChromArrays.MID_PADDING=2000; + ReadWrite.ZIPLEVEL=2; + MAKE_MATCH_STRING=true; + keylen=12; + + MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f; + + keyDensity=3.3f;//2.3f; //Normal key density + maxKeyDensity=4.3f;//4f; //For situations where some of the read is too low quality, this is the max for the rest of the read. + minKeyDensity=1.8f;//1.8f; + maxDesiredKeys=63; //Don't go above this number of keys except to maintain minKeyDensity. + + SLOW_ALIGN_PADDING=8; + SLOW_RESCUE_PADDING=8+SLOW_ALIGN_PADDING; + TIP_SEARCH_DIST=15; + + MSA_TYPE="MultiStateAligner9PacBio"; + MAX_SITESCORES_TO_PRINT=500; + PRINT_SECONDARY_ALIGNMENTS=true; + AbstractIndex.MIN_APPROX_HITS_TO_KEEP=2; + } + + @Override + public String[] preparse(String[] args){ + if(fast){ + ArrayList list=new ArrayList(); + list.add("tipsearch="+TIP_SEARCH_DIST/5); +// list.add("maxindel=100"); +// list.add("minhits=2"); + list.add("bwr=0.16"); +// list.add("minratio=0.5"); +// list.add("k=13"); + list.add("quickmatch=t"); + +// BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*1.25f); + + for(String s : args){if(s!=null){list.add(s);}} + args=list.toArray(new String[list.size()]); + + keyDensity*=0.9f; + maxKeyDensity*=0.9f; + minKeyDensity*=0.9f; + } + return args; + } + + @Override + void postparse(String[] args){ + + if(MSA.bandwidthRatio>0 && MSA.bandwidthRatio<.2){ + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, 5); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, 10); + } + + if(maxIndel1>-1){ + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, maxIndel1); + BBIndexPacBioSkimmer.MAX_INDEL=maxIndel1; + } + if(maxIndel2>-1){ + BBIndexPacBioSkimmer.MAX_INDEL2=maxIndel2; + } + + if(minApproxHits>-1){ + BBIndexPacBioSkimmer.MIN_APPROX_HITS_TO_KEEP=minApproxHits; + } + + if(expectedSites>-1){ + BBMapThreadPacBioSkimmer.setExpectedSites(expectedSites); + sysout.println("Set EXPECTED_SITES to "+expectedSites); + } + + if(fractionGenomeToExclude>=0){ + BBIndexPacBioSkimmer.setFractionToExclude(fractionGenomeToExclude); + } + + { + final String a=(args.length>0 ? args[0] : null); + final String b=(args.length>1 ? args[1] : null); + if(in1==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in1=a;} + if(in2==null && b!=null && b.indexOf('=')<0 && new File(b).exists()){in2=b;} + if(ERROR_ON_NO_OUTPUT && !OUTPUT_READS && in1!=null){throw new RuntimeException("Error: no output file, and ERROR_ON_NO_OUTPUT="+ERROR_ON_NO_OUTPUT);} + } + + assert(readlen0){ + int halfwidth=MSA.bandwidth/2; + TIP_SEARCH_DIST=Tools.min(TIP_SEARCH_DIST, halfwidth/2); + BBIndexPacBioSkimmer.MAX_INDEL=Tools.min(BBIndexPacBioSkimmer.MAX_INDEL, halfwidth/2); + BBIndexPacBioSkimmer.MAX_INDEL2=Tools.min(BBIndexPacBioSkimmer.MAX_INDEL2, halfwidth); + SLOW_ALIGN_PADDING=Tools.min(SLOW_ALIGN_PADDING, halfwidth/4); + SLOW_RESCUE_PADDING=Tools.min(SLOW_RESCUE_PADDING, halfwidth/4); + } + + if(PRINT_SECONDARY_ALIGNMENTS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false; + } + + if(ambigMode==AMBIG_BEST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// if(!PRINT_SECONDARY_ALIGNMENTS){BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=true;} + sysout.println("Retaining first best site only for ambiguous mappings."); + }else if(ambigMode==AMBIG_ALL){ + PRINT_SECONDARY_ALIGNMENTS=ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS=true; + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false; + SamLine.MAKE_NH_TAG=true; + ambiguousAll=true; + sysout.println("Retaining all best sites for ambiguous mappings."); + }else if(ambigMode==AMBIG_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false; + ambiguousRandom=true; + sysout.println("Choosing a site randomly for ambiguous mappings."); + }else if(ambigMode==AMBIG_TOSS){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=true; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Ambiguously mapped reads will be considered unmapped."); + }else{ + throw new RuntimeException("Unknown ambiguous mapping mode: "+ambigMode); + } + + } + + @Override + public void setup(){ + + assert(!useRandomReads || reads>0 || (in1!=null && in1.equals("sequential"))) : "Please specify number of reads to use."; + + if(minid!=-1){ + MINIMUM_ALIGNMENT_SCORE_RATIO=MSA.minIdToMinRatio(minid, MSA_TYPE); + sysout.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+String.format("%.3f",MINIMUM_ALIGNMENT_SCORE_RATIO)); + } + + if(!setxs){SamLine.MAKE_XS_TAG=(SamLine.INTRON_LIMIT<1000000000);} + if(setxs && !setintron){SamLine.INTRON_LIMIT=10;} + + if(outFile==null && outFile2==null && outFileM==null && outFileM2==null && outFileU==null && outFileU2==null && outFileB==null && outFileB2==null && BBSplitter.streamTable==null){ + sysout.println("No output file."); + OUTPUT_READS=false; + }else{ + OUTPUT_READS=true; + if(bamscript!=null){ + BBSplitter.makeBamScript(bamscript, outFile, outFile2, outFileM, outFileM2, outFileU, outFileU2, outFileB, outFileB2); + } + } + + FastaReadInputStream.MIN_READ_LEN=Tools.max(keylen+2, FastaReadInputStream.MIN_READ_LEN); + assert(FastaReadInputStream.settingsOK()); + + if(build<0){throw new RuntimeException("Must specify a build number, e.g. build=1");} + else{Data.GENOME_BUILD=build;} + + if(blacklist!=null && blacklist.size()>0){ + Timer t=new Timer(); + t.start(); + for(String s : blacklist){ + Blacklist.addToBlacklist(s); + } + t.stop(); + sysout.println("Created blacklist:\t"+t); + t.start(); + } + + if(ziplevel!=-1){ReadWrite.ZIPLEVEL=ziplevel;} + if(reference!=null){RefToIndex.makeIndex(reference, build, sysout, keylen);} + ReadWrite.USE_GZIP=gzip; + ReadWrite.USE_PIGZ=pigz; + } + + + @Override + void processAmbig2(){ + assert(false) : "TODO: Only process this block if there are multiple references."; //This information may not be available yet, though... + if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_SPLIT){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to special output streams."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_FIRST){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to the first reference's stream only."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_TOSS){ +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=true; + sysout.println("Reads that map to multiple references will be considered unmapped."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_RANDOM){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to a random stream."); + }else if(BBSplitter.AMBIGUOUS2_MODE==BBSplitter.AMBIGUOUS2_ALL){ + REMOVE_DUPLICATE_BEST_ALIGNMENTS=false; +// BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false; + sysout.println("Reads that map to multiple references will be written to all relevant output streams."); + }else{ + BBSplitter.AMBIGUOUS2_MODE=BBSplitter.AMBIGUOUS2_FIRST; + } + } + + @Override + void loadIndex(){ + Timer t=new Timer(); + t.start(); + + if(build>-1){ + Data.setGenome(build); + AbstractIndex.MINCHROM=1; + AbstractIndex.MAXCHROM=Data.numChroms; + if(minChrom<0){minChrom=1;} + if(maxChrom<0 || maxChrom>Data.numChroms){maxChrom=Data.numChroms;} + sysout.println("Set genome to "+Data.GENOME_BUILD); + + if(RefToIndex.AUTO_CHROMBITS){ + int maxLength=Tools.max(Data.chromLengths); + RefToIndex.chrombits=Integer.numberOfLeadingZeros(maxLength)-1; + RefToIndex.chrombits=Tools.min(RefToIndex.chrombits, 16); + } + if(RefToIndex.chrombits!=-1){ + BBIndexPacBioSkimmer.setChromBits(RefToIndex.chrombits); + if(verbose_stats>0){sysout.println("Set CHROMBITS to "+RefToIndex.chrombits);} + } + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){sysout.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(OUTPUT_READS && !Tools.testOutputFiles(OVERWRITE, false, outFile, outFile2)){ + throw new RuntimeException("\n\nOVERWRITE="+OVERWRITE+"; Can't write to output files "+outFile+", "+outFile2+"\n"); + } + + if(reads>0 && reads=keylen); + assert(minChrom>=AbstractIndex.MINCHROM && maxChrom<=AbstractIndex.MAXCHROM) : + minChrom+", "+maxChrom+", "+AbstractIndex.MINCHROM+", "+AbstractIndex.MAXCHROM; + AbstractIndex.MINCHROM=minChrom; + AbstractIndex.MAXCHROM=maxChrom; + + if(targetGenomeSize>0){ + long bases=Data.numDefinedBases; + long x=Tools.max(1, Math.round(0.25f+bases*1d/targetGenomeSize)); + BBMapThreadPacBioSkimmer.setExpectedSites((int)x); + sysout.println("Set EXPECTED_SITES to "+x); + } + + assert(!(PERFECTMODE && SEMIPERFECTMODE)); + if(PERFECTMODE){setPerfectMode();} + if(SEMIPERFECTMODE){setSemiperfectMode();} + + //Optional section for discrete timing of chrom array loading + if(SLOW_ALIGN || AbstractIndex.USE_EXTENDED_SCORE || useRandomReads || MAKE_MATCH_STRING){ + sysout.println(); + if(RefToIndex.chromlist==null){ + Data.loadChromosomes(minChrom, maxChrom); + }else{ + assert(RefToIndex.chromlist.size()==maxChrom-minChrom+1) : RefToIndex.chromlist.size(); + for(ChromosomeArray cha : RefToIndex.chromlist){ + Data.chromosomePlusMatrix[cha.chromosome]=cha; + } + } + if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();} + t.stop(); + sysout.println("Loaded Reference:\t"+t); + t.start(); + } + RefToIndex.chromlist=null; + + t.start(); + BBIndexPacBioSkimmer.loadIndex(minChrom, maxChrom, keylen, !RefToIndex.NODISK, RefToIndex.NODISK); + + { + long len=Data.numDefinedBases; + if(len<300000000){ +// BBIndexPacBioSkimmer.MAX_HITS_REDUCTION2+=1; +// BBIndexPacBioSkimmer.MAXIMUM_MAX_HITS_REDUCTION+=1; + if(len<30000000){ + BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*0.5f); +// BBIndexPacBioSkimmer.MAXIMUM_MAX_HITS_REDUCTION+=1; +// BBIndexPacBioSkimmer.HIT_REDUCTION_DIV=Tools.max(BBIndexPacBioSkimmer.HIT_REDUCTION_DIV-1, 3); + }else if(len<100000000){ + BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*0.6f); + }else{ + BBIndexPacBioSkimmer.setFractionToExclude(BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE*0.75f); + } + } + } + + t.stop(); + sysout.println("Generated Index:\t"+t); + t.start(); + + if(!SLOW_ALIGN && !AbstractIndex.USE_EXTENDED_SCORE && !useRandomReads && !MAKE_MATCH_STRING){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + Data.unload(chrom, true); + } + } + + if(ReadWrite.countActiveThreads()>0){ + ReadWrite.waitForWritingToFinish(); + t.stop(); + sysout.println("Finished Writing:\t"+t); + t.start(); + } + + if(!forceanalyze && (in1==null || reads==0)){return;} + + BBIndexPacBioSkimmer.analyzeIndex(minChrom, maxChrom, colorspace, BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE, keylen); + + t.stop(); + sysout.println("Analyzed Index: \t"+t); + t.start(); + } + + public void testSpeed(String[] args){ + + if(in1==null || reads==0){ + sysout.println("No reads to process; quitting."); + return; + } + + Timer t=new Timer(); + t.start(); + + final boolean paired=openStreams(t, args); +// if(paired){BBIndexPacBioSkimmer.QUIT_AFTER_TWO_PERFECTS=false;} + + t.start(); + + adjustThreadsforMemory(680); + + AbstractMapThread.CALC_STATISTICS=CALC_STATISTICS; + AbstractMapThread[] mtts=new AbstractMapThread[Shared.THREADS]; + for(int i=0; i0 || errorState){throw new RuntimeException("BBMap terminated in an error state; the output may be corrupt.");} + } + + @Override + void setSemiperfectMode() { + assert(SEMIPERFECTMODE); + if(SEMIPERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=0.45f; //To allow semiperfect reads + BBIndexPacBioSkimmer.setSemiperfectMode(); + } + } + + @Override + void setPerfectMode() { + assert(PERFECTMODE); + if(PERFECTMODE){ + TRIM_LIST=false; + keyDensity/=2; + maxKeyDensity/=2; + minKeyDensity=1.1f; + maxDesiredKeys/=2; + MINIMUM_ALIGNMENT_SCORE_RATIO=1.0f; + BBIndexPacBioSkimmer.setPerfectMode(); + } + } + + + @Override + void printSettings(int k){ + + printSettings0(k, BBIndexPacBioSkimmer.MAX_INDEL, MINIMUM_ALIGNMENT_SCORE_RATIO); + + if(verbose_stats>=2){ + sysout.println("Key Density: \t"+keyDensity+" ("+minKeyDensity+" ~ "+maxKeyDensity+")"); + sysout.println("Max keys: \t"+maxDesiredKeys); + + sysout.println("Block Subsections: \t"+BBIndexPacBioSkimmer.CHROMS_PER_BLOCK); + sysout.println("Fraction To Remove: \t"+String.format("%.4f", (BBIndexPacBioSkimmer.REMOVE_FREQUENT_GENOME_FRACTION ? BBIndexPacBioSkimmer.FRACTION_GENOME_TO_EXCLUDE : 0))); + // sysout.println("ADD_SCORE_Z: \t"+IndexPacBioSkimmer.ADD_SCORE_Z); + sysout.println("Hits To Keep: \t"+BBIndexPacBioSkimmer.MIN_APPROX_HITS_TO_KEEP); + } + + if(verbose_stats>=3){ + sysout.println("Remove Clumpy: \t"+BBIndexPacBioSkimmer.REMOVE_CLUMPY); + if(BBIndexPacBioSkimmer.REMOVE_CLUMPY){ + sysout.println("CLUMPY_MAX_DIST: \t"+BBIndexPacBioSkimmer.CLUMPY_MAX_DIST); + sysout.println("CLUMPY_MIN_LENGTH: \t"+BBIndexPacBioSkimmer.CLUMPY_MIN_LENGTH_INDEX); + sysout.println("CLUMPY_FRACTION: \t"+BBIndexPacBioSkimmer.CLUMPY_FRACTION); + } + sysout.println("Remove Long Lists: \t"+BBIndexPacBioSkimmer.TRIM_LONG_HIT_LISTS); + if(BBIndexPacBioSkimmer.TRIM_LONG_HIT_LISTS){ + sysout.println("HIT_FRACTION_TO_RETAIN:\t"+BBIndexPacBioSkimmer.HIT_FRACTION_TO_RETAIN); + } + sysout.println("Trim By Greedy: \t"+BBIndexPacBioSkimmer.TRIM_BY_GREEDY); + sysout.println("Trim By Total Sites: \t"+BBIndexPacBioSkimmer.TRIM_BY_TOTAL_SITE_COUNT); + if(BBIndexPacBioSkimmer.TRIM_BY_TOTAL_SITE_COUNT){ + sysout.println("MAX_AVG_SITES: \t"+BBIndexPacBioSkimmer.MAX_AVERAGE_LIST_TO_SEARCH); + sysout.println("MAX_AVG_SITES_2: \t"+BBIndexPacBioSkimmer.MAX_AVERAGE_LIST_TO_SEARCH2); + sysout.println("MAX_SHORTEST_SITE: \t"+BBIndexPacBioSkimmer.MAX_SHORTEST_LIST_TO_SEARCH); + } + sysout.println("Index Min Score: \t"+BBIndexPacBioSkimmer.MIN_SCORE_MULT); + + sysout.println("Dynamic Trim: \t"+BBIndexPacBioSkimmer.DYNAMICALLY_TRIM_LOW_SCORES); + if(BBIndexPacBioSkimmer.DYNAMICALLY_TRIM_LOW_SCORES){ + sysout.println("DYNAMIC_SCORE_THRESH: \t"+BBIndexPacBioSkimmer.DYNAMIC_SCORE_THRESH); + } + } + + } + +} diff --git a/current/align2/BBMapThread.java b/current/align2/BBMapThread.java new file mode 100755 index 0000000..a327e4b --- /dev/null +++ b/current/align2/BBMapThread.java @@ -0,0 +1,1552 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import stream.ConcurrentReadStreamInterface; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; + +/** + * Based on MapTestThread11f + * + * @author Brian Bushnell + * @date Dec 22, 2012 + * + */ +public final class BBMapThread extends AbstractMapThread{ + + static final int ALIGN_COLUMNS=BBIndex.ALIGN_COLUMNS; + static final int ALIGN_ROWS=501; + + + + /** Don't trim for local alignments unless at least this many bases will be clipped */ + private final int LOCAL_ALIGN_TIP_LENGTH=8; + /** Range is 0-1; a lower number makes trimming more aggressive */ + private final float LOCAL_ALIGN_MATCH_POINT_RATIO=1f; + + /** Ratio of the points for a match of a single base needed to declare unambiguous. 1 SNP is currently about 2.57 */ + public final float CLEARZONE_RATIOP=1.6f; //default 1.3f, which makes read ambiguous if there is 1 N in an alternate site. + public final float CLEARZONE_RATIO1=2.0f; + public final float CLEARZONE_RATIO1b=2.6f; + public final float CLEARZONE_RATIO1c=4.6f; + public final float CLEARZONE_RATIO3=8.0f; + /** Max allowed number of sites within 1 edit (excluding primary site) */ + public final int CLEARZONE_LIMIT1e=40; //Needs to be redone to assign a quality penalty rather than simply marking as ambiguous + public final int CLEARZONEP; + public final int CLEARZONE1; + public final int CLEARZONE1b; + public final int CLEARZONE1c; + //public final int CLEARZONE1e; + public final int CLEARZONE3; + public final float INV_CLEARZONE3; + public final float CLEARZONE1b_CUTOFF_FLAT_RATIO=12;//3f; + public final float CLEARZONE1b_CUTOFF_FLAT; + public final float CLEARZONE1b_CUTOFF_SCALE=0.97f; + public final float CLEARZONE1c_CUTOFF_FLAT_RATIO=26;//7f; + public final float CLEARZONE1c_CUTOFF_FLAT; + public final float CLEARZONE1c_CUTOFF_SCALE=0.92f; + + public final BBIndex index; + + + private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3; + private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2; + private int MAX_TRIM_SITES_TO_RETAIN=800; + + public static void setExpectedSites(int x){ + System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName())); + } + + @Override + public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;} + @Override + public final int ALIGN_ROWS(){return ALIGN_ROWS;} + @Override + public final int maxReadLength(){return ALIGN_ROWS-1;} + @Override + final AbstractIndex index(){return index;} + @Override + final int CLEARZONE1(){return CLEARZONE1;} + + public BBMapThread(ConcurrentReadStreamInterface cris_, int keylen_, + boolean colorspace_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_, + int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_, + boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_, + float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_, + RTextOutputStream3 outStream_, RTextOutputStream3 outStreamMapped_, RTextOutputStream3 outStreamUnmapped_, RTextOutputStream3 outStreamBlack_, + boolean translateToBaseSpace_, int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_, + int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_, + boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_, + boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_, + boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, + boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){ + + super(cris_, + outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_, + colorspace_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_, + AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, THRESH_, + minChrom_, maxChrom_, KFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_, + translateToBaseSpace_, REQUIRE_CORRECT_STRANDS_PAIRS_, + SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_, + MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_, + MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_, + QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_, + keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_, + BBIndex.MIN_APPROX_HITS_TO_KEEP, BBIndex.USE_EXTENDED_SCORE, + BBIndex.BASE_HIT_SCORE, BBIndex.USE_AFFINE_SCORE, BBIndex.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_); + + assert(SLOW_ALIGN_PADDING>=0); + assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO"; + + if(SLOW_ALIGN || MAKE_MATCH_STRING){ +// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, colorspace, MSA_TYPE); +// POINTS_MATCH=msa.POINTS_MATCH(); +// POINTS_MATCH2=msa.POINTS_MATCH2(); + CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2); + CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2); + CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2); + CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2); + CLEARZONE3=(int)(CLEARZONE_RATIO3*POINTS_MATCH2); +// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1; + }else{ +// POINTS_MATCH=70; +// POINTS_MATCH2=100; +// msa=null; + CLEARZONE1=0; + CLEARZONE1b=0; + CLEARZONE1c=0; + CLEARZONEP=0; + CLEARZONE3=0; +// CLEARZONE1e=0; + } + + CLEARZONE1b_CUTOFF_FLAT=CLEARZONE1b_CUTOFF_FLAT_RATIO*POINTS_MATCH2; + CLEARZONE1c_CUTOFF_FLAT=CLEARZONE1c_CUTOFF_FLAT_RATIO*POINTS_MATCH2; + INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3); + + index=new BBIndex(KEYLEN, minChrom, maxChrom, KFILTER, msa); + } + + + public int trimList(ArrayList list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){ + if(list==null || list.size()==0){return -99999;} + if(list.size()==1){return list.get(0).score;} + + final int highestScore; + if(USE_AFFINE_SCORE){ + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(highestScore==maxScore && specialCasePerfect){ + Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + return highestScore; + } + + final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1); + +// if(list.size()>6){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>10){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>14){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>18){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// //// System.out.print(", "+list.size()); +// if(list.size()>22){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// //// System.out.print(", "+list.size()); +// if(list.size()>26){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>34){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +// if(list.size()>42){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +// if(list.size()>50){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +//// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + + if(list.size()>4){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>8){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>12){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// //// System.out.print(", "+list.size()); + if(list.size()>20){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// //// System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); +// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else if(USE_EXTENDED_SCORE){ + highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else{ + // System.out.print("\n\nSize:\t"+list.size()); + + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + } + + return highestScore; + } + + + public void scoreSlow(final ArrayList list, final byte[] basesP, final byte[] basesM, + final int maxSwScore, final int maxImperfectSwScore){ + + int minMsaLimit; + if(PAIRED){ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore); + }else{ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore); + } + assert(Read.CHECKSITES(list, basesP, basesM, -1)); + + int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string + for(int i=0; ibases.length-1) : bases.length+", "+ss.toText(); + assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n"; + ss.slowScore=-1; + ss.semiperfect=false; + ss.perfect=false; + } + + final int swscoreNoIndel=ss.slowScore; + int[] swscoreArray=null; + + if(swscoreNoIndel4000){ + System.err.println(ss.toText()); + System.err.println(list.size()); + System.err.println(); + } + + int expectedLen=GapTools.calcGrefLen(ss); + if(expectedLen>=EXPECTED_LEN_LIMIT){ + //TODO: Alternately, I could kill the site. + ss.stop=ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT); + if(ss.gaps!=null){GapTools.fixGaps(ss);} + } + + int pad=SLOW_ALIGN_PADDING; + int minscore=Tools.max(swscoreNoIndel, minMsaLimit); + if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));} + swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore); + if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));} + + if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen="+minscore+", "+PRINT_SECONDARY_ALIGNMENTS+", "+USE_SS_MATCH_FOR_PRIMARY+", "+minMatch); +// } + if(QUICK_MATCH_STRINGS && swscoreArray!=null && swscoreArray.length==6 && swscoreArray[0]>=minscore && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){ + assert(swscoreArray.length==6) : swscoreArray.length; + assert(swscoreArray[0]>=minscore) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch; + ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null); + ss.fixXY(bases, true, msa); +// System.err.println(i+": "+(ss.match==null ? "null" : new String(ss.match))); + }else{ + ss.match=null; + } + } + if(swscoreArray!=null){ + if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));} + ss.slowScore=swscoreArray[0]; + ss.start=swscoreArray[1]; + ss.stop=swscoreArray[2]; + if(ss.gaps!=null){ + if(verbose){System.err.println("GapTools.fixGaps("+ss.start+", "+ss.stop+", "+Arrays.toString(ss.gaps)+", "+Shared.MINGAP);} + ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP); + } + }else{ + assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP); + assert(swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) : + swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+ + ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow + } + ss.score=ss.slowScore; + minMatch=Tools.max(minMatch, ss.slowScore); + minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3); + assert(ss.slowScore<=maxSwScore); + assert(!(ss.perfect && ss.slowScore "+ss);} + } + } + + + public void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(OUTPUT_PAIRED_ONLY && r.mate!=null && !r.paired() && (r.mapped() || r.mate.mapped())){r.clearPairMapping();} + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment1++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + assert(elements>0 == r.mapped()); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM1+=errors[0]; + matchCountS1+=errors[1]; + matchCountD1+=errors[2]; + matchCountI1+=errors[3]; + matchCountN1+=errors[4]; + } + + + mappedRetained1++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP1++; + }else{ + rescuedM1++; + } + } + if(r.paired()){ + numMated++; + int inner; + int outer; + if(r.start<=r.mate.start){ + inner=r.mate.start-r.stop; + outer=r.mate.stop-r.start; + }else{ + inner=r.start-r.mate.stop; + outer=r.stop-r.mate.start; + } + + inner=Tools.min(MAX_PAIR_DIST, inner); + inner=Tools.max(MIN_PAIR_DIST, inner); + innerLengthSum+=inner; + outerLengthSum+=outer; + insertSizeSum+=(inner+r.bases.length+r.mate.bases.length); + }else if(r.mate!=null && r.mate.mapped()/*&& r.list!=null && r.list.size()>0*/){ + badPairs++; + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch1++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP1++;} + else{truePositiveM1++;} + totalCorrectSites1+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit1++; + }else{ + correctMultiHit1++; + } + }else{ + correctLowHit1++; + } + + }else{ + + falsePositive1++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + }else{ + noHit1++; + } + } + + + public void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment2++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM2+=errors[0]; + matchCountS2+=errors[1]; + matchCountD2+=errors[2]; + matchCountI2+=errors[3]; + matchCountN2+=errors[4]; + } + + mappedRetained2++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP2++; + }else{ + rescuedM2++; + } + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch2++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP2++;} + else{truePositiveM2++;} + totalCorrectSites2+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit2++; + }else{ + correctMultiHit2++; + } + }else{ + correctLowHit2++; + } + + }else{ + + falsePositive2++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded2++; + }else{ + noHit2++; + } + } + + public void processRead(final Read r, final byte[] basesM){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final byte[] basesP=r.bases; + +// System.err.print(" rd#"+r.numericID+" "); +// if(r.numericID==25967){ +// verbose=true; +// msa.verbose=true; +// GapTools.verbose=true; +// index.verbose=true; +// tcr.verbose=true; +// } + + if(verbose){System.err.println("\nProcessing "+r);} + readsUsed++; + + final int maxPossibleQuickScore=quickMap(r, basesM); + if(verbose){System.err.println("\nQuick Map: \t"+r.sites);} + + if(maxPossibleQuickScore<0){ + r.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + return; + } + initialSiteSum1+=r.numSites(); + if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);} + + int maxSwScore=0; + int maxImperfectSwScore=0; + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + maxSwScore=msa.maxQuality(r.bases.length); + maxImperfectSwScore=msa.maxImperfectScore(r.bases.length); + } + + if(TRIM_LIST && r.numSites()>1){ + if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);} + int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + if(verbose){System.err.println("\nAfter trim: \t"+r.sites);} + + assert(Read.CHECKSITES(r, basesM)); + + + if(SLOW_ALIGN && r.numSites()>0){ + + int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore); + + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + +// int numPerfectScores=0; +// if(numNearPerfectScores>0){ +// for(SiteScore ss : r.list){ +// if(ss.perfect){numPerfectScores++;} +// else{break;} +// } +// } + + if(verbose){ + System.err.println("\nAfter scoreNoIndels: \t"+r.sites); + } + + if(numNearPerfectScores<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);} + } + + if(verbose){ + System.err.println("\nAfter findTipDeletions: \t"+r.sites); + } + + //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length + //***Above note should be resolved now, but needs to be verified. + + if(numNearPerfectScores<1){ + scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore); + } + + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + + if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);} + assert(Read.CHECKSITES(r, basesM)); + } + + + if(r.numSites()>0){ + mapped1++; + try { + Tools.mergeDuplicateSites(r.sites, true, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r.toText(false)+"\n\n"); + } + Collections.sort(r.sites); + } + + if(r.numSites()>1){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r.sites.get(1); + //Ensure no duplicates + assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false); + } + assert(Read.CHECKSITES(r, basesM)); + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);} + + if(r.numSites()>1){ + + final int clearzone; + final int score=r.topSite().score; + if(r.perfect()){clearzone=CLEARZONEP;} + else{ + assert(scorecz1blimit){ +// clearzone=CLEARZONE1; + clearzone=(int)(((maxSwScore-score)*CLEARZONE1b+(score-cz1blimit)*CLEARZONE1)/(maxSwScore-cz1blimit)); + }else if(score>cz1climit){ +// clearzone=CLEARZONE1b; + clearzone=(int)(((cz1blimit-score)*CLEARZONE1c+(score-cz1climit)*CLEARZONE1b)/(cz1blimit-cz1climit)); + }else{ + clearzone=CLEARZONE1c; + } +// assert(false) : x+", "+cz1blimit+", "+cz1climit+", "+CLEARZONE1b_CUTOFF_FLAT+", "+clearzone; + } + + +// final int clearzone=r.perfect() ? CLEARZONEP : +// r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : +// (r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? (CLEARZONE1b_CUTOFF-)CLEARZONE1b : CLEARZONE1c); + int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + }else{ + final int lim=(r.perfect() ? (int)(4f*CLEARZONE_LIMIT1e) : score+CLEARZONE1e>=maxSwScore ? 2*CLEARZONE_LIMIT1e : CLEARZONE_LIMIT1e)+1; + if(r.sites.size()>lim && clearzonelim){ + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + } + } + } + } + + if(verbose){System.err.println("A: "+r);} + + if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){ + int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.topSite().score1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("C: "+r);} + + //***$ + if(MAKE_MATCH_STRING && r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + if(r.sites.size()>1){ + assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n"; + } + int mapScore=r.mapScore; + + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true); + if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){ + SiteScore ss=r.topSite(); + r.mapScore=ss.score=ss.slowScore=ss.pairedScore=Tools.min(ss.score, -9999); + } + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("D: "+r);} + + //TODO: Fix this + // if(mapScore>r.mapScore){ + // System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID); + // } + r.topSite().score=r.topSite().slowScore; + while(r.sites.size()>1 && r.topSite().score1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("E: "+r);} + } + } + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + removeDuplicateBestSites(r); + } + if(r.numSites()>0){r.topSite().match=r.match;} + + + + if(r.sites!=null && r.mapScore<=0){//This came from BBMapThreadPacBio; not sure if needed for other modes + if(!Shared.anomaly){ + System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped.\t"+(r.match==null)+"\t"+r.mapScore+"\t"+r.topSite()); + if(MSA.bandwidth>0 || MSA.bandwidthRatio>0){Shared.anomaly=true;} + } + r.mapScore=0; + r.setMapped(false); + r.sites=null; + } + + + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + }else if(r.mapScore<=0 && r.sites!=null){ + if(BANDWIDTH<1){ + if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + } + r.clearMapping(); + } + assert(r.sites==null || r.mapScore>0) : + "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+ + "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+ + "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+ + "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n"; + +// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString(); + + if((CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP) && r.sites!=null && !r.ambiguous()){ + + assert(r.mapScore>0); + float cz3v2=(CLEARZONE3*Tools.min(1.25f, (maxSwScore/(float)r.mapScore))); + +// boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3); + boolean changed=applyClearzone3(r, (int)cz3v2, 1/cz3v2); + if(changed){ + int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.mapScore100){ +// float penalty2=0.0004f*(500f*delta)/(500f+delta); +// r.mapScore=(int)(r.mapScore*(1-penalty2)); +// } +// } + + if(CALC_STATISTICS){ + calcStatistics1(r, maxSwScore, maxPossibleQuickScore); + } + } + + + /** Returns number of perfect pairs */ + public int pairSiteScoresInitial(Read r, Read r2, boolean trim){ + + if(r.numSites()<1 || r2.numSites()<1){return 0;} + + SiteScore.PCOMP.sort(r.sites); + SiteScore.PCOMP.sort(r2.sites); + + for(SiteScore ss : r.sites){ss.pairedScore=0;} + for(SiteScore ss : r2.sites){ss.pairedScore=0;} + +// ArrayList pairs=new ArrayList(Tools.min(8, Tools.min(r.list.size(), r2.list.size()))); + + int maxPairedScore1=-1; + int maxPairedScore2=-1; + + +// for(SiteScore ss : r.list){ +// System.out.println(ss.toText()); +// } + +// int i=0, j=0; + final int ilimit=r.sites.size()-1; + final int jlimit=r2.sites.size()-1; + final int maxReadLen=Tools.max(r.bases.length, r2.bases.length); + +// final int outerDistLimit=MIN_PAIR_DIST+r.bases.length+r2.bases.length; + final int outerDistLimit=(Tools.max(r.bases.length, r2.bases.length)*(OUTER_DIST_MULT))/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0); + final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0); + final int expectedFragLength=AVERAGE_PAIR_DIST+r.bases.length+r2.bases.length; + + int numPerfectPairs=0; + + for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){ + SiteScore ss1=r.sites.get(i); + SiteScore ss2=r2.sites.get(j); + + while(jinnerDistLimit))){ + j++; + ss2=r2.sites.get(j); + } + + for(int k=j; k<=jlimit; k++){ + ss2=r2.sites.get(k); + + if(ss2.chrom>ss1.chrom){break;} + if(ss2.start-ss1.stop>innerDistLimit){break;} + +// int dist=0; +// +// if(ss1.start<=ss2.start){ +// dist=ss2.start-ss1.stop; +// }else if(ss1.start>ss2.start){ +// dist=ss1.start-ss2.stop; +// } + + +// int innerdist=0; +// int outerdist=0; +// +// if(ss1.start<=ss2.start){ +// innerdist=ss2.start-ss1.stop; +// outerdist=ss2.stop-ss1.start; +// }else if(ss1.start>ss2.start){ +// innerdist=ss1.start-ss2.stop; +// outerdist=ss1.stop-ss2.start; +// } + + final int innerdist, outerdist; + //assert(!SAME_STRAND_PAIRS) : "TODO"; + + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + if(ss1.strand!=ss2.strand){ + if(ss1.strand==Gene.PLUS){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + + assert(outerdist>=innerdist); + + if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){ + + boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS); + + if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){ + + boolean paired1=false, paired2=false; + + int deviation=absdif(AVERAGE_PAIR_DIST, innerdist); + + final int pairedScore1; + final int pairedScore2; + if(strandOK){ +// pairedScore1=ss1.score+ss2.score/2; +// pairedScore2=ss2.score+ss1.score/2; + + pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-(((deviation)*ss2.score)/(32*expectedFragLength+100))); + pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-(((deviation)*ss1.score)/(32*expectedFragLength+100))); + }else{//e.g. a junction + pairedScore1=ss1.score+Tools.max(0, ss2.score/16); + pairedScore2=ss2.score+Tools.max(0, ss1.score/16); + } + + if(pairedScore1>ss1.pairedScore){ + paired1=true; + ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); + maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); + // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText()); + }else{ + // System.out.println(ss1.toText()+" already paired."); + } + if(pairedScore2>ss2.pairedScore){ + paired2=true; + ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); + maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + + if(paired1 && paired2 && outerdist>=maxReadLen && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){ + numPerfectPairs++; //Lower bound. Some perfect pairs may be the same. + } + +// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); +// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); +// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); +// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + } + } + + } + + + + for(SiteScore ss : r.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + for(SiteScore ss : r2.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + + if(trim){ + if(numPerfectPairs>0){ +// System.out.print("."); + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + }else{ + if(r.sites.size()>4){ + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + if(r2.sites.size()>4){ + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + } + } + +// if(pairs.isEmpty()){return null;} +// +// ArrayList temp=new ArrayList(Tools.max(r.list.size(), r2.list.size())); +// +// for(SiteScore ss : r.list){ +// if(ss.score>maxPairedScore1){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.a); +// } +// r.list.clear(); +// r.list.addAll(temp); +// +// for(SiteScore ss : r2.list){ +// if(ss.score>maxPairedScore2){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.b); +// } +// r2.list.clear(); +// r2.list.addAll(temp); +// +// return pairs; + + return numPerfectPairs; + } + + + public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){ +// if(r.numericID==2660){ +// verbose=msa.verbose=SiteScore.verbose=true; +// }else{ +// verbose=msa.verbose=SiteScore.verbose=false; +// } + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final Read r2=r.mate; + assert(r2!=null); + final byte[] basesP1=r.bases, basesP2=r2.bases; + + readsUsed++; + readsUsed2++; + + final int maxPossibleQuickScore1=quickMap(r, basesM1); + final int maxPossibleQuickScore2=quickMap(r2, basesM2); + + if(verbose){ + System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate); + } + + if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){ + r.sites=null; + r2.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + lowQualityReadsDiscarded2++; + r2.setDiscarded(true); + return; + } + + //Not really needed due to subsumption +// Tools.mergeDuplicateSites(r.list); +// Tools.mergeDuplicateSites(r2.list); + + initialSiteSum1+=r.numSites(); + initialSiteSum2+=r2.numSites(); + + //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used. + //Discards need to be tracked separately for each end. +// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;} + + final int maxSwScore1=msa.maxQuality(r.bases.length); + final int maxImperfectSwScore1=msa.maxImperfectScore(r.bases.length); + final int maxSwScore2=msa.maxQuality(r2.bases.length); + final int maxImperfectSwScore2=msa.maxImperfectScore(r2.bases.length); + + pairSiteScoresInitial(r, r2, TRIM_LIST); + if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(TRIM_LIST){ + + if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){ + if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);} + if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);} + } + + trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + postTrimSiteSum2+=r2.numSites(); + + {//Reset score to non-paired score + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + } + + if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);} + + if(SLOW_ALIGN){ + + if(r.numSites()>0){ + + int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores1<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);} + } + + //TODO: + //Note scoreSlow can be skipped under this circumstance: + //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites. + scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + Tools.mergeDuplicateSites(r.sites, true, true); + } + + if(r2.numSites()>0){ + int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores2<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);} + } + + scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r2.sites, index.MAX_INDEL); + if(r2.numSites()<1){r2.clearMapping();} + } + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + + if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(DO_RESCUE){ + int unpaired1=0; + int unpaired2=0; + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired1++;} + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r2.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired2++;} + } + } + + if(unpaired1>0 && r.numSites()>0){ + Collections.sort(r.sites); + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + if(unpaired2>0 && r2.numSites()>0){ + Collections.sort(r2.sites); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r.sites, true, true); + } + + postRescueSiteSum1+=r.numSites(); + postRescueSiteSum2+=r2.numSites(); + +// if(r.list!=null){Collections.sort(r.list);} +// if(r2.list!=null){Collections.sort(r2.list);} +// +// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); +// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + + if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + }else{ + Tools.mergeDuplicateSites(r.sites, true, false); + Tools.mergeDuplicateSites(r2.sites, true, false); + if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + + if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!) + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0){ + mapped1++; + Collections.sort(r.sites); + } + if(r2.numSites()>0){ + mapped2++; + Collections.sort(r2.sites); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + r.setPerfectFlag(maxSwScore1); + r2.setPerfectFlag(maxSwScore2); + } + + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + } + } + + if(r2.numSites()>1){ + final int clearzone=r2.perfect() ? CLEARZONEP : + r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 : + (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites2=Tools.countTopScores(r2.sites, clearzone); + if(numBestSites2>1){ + //Ambiguous alignment + assert(r2.sites.size()>1); + + boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r2.setAmbiguous(b); + } + } + if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0 && r2.numSites()>0){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r2.topSite(); + if(canPair(ss1, ss2, r.bases.length, r2.bases.length, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n"+ + r.mapped()+", "+r.paired()+", "+r.strand()+", "+r.ambiguous()+"\n\n"+r2.mapped()+", "+r2.paired()+", "+r2.strand()+", "+r2.ambiguous()+"\n\n"; + assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n\n"; + r.setPaired(true); + r.mate.setPaired(true); + } + } + + if(r.numSites()==0){r.sites=null;r.mapScore=0;} + if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;} + +// assert(Read.CHECKSITES(r, basesM));//***123 +// assert(Read.CHECKSITES(r2));//***123 + + r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + assert(checkTopSite(r)); // TODO remove this + if(KILL_BAD_PAIRS){ + if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + int x=r.mapScore/r.bases.length; + int y=r2.mapScore/r2.bases.length; + if(x>=y){ + r2.clearAnswers(false); + }else{ + r.clearAnswers(false); + } + } + } + if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(checkTopSite(r)); // TODO remove this + if(MAKE_MATCH_STRING){ + if(r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + assert(checkTopSite(r)); // TODO remove this + genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false); + assert(checkTopSite(r)); // TODO remove this + } + } + if(r2.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){ + r2.match=r2.topSite().match; + }else{ + genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false); + } + } + } + + assert(checkTopSite(r)); // TODO remove this + if(verbose){ + System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2); + if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);} + if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);} + } + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + }else if(r.mapScore<=0 && r.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + } + assert(checkTopSite(r)); // TODO remove this + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r2.mapScore>0 && r2.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + }else if(r2.mapScore<=0 && r2.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + } + + assert(r.sites==null || r.mapScore>0) : + r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + assert(r2.sites==null || r2.mapScore>0) : + r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + + assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails."; + assert(checkTopSite(r)); // TODO remove this + removeDuplicateBestSites(r); + removeDuplicateBestSites(r2); + + if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){ + AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated); + } + assert(checkTopSite(r)); // TODO remove this + if(r.ambiguous() && AMBIGUOUS_TOSS){ + if(r.sites!=null){r.sites=null;} + r.clearSite(); + r.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + if(r2.ambiguous() && AMBIGUOUS_TOSS){ + if(r2.sites!=null){r2.sites=null;} + r2.clearSite(); + r2.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + + assert(checkTopSite(r)); + if(r.mapped() && (LOCAL_ALIGN || r.containsXY2())){ + final SiteScore ss=r.topSite(); + ss.match=r.match; + msa.toLocalAlignment(r, ss, r.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + assert(checkTopSite(r2)); + if(r2.mapped() && (LOCAL_ALIGN || r2.containsXY2())){ + final SiteScore ss=r2.topSite(); + ss.match=r2.match; + msa.toLocalAlignment(r2, ss, r2.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + if(CALC_STATISTICS){ + calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1); + calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2); + } + } + +} diff --git a/current/align2/BBMapThread5.java b/current/align2/BBMapThread5.java new file mode 100755 index 0000000..419cf1a --- /dev/null +++ b/current/align2/BBMapThread5.java @@ -0,0 +1,1471 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import stream.ConcurrentReadStreamInterface; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Gene; + +/** + * Based on MapTestThread11f + * + * @author Brian Bushnell + * @date Jan 3, 2013 + * + */ +public final class BBMapThread5 extends AbstractMapThread { + + static final int ALIGN_COLUMNS=BBIndex5.ALIGN_COLUMNS; + static final int ALIGN_ROWS=501; + + + + /** Don't trim for local alignments unless at least this many bases will be clipped */ + private final int LOCAL_ALIGN_TIP_LENGTH=8; + /** Range is 0-1; a lower number makes trimming more aggressive */ + private final float LOCAL_ALIGN_MATCH_POINT_RATIO=1f; + + /** Ratio of the points for a match of a single base needed to declare unambiguous. 1 SNP is currently about 2.57 */ + public final float CLEARZONE_RATIOP=1.6f; //default 1.3f, which makes read ambiguous if there is 1 N in an alternate site. + public final float CLEARZONE_RATIO1=2.0f; + public final float CLEARZONE_RATIO1b=2.6f; + public final float CLEARZONE_RATIO1c=4.6f; + public final float CLEARZONE_RATIO3=8.0f; + /** Max allowed number of sites within 1 edit (excluding primary site) */ + public final int CLEARZONE_LIMIT1e=40; //Needs to be redone to assign a quality penalty rather than simply marking as ambiguous + public final int CLEARZONEP; + public final int CLEARZONE1; + public final int CLEARZONE1b; + public final int CLEARZONE1c; + //public final int CLEARZONE1e; + public final int CLEARZONE3; + public final float INV_CLEARZONE3; + public final float CLEARZONE1b_CUTOFF=0.92f; + public final float CLEARZONE1c_CUTOFF=0.82f; + + public final BBIndex5 index; + + + private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3; + private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2; + private int MAX_TRIM_SITES_TO_RETAIN=800; + + public static void setExpectedSites(int x){ + System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName())); + } + + @Override + public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;} + @Override + public final int ALIGN_ROWS(){return ALIGN_ROWS;} + @Override + public final int maxReadLength(){return ALIGN_ROWS-1;} + @Override + final AbstractIndex index(){return index;} + @Override + final int CLEARZONE1(){return CLEARZONE1;} + + public BBMapThread5(ConcurrentReadStreamInterface cris_, int keylen_, + boolean colorspace_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_, + int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_, + boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_, + float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_, + RTextOutputStream3 outStream_, RTextOutputStream3 outStreamMapped_, RTextOutputStream3 outStreamUnmapped_, RTextOutputStream3 outStreamBlack_, + boolean translateToBaseSpace_, int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_, + int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_, + boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_, + boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_, + boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, + boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){ + + super(cris_, + outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_, + colorspace_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_, + AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, THRESH_, + minChrom_, maxChrom_, KFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_, + translateToBaseSpace_, REQUIRE_CORRECT_STRANDS_PAIRS_, + SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_, + MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_, + MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_, + QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_, + keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_, + BBIndex5.MIN_APPROX_HITS_TO_KEEP, BBIndex5.USE_EXTENDED_SCORE, + BBIndex5.BASE_HIT_SCORE, BBIndex5.USE_AFFINE_SCORE, BBIndex5.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_); + + assert(SLOW_ALIGN_PADDING>=0); + assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO"; + + if(SLOW_ALIGN || MAKE_MATCH_STRING){ +// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, colorspace, MSA_TYPE); +// POINTS_MATCH=msa.POINTS_MATCH(); +// POINTS_MATCH2=msa.POINTS_MATCH2(); + CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2); + CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2); + CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2); + CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2); + CLEARZONE3=(int)(CLEARZONE_RATIO3*POINTS_MATCH2); +// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1; + }else{ +// POINTS_MATCH=70; +// POINTS_MATCH2=100; +// msa=null; + CLEARZONE1=0; + CLEARZONE1b=0; + CLEARZONE1c=0; + CLEARZONEP=0; + CLEARZONE3=0; +// CLEARZONE1e=0; + } + INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3); + + index=new BBIndex5(KEYLEN, minChrom, maxChrom, KFILTER, msa); + } + + + public int trimList(ArrayList list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){ + if(list==null || list.size()==0){return -99999;} + if(list.size()==1){return list.get(0).score;} + + final int highestScore; + if(USE_AFFINE_SCORE){ + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(highestScore==maxScore && specialCasePerfect){ + Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + return highestScore; + } + + final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1); + +// if(list.size()>6){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>10){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>14){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>18){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// //// System.out.print(", "+list.size()); +// if(list.size()>22){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// //// System.out.print(", "+list.size()); +// if(list.size()>26){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>34){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +// if(list.size()>42){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +// if(list.size()>50){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +//// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + + if(list.size()>4){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>8){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>12){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// //// System.out.print(", "+list.size()); + if(list.size()>20){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// //// System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); +// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else if(USE_EXTENDED_SCORE){ + highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else{ + // System.out.print("\n\nSize:\t"+list.size()); + + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + } + + return highestScore; + } + + + public void scoreSlow(final ArrayList list, final byte[] basesP, final byte[] basesM, + final int maxSwScore, final int maxImperfectSwScore){ + + int minMsaLimit; + if(PAIRED){ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore); + }else{ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore); + } + assert(Read.CHECKSITES(list, basesP, basesM, -1)); + + int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string + for(int i=0; ibases.length-1) : bases.length+", "+ss.toText(); + assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n"; + ss.slowScore=-1; + ss.semiperfect=false; + ss.perfect=false; + } + + final int swscoreNoIndel=ss.slowScore; + int[] swscoreArray=null; + + if(swscoreNoIndel4000){ + System.err.println(ss.toText()); + System.err.println(list.size()); + System.err.println(); + } + + int expectedLen=GapTools.calcGrefLen(ss); + if(expectedLen>=EXPECTED_LEN_LIMIT){ + //TODO: Alternately, I could kill the site. + ss.stop=ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT); + if(ss.gaps!=null){GapTools.fixGaps(ss);} + } + + int pad=SLOW_ALIGN_PADDING; + int minscore=Tools.max(swscoreNoIndel, minMsaLimit); + if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));} + swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore); + if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));} + + if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen=minscore && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){ + assert(swscoreArray.length==6) : swscoreArray.length; + assert(swscoreArray[0]>=minscore) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch; + ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null); + ss.fixXY(bases, true, msa); + }else{ss.match=null;} + } + if(swscoreArray!=null){ + if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));} + ss.slowScore=swscoreArray[0]; + ss.start=swscoreArray[1]; + ss.stop=swscoreArray[2]; + if(ss.gaps!=null){ + if(verbose){System.err.println("GapTools.fixGaps("+ss.start+", "+ss.stop+", "+Arrays.toString(ss.gaps)+", "+Shared.MINGAP);} + ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP); + } + }else{ + assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP); + assert(swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) : + swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+ + ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow + } + ss.score=ss.slowScore; + minMatch=Tools.max(minMatch, ss.slowScore); + minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3); + assert(ss.slowScore<=maxSwScore); + assert(!(ss.perfect && ss.slowScore "+ss);} + } + } + + + public void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(OUTPUT_PAIRED_ONLY && r.mate!=null && !r.paired() && (r.mapped() || r.mate.mapped())){r.clearPairMapping();} + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment1++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + assert(elements>0 == r.mapped()); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM1+=errors[0]; + matchCountS1+=errors[1]; + matchCountD1+=errors[2]; + matchCountI1+=errors[3]; + matchCountN1+=errors[4]; + } + + + mappedRetained1++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP1++; + }else{ + rescuedM1++; + } + } + if(r.paired()){ + numMated++; + int inner; + int outer; + if(r.start<=r.mate.start){ + inner=r.mate.start-r.stop; + outer=r.mate.stop-r.start; + }else{ + inner=r.start-r.mate.stop; + outer=r.stop-r.mate.start; + } + + inner=Tools.min(MAX_PAIR_DIST, inner); + inner=Tools.max(MIN_PAIR_DIST, inner); + innerLengthSum+=inner; + outerLengthSum+=outer; + insertSizeSum+=(inner+r.bases.length+r.mate.bases.length); + }else if(r.mate!=null && r.mate.mapped()/*&& r.list!=null && r.list.size()>0*/){ + badPairs++; + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch1++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP1++;} + else{truePositiveM1++;} + totalCorrectSites1+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit1++; + }else{ + correctMultiHit1++; + } + }else{ + correctLowHit1++; + } + + }else{ + + falsePositive1++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + }else{ + noHit1++; + } + } + + + public void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment2++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM2+=errors[0]; + matchCountS2+=errors[1]; + matchCountD2+=errors[2]; + matchCountI2+=errors[3]; + matchCountN2+=errors[4]; + } + + mappedRetained2++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP2++; + }else{ + rescuedM2++; + } + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch2++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP2++;} + else{truePositiveM2++;} + totalCorrectSites2+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit2++; + }else{ + correctMultiHit2++; + } + }else{ + correctLowHit2++; + } + + }else{ + + falsePositive2++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded2++; + }else{ + noHit2++; + } + } + + public void processRead(final Read r, final byte[] basesM){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final byte[] basesP=r.bases; + +// System.err.print(" rd#"+r.numericID+" "); +// if(r.numericID==25967){ +// verbose=true; +// msa.verbose=true; +// GapTools.verbose=true; +// index.verbose=true; +// tcr.verbose=true; +// } + + if(verbose){System.err.println("\nProcessing "+r);} + readsUsed++; + + final int maxPossibleQuickScore=quickMap(r, basesM); + if(verbose){System.err.println("\nQuick Map: \t"+r.sites);} + + if(maxPossibleQuickScore<0){ + r.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + return; + } + initialSiteSum1+=r.numSites(); + if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);} + + int maxSwScore=0; + int maxImperfectSwScore=0; + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + maxSwScore=msa.maxQuality(r.bases.length); + maxImperfectSwScore=msa.maxImperfectScore(r.bases.length); + } + + if(TRIM_LIST && r.numSites()>1){ + if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);} + int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + if(verbose){System.err.println("\nAfter trim: \t"+r.sites);} + + assert(Read.CHECKSITES(r, basesM)); + + + if(SLOW_ALIGN && r.numSites()>0){ + + int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore); + + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + +// int numPerfectScores=0; +// if(numNearPerfectScores>0){ +// for(SiteScore ss : r.list){ +// if(ss.perfect){numPerfectScores++;} +// else{break;} +// } +// } + + if(verbose){ + System.err.println("\nAfter scoreNoIndels: \t"+r.sites); + } + + if(numNearPerfectScores<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);} + } + + if(verbose){ + System.err.println("\nAfter findTipDeletions: \t"+r.sites); + } + + //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length + //***Above note should be resolved now, but needs to be verified. + + if(numNearPerfectScores<1){ + scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore); + } + + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + + if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);} + assert(Read.CHECKSITES(r, basesM)); + } + + + if(r.numSites()>0){ + mapped1++; + try { + Tools.mergeDuplicateSites(r.sites, true, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r.toText(false)+"\n\n"); + } + Collections.sort(r.sites); + } + + if(r.numSites()>1){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r.sites.get(1); + //Ensure no duplicates + assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false); + } + assert(Read.CHECKSITES(r, basesM)); + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);} + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + final int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + } + } + + if(verbose){System.err.println("A: "+r);} + + if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){ + int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.topSite().score0) : r.sites+", "+r.mapScore+"\n"+r; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("C: "+r);} + + //***$ + if(MAKE_MATCH_STRING && r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + if(r.sites.size()>1){ + assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n"; + } + int mapScore=r.mapScore; + + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true); + if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){ + SiteScore ss=r.topSite(); + r.mapScore=ss.score=ss.slowScore=ss.pairedScore=Tools.min(ss.score, -9999); + } + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("D: "+r);} + + //TODO: Fix this + // if(mapScore>r.mapScore){ + // System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID); + // } + r.topSite().score=r.topSite().slowScore; + while(r.sites.size()>1 && r.topSite().score1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("E: "+r);} + } + } + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + removeDuplicateBestSites(r); + } + if(r.numSites()>0){r.topSite().match=r.match;} + + + + if(r.sites!=null && r.mapScore<=0){//This came from BBMapThreadPacBio; not sure if needed for other modes + if(!Shared.anomaly){ + System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped."); + if(MSA.bandwidth>0 || MSA.bandwidthRatio>0){Shared.anomaly=true;} + } + r.mapScore=0; + r.setMapped(false); + r.sites=null; + } + + + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + }else if(r.mapScore<=0 && r.sites!=null){ + if(BANDWIDTH<1){ + if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + } + r.clearMapping(); + } + assert(r.sites==null || r.mapScore>0) : + "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+ + "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+ + "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+ + "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n"; + +// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString(); + + if(CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP){ + boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3); + if(changed){ + int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.mapScore pairs=new ArrayList(Tools.min(8, Tools.min(r.list.size(), r2.list.size()))); + + int maxPairedScore1=-1; + int maxPairedScore2=-1; + + +// for(SiteScore ss : r.list){ +// System.out.println(ss.toText()); +// } + +// int i=0, j=0; + final int ilimit=r.sites.size()-1; + final int jlimit=r2.sites.size()-1; + +// final int outerDistLimit=MIN_PAIR_DIST+r.bases.length+r2.bases.length; + final int outerDistLimit=(Tools.max(r.bases.length, r2.bases.length)*OUTER_DIST_MULT)/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0); + final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0); + final int expectedFragLength=AVERAGE_PAIR_DIST+r.bases.length+r2.bases.length; + + int numPerfectPairs=0; + + for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){ + SiteScore ss1=r.sites.get(i); + SiteScore ss2=r2.sites.get(j); + + while(jinnerDistLimit))){ + j++; + ss2=r2.sites.get(j); + } + + for(int k=j; k<=jlimit; k++){ + ss2=r2.sites.get(k); + + if(ss2.chrom>ss1.chrom){break;} + if(ss2.start-ss1.stop>innerDistLimit){break;} + +// int dist=0; +// +// if(ss1.start<=ss2.start){ +// dist=ss2.start-ss1.stop; +// }else if(ss1.start>ss2.start){ +// dist=ss1.start-ss2.stop; +// } + + +// int innerdist=0; +// int outerdist=0; +// +// if(ss1.start<=ss2.start){ +// innerdist=ss2.start-ss1.stop; +// outerdist=ss2.stop-ss1.start; +// }else if(ss1.start>ss2.start){ +// innerdist=ss1.start-ss2.stop; +// outerdist=ss1.stop-ss2.start; +// } + + final int innerdist, outerdist; + //assert(!SAME_STRAND_PAIRS) : "TODO"; + + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + if(ss1.strand!=ss2.strand){ + if(ss1.strand==Gene.PLUS){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + + assert(outerdist>=innerdist); + + if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){ + + boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS); + + if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){ + + boolean paired1=false, paired2=false; + + int deviation=absdif(AVERAGE_PAIR_DIST, innerdist); + + final int pairedScore1; + final int pairedScore2; + if(strandOK){ +// pairedScore1=ss1.score+ss2.score/2; +// pairedScore2=ss2.score+ss1.score/2; + + pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-((deviation*ss2.score)/(32*expectedFragLength+100))); + pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-((deviation*ss1.score)/(32*expectedFragLength+100))); + }else{//e.g. a junction + pairedScore1=ss1.score+ss2.score/16; + pairedScore2=ss2.score+ss1.score/16; + } + + if(pairedScore1>ss1.pairedScore){ + paired1=true; + ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); + maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); + // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText()); + }else{ + // System.out.println(ss1.toText()+" already paired."); + } + if(pairedScore2>ss2.pairedScore){ + paired2=true; + ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); + maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + + if(paired1 && paired2 && innerdist>0 && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){ + numPerfectPairs++; //Lower bound. Some perfect pairs may be the same. + } + +// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); +// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); +// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); +// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + } + } + + } + + + + for(SiteScore ss : r.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + for(SiteScore ss : r2.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + + if(trim){ + if(numPerfectPairs>0){ +// System.out.print("."); + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + }else{ + if(r.sites.size()>4){ + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + if(r2.sites.size()>4){ + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + } + } + +// if(pairs.isEmpty()){return null;} +// +// ArrayList temp=new ArrayList(Tools.max(r.list.size(), r2.list.size())); +// +// for(SiteScore ss : r.list){ +// if(ss.score>maxPairedScore1){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.a); +// } +// r.list.clear(); +// r.list.addAll(temp); +// +// for(SiteScore ss : r2.list){ +// if(ss.score>maxPairedScore2){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.b); +// } +// r2.list.clear(); +// r2.list.addAll(temp); +// +// return pairs; + + return numPerfectPairs; + } + + + public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final Read r2=r.mate; + assert(r2!=null); + final byte[] basesP1=r.bases, basesP2=r2.bases; + + readsUsed++; + readsUsed2++; + + final int maxPossibleQuickScore1=quickMap(r, basesM1); + final int maxPossibleQuickScore2=quickMap(r2, basesM2); + + if(verbose){ + System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate); + } + + if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){ + r.sites=null; + r2.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + lowQualityReadsDiscarded2++; + r2.setDiscarded(true); + return; + } + + //Not really needed due to subsumption +// Tools.mergeDuplicateSites(r.list); +// Tools.mergeDuplicateSites(r2.list); + + initialSiteSum1+=r.numSites(); + initialSiteSum2+=r2.numSites(); + + //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used. + //Discards need to be tracked separately for each end. +// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;} + + final int maxSwScore1=msa.maxQuality(r.bases.length); + final int maxImperfectSwScore1=msa.maxImperfectScore(r.bases.length); + final int maxSwScore2=msa.maxQuality(r2.bases.length); + final int maxImperfectSwScore2=msa.maxImperfectScore(r2.bases.length); + + pairSiteScoresInitial(r, r2, TRIM_LIST); + if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(TRIM_LIST){ + + if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){ + if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);} + if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);} + } + + trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + postTrimSiteSum2+=r2.numSites(); + + {//Reset score to non-paired score + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + } + + if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);} + + if(SLOW_ALIGN){ + + if(r.numSites()>0){ + + int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores1<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);} + } + + //TODO: + //Note scoreSlow can be skipped under this circumstance: + //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites. + scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + Tools.mergeDuplicateSites(r.sites, true, true); + } + + if(r2.numSites()>0){ + int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores2<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);} + } + + scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r2.sites, index.MAX_INDEL); + if(r2.numSites()<1){r2.clearMapping();} + } + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + + if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(DO_RESCUE){ + int unpaired1=0; + int unpaired2=0; + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired1++;} + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r2.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired2++;} + } + } + + if(unpaired1>0 && r.numSites()>0){ + Collections.sort(r.sites); + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + if(unpaired2>0 && r2.numSites()>0){ + Collections.sort(r2.sites); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r.sites, true, true); + } + + postRescueSiteSum1+=r.numSites(); + postRescueSiteSum2+=r2.numSites(); + +// if(r.list!=null){Collections.sort(r.list);} +// if(r2.list!=null){Collections.sort(r2.list);} +// +// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); +// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + + if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + }else{ + Tools.mergeDuplicateSites(r.sites, true, false); + Tools.mergeDuplicateSites(r2.sites, true, false); + if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + + if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!) + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0){ + mapped1++; + Collections.sort(r.sites); + } + if(r2.numSites()>0){ + mapped2++; + Collections.sort(r2.sites); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + r.setPerfectFlag(maxSwScore1); + r2.setPerfectFlag(maxSwScore2); + } + + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + } + } + + if(r2.numSites()>1){ + final int clearzone=r2.perfect() ? CLEARZONEP : + r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites2=Tools.countTopScores(r2.sites, clearzone); + if(numBestSites2>1){ + //Ambiguous alignment + assert(r2.sites.size()>1); + + boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r2.setAmbiguous(b); + } + } + if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0 && r2.numSites()>0){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r2.topSite(); + if(canPair(ss1, ss2, r.bases.length, r2.bases.length, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + r.setPaired(true); + r.mate.setPaired(true); + } + } + + if(r.numSites()==0){r.sites=null;r.mapScore=0;} + if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;} + +// assert(Read.CHECKSITES(r, basesM));//***123 +// assert(Read.CHECKSITES(r2));//***123 + + r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + assert(checkTopSite(r)); // TODO remove this + if(KILL_BAD_PAIRS){ + if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + int x=r.mapScore/r.bases.length; + int y=r2.mapScore/r2.bases.length; + if(x>=y){ + r2.clearAnswers(false); + }else{ + r.clearAnswers(false); + } + } + } + if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(checkTopSite(r)); // TODO remove this + if(MAKE_MATCH_STRING){ + if(r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + assert(checkTopSite(r)); // TODO remove this + genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false); + assert(checkTopSite(r)); // TODO remove this + } + } + if(r2.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){ + r2.match=r2.topSite().match; + }else{ + genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false); + } + } + } + + assert(checkTopSite(r)); // TODO remove this + if(verbose){ + System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2); + if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);} + if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);} + } + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + }else if(r.mapScore<=0 && r.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + } + assert(checkTopSite(r)); // TODO remove this + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r2.mapScore>0 && r2.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + }else if(r2.mapScore<=0 && r2.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + } + + assert(r.sites==null || r.mapScore>0) : + r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + assert(r2.sites==null || r2.mapScore>0) : + r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + + assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails."; + assert(checkTopSite(r)); // TODO remove this + removeDuplicateBestSites(r); + removeDuplicateBestSites(r2); + + if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){ + AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated); + } + assert(checkTopSite(r)); // TODO remove this + if(r.ambiguous() && AMBIGUOUS_TOSS){ + if(r.sites!=null){r.sites=null;} + r.clearSite(); + r.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + if(r2.ambiguous() && AMBIGUOUS_TOSS){ + if(r2.sites!=null){r2.sites=null;} + r2.clearSite(); + r2.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + + assert(checkTopSite(r)); + if(r.mapped() && (LOCAL_ALIGN || r.containsXY2())){ + final SiteScore ss=r.topSite(); + ss.match=r.match; + msa.toLocalAlignment(r, ss, r.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + assert(checkTopSite(r2)); + if(r2.mapped() && (LOCAL_ALIGN || r2.containsXY2())){ + final SiteScore ss=r2.topSite(); + ss.match=r2.match; + msa.toLocalAlignment(r2, ss, r2.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + if(CALC_STATISTICS){ + calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1); + calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2); + } + } + +} diff --git a/current/align2/BBMapThreadAcc.java b/current/align2/BBMapThreadAcc.java new file mode 100755 index 0000000..93d5227 --- /dev/null +++ b/current/align2/BBMapThreadAcc.java @@ -0,0 +1,1576 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import stream.ConcurrentReadStreamInterface; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Gene; + +/** + * Based on MapTestThread11i + * + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public final class BBMapThreadAcc extends AbstractMapThread{ + + static final int ALIGN_COLUMNS=BBIndexAcc.ALIGN_COLUMNS; + static final int ALIGN_ROWS=501; + + + + /** Don't trim for local alignments unless at least this many bases will be clipped */ + private final int LOCAL_ALIGN_TIP_LENGTH=8; + /** Range is 0-1; a lower number makes trimming more aggressive */ + private final float LOCAL_ALIGN_MATCH_POINT_RATIO=1f; + + /** Ratio of the points for a match of a single base needed to declare unambiguous. 1 SNP is currently about 2.57 */ + public final float CLEARZONE_RATIOP=1.6f; //default 1.3f, which makes read ambiguous if there is 1 N in an alternate site. + public final float CLEARZONE_RATIO1=2.0f; + public final float CLEARZONE_RATIO1b=2.6f; + public final float CLEARZONE_RATIO1c=4.8f; + public final float CLEARZONE_RATIO3=9.5f; + /** Max allowed number of sites within 1 edit (excluding primary site) */ + public final int CLEARZONE_LIMIT1e=50; + public final int CLEARZONEP; + public final int CLEARZONE1; + public final int CLEARZONE1b; + public final int CLEARZONE1c; + //public final int CLEARZONE1e; + public final int CLEARZONE3; + public final float INV_CLEARZONE3; + public final float CLEARZONE1b_CUTOFF_FLAT_RATIO=12;//3f; + public final float CLEARZONE1b_CUTOFF_FLAT; + public final float CLEARZONE1b_CUTOFF_SCALE=0.97f; + public final float CLEARZONE1c_CUTOFF_FLAT_RATIO=26;//7f; + public final float CLEARZONE1c_CUTOFF_FLAT; + public final float CLEARZONE1c_CUTOFF_SCALE=0.92f; + + public final BBIndexAcc index; + + + private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3; + private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2; + private int MAX_TRIM_SITES_TO_RETAIN=800; + + public static void setExpectedSites(int x){ + System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName())); + } + + @Override + public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;} + @Override + public final int ALIGN_ROWS(){return ALIGN_ROWS;} + @Override + public final int maxReadLength(){return ALIGN_ROWS-1;} + @Override + final AbstractIndex index(){return index;} + @Override + final int CLEARZONE1(){return CLEARZONE1;} + + public BBMapThreadAcc(ConcurrentReadStreamInterface cris_, int keylen_, + boolean colorspace_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_, + int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_, + boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_, + float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_, + RTextOutputStream3 outStream_, RTextOutputStream3 outStreamMapped_, RTextOutputStream3 outStreamUnmapped_, RTextOutputStream3 outStreamBlack_, + boolean translateToBaseSpace_, int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_, + int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_, + boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_, + boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_, + boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, + boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){ + + super(cris_, + outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_, + colorspace_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_, + AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, THRESH_, + minChrom_, maxChrom_, KFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_, + translateToBaseSpace_, REQUIRE_CORRECT_STRANDS_PAIRS_, + SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_, + MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_, + MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_, + QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_, + keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_, + BBIndexAcc.MIN_APPROX_HITS_TO_KEEP, BBIndexAcc.USE_EXTENDED_SCORE, + BBIndexAcc.BASE_HIT_SCORE, BBIndexAcc.USE_AFFINE_SCORE, BBIndexAcc.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_); + + assert(SLOW_ALIGN_PADDING>=0); + assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO"; + + if(SLOW_ALIGN || MAKE_MATCH_STRING){ +// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, colorspace, MSA_TYPE); +// POINTS_MATCH=msa.POINTS_MATCH(); +// POINTS_MATCH2=msa.POINTS_MATCH2(); + CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2); + CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2); + CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2); + CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2); + CLEARZONE3=(int)(CLEARZONE_RATIO3*POINTS_MATCH2); +// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1; + }else{ +// POINTS_MATCH=70; +// POINTS_MATCH2=100; +// msa=null; + CLEARZONE1=0; + CLEARZONE1b=0; + CLEARZONE1c=0; + CLEARZONEP=0; + CLEARZONE3=0; +// CLEARZONE1e=0; + } + + CLEARZONE1b_CUTOFF_FLAT=CLEARZONE1b_CUTOFF_FLAT_RATIO*POINTS_MATCH2; + CLEARZONE1c_CUTOFF_FLAT=CLEARZONE1c_CUTOFF_FLAT_RATIO*POINTS_MATCH2; + INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3); + + index=new BBIndexAcc(KEYLEN, minChrom, maxChrom, KFILTER, msa); + } + + + public int trimList(ArrayList list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){ + if(list==null || list.size()==0){return -99999;} + if(list.size()==1){return list.get(0).score;} + + final int highestScore; + if(USE_AFFINE_SCORE){ + + highestScore=Tools.trimSiteList(list, .35f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + +// System.err.println("\nTrimming list of length "+list.size()+" vs highestScore "+highestScore+", maxScore "+maxScore+", specialcasePerfect="+specialCasePerfect); + + final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1); + if(highestScore==maxScore && specialCasePerfect){ + Tools.trimSiteList(list, .9f, retainPaired, true, mstr2, maxSitesToRetain); + if(list.size()>30){Tools.trimSiteList(list, .92f, retainPaired, true, mstr2, maxSitesToRetain);} + if(list.size()>60){Tools.trimSiteList(list, .94f, retainPaired, true, mstr2, maxSitesToRetain);} + if(list.size()>80){Tools.trimSiteList(list, .96f, retainPaired, true, mstr2, maxSitesToRetain);} + if(list.size()>120){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);} + if(list.size()>160){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);} + return highestScore; + } + + if(list.size()>4){Tools.trimSiteList(list, .4f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>6){Tools.trimSiteList(list, .45f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>8){Tools.trimSiteList(list, .5f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>12){Tools.trimSiteList(list, .55f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>20){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>56){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>64){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>80){Tools.trimSiteList(list, .94f, retainPaired, true, mstr2, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>100){Tools.trimSiteList(list, .95f, retainPaired, true, mstr2, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>120){Tools.trimSiteList(list, .96f, retainPaired, true, mstr2, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>160){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>200){Tools.trimSiteList(list, .98f, retainPaired, true, mstr2, maxSitesToRetain);} +// System.out.print(", "+list.size()); + if(list.size()>240){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);} +// System.out.print(", "+list.size()); + + +// if(list.size()>4){Tools.trimSiteList(list, .4f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>8){Tools.trimSiteList(list, .45f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>12){Tools.trimSiteList(list, .5f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>16){Tools.trimSiteList(list, .55f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>20){Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>24){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>32){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>48){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>64){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>128){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>256){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>512){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>2048){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>4096){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>8192){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>16000){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>32000){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); +// if(list.size()>32000){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// System.out.print(", "+list.size()); + + + }else if(BBIndexAcc.USE_EXTENDED_SCORE){ + highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else{ + // System.out.print("\n\nSize:\t"+list.size()); + + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + } + + return highestScore; + } + + + public void scoreSlow(final ArrayList list, final byte[] basesP, final byte[] basesM, + final int maxSwScore, final int maxImperfectSwScore){ + + int minMsaLimit; + if(PAIRED){ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore); + }else{ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore); + } + assert(Read.CHECKSITES(list, basesP, basesM, -1)); + + int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string + for(SiteScore ss : list){ + + final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM); + + if(SEMIPERFECTMODE){ + assert(ss.stop-ss.start==bases.length-1); + assert(ss.semiperfect); + } + + if(verbose){System.err.println("Slow-scoring "+ss);} + if(ss.stop-ss.start!=bases.length-1){ + assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText(); + assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n"; + ss.slowScore=-1; + ss.semiperfect=false; + ss.perfect=false; + } + + final int swscoreNoIndel=ss.slowScore; + int[] swscoreArray=null; + + if(swscoreNoIndel4000){ + System.err.println(ss.toText()); + System.err.println(list.size()); + System.err.println(); + } + + int expectedLen=GapTools.calcGrefLen(ss); + if(expectedLen>=EXPECTED_LEN_LIMIT){ + //TODO: Alternately, I could kill the site. + ss.stop=ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT); + if(ss.gaps!=null){GapTools.fixGaps(ss);} + } + + int pad=SLOW_ALIGN_PADDING; + int minscore=Tools.max(swscoreNoIndel, minMsaLimit); + if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));} + swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore); + if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));} + + if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen=minscore && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){ + assert(swscoreArray.length==6) : swscoreArray.length; + assert(swscoreArray[0]>=minscore) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch; + ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null); + ss.fixXY(bases, true, msa); + }else{ss.match=null;} + } + if(swscoreArray!=null){ + if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));} + ss.slowScore=swscoreArray[0]; + ss.start=swscoreArray[1]; + ss.stop=swscoreArray[2]; + if(ss.gaps!=null){ + if(verbose){System.err.println("GapTools.fixGaps("+ss.start+", "+ss.stop+", "+Arrays.toString(ss.gaps)+", "+Shared.MINGAP);} + ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP); + } + }else{ + assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP); + assert(swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) : + swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+ + ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow + } + ss.score=ss.slowScore; + minMatch=Tools.max(minMatch, ss.slowScore); + minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3); + assert(ss.slowScore<=maxSwScore); + assert(!(ss.perfect && ss.slowScore "+ss);} + } + } + + + public void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(OUTPUT_PAIRED_ONLY && r.mate!=null && !r.paired() && (r.mapped() || r.mate.mapped())){r.clearPairMapping();} + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment1++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + assert(elements>0 == r.mapped()); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM1+=errors[0]; + matchCountS1+=errors[1]; + matchCountD1+=errors[2]; + matchCountI1+=errors[3]; + matchCountN1+=errors[4]; + } + + + mappedRetained1++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP1++; + }else{ + rescuedM1++; + } + } + if(r.paired()){ + numMated++; + int inner; + int outer; + if(r.start<=r.mate.start){ + inner=r.mate.start-r.stop; + outer=r.mate.stop-r.start; + }else{ + inner=r.start-r.mate.stop; + outer=r.stop-r.mate.start; + } + + inner=Tools.min(MAX_PAIR_DIST, inner); + inner=Tools.max(MIN_PAIR_DIST, inner); + innerLengthSum+=inner; + outerLengthSum+=outer; + insertSizeSum+=(inner+r.bases.length+r.mate.bases.length); + }else if(r.mate!=null && r.mate.mapped()/*&& r.list!=null && r.list.size()>0*/){ + badPairs++; + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch1++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP1++;} + else{truePositiveM1++;} + totalCorrectSites1+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit1++; + }else{ + correctMultiHit1++; + } + }else{ + correctLowHit1++; + } + + }else{ + + falsePositive1++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + }else{ + noHit1++; + } + } + + + public void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment2++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM2+=errors[0]; + matchCountS2+=errors[1]; + matchCountD2+=errors[2]; + matchCountI2+=errors[3]; + matchCountN2+=errors[4]; + } + + mappedRetained2++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP2++; + }else{ + rescuedM2++; + } + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch2++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP2++;} + else{truePositiveM2++;} + totalCorrectSites2+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit2++; + }else{ + correctMultiHit2++; + } + }else{ + correctLowHit2++; + } + + }else{ + + falsePositive2++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded2++; + }else{ + noHit2++; + } + } + + public void processRead(final Read r, final byte[] basesM){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final byte[] basesP=r.bases; + +// System.err.print(" rd#"+r.numericID+" "); +// if(r.numericID==25967){ +// verbose=true; +// msa.verbose=true; +// GapTools.verbose=true; +// index.verbose=true; +// tcr.verbose=true; +// } + + if(verbose){System.err.println("\nProcessing "+r);} + readsUsed++; + + final int maxPossibleQuickScore=quickMap(r, basesM); + if(verbose){System.err.println("\nQuick Map: \t"+r.sites);} + + if(maxPossibleQuickScore<0){ + r.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + return; + } + initialSiteSum1+=r.numSites(); + if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);} + + int maxSwScore=0; + int maxImperfectSwScore=0; + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + maxSwScore=msa.maxQuality(r.bases.length); + maxImperfectSwScore=msa.maxImperfectScore(r.bases.length); + } + + if(TRIM_LIST && r.numSites()>1){ + if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);} + int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + if(verbose){System.err.println("\nAfter trim: \t"+r.sites);} + + assert(Read.CHECKSITES(r, basesM)); + + + if(SLOW_ALIGN && r.numSites()>0){ + + int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore); + + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + int numPerfectScores=0; + if(numNearPerfectScores>0){ + for(SiteScore ss : r.sites){ + if(ss.perfect){numPerfectScores++;} + else{break;} + } + } + + if(verbose){ + System.err.println("\nAfter scoreNoIndels: \t"+r.sites); + } + + if(numPerfectScores<2 && numNearPerfectScores<3){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);} + } + + if(verbose){ + System.err.println("\nAfter findTipDeletions: \t"+r.sites); + } + + //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length + //***Above note should be resolved now, but needs to be verified. + + if(numPerfectScores<2 /*&& numNearPerfectScores<3*/){ + scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore); + } + + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + + if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);} + assert(Read.CHECKSITES(r, basesM)); + } + + + if(r.numSites()>0){ + mapped1++; + try { + Tools.mergeDuplicateSites(r.sites, true, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r.toText(false)+"\n\n"); + } + Collections.sort(r.sites); + } + + if(r.numSites()>1){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r.sites.get(1); + //Ensure no duplicates + assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false); + } + assert(Read.CHECKSITES(r, basesM)); + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore); + } + + if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){ + int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.topSite().score1){ + + final int clearzone; + final int score=r.topSite().score; + if(r.perfect()){clearzone=CLEARZONEP;} + else{ + assert(scorecz1blimit){ +// clearzone=CLEARZONE1; + clearzone=(int)(((maxSwScore-score)*CLEARZONE1b+(score-cz1blimit)*CLEARZONE1)/(maxSwScore-cz1blimit)); + }else if(score>cz1climit){ +// clearzone=CLEARZONE1b; + clearzone=(int)(((cz1blimit-score)*CLEARZONE1c+(score-cz1climit)*CLEARZONE1b)/(cz1blimit-cz1climit)); + }else{ + clearzone=CLEARZONE1c; + } +// assert(false) : x+", "+cz1blimit+", "+cz1climit+", "+CLEARZONE1b_CUTOFF_FLAT+", "+clearzone; + } + + +// final int clearzone=r.perfect() ? CLEARZONEP : +// r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : +// (r.list.get(0).score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? (CLEARZONE1b_CUTOFF-)CLEARZONE1b : CLEARZONE1c); + int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + }else{ + final int lim=(r.perfect() ? 3*CLEARZONE_LIMIT1e : score+CLEARZONE1e>=maxSwScore ? 2*CLEARZONE_LIMIT1e : CLEARZONE_LIMIT1e)+1; + if(r.sites.size()>lim && clearzonelim){ + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + } + } + } + } + + if(verbose){System.err.println("A: "+r);} + + if((SLOW_ALIGN || BBIndex.USE_AFFINE_SCORE) && r.numSites()>0){ + int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.topSite().score1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("C: "+r);} + + //***$ + if(MAKE_MATCH_STRING && r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + if(r.sites.size()>1){ + assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n"; + } + int mapScore=r.mapScore; + + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true); + if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){ + SiteScore ss=r.topSite(); + r.mapScore=ss.score=ss.slowScore=ss.pairedScore=Tools.min(ss.score, -9999); + } + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("D: "+r);} + + //TODO: Fix this + // if(mapScore>r.mapScore){ + // System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID); + // } + r.topSite().score=r.topSite().slowScore; + while(r.sites.size()>1 && r.topSite().score1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("E: "+r);} + } + } + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + removeDuplicateBestSites(r); + } + if(r.numSites()>0){r.topSite().match=r.match;} + + + + if(r.sites!=null && r.mapScore<=0){//This came from BBMapThreadPacBio; not sure if needed for other modes + if(!Shared.anomaly){ + System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped."); + if(MSA.bandwidth>0 || MSA.bandwidthRatio>0){Shared.anomaly=true;} + } + r.mapScore=0; + r.setMapped(false); + r.sites=null; + } + + + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + }else if(r.mapScore<=0 && r.sites!=null){ + if(BANDWIDTH<1){ + if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + } + r.clearMapping(); + } + assert(r.sites==null || r.mapScore>0) : + "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+ + "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+ + "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+ + "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n"; + +// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString(); + + if((CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP) && r.sites!=null && !r.ambiguous()){ + + assert(r.mapScore>0); + float cz3v2=(CLEARZONE3*Tools.min(1.1f, (maxSwScore/(float)r.mapScore))); + +// boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3); + boolean changed=applyClearzone3(r, (int)cz3v2, 1/cz3v2); + if(changed){ + int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.mapScoreCLEARZONE1 || CLEARZONE3>CLEARZONEP){ +// boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3); +// if(changed){ +// int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); +// if(r.mapScore pairs=new ArrayList(Tools.min(8, Tools.min(r.list.size(), r2.list.size()))); + + int maxPairedScore1=-1; + int maxPairedScore2=-1; + + +// for(SiteScore ss : r.list){ +// System.out.println(ss.toText()); +// } + +// int i=0, j=0; + final int ilimit=r.sites.size()-1; + final int jlimit=r2.sites.size()-1; + +// final int outerDistLimit=MIN_PAIR_DIST+r.bases.length+r2.bases.length; + final int outerDistLimit=(Tools.max(r.bases.length, r2.bases.length)*OUTER_DIST_MULT)/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0); + final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0); + final int expectedFragLength=AVERAGE_PAIR_DIST+r.bases.length+r2.bases.length; + + int numPerfectPairs=0; + + for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){ + SiteScore ss1=r.sites.get(i); + SiteScore ss2=r2.sites.get(j); + + while(jinnerDistLimit))){ + j++; + ss2=r2.sites.get(j); + } + + for(int k=j; k<=jlimit; k++){ + ss2=r2.sites.get(k); + + if(ss2.chrom>ss1.chrom){break;} + if(ss2.start-ss1.stop>innerDistLimit){break;} + +// int dist=0; +// +// if(ss1.start<=ss2.start){ +// dist=ss2.start-ss1.stop; +// }else if(ss1.start>ss2.start){ +// dist=ss1.start-ss2.stop; +// } + + +// int innerdist=0; +// int outerdist=0; +// +// if(ss1.start<=ss2.start){ +// innerdist=ss2.start-ss1.stop; +// outerdist=ss2.stop-ss1.start; +// }else if(ss1.start>ss2.start){ +// innerdist=ss1.start-ss2.stop; +// outerdist=ss1.stop-ss2.start; +// } + + final int innerdist, outerdist; + //assert(!SAME_STRAND_PAIRS) : "TODO"; + + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + if(ss1.strand!=ss2.strand){ + if(ss1.strand==Gene.PLUS){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + + assert(outerdist>=innerdist); + + if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){ + + boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS); + + if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){ + + boolean paired1=false, paired2=false; + + int deviation=absdif(AVERAGE_PAIR_DIST, innerdist); + + final int pairedScore1; + final int pairedScore2; + if(strandOK){ +// pairedScore1=ss1.score+ss2.score/2; +// pairedScore2=ss2.score+ss1.score/2; + + pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-((deviation*ss2.score)/(32*expectedFragLength+100))); + pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-((deviation*ss1.score)/(32*expectedFragLength+100))); + }else{//e.g. a junction + pairedScore1=ss1.score+ss2.score/16; + pairedScore2=ss2.score+ss1.score/16; + } + + if(pairedScore1>ss1.pairedScore){ + paired1=true; + ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); + maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); + // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText()); + }else{ + // System.out.println(ss1.toText()+" already paired."); + } + if(pairedScore2>ss2.pairedScore){ + paired2=true; + ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); + maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + + if(paired1 && paired2 && innerdist>0 && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){ + numPerfectPairs++; //Lower bound. Some perfect pairs may be the same. + } + +// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); +// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); +// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); +// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + } + } + + } + + + + for(SiteScore ss : r.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + for(SiteScore ss : r2.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + + if(trim){ + if(numPerfectPairs>0){ +// System.out.print("."); + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + }else{ + if(r.sites.size()>4){ + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + if(r2.sites.size()>4){ + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + } + } + +// if(pairs.isEmpty()){return null;} +// +// ArrayList temp=new ArrayList(Tools.max(r.list.size(), r2.list.size())); +// +// for(SiteScore ss : r.list){ +// if(ss.score>maxPairedScore1){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.a); +// } +// r.list.clear(); +// r.list.addAll(temp); +// +// for(SiteScore ss : r2.list){ +// if(ss.score>maxPairedScore2){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.b); +// } +// r2.list.clear(); +// r2.list.addAll(temp); +// +// return pairs; + + return numPerfectPairs; + } + + + public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final Read r2=r.mate; + assert(r2!=null); + final byte[] basesP1=r.bases, basesP2=r2.bases; + + readsUsed++; + readsUsed2++; + + final int maxPossibleQuickScore1=quickMap(r, basesM1); + final int maxPossibleQuickScore2=quickMap(r2, basesM2); + + if(verbose){ + System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate); + } + + if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){ + r.sites=null; + r2.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + lowQualityReadsDiscarded2++; + r2.setDiscarded(true); + return; + } + + //Not really needed due to subsumption +// Tools.mergeDuplicateSites(r.list); +// Tools.mergeDuplicateSites(r2.list); + + initialSiteSum1+=r.numSites(); + initialSiteSum2+=r2.numSites(); + + //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used. + //Discards need to be tracked separately for each end. +// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;} + + final int maxSwScore1=msa.maxQuality(r.bases.length); + final int maxImperfectSwScore1=msa.maxImperfectScore(r.bases.length); + final int maxSwScore2=msa.maxQuality(r2.bases.length); + final int maxImperfectSwScore2=msa.maxImperfectScore(r2.bases.length); + + pairSiteScoresInitial(r, r2, TRIM_LIST); + if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(TRIM_LIST){ + + if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){ + if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);} + if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);} + } + + trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + postTrimSiteSum2+=r2.numSites(); + + {//Reset score to non-paired score + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + } + + if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);} + + if(SLOW_ALIGN){ + + if(r.numSites()>0){ + + int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores1<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);} + } + + //TODO: + //Note scoreSlow can be skipped under this circumstance: + //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites. + scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + Tools.mergeDuplicateSites(r.sites, true, true); + } + + if(r2.numSites()>0){ + int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores2<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);} + } + + scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r2.sites, index.MAX_INDEL); + if(r2.numSites()<1){r2.clearMapping();} + } + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + + if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(DO_RESCUE){ + int unpaired1=0; + int unpaired2=0; + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired1++;} + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r2.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired2++;} + } + } + + if(unpaired1>0 && r.numSites()>0){ + Collections.sort(r.sites); + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + if(unpaired2>0 && r2.numSites()>0){ + Collections.sort(r2.sites); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r.sites, true, true); + } + + postRescueSiteSum1+=r.numSites(); + postRescueSiteSum2+=r2.numSites(); + +// if(r.list!=null){Collections.sort(r.list);} +// if(r2.list!=null){Collections.sort(r2.list);} +// +// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); +// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + + if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + }else{ + Tools.mergeDuplicateSites(r.sites, true, false); + Tools.mergeDuplicateSites(r2.sites, true, false); + if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + + if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!) + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0){ + mapped1++; + Collections.sort(r.sites); + } + if(r2.numSites()>0){ + mapped2++; + Collections.sort(r2.sites); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + r.setPerfectFlag(maxSwScore1); + r2.setPerfectFlag(maxSwScore2); + } + + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r.setAmbiguous(b); + } + } + + if(r2.numSites()>1){ + final int clearzone=r2.perfect() ? CLEARZONEP : + r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF_SCALE-CLEARZONE1b_CUTOFF_FLAT) ? CLEARZONE1 : + (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF_SCALE-CLEARZONE1c_CUTOFF_FLAT) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites2=Tools.countTopScores(r2.sites, clearzone); + if(numBestSites2>1){ + //Ambiguous alignment + assert(r2.sites.size()>1); + + boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, SAVE_AMBIGUOUS_XY); + r2.setAmbiguous(b); + } + } + if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0 && r2.numSites()>0){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r2.topSite(); + if(canPair(ss1, ss2, r.bases.length, r2.bases.length, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + r.setPaired(true); + r.mate.setPaired(true); + } + } + + if(r.numSites()==0){r.sites=null;r.mapScore=0;} + if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;} + +// assert(Read.CHECKSITES(r, basesM));//***123 +// assert(Read.CHECKSITES(r2));//***123 + + r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + assert(checkTopSite(r)); // TODO remove this + if(KILL_BAD_PAIRS){ + if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + int x=r.mapScore/r.bases.length; + int y=r2.mapScore/r2.bases.length; + if(x>=y){ + r2.clearAnswers(false); + }else{ + r.clearAnswers(false); + } + } + } + if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(checkTopSite(r)); // TODO remove this + if(MAKE_MATCH_STRING){ + if(r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + assert(checkTopSite(r)); // TODO remove this + genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false); + assert(checkTopSite(r)); // TODO remove this + } + } + if(r2.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){ + r2.match=r2.topSite().match; + }else{ + genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false); + } + } + } + + assert(checkTopSite(r)); // TODO remove this + if(verbose){ + System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2); + if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);} + if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);} + } + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + }else if(r.mapScore<=0 && r.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + } + assert(checkTopSite(r)); // TODO remove this + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r2.mapScore>0 && r2.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + }else if(r2.mapScore<=0 && r2.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + } + + assert(r.sites==null || r.mapScore>0) : + r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + assert(r2.sites==null || r2.mapScore>0) : + r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + + assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails."; + assert(checkTopSite(r)); // TODO remove this + removeDuplicateBestSites(r); + removeDuplicateBestSites(r2); + + if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){ + AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated); + } + assert(checkTopSite(r)); // TODO remove this + if(r.ambiguous() && AMBIGUOUS_TOSS){ + if(r.sites!=null){r.sites=null;} + r.clearSite(); + r.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + if(r2.ambiguous() && AMBIGUOUS_TOSS){ + if(r2.sites!=null){r2.sites=null;} + r2.clearSite(); + r2.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + + assert(checkTopSite(r)); + if(r.mapped() && (LOCAL_ALIGN || r.containsXY2())){ + final SiteScore ss=r.topSite(); + ss.match=r.match; + msa.toLocalAlignment(r, ss, r.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + assert(checkTopSite(r2)); + if(r2.mapped() && (LOCAL_ALIGN || r2.containsXY2())){ + final SiteScore ss=r2.topSite(); + ss.match=r2.match; + msa.toLocalAlignment(r2, ss, r2.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + if(CALC_STATISTICS){ + calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1); + calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2); + } + } + +} diff --git a/current/align2/BBMapThreadPacBio.java b/current/align2/BBMapThreadPacBio.java new file mode 100755 index 0000000..9ce6ab8 --- /dev/null +++ b/current/align2/BBMapThreadPacBio.java @@ -0,0 +1,1477 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import stream.ConcurrentReadStreamInterface; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; + +/** + * Based on MapTestThread11f + * + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public final class BBMapThreadPacBio extends AbstractMapThread{ + + static final int ALIGN_COLUMNS=BBIndexPacBio.ALIGN_COLUMNS; + static final int ALIGN_ROWS=6020; + + + + /** Don't trim for local alignments unless at least this many bases will be clipped */ + private final int LOCAL_ALIGN_TIP_LENGTH=8; + /** Range is 0-1; a lower number makes trimming more aggressive */ + private final float LOCAL_ALIGN_MATCH_POINT_RATIO=0.75f; + + /** Ratio of the points for a match of a single base needed to declare unambiguous */ + public final float CLEARZONE_RATIOP=1.5f; + public final float CLEARZONE_RATIO1=2.2f; + public final float CLEARZONE_RATIO1b=2.8f; + public final float CLEARZONE_RATIO1c=4.8f; + public final float CLEARZONE_RATIO3=8f; + /** Max allowed number of sites within 1 edit (excluding primary site) */ + public final int CLEARZONE_LIMIT1e=4; + //public final int CLEARZONE1e; + public final int CLEARZONEP; + public final int CLEARZONE1; + public final int CLEARZONE1b; + public final int CLEARZONE1c; + public final int CLEARZONE3; + public final float INV_CLEARZONE3; + public final float CLEARZONE1b_CUTOFF=0.92f; + public final float CLEARZONE1c_CUTOFF=0.82f; + + public final BBIndexPacBio index; + + + private final int MIN_TRIM_SITES_TO_RETAIN_SINGLE=3; + private final int MIN_TRIM_SITES_TO_RETAIN_PAIRED=2; + private int MAX_TRIM_SITES_TO_RETAIN=800; + + public static void setExpectedSites(int x){ + System.err.println("Warning: EXPECTED_SITES is not valid for "+(new Object() { }.getClass().getEnclosingClass().getName())); + } + + @Override + public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;} + @Override + public final int ALIGN_ROWS(){return ALIGN_ROWS;} + @Override + public final int maxReadLength(){return ALIGN_ROWS-1;} + @Override + final AbstractIndex index(){return index;} + @Override + final int CLEARZONE1(){return CLEARZONE1;} + + public BBMapThreadPacBio(ConcurrentReadStreamInterface cris_, int keylen_, + boolean colorspace_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_, + int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_, + boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_, + float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_, + RTextOutputStream3 outStream_, RTextOutputStream3 outStreamMapped_, RTextOutputStream3 outStreamUnmapped_, RTextOutputStream3 outStreamBlack_, + boolean translateToBaseSpace_, int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_, + int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_, + boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_, + boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_, + boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, + boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){ + + super(cris_, + outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_, + colorspace_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_, + AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, THRESH_, + minChrom_, maxChrom_, KFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_, + translateToBaseSpace_, REQUIRE_CORRECT_STRANDS_PAIRS_, + SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_, + MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_, + MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_, + QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_, + keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_, + BBIndexPacBio.MIN_APPROX_HITS_TO_KEEP, BBIndexPacBio.USE_EXTENDED_SCORE, + BBIndexPacBio.BASE_HIT_SCORE, BBIndexPacBio.USE_AFFINE_SCORE, BBIndexPacBio.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_); + + assert(SLOW_ALIGN_PADDING>=0); + assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO"; + + if(SLOW_ALIGN || MAKE_MATCH_STRING){ +// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, colorspace, MSA_TYPE); +// POINTS_MATCH=msa.POINTS_MATCH(); +// POINTS_MATCH2=msa.POINTS_MATCH2(); + CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2); + CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2); + CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2); + CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2); + CLEARZONE3=(int)(CLEARZONE_RATIO3*POINTS_MATCH2); +// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1; + }else{ +// POINTS_MATCH=70; +// POINTS_MATCH2=100; +// msa=null; + CLEARZONE1=0; + CLEARZONE1b=0; + CLEARZONE1c=0; + CLEARZONEP=0; + CLEARZONE3=0; +// CLEARZONE1e=0; + } + INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3); + + index=new BBIndexPacBio(KEYLEN, minChrom, maxChrom, KFILTER, msa); + } + + + public int trimList(ArrayList list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){ + if(list==null || list.size()==0){return -99999;} + if(list.size()==1){return list.get(0).score;} + + final int highestScore; + if(USE_AFFINE_SCORE){ + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(highestScore==maxScore && specialCasePerfect){ + Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + return highestScore; + } + + final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1); + +// if(list.size()>6){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>10){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>14){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>18){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// //// System.out.print(", "+list.size()); +// if(list.size()>22){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// //// System.out.print(", "+list.size()); +// if(list.size()>26){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +//// // System.out.print(", "+list.size()); +// if(list.size()>34){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +// if(list.size()>42){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +// if(list.size()>50){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); +//// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + + if(list.size()>4){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>8){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>12){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// //// System.out.print(", "+list.size()); + if(list.size()>20){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// //// System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .9f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .97f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .99f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); +// if(list.size()>64){Tools.trimSiteList(list, .995f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else if(USE_EXTENDED_SCORE){ + highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else{ + // System.out.print("\n\nSize:\t"+list.size()); + + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + } + + return highestScore; + } + + + public void scoreSlow(final ArrayList list, final byte[] basesP, final byte[] basesM, + final int maxSwScore, final int maxImperfectSwScore){ + + int minMsaLimit; + if(PAIRED){ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore); + }else{ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore); + } + assert(Read.CHECKSITES(list, basesP, basesM, -1)); + + int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string + for(SiteScore ss : list){ + + final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM); + + if(SEMIPERFECTMODE){ + assert(ss.stop-ss.start==bases.length-1); + assert(ss.semiperfect); + } + + if(verbose){System.err.println("Slow-scoring "+ss);} + if(ss.stop-ss.start!=bases.length-1){ + assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText(); + assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n"; + ss.slowScore=-1; + ss.semiperfect=false; + ss.perfect=false; + } + + final int swscoreNoIndel=ss.slowScore; + int[] swscoreArray=null; + + if(swscoreNoIndel4000){ + System.err.println(ss.toText()); + System.err.println(list.size()); + System.err.println(); + } + + int expectedLen=GapTools.calcGrefLen(ss); + if(expectedLen>=EXPECTED_LEN_LIMIT){ + //TODO: Alternately, I could kill the site. + ss.stop=ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT); + if(ss.gaps!=null){GapTools.fixGaps(ss);} + } + + int pad=SLOW_ALIGN_PADDING; + int minscore=Tools.max(swscoreNoIndel, minMsaLimit); + if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));} + swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore); + if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));} + + if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen=minscore && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){ + assert(swscoreArray.length==6) : swscoreArray.length; + assert(swscoreArray[0]>=minscore) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch; + ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null); + ss.fixXY(bases, true, msa); + }else{ss.match=null;} + } + if(swscoreArray!=null){ + if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));} + ss.slowScore=swscoreArray[0]; + ss.start=swscoreArray[1]; + ss.stop=swscoreArray[2]; + if(ss.gaps!=null){ + if(verbose){System.err.println("GapTools.fixGaps("+ss.start+", "+ss.stop+", "+Arrays.toString(ss.gaps)+", "+Shared.MINGAP);} + ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP); + } + }else{ + assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP); + assert(swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) : + swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+ + ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow + } + ss.score=ss.slowScore; + minMatch=Tools.max(minMatch, ss.slowScore); + minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3); + assert(ss.slowScore<=maxSwScore); + assert(!(ss.perfect && ss.slowScore "+ss);} + } + } + + + public void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(OUTPUT_PAIRED_ONLY && r.mate!=null && !r.paired() && (r.mapped() || r.mate.mapped())){r.clearPairMapping();} + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment1++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + assert(elements>0 == r.mapped()); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM1+=errors[0]; + matchCountS1+=errors[1]; + matchCountD1+=errors[2]; + matchCountI1+=errors[3]; + matchCountN1+=errors[4]; + } + + + mappedRetained1++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP1++; + }else{ + rescuedM1++; + } + } + if(r.paired()){ + numMated++; + int inner; + int outer; + if(r.start<=r.mate.start){ + inner=r.mate.start-r.stop; + outer=r.mate.stop-r.start; + }else{ + inner=r.start-r.mate.stop; + outer=r.stop-r.mate.start; + } + + inner=Tools.min(MAX_PAIR_DIST, inner); + inner=Tools.max(MIN_PAIR_DIST, inner); + innerLengthSum+=inner; + outerLengthSum+=outer; + insertSizeSum+=(inner+r.bases.length+r.mate.bases.length); + }else if(r.mate!=null && r.mate.mapped()/*&& r.list!=null && r.list.size()>0*/){ + badPairs++; + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch1++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP1++;} + else{truePositiveM1++;} + totalCorrectSites1+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit1++; + }else{ + correctMultiHit1++; + } + }else{ + correctLowHit1++; + } + + }else{ + + falsePositive1++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + }else{ + noHit1++; + } + } + + + public void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(r.ambiguous() && (AMBIGUOUS_TOSS || r.mapped())){ + ambiguousBestAlignment2++; + } + + int[] correctness=calcCorrectness(r, THRESH); + int correctGroup=correctness[0]; + int correctGroupSize=correctness[1]; + int numGroups=correctness[2]; + int elements=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + int sizeOfTopGroup=correctness[6]; + int numCorrect=correctness[7]; + boolean firstElementCorrect=(correctness[8]==1); + boolean firstElementCorrectLoose=(correctness[9]==1); + boolean firstGroupCorrectLoose=(correctness[10]==1); + + if(elements>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM2+=errors[0]; + matchCountS2+=errors[1]; + matchCountD2+=errors[2]; + matchCountI2+=errors[3]; + matchCountN2+=errors[4]; + } + + mappedRetained2++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP2++; + }else{ + rescuedM2++; + } + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch2++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP2++;} + else{truePositiveM2++;} + totalCorrectSites2+=numCorrect; + + if(correctGroup==1){ + if(sizeOfTopGroup==1){ + correctUniqueHit2++; + }else{ + correctMultiHit2++; + } + }else{ + correctLowHit2++; + } + + }else{ + + falsePositive2++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded2++; + }else{ + noHit2++; + } + } + + public void processRead(final Read r, final byte[] basesM){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final byte[] basesP=r.bases; + +// System.err.print(" rd#"+r.numericID+" "); +// if(r.numericID==25967){ +// verbose=true; +// msa.verbose=true; +// GapTools.verbose=true; +// index.verbose=true; +// tcr.verbose=true; +// } + + if(verbose){System.err.println("\nProcessing "+r);} + readsUsed++; + + final int maxPossibleQuickScore=quickMap(r, basesM); + if(verbose){System.err.println("\nQuick Map: \t"+r.sites);} + + if(maxPossibleQuickScore<0){ + r.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + return; + } + initialSiteSum1+=r.numSites(); + if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);} + + int maxSwScore=0; + int maxImperfectSwScore=0; + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + maxSwScore=msa.maxQuality(r.bases.length); + maxImperfectSwScore=msa.maxImperfectScore(r.bases.length); + } + + if(TRIM_LIST && r.numSites()>1){ + if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);} + int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + if(verbose){System.err.println("\nAfter trim: \t"+r.sites);} + + assert(Read.CHECKSITES(r, basesM)); + + + if(SLOW_ALIGN && r.numSites()>0){ + + int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore); + + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + +// int numPerfectScores=0; +// if(numNearPerfectScores>0){ +// for(SiteScore ss : r.list){ +// if(ss.perfect){numPerfectScores++;} +// else{break;} +// } +// } + + if(verbose){ + System.err.println("\nAfter scoreNoIndels: \t"+r.sites); + } + + if(numNearPerfectScores<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);} + } + + if(verbose){ + System.err.println("\nAfter findTipDeletions: \t"+r.sites); + } + + //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length + //***Above note should be resolved now, but needs to be verified. + + if(numNearPerfectScores<1){ + scoreSlow(r.sites, basesP, basesM, maxSwScore, maxImperfectSwScore); + } + + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + + if(verbose){System.err.println("\nAfter scoreSlow: \t"+r.sites);} + assert(Read.CHECKSITES(r, basesM)); + } + + + if(r.numSites()>0){ + mapped1++; + try { + Tools.mergeDuplicateSites(r.sites, true, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r.toText(false)+"\n\n"); + } + Collections.sort(r.sites); + } + + if(r.numSites()>1){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r.sites.get(1); + //Ensure no duplicates + assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false); + } + assert(Read.CHECKSITES(r, basesM)); + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);} + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + final int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false); + r.setAmbiguous(b); + } + } + + if(verbose){System.err.println("A: "+r);} + + if((SLOW_ALIGN || USE_AFFINE_SCORE) && r.numSites()>0){ + int lim=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.topSite().score0) : r.sites+", "+r.mapScore+"\n"+r; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("C: "+r);} + + //***$ + if(MAKE_MATCH_STRING && r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + if(r.sites.size()>1){ + assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n"; + } + int mapScore=r.mapScore; + + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true); + if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){ + SiteScore ss=r.topSite(); + r.mapScore=ss.score=ss.slowScore=ss.pairedScore=Tools.min(ss.score, -9999); + } + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("D: "+r);} + + //TODO: Fix this + // if(mapScore>r.mapScore){ + // System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID); + // } + r.topSite().score=r.topSite().slowScore; + while(r.sites.size()>1 && r.topSite().score1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("E: "+r);} + } + } + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + removeDuplicateBestSites(r); + } + if(r.numSites()>0){r.topSite().match=r.match;} + + + + if(r.sites!=null && r.mapScore<=0){ + if(!Shared.anomaly){ + System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped."); + if(MSA.bandwidth>0 || MSA.bandwidthRatio>0){Shared.anomaly=true;} + } + r.mapScore=0; + r.setMapped(false); + r.sites=null; + } + + + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + }else if(r.mapScore<=0 && r.sites!=null){ + if(BANDWIDTH<1){ + if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + } + r.clearMapping(); + } + assert(r.sites==null || r.mapScore>0) : + "\n\n**************************************** ERROR ***********************************************\n\n" + + "mapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+ + "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+ + "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+ + "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n"; + +// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString(); + + + + if(CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP){ + boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3); + if(changed){ + int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.mapScore pairs=new ArrayList(Tools.min(8, Tools.min(r.list.size(), r2.list.size()))); + + int maxPairedScore1=-1; + int maxPairedScore2=-1; + + +// for(SiteScore ss : r.list){ +// System.out.println(ss.toText()); +// } + +// int i=0, j=0; + final int ilimit=r.sites.size()-1; + final int jlimit=r2.sites.size()-1; + +// final int outerDistLimit=MIN_PAIR_DIST+r.bases.length+r2.bases.length; + final int outerDistLimit=(Tools.max(r.bases.length, r2.bases.length)*OUTER_DIST_MULT)/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0); + final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0); + final int expectedFragLength=AVERAGE_PAIR_DIST+r.bases.length+r2.bases.length; + + int numPerfectPairs=0; + + for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){ + SiteScore ss1=r.sites.get(i); + SiteScore ss2=r2.sites.get(j); + + while(jinnerDistLimit))){ + j++; + ss2=r2.sites.get(j); + } + + for(int k=j; k<=jlimit; k++){ + ss2=r2.sites.get(k); + + if(ss2.chrom>ss1.chrom){break;} + if(ss2.start-ss1.stop>innerDistLimit){break;} + +// int dist=0; +// +// if(ss1.start<=ss2.start){ +// dist=ss2.start-ss1.stop; +// }else if(ss1.start>ss2.start){ +// dist=ss1.start-ss2.stop; +// } + + +// int innerdist=0; +// int outerdist=0; +// +// if(ss1.start<=ss2.start){ +// innerdist=ss2.start-ss1.stop; +// outerdist=ss2.stop-ss1.start; +// }else if(ss1.start>ss2.start){ +// innerdist=ss1.start-ss2.stop; +// outerdist=ss1.stop-ss2.start; +// } + + final int innerdist, outerdist; + //assert(!SAME_STRAND_PAIRS) : "TODO"; + + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + if(ss1.strand!=ss2.strand){ + if(ss1.strand==Gene.PLUS){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + + assert(outerdist>=innerdist); + + if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){ + + boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS); + + if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){ + + boolean paired1=false, paired2=false; + + int deviation=absdif(AVERAGE_PAIR_DIST, innerdist); + + final int pairedScore1; + final int pairedScore2; + if(strandOK){ +// pairedScore1=ss1.score+ss2.score/2; +// pairedScore2=ss2.score+ss1.score/2; + + pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-((deviation*ss2.score)/(32*expectedFragLength+100))); + pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-((deviation*ss1.score)/(32*expectedFragLength+100))); + }else{//e.g. a junction + pairedScore1=ss1.score+ss2.score/16; + pairedScore2=ss2.score+ss1.score/16; + } + + if(pairedScore1>ss1.pairedScore){ + paired1=true; + ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); + maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); + // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText()); + }else{ + // System.out.println(ss1.toText()+" already paired."); + } + if(pairedScore2>ss2.pairedScore){ + paired2=true; + ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); + maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + + if(paired1 && paired2 && innerdist>0 && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){ + numPerfectPairs++; //Lower bound. Some perfect pairs may be the same. + } + +// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); +// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); +// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); +// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + } + } + + } + + + + for(SiteScore ss : r.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + for(SiteScore ss : r2.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + + if(trim){ + if(numPerfectPairs>0){ +// System.out.print("."); + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.94f), false, true, 1, MAX_TRIM_SITES_TO_RETAIN); + }else{ + if(r.sites.size()>4){ + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + if(r2.sites.size()>4){ + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.9f), true, true, 1, MAX_TRIM_SITES_TO_RETAIN); + } + } + } + +// if(pairs.isEmpty()){return null;} +// +// ArrayList temp=new ArrayList(Tools.max(r.list.size(), r2.list.size())); +// +// for(SiteScore ss : r.list){ +// if(ss.score>maxPairedScore1){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.a); +// } +// r.list.clear(); +// r.list.addAll(temp); +// +// for(SiteScore ss : r2.list){ +// if(ss.score>maxPairedScore2){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.b); +// } +// r2.list.clear(); +// r2.list.addAll(temp); +// +// return pairs; + + return numPerfectPairs; + } + + + public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final Read r2=r.mate; + assert(r2!=null); + final byte[] basesP1=r.bases, basesP2=r2.bases; + + readsUsed++; + readsUsed2++; + + final int maxPossibleQuickScore1=quickMap(r, basesM1); + final int maxPossibleQuickScore2=quickMap(r2, basesM2); + + if(verbose){ + System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate); + } + + if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){ + r.sites=null; + r2.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + lowQualityReadsDiscarded2++; + r2.setDiscarded(true); + return; + } + + //Not really needed due to subsumption +// Tools.mergeDuplicateSites(r.list); +// Tools.mergeDuplicateSites(r2.list); + + initialSiteSum1+=r.numSites(); + initialSiteSum2+=r2.numSites(); + + //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used. + //Discards need to be tracked separately for each end. +// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;} + + final int maxSwScore1=msa.maxQuality(r.bases.length); + final int maxImperfectSwScore1=msa.maxImperfectScore(r.bases.length); + final int maxSwScore2=msa.maxQuality(r2.bases.length); + final int maxImperfectSwScore2=msa.maxImperfectScore(r2.bases.length); + + pairSiteScoresInitial(r, r2, TRIM_LIST); + if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(TRIM_LIST){ + + if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){ + if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r.sites);} + if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){Collections.sort(r2.sites);} + } + + trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + postTrimSiteSum2+=r2.numSites(); + + {//Reset score to non-paired score + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + } + + if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);} + + if(SLOW_ALIGN){ + + if(r.numSites()>0){ + + int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores1<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1);} + } + + //TODO: + //Note scoreSlow can be skipped under this circumstance: + //When rescue is disabled, numNearPerfectScores>0, and there are no paired sites. + scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){ + r.clearMapping(); + } + } + Tools.mergeDuplicateSites(r.sites, true, true); + } + + if(r2.numSites()>0){ + int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores2<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2);} + } + + scoreSlow(r2.sites, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r2.sites, index.MAX_INDEL); + if(r2.numSites()<1){r2.clearMapping();} + } + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + + if(verbose){System.err.println("\nAfter slow align:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(DO_RESCUE){ + int unpaired1=0; + int unpaired2=0; + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired1++;} + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r2.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired2++;} + } + } + + if(unpaired1>0 && r.numSites()>0){ + Collections.sort(r.sites); + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + if(unpaired2>0 && r2.numSites()>0){ + Collections.sort(r2.sites); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r.sites, true, true); + } + + postRescueSiteSum1+=r.numSites(); + postRescueSiteSum2+=r2.numSites(); + +// if(r.list!=null){Collections.sort(r.list);} +// if(r2.list!=null){Collections.sort(r2.list);} +// +// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); +// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + + if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + }else{ + Tools.mergeDuplicateSites(r.sites, true, false); + Tools.mergeDuplicateSites(r2.sites, true, false); + if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + + if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!) + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, true, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + if(verbose){System.err.println("\nAfter final pairing:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0){ + mapped1++; + Collections.sort(r.sites); + } + if(r2.numSites()>0){ + mapped2++; + Collections.sort(r2.sites); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + r.setPerfectFlag(maxSwScore1); + r2.setPerfectFlag(maxSwScore2); + } + + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false); + r.setAmbiguous(b); + } + } + + if(r2.numSites()>1){ + final int clearzone=r2.perfect() ? CLEARZONEP : + r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites2=Tools.countTopScores(r2.sites, clearzone); + if(numBestSites2>1){ + //Ambiguous alignment + assert(r2.sites.size()>1); + + boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, false); + r2.setAmbiguous(b); + } + } + if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0 && r2.numSites()>0){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r2.topSite(); + if(canPair(ss1, ss2, r.bases.length, r2.bases.length, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + r.setPaired(true); + r.mate.setPaired(true); + } + } + + if(r.numSites()==0){r.sites=null;r.mapScore=0;} + if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;} + +// assert(Read.CHECKSITES(r, basesM));//***123 +// assert(Read.CHECKSITES(r2));//***123 + + r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + assert(checkTopSite(r)); // TODO remove this + if(KILL_BAD_PAIRS){ + if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + int x=r.mapScore/r.bases.length; + int y=r2.mapScore/r2.bases.length; + if(x>=y){ + r2.clearAnswers(false); + }else{ + r.clearAnswers(false); + } + } + } + if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(checkTopSite(r)); // TODO remove this + if(MAKE_MATCH_STRING){ + if(r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + assert(checkTopSite(r)); // TODO remove this + genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false); + assert(checkTopSite(r)); // TODO remove this + } + } + if(r2.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){ + r2.match=r2.topSite().match; + }else{ + genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false); + } + } + } + + assert(checkTopSite(r)); // TODO remove this + if(verbose){ + System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2); + if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);} + if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);} + } + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + }else if(r.mapScore<=0 && r.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + } + assert(checkTopSite(r)); // TODO remove this + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r2.mapScore>0 && r2.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + }else if(r2.mapScore<=0 && r2.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + } + + assert(r.sites==null || r.mapScore>0) : + r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + assert(r2.sites==null || r2.mapScore>0) : + r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + + assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails."; + assert(checkTopSite(r)); // TODO remove this + removeDuplicateBestSites(r); + removeDuplicateBestSites(r2); + + if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){ + AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated); + } + assert(checkTopSite(r)); // TODO remove this + if(r.ambiguous() && AMBIGUOUS_TOSS){ + if(r.sites!=null){r.sites=null;} + r.clearSite(); + r.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + if(r2.ambiguous() && AMBIGUOUS_TOSS){ + if(r2.sites!=null){r2.sites=null;} + r2.clearSite(); + r2.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + + assert(checkTopSite(r)); + if(r.mapped() && (LOCAL_ALIGN || r.containsXY2())){ + final SiteScore ss=r.topSite(); + ss.match=r.match; + msa.toLocalAlignment(r, ss, r.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + assert(checkTopSite(r2)); + if(r2.mapped() && (LOCAL_ALIGN || r2.containsXY2())){ + final SiteScore ss=r2.topSite(); + ss.match=r2.match; + msa.toLocalAlignment(r2, ss, r2.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + if(CALC_STATISTICS){ + calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1); + calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2); + } + } + +} diff --git a/current/align2/BBMapThreadPacBioSkimmer.java b/current/align2/BBMapThreadPacBioSkimmer.java new file mode 100755 index 0000000..20aa402 --- /dev/null +++ b/current/align2/BBMapThreadPacBioSkimmer.java @@ -0,0 +1,1653 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import stream.ConcurrentReadStreamInterface; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Gene; + +/** + * Based on MapTestThread11f + * Designed to skim and retain all sites above a threshold. + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public final class BBMapThreadPacBioSkimmer extends AbstractMapThread{ + + static final int ALIGN_COLUMNS=BBIndexPacBioSkimmer.ALIGN_COLUMNS; + static final int ALIGN_ROWS=4020; + + + + /** Don't trim for local alignments unless at least this many bases will be clipped */ + private final int LOCAL_ALIGN_TIP_LENGTH=8; + /** Range is 0-1; a lower number makes trimming more aggressive */ + private final float LOCAL_ALIGN_MATCH_POINT_RATIO=0.75f; + + /** Ratio of the points for a match of a single base needed to declare unambiguous */ + public final float CLEARZONE_RATIOP=1.5f; + public final float CLEARZONE_RATIO1=2.2f; + public final float CLEARZONE_RATIO1b=2.8f; + public final float CLEARZONE_RATIO1c=4.8f; + public final float CLEARZONE_RATIO3=8f; + /** Max allowed number of sites within 1 edit (excluding primary site) */ + public final int CLEARZONE_LIMIT1e=4; + //public final int CLEARZONE1e; + public final int CLEARZONEP; + public final int CLEARZONE1; + public final int CLEARZONE1b; + public final int CLEARZONE1c; + public final int CLEARZONE3; + public final float INV_CLEARZONE3; + public final float CLEARZONE1b_CUTOFF=0.92f; + public final float CLEARZONE1c_CUTOFF=0.82f; + + public final BBIndexPacBioSkimmer index; + + + private static int MIN_TRIM_SITES_TO_RETAIN_SINGLE=2; + private static int MIN_TRIM_SITES_TO_RETAIN_PAIRED=1; + private static int MAX_TRIM_SITES_TO_RETAIN=400; + + /** TODO - perhaps I can rewrite cz3 to penalize reads that map similarly to more than the expected number of places */ + public static final boolean USE_CLEARZONE3=false; + + private static int EXPECTED_SITES=1; + public static void setExpectedSites(int x){ + EXPECTED_SITES=x; + MIN_TRIM_SITES_TO_RETAIN_SINGLE=Tools.max((int)(EXPECTED_SITES*4)+1, MIN_TRIM_SITES_TO_RETAIN_SINGLE); + MIN_TRIM_SITES_TO_RETAIN_PAIRED=Tools.max((int)(EXPECTED_SITES*4)+1, MIN_TRIM_SITES_TO_RETAIN_PAIRED); + MAX_TRIM_SITES_TO_RETAIN=EXPECTED_SITES*40+80; + } + + @Override + public final int ALIGN_COLUMNS(){return ALIGN_COLUMNS;} + @Override + public final int ALIGN_ROWS(){return ALIGN_ROWS;} + @Override + public final int maxReadLength(){return ALIGN_ROWS-1;} + @Override + final AbstractIndex index(){return index;} + @Override + final int CLEARZONE1(){return CLEARZONE1;} + + public BBMapThreadPacBioSkimmer(ConcurrentReadStreamInterface cris_, int keylen_, + boolean colorspace_, boolean SMITH_WATERMAN_, int THRESH_, int minChrom_, + int maxChrom_, float keyDensity_, float maxKeyDensity_, float minKeyDensity_, int maxDesiredKeys_, + boolean REMOVE_DUPLICATE_BEST_ALIGNMENTS_, boolean SAVE_AMBIGUOUS_XY_, + float MINIMUM_ALIGNMENT_SCORE_RATIO_, boolean TRIM_LIST_, boolean MAKE_MATCH_STRING_, boolean QUICK_MATCH_STRINGS_, + RTextOutputStream3 outStream_, RTextOutputStream3 outStreamMapped_, RTextOutputStream3 outStreamUnmapped_, RTextOutputStream3 outStreamBlack_, + boolean translateToBaseSpace_, int SLOW_ALIGN_PADDING_, int SLOW_RESCUE_PADDING_, boolean DONT_OUTPUT_UNMAPPED_READS_, boolean DONT_OUTPUT_BLACKLISTED_READS_, + int MAX_SITESCORES_TO_PRINT_, boolean PRINT_SECONDARY_ALIGNMENTS_, + boolean REQUIRE_CORRECT_STRANDS_PAIRS_, boolean SAME_STRAND_PAIRS_, boolean KILL_BAD_PAIRS_, boolean RCOMP_MATE_, + boolean PERFECTMODE_, boolean SEMIPERFECTMODE_, boolean FORBID_SELF_MAPPING_, int TIP_DELETION_SEARCH_RANGE_, + boolean AMBIGUOUS_RANDOM_, boolean AMBIGUOUS_ALL_, int KFILTER_, boolean TRIM_LEFT_, boolean TRIM_RIGHT_, boolean UNTRIM_, byte TRIM_QUAL_, + boolean LOCAL_ALIGN_, boolean RESCUE_, boolean STRICT_MAX_INDEL_, String MSA_TYPE_){ + + super(cris_, + outStream_, outStreamMapped_, outStreamUnmapped_, outStreamBlack_, + colorspace_, SMITH_WATERMAN_, LOCAL_ALIGN_, REMOVE_DUPLICATE_BEST_ALIGNMENTS_, + AMBIGUOUS_RANDOM_, AMBIGUOUS_ALL_, TRIM_LEFT_, TRIM_RIGHT_, UNTRIM_, TRIM_QUAL_, THRESH_, + minChrom_, maxChrom_, KFILTER_, KILL_BAD_PAIRS_, SAVE_AMBIGUOUS_XY_, + translateToBaseSpace_, REQUIRE_CORRECT_STRANDS_PAIRS_, + SAME_STRAND_PAIRS_, RESCUE_, STRICT_MAX_INDEL_, SLOW_ALIGN_PADDING_, SLOW_RESCUE_PADDING_, + MSA_TYPE_, keylen_, PERFECTMODE_, SEMIPERFECTMODE_, FORBID_SELF_MAPPING_, RCOMP_MATE_, + MAKE_MATCH_STRING_, DONT_OUTPUT_UNMAPPED_READS_, DONT_OUTPUT_BLACKLISTED_READS_, PRINT_SECONDARY_ALIGNMENTS_, + QUICK_MATCH_STRINGS_, MAX_SITESCORES_TO_PRINT_, MINIMUM_ALIGNMENT_SCORE_RATIO_, + keyDensity_, maxKeyDensity_, minKeyDensity_, maxDesiredKeys_, + BBIndexPacBioSkimmer.MIN_APPROX_HITS_TO_KEEP, BBIndexPacBioSkimmer.USE_EXTENDED_SCORE, + BBIndexPacBioSkimmer.BASE_HIT_SCORE, BBIndexPacBioSkimmer.USE_AFFINE_SCORE, BBIndexPacBioSkimmer.MAX_INDEL, TRIM_LIST_, TIP_DELETION_SEARCH_RANGE_); + + assert(SLOW_ALIGN_PADDING>=0); + assert(!(RCOMP_MATE/* || FORBID_SELF_MAPPING*/)) : "RCOMP_MATE: TODO"; + + if(SLOW_ALIGN || MAKE_MATCH_STRING){ +// msa=MSA.makeMSA(ALIGN_ROWS, ALIGN_COLUMNS, colorspace, MSA_TYPE); +// POINTS_MATCH=msa.POINTS_MATCH(); +// POINTS_MATCH2=msa.POINTS_MATCH2(); + CLEARZONE1=(int)(CLEARZONE_RATIO1*POINTS_MATCH2); + CLEARZONE1b=(int)(CLEARZONE_RATIO1b*POINTS_MATCH2); + CLEARZONE1c=(int)(CLEARZONE_RATIO1c*POINTS_MATCH2); + CLEARZONEP=(int)(CLEARZONE_RATIOP*POINTS_MATCH2); + CLEARZONE3=(int)(CLEARZONE_RATIO3*POINTS_MATCH2); +// CLEARZONE1e=(int)(2*POINTS_MATCH2-POINTS_MATCH-msa.POINTS_SUB())+1; + }else{ +// POINTS_MATCH=70; +// POINTS_MATCH2=100; +// msa=null; + CLEARZONE1=0; + CLEARZONE1b=0; + CLEARZONE1c=0; + CLEARZONEP=0; + CLEARZONE3=0; +// CLEARZONE1e=0; + } + INV_CLEARZONE3=(CLEARZONE3==0 ? 0 : 1f/CLEARZONE3); + + index=new BBIndexPacBioSkimmer(KEYLEN, minChrom, maxChrom, KFILTER, msa); + } + + + public int trimList(ArrayList list, boolean retainPaired, int maxScore, boolean specialCasePerfect, int minSitesToRetain, int maxSitesToRetain){ + if(list==null || list.size()==0){return -99999;} + if(list.size()==1){return list.get(0).score;} + + boolean b=(list.size()>=minSitesToRetain); + + final int highestScore; + if(USE_AFFINE_SCORE){ + + Tools.trimSiteList(list, .10f, retainPaired, true, EXPECTED_SITES<3 ? EXPECTED_SITES : EXPECTED_SITES*2, maxSitesToRetain); + + highestScore=Tools.trimSiteList(list, .25f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(highestScore==maxScore && specialCasePerfect){ + Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + if(list.size()>8){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + return highestScore; + } + + final int mstr2=(minSitesToRetain<=1 ? 1 : minSitesToRetain+1); + final int mstr3=(minSitesToRetain+1)/2; + + assert(!b || list.size()>=EXPECTED_SITES); + + int N=(EXPECTED_SITES+3)/4; + + if(list.size()>2*N){ + Tools.trimSiteListByMax(list, (int)(.12f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } + + if(list.size()>3*N){ + Tools.trimSiteList(list, .28f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.14f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } + + if(list.size()>4*N){ + Tools.trimSiteList(list, .32f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.16f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } + + if(list.size()>5*N){ + Tools.trimSiteList(list, .36f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.17f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } + + if(list.size()>6*N){ + Tools.trimSiteList(list, .38f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.18f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } + + if(list.size()>8*N){ + Tools.trimSiteList(list, .40f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.20f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } +// // System.out.print(", "+list.size()); + if(list.size()>12*N){ + Tools.trimSiteList(list, .45f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.22f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } +// // System.out.print(", "+list.size()); + if(list.size()>16*N){ + Tools.trimSiteList(list, .50f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.24f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } +// // System.out.print(", "+list.size()); + if(list.size()>20*N){ + Tools.trimSiteList(list, .55f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.26f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } + + if(list.size()>24*N){ + Tools.trimSiteList(list, .60f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.28f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } +// //// System.out.print(", "+list.size()); + if(list.size()>32*N){ + Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + Tools.trimSiteListByMax(list, (int)(.30f*maxScore), retainPaired, true, mstr3, maxSitesToRetain); + } +// //// System.out.print(", "+list.size()); + if(list.size()>40*N){Tools.trimSiteList(list, .70f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} +// // System.out.print(", "+list.size()); + if(list.size()>48*N){Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>64*N){Tools.trimSiteList(list, .80f, retainPaired, true, mstr2, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>80*N){Tools.trimSiteList(list, .85f, retainPaired, true, mstr2, maxSitesToRetain);} + if(list.size()>96*N){Tools.trimSiteList(list, .90f, retainPaired, true, mstr2, maxSitesToRetain);} + + assert(!b || list.size()>=EXPECTED_SITES); + + + }else if(BBIndexPacBioSkimmer.USE_EXTENDED_SCORE){ + highestScore=Tools.trimSiteList(list, .75f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>8){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>36){Tools.trimSiteList(list, .92f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>40){Tools.trimSiteList(list, .94f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>64){Tools.trimSiteList(list, .98f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>80){Tools.trimSiteList(list, .99f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + + + }else{ + // System.out.print("\n\nSize:\t"+list.size()); + + + highestScore=Tools.trimSiteList(list, .6f, retainPaired, true, minSitesToRetain, maxSitesToRetain); + + if(list.size()>12){Tools.trimSiteList(list, .65f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>16){Tools.trimSiteList(list, .7f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>24){Tools.trimSiteList(list, .74f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>28){Tools.trimSiteList(list, .8f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>32){Tools.trimSiteList(list, .85f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + if(list.size()>48){Tools.trimSiteList(list, .90f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>40){Tools.trimSiteList(list, .95f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>48){Tools.trimSiteList(list, .96f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + //// System.out.print(", "+list.size()); + // if(list.size()>56){Tools.trimSiteList(list, .97f, retainPaired, true, minSitesToRetain, maxSitesToRetain);} + // System.out.print(", "+list.size()); + } + + return highestScore; + } + + + public void scoreSlow(final ArrayList list, final byte[] basesP, final byte[] basesM, + final int maxSwScore, final int maxImperfectSwScore){ + + final int minMsaLimit; + if(PAIRED){ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE*maxSwScore); + }else{ + minMsaLimit=-CLEARZONE1e+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore); + } + assert(Read.CHECKSITES(list, basesP, basesM, -1)); + + int minMatch=Tools.max(-300, minMsaLimit-CLEARZONE3); //Score must exceed this to generate quick match string + for(SiteScore ss : list){ + + final byte[] bases=(ss.strand==Gene.PLUS ? basesP : basesM); + + if(SEMIPERFECTMODE){ + assert(ss.stop-ss.start==bases.length-1); + assert(ss.semiperfect); + } + + if(verbose){System.err.println("Slow-scoring "+ss);} + if(ss.stop-ss.start!=bases.length-1){ + assert(ss.stop-ss.start>bases.length-1) : bases.length+", "+ss.toText(); + assert(!ss.semiperfect) : "\n"+bases.length+", "+ss.toText()+", "+ss.perfect+", "+ss.semiperfect+", "+maxSwScore+"\n"+new String(basesP)+"\n"; + ss.slowScore=-1; + ss.semiperfect=false; + ss.perfect=false; + } + + final int swscoreNoIndel=ss.slowScore; + int[] swscoreArray=null; + + if(swscoreNoIndel4000){ + System.err.println(ss.toText()); + System.err.println(list.size()); + System.err.println(); + } + + int expectedLen=GapTools.calcGrefLen(ss); + if(expectedLen>=EXPECTED_LEN_LIMIT){ + //TODO: Alternately, I could kill the site. + ss.stop=ss.start+Tools.min(basesP.length+40, EXPECTED_LEN_LIMIT); + if(ss.gaps!=null){GapTools.fixGaps(ss);} + } + + int pad=SLOW_ALIGN_PADDING; + int minscore=Tools.max(swscoreNoIndel, minMsaLimit); + if(verbose){System.err.println("Sent to msa with start="+ss.start+", stop="+ss.stop+", pad="+pad+", limit="+minscore+", gaps="+GapTools.toString(ss.gaps));} + swscoreArray=msa.fillAndScoreLimited(bases, ss, pad, minscore); + if(verbose){System.err.println("Received "+Arrays.toString(swscoreArray));} + + if(swscoreArray!=null && swscoreArray.length>6 && (swscoreArray[3]+swscoreArray[4]+expectedLen=minscore && (PRINT_SECONDARY_ALIGNMENTS || (USE_SS_MATCH_FOR_PRIMARY && swscoreArray[0]>minMatch))){ + assert(swscoreArray.length==6) : swscoreArray.length; + assert(swscoreArray[0]>=minscore) : "\n"+Arrays.toString(swscoreArray)+"\n"+minscore+"\n"+minMatch; + ss.match=msa.traceback(bases, Data.getChromosome(ss.chrom).array, ss.start-pad, ss.stop+pad, swscoreArray[3], swscoreArray[4], swscoreArray[5], ss.gaps!=null); + ss.fixXY(bases, true, msa); + }else{ss.match=null;} + } + if(swscoreArray!=null){ + if(verbose){System.err.println("msa returned "+Arrays.toString(swscoreArray));} + ss.slowScore=swscoreArray[0]; + ss.start=swscoreArray[1]; + ss.stop=swscoreArray[2]; + if(ss.gaps!=null){ + if(verbose){System.err.println("GapTools.fixGaps("+ss.start+", "+ss.stop+", "+Arrays.toString(ss.gaps)+", "+Shared.MINGAP);} + ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP); + } + }else{ + assert(swscoreNoIndel<=maxSwScore) : swscoreNoIndel+", "+maxImperfectSwScore+", "+maxSwScore+", "+new String(basesP); + assert(swscoreNoIndel==-1 || msa.scoreNoIndels(bases, ss.chrom, ss.start)==swscoreNoIndel) : + swscoreNoIndel+" != "+msa.scoreNoIndels(bases, ss.chrom, ss.start)+"\n"+ + ss.toText()+"\n"+(ss.stop-ss.start)+", "+bases.length; //Slow + } + ss.score=ss.slowScore; + minMatch=Tools.max(minMatch, ss.slowScore); +// minMsaLimit=Tools.max(minMsaLimit, ss.slowScore-CLEARZONE3); + assert(ss.slowScore<=maxSwScore); + assert(!(ss.perfect && ss.slowScore "+ss);} + } + } + + + /** {group of correct hit (or -1), size of correct group, number of groups, + * number of elements, correctScore, maxScore, size of top group, num correct, firstElementCorrect, + * firstElementCorrectLoose, firstGroupCorrectLoose} */ + + public void calcStatistics1(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + if(OUTPUT_PAIRED_ONLY && r.mate!=null && !r.paired() && (r.mapped() || r.mate.mapped())){r.clearPairMapping();} + + /* {number of correct (loose) sites, number of incorrect (loose) sites, number incorrect sites before last correct site, + * number of sites, correctScore, maxScore, firstElementCorrect, firstElementCorrectLoose, position of first correct element (or -1), + * sizeOfTopGroup, numTopCorrect} */ + int[] correctness=calcCorrectnessSkimmer(r, THRESH); + + int numCorrect=correctness[0]; + int numIncorrect=correctness[1]; + int numIncorrectPrior=correctness[2]; + int numSites=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + boolean firstElementCorrect=(correctness[6]==1); + boolean firstElementCorrectLoose=(correctness[7]==1); + int positionOfFirstCorrect=correctness[8]; + int sizeOfTopGroup=correctness[9]; + int numTopCorrect=correctness[10]; + + assert(numSites==numCorrect+numIncorrect) : numSites+", "+numCorrect+", "+numIncorrect+", "+r.numSites(); + assert(numSites==r.numSites()); + + totalNumCorrect1+=numCorrect; + totalNumIncorrect1+=numIncorrect; + totalNumIncorrectPrior1+=numIncorrectPrior; + if(numCorrect>=EXPECTED_SITES){ + totalNumCapturedAllCorrect1++; +// assert(numCorrect==EXPECTED_CORRECT_SITES) : numCorrect +", "+EXPECTED_CORRECT_SITES+", "+r.list; + assert(r.sites.size()>=EXPECTED_SITES) : numCorrect; + if(numTopCorrect==numCorrect){ + totalNumCapturedAllCorrectTop1++; + if(numCorrect==numSites){ + totalNumCapturedAllCorrectOnly1++; + } + } + } + + assert(numSites>0 == r.mapped()); + if(numSites>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM1+=errors[0]; + matchCountS1+=errors[1]; + matchCountD1+=errors[2]; + matchCountI1+=errors[3]; + matchCountN1+=errors[4]; + } + + + mappedRetained1++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP1++; + }else{ + rescuedM1++; + } + } + if(r.paired()){ + numMated++; + int inner; + int outer; + if(r.start<=r.mate.start){ + inner=r.mate.start-r.stop; + outer=r.mate.stop-r.start; + }else{ + inner=r.start-r.mate.stop; + outer=r.stop-r.mate.start; + } + + inner=Tools.min(MAX_PAIR_DIST, inner); + inner=Tools.max(MIN_PAIR_DIST, inner); + innerLengthSum+=inner; + outerLengthSum+=outer; + insertSizeSum+=(inner+r.bases.length+r.mate.bases.length); + }else if(r.mate!=null && r.mate.mapped()/*&& r.list!=null && r.list.size()>0*/){ + badPairs++; + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch1++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP1++;} + else{truePositiveM1++;} + totalCorrectSites1+=numCorrect; + + if(positionOfFirstCorrect==0){ + correctUniqueHit1++; + }else{ + correctLowHit1++; + } + + }else{ + + falsePositive1++; +// System.out.println("********"); +// System.out.println(r.toText(false)); +// System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + }else{ + noHit1++; + } + } + + + public void calcStatistics2(final Read r, final int maxSwScore, final int maxPossibleQuickScore){ + + int[] correctness=calcCorrectnessSkimmer(r, THRESH); + + int numCorrect=correctness[0]; + int numIncorrect=correctness[1]; + int numIncorrectPrior=correctness[2]; + int numSites=correctness[3]; + int correctScore=correctness[4]; + int topScore=correctness[5]; + boolean firstElementCorrect=(correctness[6]==1); + boolean firstElementCorrectLoose=(correctness[7]==1); + int positionOfFirstCorrect=correctness[8]; + int sizeOfTopGroup=correctness[9]; + int numTopCorrect=correctness[10]; + + totalNumCorrect2+=numCorrect; + totalNumIncorrect2+=numIncorrect; + totalNumIncorrectPrior2+=numIncorrectPrior; + if(numCorrect>=EXPECTED_SITES){ + totalNumCapturedAllCorrect2++; + if(numTopCorrect==numCorrect){ + totalNumCapturedAllCorrectTop2++; + if(numCorrect==numSites){ + totalNumCapturedAllCorrectOnly2++; + } + } + } + + if(numSites>0){ + + if(r.match!=null){ + int[] errors=r.countErrors(); + matchCountM2+=errors[0]; + matchCountS2+=errors[1]; + matchCountD2+=errors[2]; + matchCountI2+=errors[3]; + matchCountN2+=errors[4]; + } + + mappedRetained2++; + if(r.rescued()){ + if(r.strand()==Gene.PLUS){ + rescuedP2++; + }else{ + rescuedM2++; + } + } + + if(r.perfect() || (maxSwScore>0 && r.topSite().slowScore==maxSwScore)){ + perfectMatch2++; + }else if(SLOW_ALIGN){ + assert(r.topSite().slowScore0){ + + if(r.strand()==Gene.PLUS){truePositiveP2++;} + else{truePositiveM2++;} + totalCorrectSites2+=numCorrect; + + if(positionOfFirstCorrect==0){ + correctUniqueHit2++; + }else{ + correctLowHit2++; + } + + }else{ + + falsePositive2++; + // System.out.println("********"); + // System.out.println(r.toText(false)); + // System.out.println(r.mate.toText(false)); + } + }else if(maxPossibleQuickScore==-1){ + lowQualityReadsDiscarded2++; + }else{ + noHit2++; + } + } + + public void processRead(final Read r, final byte[] basesM){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final byte[] basesP=r.bases; + +// System.err.print(" rd#"+r.numericID+" "); +// if(r.numericID==25967){ +// verbose=true; +// msa.verbose=true; +// GapTools.verbose=true; +// index.verbose=true; +// tcr.verbose=true; +// } + + if(verbose){System.err.println("\nProcessing "+r);} + readsUsed++; + + final int maxPossibleQuickScore=quickMap(r, basesM); + if(verbose){System.err.println("\nQuick Map: \t"+r.sites);} + + if(maxPossibleQuickScore<0){ + r.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + return; + } + initialSiteSum1+=r.numSites(); + if(verbose){System.err.println("\ninitialSiteSum1: "+initialSiteSum1);} + + int maxSwScore=0; + int maxImperfectSwScore=0; + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + maxSwScore=msa.maxQuality(r.bases.length); + maxImperfectSwScore=msa.maxImperfectScore(r.bases.length); + } + + if(TRIM_LIST && r.numSites()>1){ + if(MIN_TRIM_SITES_TO_RETAIN_SINGLE>1){Collections.sort(r.sites);} + int highestQuickScore=trimList(r.sites, false, maxSwScore, true, MIN_TRIM_SITES_TO_RETAIN_SINGLE, MAX_TRIM_SITES_TO_RETAIN); + } + postTrimSiteSum1+=r.numSites(); + if(verbose){System.err.println("\nAfter trim: \t"+r.sites);} + + assert(Read.CHECKSITES(r, basesM)); + + + if(SLOW_ALIGN && r.numSites()>0){ + Tools.subsumeOverlappingSites(r.sites, true, false); + int numNearPerfectScores=scoreNoIndels(r, basesP, basesM, maxSwScore, maxImperfectSwScore); + + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + +// int numPerfectScores=0; +// if(numNearPerfectScores>0){ +// for(SiteScore ss : r.list){ +// if(ss.perfect){numPerfectScores++;} +// else{break;} +// } +// } + + if(verbose){ + System.err.println("\nAfter scoreNoIndels: \t"+r.sites); + } + + if(numNearPerfectScores<1){ + if(FIND_TIP_DELETIONS){findTipDeletions(r, basesP, basesM, maxSwScore, maxImperfectSwScore);} + } + + if(verbose){ + System.err.println("\nAfter findTipDeletions: \t"+r.sites); + } + + //TODO: This causes problems with perfect matches that are mapped to areas longer than the read length + //***Above note should be resolved now, but needs to be verified. + + if(numNearPerfectScores0){ + mapped1++; + try { + Tools.mergeDuplicateSites(r.sites, true, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r.toText(false)+"\n\n"); + } + try { + Tools.removeOverlappingSites(r.sites, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r.toText(false)+"\n\n"); + } + Collections.sort(r.sites); + assert(Read.CHECKSITES(r, basesM)); + } + if(verbose){System.err.println("\nAfter merge: \t"+r.sites);} + + if(r.numSites()>1){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r.sites.get(1); + //Ensure no duplicates + assert(ss1.chrom!=ss2.chrom || ss1.strand!=ss2.strand || ss1.start!=ss2.start || ss1.stop!=ss2.stop) : r.toText(false); + } + assert(Read.CHECKSITES(r, basesM)); + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){r.setPerfectFlag(maxSwScore);} + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + final int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false); + r.setAmbiguous(b); + } + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesUnpaired2(r.sites, maxSwScore, MINIMUM_ALIGNMENT_SCORE_RATIO, EXPECTED_SITES); + if(r.numSites()>1){ + SiteScore a=r.topSite(); + SiteScore b=r.sites.get(1); + assert(a.score>=b.score); + assert(a.score>=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)) : a; + assert(b.score>=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO)) : "\n"+a+"\t"+b+"\n"+(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + } + } + if(verbose){System.err.println("\nAfter removal: \t"+r.sites);} + + if(r.numSites()==0){r.sites=null;r.mapScore=0;} + r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + assert(Read.CHECKSITES(r, basesM)); + + assert(r.gaps==null || r.gaps[0]==r.start && r.gaps[r.gaps.length-1]==r.stop); + assert(r.sites==null || r.mapScore>0) : r.sites+", "+r.mapScore+"\n"+r; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("C: "+r);} + + //***$ + if(MAKE_MATCH_STRING && r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + if(r.sites.size()>1){ + assert(r.topSite().score>=r.sites.get(1).score) : "\n"+r.topSite().toText()+"\t<\t"+r.sites.get(1).toText()+"\n"+r.toText(false)+"\n"; + } + int mapScore=r.mapScore; + + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + genMatchString(r, basesP, basesM, maxImperfectSwScore, maxSwScore, true, true); + if(STRICT_MAX_INDEL && hasLongIndel(r.match, index.MAX_INDEL)){ + SiteScore ss=r.topSite(); + r.mapScore=ss.score=ss.slowScore=ss.pairedScore=Tools.min(ss.score, -9999); + } + assert(r.mate!=null || r.numSites()==0 || r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + } + + if(verbose){System.err.println("D: "+r);} + + //TODO: Fix this + // if(mapScore>r.mapScore){ + // System.err.println("genMatchString reduced mapping score: "+mapScore+" -> "+r.mapScore+" in read "+r.numericID); + // } + r.topSite().score=r.topSite().slowScore; + while(r.sites.size()>1 && r.topSite().score1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + assert(Read.CHECKSITES(r, basesM)); + } + + if(verbose){System.err.println("E: "+r);} + } + } + if(verbose){System.err.println("\nAfter match: \t"+r.sites);} + + if(r.numSites()>1){ + assert(r.topSite().score==r.topSite().slowScore) : "\n"+r.toText(false)+"\n"; + assert(r.topSite().score==r.mapScore) : "\n"+r.toText(false)+"\n"; + removeDuplicateBestSites(r); + } + if(r.numSites()>0){r.topSite().match=r.match;} + + + + if(r.sites!=null && r.mapScore<=0){ + if(!Shared.anomaly){ + System.err.println("Note: Read "+r.id+" failed cigar string generation and will be marked as unmapped."); + if(MSA.bandwidth>0 || MSA.bandwidthRatio>0){Shared.anomaly=true;} + } + r.mapScore=0; + r.setMapped(false); + r.sites=null; + } + + + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + }else if(r.mapScore<=0 && r.sites!=null){ + if(BANDWIDTH<1){ + if(!Shared.anomaly){System.err.println("Anomaly1: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + } + r.clearMapping(); + } + assert(r.sites==null || r.mapScore>0) : + "\nmapScore = "+r.mapScore+"\nread = "+r.toText(false)+"\nscore thresh = "+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))+"\n"+ + "msa unlimited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), 0))+"\n"+ + "msa limited return = "+Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 10), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO*maxSwScore))))+"\n\n"+ + "msa vert limit: "+msa.showVertLimit()+"\n\nmsa horz limit: "+msa.showHorizLimit()+"\n\n"; + +// assert(r.list==null || r.mapScore>0) : r.mapScore+"\n"+r.list==null ? "null" : r.list.toString(); + + if(USE_CLEARZONE3 && (CLEARZONE3>CLEARZONE1 || CLEARZONE3>CLEARZONEP)){ + boolean changed=applyClearzone3(r, CLEARZONE3, INV_CLEARZONE3); + if(changed){ + int minScore=(int)(maxSwScore*MINIMUM_ALIGNMENT_SCORE_RATIO); + if(r.mapScore pairs=new ArrayList(Tools.min(8, Tools.min(r.list.size(), r2.list.size()))); + + int maxPairedScore1=-1; + int maxPairedScore2=-1; + + +// for(SiteScore ss : r.list){ +// System.out.println(ss.toText()); +// } + +// int i=0, j=0; + final int ilimit=r.sites.size()-1; + final int jlimit=r2.sites.size()-1; + +// final int outerDistLimit=MIN_PAIR_DIST+r.bases.length+r2.bases.length; + final int outerDistLimit=(Tools.max(r.bases.length, r2.bases.length)*OUTER_DIST_MULT)/OUTER_DIST_DIV;//-(SLOW_ALIGN ? 100 : 0); + final int innerDistLimit=MAX_PAIR_DIST;//+(FIND_TIP_DELETIONS ? TIP_DELETION_SEARCH_RANGE : 0); + final int expectedFragLength=AVERAGE_PAIR_DIST+r.bases.length+r2.bases.length; + + int numPerfectPairs=0; + + for(int i=0, j=0; i<=ilimit && j<=jlimit; i++){ + SiteScore ss1=r.sites.get(i); + SiteScore ss2=r2.sites.get(j); + + while(jinnerDistLimit))){ + j++; + ss2=r2.sites.get(j); + } + + for(int k=j; k<=jlimit; k++){ + ss2=r2.sites.get(k); + + if(ss2.chrom>ss1.chrom){break;} + if(ss2.start-ss1.stop>innerDistLimit){break;} + +// int dist=0; +// +// if(ss1.start<=ss2.start){ +// dist=ss2.start-ss1.stop; +// }else if(ss1.start>ss2.start){ +// dist=ss1.start-ss2.stop; +// } + + +// int innerdist=0; +// int outerdist=0; +// +// if(ss1.start<=ss2.start){ +// innerdist=ss2.start-ss1.stop; +// outerdist=ss2.stop-ss1.start; +// }else if(ss1.start>ss2.start){ +// innerdist=ss1.start-ss2.stop; +// outerdist=ss1.stop-ss2.start; +// } + + final int innerdist, outerdist; + //assert(!SAME_STRAND_PAIRS) : "TODO"; + + if(REQUIRE_CORRECT_STRANDS_PAIRS){ + if(ss1.strand!=ss2.strand){ + if(ss1.strand==Gene.PLUS){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + }else{ + if(ss1.start<=ss2.start){ + innerdist=ss2.start-ss1.stop; + outerdist=ss2.stop-ss1.start; + }else{ + innerdist=ss1.start-ss2.stop; + outerdist=ss1.stop-ss2.start; + } + } + + assert(outerdist>=innerdist); + + if(outerdist>=outerDistLimit && innerdist<=innerDistLimit){ + + boolean strandOK=((ss1.strand==ss2.strand)==SAME_STRAND_PAIRS); + + if(strandOK || !REQUIRE_CORRECT_STRANDS_PAIRS){ + + boolean paired1=false, paired2=false; + + int deviation=absdif(AVERAGE_PAIR_DIST, innerdist); + + final int pairedScore1; + final int pairedScore2; + if(strandOK){ +// pairedScore1=ss1.score+ss2.score/2; +// pairedScore2=ss2.score+ss1.score/2; + + pairedScore1=ss1.score+1+Tools.max(1, ss2.score/2-((deviation*ss2.score)/(32*expectedFragLength+100))); + pairedScore2=ss2.score+1+Tools.max(1, ss1.score/2-((deviation*ss1.score)/(32*expectedFragLength+100))); + }else{//e.g. a junction + pairedScore1=ss1.score+ss2.score/16; + pairedScore2=ss2.score+ss1.score/16; + } + + if(pairedScore1>ss1.pairedScore){ + paired1=true; + ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); + maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); + // System.out.println("Paired "+ss1.toText()+" with "+ss2.toText()); + }else{ + // System.out.println(ss1.toText()+" already paired."); + } + if(pairedScore2>ss2.pairedScore){ + paired2=true; + ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); + maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + + if(paired1 && paired2 && innerdist>0 && deviation<=expectedFragLength && ss1.perfect && ss2.perfect){ + numPerfectPairs++; //Lower bound. Some perfect pairs may be the same. + } + +// ss1.pairedScore=Tools.max(ss1.pairedScore, pairedScore1); +// ss2.pairedScore=Tools.max(ss2.pairedScore, pairedScore2); +// maxPairedScore1=Tools.max(ss1.score, maxPairedScore1); +// maxPairedScore2=Tools.max(ss2.score, maxPairedScore2); + } + } + } + + } + + + + for(SiteScore ss : r.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + for(SiteScore ss : r2.sites){ + if(ss.pairedScore>ss.score){ss.score=ss.pairedScore;} + else{assert(ss.pairedScore==0);} +// ss.score=ss.pairedScore=Tools.max(ss.pairedScore, ss.score); + } + + if(trim){ + if(numPerfectPairs>MIN_TRIM_SITES_TO_RETAIN_PAIRED){ +// System.out.print("."); + Collections.sort(r.sites); + Collections.sort(r2.sites); + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + }else{ + if(r.sites.size()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){ + Collections.sort(r.sites); + Tools.trimSitesBelowCutoff(r.sites, (int)(maxPairedScore1*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + if(r2.sites.size()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){ + Collections.sort(r2.sites); + Tools.trimSitesBelowCutoff(r2.sites, (int)(maxPairedScore2*.1f), true, true, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + } + } + +// if(pairs.isEmpty()){return null;} +// +// ArrayList temp=new ArrayList(Tools.max(r.list.size(), r2.list.size())); +// +// for(SiteScore ss : r.list){ +// if(ss.score>maxPairedScore1){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.a); +// } +// r.list.clear(); +// r.list.addAll(temp); +// +// for(SiteScore ss : r2.list){ +// if(ss.score>maxPairedScore2){temp.add(ss);} +// } +// for(SiteScorePair ssp : pairs){ +// temp.add(ssp.b); +// } +// r2.list.clear(); +// r2.list.addAll(temp); +// +// return pairs; + + return numPerfectPairs; + } + + + public void processReadPair(final Read r, final byte[] basesM1, final byte[] basesM2){ + if(idmodulo>1 && r.numericID%idmodulo!=1){return;} + final Read r2=r.mate; + assert(r2!=null); + final byte[] basesP1=r.bases, basesP2=r2.bases; + + readsUsed++; + readsUsed2++; + + final int maxPossibleQuickScore1=quickMap(r, basesM1); + final int maxPossibleQuickScore2=quickMap(r2, basesM2); + + if(verbose){ + System.err.println("\nAfter quick map:\nRead1:\t"+r+"\nRead2:\t"+r.mate); + } + + if(maxPossibleQuickScore1<0 && maxPossibleQuickScore2<0){ + r.sites=null; + r2.sites=null; + lowQualityReadsDiscarded1++; + r.setDiscarded(true); + lowQualityReadsDiscarded2++; + r2.setDiscarded(true); + return; + } + + //Not really needed due to subsumption + Tools.mergeDuplicateSites(r.sites, true, false); + Tools.mergeDuplicateSites(r2.sites, true, false); + + initialSiteSum1+=r.numSites(); + initialSiteSum2+=r2.numSites(); + + //TODO: Fix this. This is a workaround for an assertion error counting the number of reads used. + //Discards need to be tracked separately for each end. +// if(maxPossibleQuickScore2<0){lowQualityReadsDiscarded--;} + + final int maxSwScore1=msa.maxQuality(r.bases.length); + final int maxImperfectSwScore1=msa.maxImperfectScore(r.bases.length); + final int maxSwScore2=msa.maxQuality(r2.bases.length); + final int maxImperfectSwScore2=msa.maxImperfectScore(r2.bases.length); + + //TODO: POSSIBLY block pairing across N blocks. + pairSiteScoresInitial(r, r2, TRIM_LIST); + if(verbose){System.err.println("\nAfter initial pair:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(TRIM_LIST){ + + if(MIN_TRIM_SITES_TO_RETAIN_PAIRED>1){ + if(r.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){ + Collections.sort(r.sites); + trimList(r.sites, true, maxSwScore1, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + if(r2.numSites()>MIN_TRIM_SITES_TO_RETAIN_PAIRED){ + Collections.sort(r2.sites); + trimList(r2.sites, true, maxSwScore2, false, MIN_TRIM_SITES_TO_RETAIN_PAIRED, MAX_TRIM_SITES_TO_RETAIN); + } + } + } + postTrimSiteSum1+=r.numSites(); + postTrimSiteSum2+=r2.numSites(); + + {//Reset score to non-paired score + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.slowScore<=ss.quickScore); + ss.score=ss.quickScore; + } + } + } + + if(verbose){System.err.println("\nAfter trim:\nRead1:\t"+r.sites+"\nRead2:\t"+r2.sites);} + + if(SLOW_ALIGN){ + + if(r.numSites()>0){ + Tools.subsumeOverlappingSites(r.sites, true, false); + int numNearPerfectScores1=scoreNoIndels(r, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + Collections.sort(r.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores10, and there are no paired sites. + scoreSlow(r.sites, basesP1, basesM1, maxSwScore1, maxImperfectSwScore1); + if(STRICT_MAX_INDEL){ + int removed=removeLongIndels(r.sites, index.MAX_INDEL); + if(r.numSites()==0){r.clearMapping();} + } + Tools.mergeDuplicateSites(r.sites, true, true); + } + + if(r2.numSites()>0){ + Tools.subsumeOverlappingSites(r2.sites, true, false); + int numNearPerfectScores2=scoreNoIndels(r2, basesP2, basesM2, maxSwScore2, maxImperfectSwScore2); + Collections.sort(r2.sites); //Puts higher scores first to better trigger the early exit based on perfect scores + + if(numNearPerfectScores2ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired1++;} + } + } + if(r2.sites!=null){ + for(SiteScore ss : r2.sites){ + assert(ss.pairedScore<1 || ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : + "\n"+ss.toText()+"\n"+r2.toText(false)+"\n"; + if(ss.pairedScore==0){unpaired2++;} + } + } + + if(unpaired1>0 && r.numSites()>0){ + Collections.sort(r.sites); + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r, r2, basesP2, basesM2, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r2.sites, true, true); + } + + if(unpaired2>0 && r2.numSites()>0){ + Collections.sort(r2.sites); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + rescue(r2, r, basesP1, basesM1, Tools.min(MAX_PAIR_DIST, 2*AVERAGE_PAIR_DIST+100)); + Tools.mergeDuplicateSites(r.sites, true, true); + } + + postRescueSiteSum1+=r.numSites(); + postRescueSiteSum2+=r2.numSites(); + +// if(r.list!=null){Collections.sort(r.list);} +// if(r2.list!=null){Collections.sort(r2.list);} +// +// Tools.removeLowQualitySites(r.list, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); +// Tools.removeLowQualitySites(r2.list, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE, MINIMUM_ALIGNMENT_SCORE_RATIO_PRE_RESCUE); + + if(verbose){System.err.println("\nAfter rescue:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + }else{ + Tools.mergeDuplicateSites(r.sites, true, false); + Tools.mergeDuplicateSites(r2.sites, true, false); + if(verbose){System.err.println("\nAfter merge:\nRead1:\t"+r+"\nRead2:\t"+r2);} + } + + try { + Tools.removeOverlappingSites(r.sites, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r.toText(false)+"\n\n"); + } + try { + Tools.removeOverlappingSites(r2.sites, true); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + throw new RuntimeException("\n\n"+r2.toText(false)+"\n\n"); + } + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + + if(false){//This block is optional, but increases correctness by a tiny bit. (or maybe not!) + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + Tools.removeLowQualitySitesPaired(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + Tools.removeLowQualitySitesPaired(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED); + } + + pairSiteScoresFinal(r, r2, false, false, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + + if(r.numSites()>1){Collections.sort(r.sites);} + if(r2.numSites()>1){Collections.sort(r2.sites);} + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ +// assert(false) : "Change to removeLowQualitySitesPaired2"; + //TODO Verify correctness + Tools.removeLowQualitySitesPaired2(r.sites, maxSwScore1, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, EXPECTED_SITES); + Tools.removeLowQualitySitesPaired2(r2.sites, maxSwScore2, MINIMUM_ALIGNMENT_SCORE_RATIO, MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED, EXPECTED_SITES); + } + + //TODO +// assert(false) : "Only verified up to this point."; + pairSiteScoresFinal(r, r2, false, true, MAX_PAIR_DIST, AVERAGE_PAIR_DIST, SAME_STRAND_PAIRS, REQUIRE_CORRECT_STRANDS_PAIRS, MAX_TRIM_SITES_TO_RETAIN); + + if(r.numSites()>0){ + mapped1++; + Collections.sort(r.sites); + } + if(r2.numSites()>0){ + mapped2++; + Collections.sort(r2.sites); + } + + if(SLOW_ALIGN || USE_AFFINE_SCORE){ + r.setPerfectFlag(maxSwScore1); + r2.setPerfectFlag(maxSwScore2); + } + + + if(r.numSites()>1){ + final int clearzone=r.perfect() ? CLEARZONEP : + r.topSite().score>=(int)(maxSwScore1*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r.topSite().score>=(int)(maxSwScore1*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites1=Tools.countTopScores(r.sites, clearzone); + if(numBestSites1>1){ + //Ambiguous alignment + assert(r.sites.size()>1); + + boolean b=processAmbiguous(r.sites, true, AMBIGUOUS_TOSS, clearzone, false); + r.setAmbiguous(b); + } + } + + if(r2.numSites()>1){ + final int clearzone=r2.perfect() ? CLEARZONEP : + r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1b_CUTOFF) ? CLEARZONE1 : + (r2.topSite().score>=(int)(maxSwScore2*CLEARZONE1c_CUTOFF) ? CLEARZONE1b : CLEARZONE1c); + int numBestSites2=Tools.countTopScores(r2.sites, clearzone); + if(numBestSites2>1){ + //Ambiguous alignment + assert(r2.sites.size()>1); + + boolean b=processAmbiguous(r2.sites, false, AMBIGUOUS_TOSS, clearzone, false); + r2.setAmbiguous(b); + } + } + if(verbose){System.err.println("\nAfter ambiguous removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + if(r.numSites()>0 && r2.numSites()>0){ + SiteScore ss1=r.topSite(); + SiteScore ss2=r2.topSite(); + if(canPair(ss1, ss2, r.bases.length, r2.bases.length, REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + assert(SLOW_ALIGN ? ss1.pairedScore>ss1.slowScore : ss1.pairedScore>ss1.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + assert(SLOW_ALIGN ? ss2.pairedScore>ss2.slowScore : ss2.pairedScore>ss2.quickScore) : + "\n"+ss1.toText()+"\n"+ss2.toText()+"\n"+r.toText(false)+"\n"+r2.toText(false)+"\n"; + r.setPaired(true); + r.mate.setPaired(true); + } + } + + if(r.numSites()==0){r.sites=null;r.mapScore=0;} + if(r2.numSites()==0){r2.sites=null;r2.mapScore=0;} + +// assert(Read.CHECKSITES(r, basesM));//***123 +// assert(Read.CHECKSITES(r2));//***123 + + r.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + r2.setFromTopSite(AMBIGUOUS_RANDOM, true, MAX_PAIR_DIST); + assert(checkTopSite(r)); // TODO remove this + if(KILL_BAD_PAIRS){ + if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, MAX_PAIR_DIST)){ + int x=r.mapScore/r.bases.length; + int y=r2.mapScore/r2.bases.length; + if(x>=y){ + r2.clearAnswers(false); + }else{ + r.clearAnswers(false); + } + } + } + if(verbose){System.err.println("\nAfter bad pair removal:\nRead1:\t"+r+"\nRead2:\t"+r2);} + + assert(r.sites==null || r.mapScore>0) : r.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(r2.sites==null || r2.mapScore>0) : r2.mapScore+"\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n"; + assert(checkTopSite(r)); // TODO remove this + if(MAKE_MATCH_STRING){ + if(r.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r.topSite().match!=null){ + r.match=r.topSite().match; + }else{ + assert(checkTopSite(r)); // TODO remove this + genMatchString(r, basesP1, basesM1, maxImperfectSwScore1, maxSwScore1, false, false); + assert(checkTopSite(r)); // TODO remove this + } + } + if(r2.numSites()>0){ + if(USE_SS_MATCH_FOR_PRIMARY && r2.topSite().match!=null){ + r2.match=r2.topSite().match; + }else{ + genMatchString(r2, basesP2, basesM2, maxImperfectSwScore2, maxSwScore2, false, false); + } + } + } + + assert(checkTopSite(r)); // TODO remove this + if(verbose){ + System.err.println("\nFinal:\nRead1:\t"+r+"\nRead2:\t"+r2); + if(r.match!=null && r.shortmatch()){r.match=Read.toLongMatchString(r.match); r.setShortMatch(false);} + if(r2.match!=null && r2.shortmatch()){r2.match=Read.toLongMatchString(r2.match); r2.setShortMatch(false);} + } + + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r.mapScore>0 && r.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + }else if(r.mapScore<=0 && r.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly2: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r.clearMapping(); + r2.setPaired(false); + } + assert(checkTopSite(r)); // TODO remove this + //Block to prevent assertion from firing. Generally caused by alignment being lost during match generation. TODO: Fix cause. + if(r2.mapScore>0 && r2.sites==null){ + if(!Shared.anomaly){System.err.println("Anomaly: mapScore>0 and list==null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + }else if(r2.mapScore<=0 && r2.sites!=null){ + if(!Shared.anomaly){System.err.println("Anomaly3: mapScore<=0 and list!=null.\n"+r+"\n");} + Shared.anomaly=true; + r2.clearMapping(); + r.setPaired(false); + } + + assert(r.sites==null || r.mapScore>0) : + r.mapScore+"\t"+r.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r.strand()==Gene.PLUS ? r.bases : + AminoAcid.reverseComplementBases(r.bases), r.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore1))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + assert(r2.sites==null || r2.mapScore>0) : + r2.mapScore+"\t"+r2.sites+"\n"+(-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), 0))+"\n"+ + Arrays.toString(msa.fillAndScoreLimited(r2.strand()==Gene.PLUS ? r2.bases : + AminoAcid.reverseComplementBases(r2.bases), r2.topSite(), Tools.max(SLOW_ALIGN_PADDING, 80), (-100+(int)(MINIMUM_ALIGNMENT_SCORE_RATIO_PAIRED*maxSwScore2))))+"\n\n"+ + msa.showVertLimit()+"\n\n"+msa.showHorizLimit()+"\n\n"+r+"\n\n"+r2+"\n\n"; + + assert(!r.mapped() || !MAKE_MATCH_STRING || r.match!=null) : "Note that sometimes, VERY RARELY, match string generation fails."; + assert(checkTopSite(r)); // TODO remove this + removeDuplicateBestSites(r); + removeDuplicateBestSites(r2); + + if(DYNAMIC_INSERT_LENGTH && numMated>1000 && r.paired()){ + AVERAGE_PAIR_DIST=(int)(innerLengthSum*1f/numMated); + } + assert(checkTopSite(r)); // TODO remove this + if(r.ambiguous() && AMBIGUOUS_TOSS){ + if(r.sites!=null){r.sites=null;} + r.clearSite(); + r.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + if(r2.ambiguous() && AMBIGUOUS_TOSS){ + if(r2.sites!=null){r2.sites=null;} + r2.clearSite(); + r2.setMapped(false); + //TODO: Unclear if I should set paired to false here + } + + assert(checkTopSite(r)); + if(r.mapped() && (LOCAL_ALIGN || r.containsXY2())){ + final SiteScore ss=r.topSite(); + ss.match=r.match; + msa.toLocalAlignment(r, ss, r.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + assert(checkTopSite(r2)); + if(r2.mapped() && (LOCAL_ALIGN || r2.containsXY2())){ + final SiteScore ss=r2.topSite(); + ss.match=r2.match; + msa.toLocalAlignment(r2, ss, r2.containsXY2() ? 1 : LOCAL_ALIGN_TIP_LENGTH, LOCAL_ALIGN_MATCH_POINT_RATIO); + } + + if(CALC_STATISTICS){ + calcStatistics1(r, maxSwScore1, maxPossibleQuickScore1); + calcStatistics2(r2, maxSwScore2, maxPossibleQuickScore2); + } + } + + /** {number of correct (loose) sites, number of incorrect (loose) sites, number incorrect sites before last correct site, + * number of sites, correctScore, maxScore, firstElementCorrect, firstElementCorrectLoose, position of first correct element (or -1), + * sizeOfTopGroup, numTopCorrect} */ + protected int[] calcCorrectnessSkimmer(Read r, int thresh){ + //assume sorted. + ArrayList ssl=r.sites; + + if(ssl==null || ssl.isEmpty()){ + return new int[] {0, 0, 0, 0, 0, 0, 0, 0, -1, 0, 0, 0}; + } + + SiteScore original=r.originalSite; + assert((original==null) != (r.synthetic())); + if(original==null){ + original=ssl.get(0); + } + + int numCorrect=0; + int numIncorrect=0; + int numIncorrectPrior=0; //Prior to expected number of correct sites being found + int numIncorrectPrior2=0; //Prior to last correct element actually found + int numIncorrectPriorTemp=0; + + int correctScore=0; + int maxScore=ssl.get(0).score; + + int positionOfFirstCorrect=-1; + int firstElementCorrect=0; + int firstElementCorrectLoose=0; + + int sizeOfTopGroup=0; + + int numTopCorrect=0; + + for(int i=0; i0 && sizeOfTopGroup<=ssl.size()); + return new int[] {numCorrect, numIncorrect, numIncorrectPrior2, + ssl.size(), correctScore, maxScore, firstElementCorrect, firstElementCorrectLoose, positionOfFirstCorrect, sizeOfTopGroup, numTopCorrect}; + } + + +} diff --git a/current/align2/BBSplitter.java b/current/align2/BBSplitter.java new file mode 100755 index 0000000..5ec8152 --- /dev/null +++ b/current/align2/BBSplitter.java @@ -0,0 +1,1166 @@ +package align2; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.HashMap; +import java.util.LinkedHashSet; + +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SiteScore; + + +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Mar 19, 2013 + * + */ +public class BBSplitter { + + public static void main(String[] args){ + if(Shared.COMMAND_LINE==null){ + Shared.COMMAND_LINE=(args==null ? null : args.clone()); + Shared.BBMAP_CLASS="BBSplitter"; + } + Timer t=new Timer(); + t.start(); + String[] margs=processArgs(args); + ReadWrite.waitForWritingToFinish(); + t.stop(); + Data.sysout.println("Ref merge time: \t"+t); + Data.scaffoldPrefixes=true; + if(MAP_MODE==MAP_NORMAL){ + BBMap.main(margs); + }else if(MAP_MODE==MAP_ACC){ + BBMapAcc.main(margs); + }else if(MAP_MODE==MAP_PACBIO){ + BBMapPacBio.main(margs); + }else if(MAP_MODE==MAP_PACBIOSKIMMER){ + BBMapPacBioSkimmer.main(margs); + }else{ + throw new RuntimeException(); + } +// Data.sysout.println("\nTotal time: \t"+t); + } + + public static String[] processArgs(String[] args){ + for(String s : args){ + if(s.endsWith("=stdout") || s.contains("=stdout.")){Data.sysout=System.err;} + } + Data.sysout.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); +// if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} //Should be fine for a realistic number of threads, except in perfect mode with lots of sites and a small index. + String[] oldargs=args; + args=remakeArgs(args); + if(args!=oldargs){ + Data.sysout.println("Converted arguments to "+Arrays.toString(args)); + } + + ReadWrite.ZIPLEVEL=2; + + Timer t=new Timer(); + t.start(); + + + int ziplevel=-1; + int build=1; + + LinkedHashSet nameSet=new LinkedHashSet(); + HashMap> table=new HashMap>(); + + ArrayList unparsed=new ArrayList(); + + String basename=null; + + for(int i=0; i1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(a.equals("blacklist") || a.equals("ref_blacklist")){a="ref_blacklist";} + if(a.equals("whitelist") || a.equals("ref_whitelist")){a="ref_whitelist";} + if(a.equals("ref") || a.equals("reference")){a="ref_ref";} + + if(b!=null && (a.startsWith("ref_"))){ + String setName=a.substring(4); + if(setName.indexOf(',')>=0){setName=setName.replace(',', '_');} + if(setName.indexOf('$')>=0){setName=setName.replace('$', '_');} + nameSet.add(setName); + if(!table.containsKey(setName)){table.put(setName, new LinkedHashSet());} + LinkedHashSet set=table.get(setName); + + File f; + if((f=new File(b)).exists()){ + try { + String s=f.getCanonicalPath(); + set.add(s); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + }else{ + for(String x : b.split(",")){ + f=new File(x); + if(f.exists()){ + try { + set.add(f.getCanonicalPath()); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + }else{ + assert(x.startsWith("stdin")) : "Can't find file "+x; + set.add(x); + } + } + } + }else{ + if(a.startsWith("-xmx") || a.startsWith("-xms")){ + //jvm argument; do nothing + }else if(a.equals("path") || a.equals("root")){ + Data.setPath(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + unparsed.add(args[i]); + }else if(a.equals("build")){ + build=Integer.parseInt(b); + unparsed.add(args[i]); + }else if(a.equals("basename")){ + basename=b; + assert(b==null || (b.indexOf('%')>=0 && (b.indexOf('%') outnames=gatherLists(nameSet, basename); +// unparsed.add("scaffoldprefixes=true"); + unparsed.add("ref="+refname); + + String[] margs=new String[unparsed.size()+(outnames==null ? 0 : outnames.size())]; + int idx=0; + for(int i=0; i set=new LinkedHashSet(); + HashMap> map=new HashMap>(); + int removed=0; + + for(int i=0; i1 ? split[1] : null; + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("mapmode") && b!=null){ + args[i]=null; + removed++; + if(b.equalsIgnoreCase("normal")){MAP_MODE=MAP_NORMAL;} + else if(b.equalsIgnoreCase("accurate") || b.equalsIgnoreCase("acc")){MAP_MODE=MAP_ACC;} + else if(b.equalsIgnoreCase("pacbio") || b.equalsIgnoreCase("pb") || b.equalsIgnoreCase("bp")){MAP_MODE=MAP_PACBIO;} + else if(b.equalsIgnoreCase("pacbioskimmer") || b.equalsIgnoreCase("pbs") || b.equalsIgnoreCase("bps")){MAP_MODE=MAP_PACBIOSKIMMER;} + else{throw new RuntimeException("Unknown mode: "+b);} + }else if(a.equals("ref") && b!=null){ + args[i]=null; + removed++; + String[] files=b.split(","); + for(String file : files){ + String name=file.replace('\\', '/'); + int x=name.lastIndexOf('/'); + if(x>=0){name=name.substring(x+1);} + while(name.endsWith(".zip") || name.endsWith(".bz2") || name.endsWith(".gz") || name.endsWith(".gzip")){ + name=name.substring(0, name.lastIndexOf('.')); + } + if(name.lastIndexOf('.')>=0){ + name=name.substring(0, name.lastIndexOf('.')); + } + set.add(name); + LinkedHashSet list=map.get(name); + if(list==null){ + list=new LinkedHashSet(); + map.put(name, list); + } + list.add(file); + } + } + } + if(set.isEmpty() && removed==0){return args;} + if(MAP_MODE==MAP_ACC){removed--;} + String[] args2=new String[args.length+set.size()-removed]; + + int i=0, j=0; + if(MAP_MODE==MAP_ACC){ + args2[j]="minratio=0.4"; //Increase sensitivity in accurate mode + j++; + } + for(; i list=map.get(key); + StringBuilder sb=new StringBuilder(200); + sb.append("ref_"+key+"="); + String comma=""; + for(String s : list){ + sb.append(comma); + sb.append(s); + comma=","; + } + args2[j]=sb.toString(); + j++; + } + return args2; + } + + + public static ArrayList gatherLists(LinkedHashSet nameSet, String basename){ + if(basename==null){return null;} + ArrayList args=new ArrayList(); + for(String name : nameSet){ + if(basename!=null){ + args.add("out_"+name+"="+(basename.replaceFirst("%", name))); + } + } + return args; + } + + + public static String mergeReferences(LinkedHashSet nameSet, HashMap> nameToFileTable, int build){ + LinkedHashSet fnames=new LinkedHashSet(); +// nameSet.remove("blacklist"); +// nameSet.remove("whitelist"); + addNames(fnames, nameToFileTable, "whitelist"); + for(String s : nameSet){ + if(!s.equalsIgnoreCase("blacklist") && !s.equalsIgnoreCase("whitelist")){ + addNames(fnames, nameToFileTable, s); + } + } + addNames(fnames, nameToFileTable, "blacklist"); + + final HashMap> fileToNameTable=new HashMap>(); + for(String name : nameSet){ + LinkedHashSet files=nameToFileTable.get(name); + if(files!=null){ + for(String f : files){ + LinkedHashSet names=fileToNameTable.get(f); + if(names==null){ + names=new LinkedHashSet(); + fileToNameTable.put(f, names); + } + names.add(name); + } + } + } + + final String root=Data.ROOT_GENOME+build; + { + File f=new File(root); + if(!f.exists()){f.mkdirs();} + } + + { + final String reflist=root+"/reflist.txt"; + final String namelist=root+"/namelist.txt"; + final boolean reflistExists=new File(reflist).exists(); + boolean writeReflist=false; + String[] oldrefs=null; + String[] oldnames=null; + if(reflistExists){ + TextFile tf=new TextFile(reflist, false, false); + oldrefs=tf.toStringLines(); + tf.close(); + + tf=new TextFile(namelist, false, false); + oldnames=tf.toStringLines(); + tf.close(); + } + if(fnames.size()>0){ + writeReflist=true; + ArrayList fl=new ArrayList(fnames.size()); + fl.addAll(fnames); + ArrayList nl=new ArrayList(nameSet.size()); + nl.addAll(nameSet); + //TODO: Compare old to new + }else{ + assert(oldrefs!=null) : "No reference specified, and none exists. Please regenerate the index."; + for(String s : oldrefs){fnames.add(s);} + + assert(oldnames!=null) : "No reference specified, and none exists. Please regenerate the index."; + for(String s : oldnames){nameSet.add(s);} + + writeReflist=false; + } + if(writeReflist){ + { +// assert(false) : fnames; +// assert(fnames.size()>0); + TextStreamWriter tsw=new TextStreamWriter(reflist, OVERWRITE, false, false); + tsw.start(); + for(String s : fnames){tsw.println(s);} + tsw.poisonAndWait(); + assert(new File(reflist).exists()) : reflist+".exists? "+new File(reflist).exists(); + } + { +// assert(nameSet.size()>0); + TextStreamWriter tsw=new TextStreamWriter(namelist, OVERWRITE, false, false); + tsw.start(); + for(String s : nameSet){tsw.println(s);} + tsw.poisonAndWait(); + } + } + } + + if(fnames.size()<1){ + assert(false) : "No references specified." + + "\nTODO: This is really annoying; I need to include reference names in some auxillary file."; + return null; + }else if(fnames.size()==1){ +// Data.sysout.println("Only one reference file; skipping merge."); +// String refname=fnames.iterator().next(); +// return refname; + } + + long key=0; + for(String s : nameSet){ + key=Long.rotateLeft(key, 21); + key=key^s.hashCode(); +// System.err.println("Hashed nameSet "+nameSet+" -> "+key); + } + key=(key&Long.MAX_VALUE); + String refname0="merged_ref_"+key+".fa.gz"; + String refname=root+"/"+refname0; + + { + File f=new File(refname); + if(f.exists()){ + // Data.sysout.println("Merged reference file /ref/genome/"+build+"/"+refname0+" already exists; skipping merge."); + Data.sysout.println("Merged reference file "+refname+" already exists; skipping merge."); + return refname; + } +// else{ +// f=new File(root); +// if(!f.exists()){f.mkdirs();} +// } + } + // Data.sysout.println("Creating merged reference file /ref/genome/"+build+"/"+refname0); + Data.sysout.println("Creating merged reference file "+refname); + + TextStreamWriter tsw=new TextStreamWriter(refname, OVERWRITE, false, true); + tsw.start(); + for(String fname : fnames){ + TextFile tf=new TextFile(fname, false, false); + LinkedHashSet listnames=fileToNameTable.get(fname); +// assert(false) : "\n\n"+fname+"\n\n"+listnames+"\n\n"+fileToNameTable+"\n\n"+nameSet+"\n\n"+nameToFileTable+"\n\n"; + String prefix=null; + { + StringBuilder sb=new StringBuilder(100); + sb.append('>'); + if(listnames!=null){ + String sep=""; + for(String s : listnames){ + sb.append(sep); + sb.append(s); + sep=","; + } + } + sb.append('$'); + prefix=sb.toString(); + } +// assert(false) : prefix; +// System.err.println(prefix); + for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ + if(prefix!=null && line.charAt(0)=='>'){ + + tsw.print(prefix); + tsw.println(line.substring(1)); + }else{ + tsw.println(line); + } + } + tf.close(); + } + tsw.poison(); + tsw.waitForFinish(); + + return refname; + } + + /** Returns the set of scaffold name prefixes or suffixes. + * + * @param prefix True to return prefixes (set names), false to return suffixes (scaffold names) + * @return + */ + public static HashSet getScaffoldAffixes(boolean getPrefixes){ + final byte[][][] b3=Data.scaffoldNames; + + int size=(int)Tools.min((10+Data.numScaffolds*3)/2, Integer.MAX_VALUE); + HashSet set=new HashSet(size); + + assert(b3!=null); + for(byte[][] b2 : b3){ + if(b2!=null){ + for(byte[] bname : b2){ + if(bname!=null){ + int idx=Tools.indexOf(bname, (byte)'$'); + String prefix=null, suffix=null; + if(idx>=0){ + if(getPrefixes){prefix=new String(bname, 0, idx);} + else{suffix=new String(bname, idx+1, bname.length-idx-1);} + }else{ + if(!getPrefixes){suffix=new String(bname);} + } + + if(getPrefixes){ + if(prefix!=null){ + if(prefix.indexOf(',')>=0){ + for(String s : prefix.split(",")){ + set.add(s); + } + }else{ + set.add(prefix); + } + } + }else{ + if(suffix!=null){ + set.add(suffix); + } + } + } + } + } + } + return set; + } + + public static synchronized HashMap makeOutputStreams(String[] args, boolean OUTPUT_READS, boolean OUTPUT_ORDERED_READS, + int buff, boolean paired, boolean overwrite_, boolean ambiguous){ + + HashMap table=new HashMap(); + for(String arg : args){ + String[] split=arg.split("="); + String a=split[0].toLowerCase(); + String b=split.length>1 ? split[1] : null; + if(b!=null && b.equalsIgnoreCase("null")){b=null;} + + if(arg.indexOf('=')>0 && a.startsWith("out_")){ + String name=a.substring(4).replace('\\', '/'); + + String fname1, fname2; + + if(ambiguous){ + if(b.indexOf('/')>=0){ + int x=b.lastIndexOf('/'); + b=b.substring(0, x+1)+"AMBIGUOUS_"+b.substring(x+1); + }else{ + b="AMBIGUOUS_"+b; + } + } + + if(b.endsWith("#")){ + fname1=b.replace('#', '1'); + fname2=b.replace('#', '2'); + }else{ + fname1=b; + fname2=null; + } +// assert(!ambiguous) : fname1+", "+fname2+", "+b+", "+ambiguous; + + FileFormat ff1=FileFormat.testOutput(fname1, FileFormat.SAM, null, true, overwrite_, OUTPUT_ORDERED_READS); + FileFormat ff2=paired ? FileFormat.testOutput(fname2, FileFormat.SAM, null, true, overwrite_, OUTPUT_ORDERED_READS) : null; + RTextOutputStream3 ros=new RTextOutputStream3(ff1, ff2, null, null, buff, null, false); + ros.start(); +// Data.sysout.println("Started output stream:\t"+t); + table.put(name, ros); + } + } + return table.isEmpty() ? null : table; + } + + + public static synchronized HashMap makeSetCountTable(){ + assert(setCountTable==null); + HashSet names=getScaffoldAffixes(true); + setCountTable=new HashMap(); + for(String s : names){setCountTable.put(s, new SetCount(s));} + return setCountTable; + } + + + public static synchronized HashMap makeScafCountTable(){ + assert(scafCountTable==null); + HashSet names=getScaffoldAffixes(false); + scafCountTable=new HashMap(); + for(String s : names){scafCountTable.put(s, new SetCount(s));} +// System.out.println("Made table "+scafCountTable); + return scafCountTable; + } + + + /** + * @param readlist List of reads to print + * @param listID ID of read list, from ReadInputStream + * @param splitTable A temporary structure to hold sets of reads that go to the different output streams + * @param clearzone Min distance between best and next-best site to be considered unambiguous + */ + public static void printReads(ArrayList readlist, long listID, HashMap> splitTable, int clearzone){ + if(clearzone>=0 || TRACK_SET_STATS || TRACK_SCAF_STATS){ + printReadsAndProcessAmbiguous(readlist, listID, splitTable, null, clearzone); + return; + } + assert((streamTable!=null && streamTable.size()>0) || (setCountTable!=null && setCountTable.size()>0) || (scafCountTable!=null && scafCountTable.size()>0)); + boolean clear=true; + if(splitTable==null){ + splitTable=new HashMap>(); + clear=false; + } + for(Read r : readlist){ + if(r!=null){ + HashSet lists=toListNames(r); + if(lists!=null){ + for(String s : lists){ + ArrayList alr=splitTable.get(s); + if(alr==null){ + alr=new ArrayList(); + splitTable.put(s, alr); + } + alr.add(r); + } + } + } + } + for(String s : streamTable.keySet()){ + ArrayList alr=splitTable.get(s); + if(alr==null){alr=blank;} + RTextOutputStream3 tros=streamTable.get(s); + tros.add(alr, listID); + } + if(clear){splitTable.clear();} + } + + + /** + * @param readlist List of reads to print + * @param listID ID of read list, from ReadInputStream + * @param splitTable A temporary structure to hold sets of reads that go to the different output streams + * @param clearzone Min distance between best and next-best site to be considered unambiguous + */ + public static void printReadsAndProcessAmbiguous(ArrayList readlist, long listID, HashMap> splitTable, + HashMap> splitTableA, int clearzone){ + assert(clearzone>=0 || TRACK_SET_STATS || TRACK_SCAF_STATS); + assert((streamTable!=null && streamTable.size()>0) || (setCountTable!=null && setCountTable.size()>0) || (scafCountTable!=null && scafCountTable.size()>0)); + boolean clear=streamTable!=null, clearA=streamTableAmbiguous!=null; + if(splitTable==null && streamTable!=null){ + splitTable=new HashMap>(); + clear=false; + } + if(splitTableA==null && streamTableAmbiguous!=null){ + splitTableA=new HashMap>(); + clearA=false; + } + + final HashSet hss0, hss1, hss2, hss3, hsspr, hssam; + final HashSet[] hssa; + if(TRACK_SET_STATS || streamTable!=null){ + hss0=new HashSet(16); + hss1=new HashSet(16); + hss2=new HashSet(16); + hss3=new HashSet(16); + hsspr=new HashSet(16); + hssam=new HashSet(16); + hssa=(HashSet[])new HashSet[] {hss0, hss1, hss2, hss3}; + }else if(TRACK_SCAF_STATS){ + hss0=new HashSet(16); + hss1=null; hss2=null; hss3=null; hsspr=null; hssam=null; hssa=null; + }else{ + hss0=null; hss1=null; hss2=null; hss3=null; hsspr=null; hssam=null; hssa=null; + } + + for(Read r1 : readlist){ +// System.out.println("\nProcessing read "+r1.numericID); + if(r1!=null){ + final Read r2=r1.mate; + assert((scafCountTable!=null)==TRACK_SCAF_STATS) : TRACK_SCAF_STATS; + if(scafCountTable!=null){ + HashSet set=getScaffolds(r1, clearzone, hss0); + if(set!=null && !set.isEmpty()){ + int incrRM=0; + int incrRA=0; + int incrBM=0; + int incrBA=0; + if(r1!=null){ + if(r1.ambiguous()){ + incrRA+=1; + incrBA+=r1.bases.length; + }else{ + incrRM+=1; + incrBM+=r1.bases.length; + } + } + if(r2!=null){ + if(r2.ambiguous()){ + incrRA+=1; + incrBA+=r2.bases.length; + }else{ + incrRM+=1; + incrBM+=r2.bases.length; + } + } + for(String s : set){ + SetCount sc=scafCountTable.get(s); + assert(sc!=null) : "Can't find "+s+"\nin\n"+scafCountTable.keySet()+"\n"; + +// System.out.println(sc); +// System.out.println("+ "+incrRM+", "+incrRA+", "+incrBM+", "+incrBA); + synchronized(sc){ + // System.out.println("Incrementing scaf "+sc); + sc.mappedReads+=incrRM; + sc.mappedBases+=incrBM; + sc.ambiguousReads+=incrRA; + sc.ambiguousBases+=incrBA; + } +// System.out.println(sc); +// System.out.println(); +// assert(false) : "\n"+incrRM+", "+incrRA+", "+incrBM+", "+incrBA+"\n"+set; + } + set.clear(); + } + +// byte[] scafb=r1.getScaffoldName(false); +// if(scafb==null && r2!=null){scafb=r2.getScaffoldName(false);} +// if(scafb!=null){ +// final String s; +// if(Data.scaffoldPrefixes){ +// int idx=Tools.indexOf(scafb, (byte)'$'); +// assert(idx>=0) : idx+", "+new String(scafb); +// s=(idx>=0 ? new String(scafb, idx+1, scafb.length-idx-1) : new String(scafb)); +// }else{ +// s=new String(scafb); +// } +// SetCount sc=scafCountTable.get(s); +// assert(sc!=null) : "Can't find "+s+"\nin\n"+scafCountTable.keySet()+"\n"; +// +// int incrRM=0; +// int incrRA=0; +// int incrBM=0; +// int incrBA=0; +// if(r1!=null){ +// if(r1.ambiguous()){ +// incrRA+=1; +// incrBA+=r1.bases.length; +// }else{ +// incrRM+=1; +// incrBM+=r1.bases.length; +// } +// } +// if(r2!=null){ +// if(r2.ambiguous()){ +// incrRA+=1; +// incrBA+=r2.bases.length; +// }else{ +// incrRM+=1; +// incrBM+=r2.bases.length; +// } +// } +// synchronized(sc){ +//// System.out.println("Incrementing scaf "+sc); +// sc.mappedReads+=incrRM; +// sc.mappedBases+=incrBM; +// sc.ambiguousReads+=incrRA; +// sc.ambiguousBases+=incrBA; +// } +// } + } + + + final HashSet[] sets=(TRACK_SET_STATS || streamTable!=null) ? getSets(r1, clearzone, hssa) : null; + boolean ambiguous=false; + if(sets!=null){ + final HashSet p1=(sets[0].isEmpty() ? null : sets[0]), s1=(sets[1].isEmpty() ? null : sets[1]), + p2=(sets[2].isEmpty() ? null : sets[2]), s2=(sets[3].isEmpty() ? null : sets[3]); + assert(sets==hssa); +// assert(p1!=null); +// assert(s1!=null); +// assert(p2!=null); +// assert(s2!=null); + + if(p1!=null && p2!=null && !p1.equals(p2)){ambiguous=true;} + else if(p1!=null && s1!=null && !p1.containsAll(s1)){ambiguous=true;} + else if(p2!=null && s2!=null && !p2.containsAll(s2)){ambiguous=true;} + +// System.out.println("\nambiguous="+ambiguous); +// System.out.println(p1); +// System.out.println(s1); + + HashSet primarySet=hsspr, ambigSet=hssam; + primarySet.clear(); + ambigSet.clear(); + if(AMBIGUOUS2_MODE==AMBIGUOUS2_FIRST || AMBIGUOUS2_MODE==AMBIGUOUS2_UNSET){//pick one + if(r2==null || r1.mapScore>=r2.mapScore){ + if(p1!=null){primarySet.addAll(p1);} + }else{ + if(p2!=null){primarySet.addAll(p2);} + } + }else{//merge + if(p1!=null){primarySet.addAll(p1);} + if(p2!=null){primarySet.addAll(p2);} + } + + + if(ambiguous){ + if(AMBIGUOUS2_MODE==AMBIGUOUS2_SPLIT){ + if(primarySet!=null && s1!=null){primarySet.addAll(s1);} + if(primarySet!=null && s2!=null){primarySet.addAll(s2);} + ambigSet=primarySet; + primarySet=null; + }else if(AMBIGUOUS2_MODE==AMBIGUOUS2_ALL){ + if(primarySet!=null && s1!=null){primarySet.addAll(s1);} + if(primarySet!=null && s2!=null){primarySet.addAll(s2);} + ambigSet=null; + }else if(AMBIGUOUS2_MODE==AMBIGUOUS2_RANDOM){ + throw new RuntimeException("AMBIGUOUS2_RANDOM: Not yet implemented."); + }else if(AMBIGUOUS2_MODE==AMBIGUOUS2_TOSS){ + primarySet=null; + } + } + + if(primarySet!=null && splitTable!=null){ + for(String s : primarySet){ + ArrayList alr=splitTable.get(s); + if(alr==null){ + alr=new ArrayList(); + splitTable.put(s, alr); + } + alr.add(r1); + } + } + + if(ambigSet!=null && splitTableA!=null){ + for(String s : ambigSet){ + ArrayList alr=splitTableA.get(s); + if(alr==null){ + alr=new ArrayList(); + splitTableA.put(s, alr); + } + alr.add(r1); + } + } + + if(setCountTable!=null){ + + primarySet=hsspr; + primarySet.clear(); + if(p1!=null){primarySet.addAll(p1);} + if(p2!=null){primarySet.addAll(p2);} + if(ambiguous){ + if(s1!=null){primarySet.addAll(s1);} + if(s2!=null){primarySet.addAll(s2);} + } +// System.out.println(primarySet); + final int incrR=1+(r2==null ? 0 : 1); + final int incrB=r1.bases.length+(r2==null ? 0 : r2.bases.length); + + for(String s : primarySet){ + SetCount sc=setCountTable.get(s); + assert(sc!=null) : s; + if(ambiguous){ + synchronized(sc){ + // System.out.println("Incrementing set "+sc); + sc.ambiguousReads+=incrR; + sc.ambiguousBases+=incrB; + } + }else{ + synchronized(sc){ + // System.out.println("Incrementing set "+sc); + sc.mappedReads+=incrR; + sc.mappedBases+=incrB; + } + + } + } + } + for(HashSet set : sets){set.clear();} + } + } + } + if(streamTable!=null){ + for(String s : streamTable.keySet()){ + ArrayList alr=splitTable.get(s); + if(alr==null){alr=blank;} + RTextOutputStream3 tros=streamTable.get(s); + tros.add(alr, listID); + } + } + if(streamTableAmbiguous!=null){ + for(String s : streamTableAmbiguous.keySet()){ + ArrayList alr=splitTableA.get(s); + if(alr==null){alr=blank;} + RTextOutputStream3 tros=streamTableAmbiguous.get(s); + tros.add(alr, listID); + } + } + if(clear){splitTable.clear();} + if(clearA){splitTableA.clear();} + } + + + public static HashSet[] getSets(Read r1, int clearzone, HashSet[] sets){ + Read r2=r1.mate; + if(!r1.mapped() && (r2==null || !r2.mapped())){return null;} + + if(sets==null){ + assert(false); + sets=new HashSet[4]; + }else{ + for(HashSet set : sets){ + assert(set==null || set.isEmpty()); + } + } + + HashSet primary1=sets[0], other1=sets[1], primary2=sets[2], other2=sets[3]; + if(r1.mapped()){ +// System.out.println(r1.list.size()); + SiteScore s0=r1.topSite(); + primary1=toListNames(s0, primary1); + for(int i=1; i getScaffolds(Read r1, int clearzone, HashSet set){ + Read r2=r1.mate; + if(!r1.mapped() && (r2==null || !r2.mapped())){return null;} + assert(set==null || set.isEmpty()); + + if(!r1.ambiguous() && (r2==null || !r2.ambiguous())){ + byte[] scafb1=r1.getScaffoldName(false); + byte[] scafb2=(r2==null ? null : r2.getScaffoldName(false)); + if(scafb1==null){scafb1=scafb2;} + if(scafb1==null){ + assert(false) : r1; + return null; + } + if(scafb2==null || scafb1==scafb2){ + final String s; + if(Data.scaffoldPrefixes){ + int idx=Tools.indexOf(scafb1, (byte)'$'); + assert(idx>=0) : idx+", "+new String(scafb1); + s=(idx>=0 ? new String(scafb1, idx+1, scafb1.length-idx-1) : new String(scafb1)); + }else{ + s=new String(scafb1); + } + if(set==null){set=new HashSet(1);} +// assert(!s.contains("$")) : s+", "+Data.scaffoldPrefixes+", "+Tools.indexOf(scafb1, (byte)'$'); + set.add(s); + return set; + } + } + + if(set==null){set=new HashSet(4);} + if(r1.mapped()){ + SiteScore s0=r1.topSite(); + for(SiteScore ss : r1.sites){ + if(ss.score+clearzone=0) : idx+", "+new String(b); + s=(idx>=0 ? new String(b, idx+1, b.length-idx-1) : new String(b)); + }else{ + s=new String(b); + } + set.add(s); + } + } + if(r2!=null && r2.mapped()){ + SiteScore s0=r2.topSite(); + for(SiteScore ss : r2.sites){ + if(ss.score+clearzone=0) : idx+", "+new String(b); + s=(idx>=0 ? new String(b, idx+1, b.length-idx-1) : new String(b)); + }else{ + s=new String(b); + } + set.add(s); + } + } + assert(set.size()>0); + return set; + } + + + /** + * @param r + * @return A set of names of reference lists containing this read or its mate. + */ + public static HashSet toListNames(Read r) { + if(r==null){return null;} + byte[] scaf1=r.getScaffoldName(false); + byte[] scaf2=(r.mate==null ? null : r.mate.getScaffoldName(false)); + if(scaf1==null && scaf2==null){return null;} + HashSet set=new HashSet(8); + int x=scaf1==null ? -1 : Tools.indexOf(scaf1, (byte)'$'); + if(x>=0){ + String s=new String(scaf1, 0, x); + if(s.indexOf(',')<0){ + set.add(s); + }else{ + for(String s2 : s.split(",")){set.add(s2);} + } + } + + x=(scaf2==null || scaf2==scaf1) ? -1 : Tools.indexOf(scaf2, (byte)'$'); + if(x>=0){ + String s=new String(scaf2, 0, x); + if(s.indexOf(',')<0){ + set.add(s); + }else{ + for(String s2 : s.split(",")){set.add(s2);} + } + } + + return set; + } + + + /** + * @param r + * @return A set of names of reference lists containing this site. + */ + public static HashSet toListNames(SiteScore r, HashSet set) { + if(r==null){return set;} + byte[] scaf1=r.getScaffoldName(false); + if(scaf1==null){return set;} + if(set==null){set=new HashSet(8);} + int x=scaf1==null ? -1 : Tools.indexOf(scaf1, (byte)'$'); + if(x>=0){ + String s=new String(scaf1, 0, x); + if(s.indexOf(',')<0){ + set.add(s); + }else{ + for(String s2 : s.split(",")){set.add(s2);} + } + } + return set; + } + + private static void addNames(LinkedHashSet fnames, HashMap> table, String setName){ + LinkedHashSet set=table.get(setName); + if(set==null){return;} + for(String s : set){fnames.add(s);} + } + + public static void makeBamScript(String outname, String...sams){ + LinkedHashSet set=new LinkedHashSet(); + if(sams!=null){ + for(String s : sams){ + if(s!=null && (s.endsWith(".sam") || s.endsWith(".sam.gz"))){ + set.add(s); + } + } + } + if(streamTable!=null){ + for(RTextOutputStream3 ros : streamTable.values()){ + String s=ros.fname(); + if(s.endsWith(".sam") || s.endsWith(".sam.gz")){ + set.add(s); + } + } + } + TextStreamWriter tsw=new TextStreamWriter(outname, OVERWRITE, false, false); + tsw.start(); + for(String sam : set){ + String bam; + if(sam.endsWith(".sam.gz")){bam=sam.substring(0, sam.length()-6)+"bam";} + else{bam=sam.substring(0, sam.length()-3)+"bam";} + String bam2=bam.substring(0, bam.length()-4)+"_sorted"; + + boolean pipe=true; + tsw.println("#!/bin/bash"); + tsw.println("module unload samtools"); + tsw.println("module load samtools/0.1.19"); + + String memstring; + long mem=Runtime.getRuntime().maxMemory()/3400000; + mem=Tools.min(100000, mem); + if(mem<2048){memstring=mem+"M";} + else{memstring=(mem/1024)+"G";} + + tsw.println("echo \"Note: This script is designed to run with the amount of memory detected by BBMap.\""); + tsw.println("echo \" If Samtools crashes, please ensure you are running on the same platform as BBMap,\""); + tsw.println("echo \" or reduce Samtools' memory setting (the -m flag).\""); + if(pipe){ + tsw.println("echo \"Note: Please ignore any warnings about 'EOF marker is absent'; " + + "this is a bug in samtools that occurs when using piped input.\""); + tsw.println("samtools view -bSh1 "+sam+" | samtools sort -m "+memstring+" -@ 3 - "+bam2); + }else{ + tsw.println("samtools view -bSh1 -o "+bam+" "+sam); + tsw.println("samtools sort -m "+memstring+" -@ 3 "+bam+" "+bam2); + } + + tsw.println("samtools index "+bam2+".bam"); + } + tsw.poison(); + tsw.waitForFinish(); + } + + public static class SetCount implements Comparable{ + + public SetCount(String s){ + name=s; + } + + public boolean equals(Object other){return equals((SetCount)other);} + public boolean equals(SetCount other){return compareTo(other)==0;} + + @Override + public int compareTo(SetCount o) { + if(mappedReads!=o.mappedReads){return mappedReads>o.mappedReads ? 1 : -1;} + if(ambiguousReads!=o.ambiguousReads){return ambiguousReads>o.ambiguousReads ? 1 : -1;} + return name.compareTo(o.name); + } + + public String toString(){ + return name+", "+mappedReads+", "+ambiguousReads+", "+mappedBases+", "+ambiguousBases; + } + + public final String name; + public long mappedReads; + public long ambiguousReads; + public long mappedBases; + public long ambiguousBases; + + } + + public static void printCounts(String fname, HashMap map, boolean header, long totalReads){ + final ArrayList list=new ArrayList(map.size()); + list.addAll(map.values()); + final TextStreamWriter tsw=new TextStreamWriter(fname, OVERWRITE, false, false); + tsw.start(); + Collections.sort(list); + Collections.reverse(list); +// long ur=0, ub=0, ar=0, ab=0; + if(header){ + tsw.print("#name\t%unambiguous reads\tunambiguous MB\t%ambiguous reads\tambiguous MB\tunambiguous reads\tambiguous reads\n"); + } + final StringBuilder sb=new StringBuilder(1024); + final double divR=100.0/(totalReads); + final double divB=1.0/1000000; + for(SetCount sc : list){ + if(sc.mappedReads<1 && sc.ambiguousReads<1){break;} + sb.append(sc.name).append('\t'); + sb.append(String.format("%.5f\t", sc.mappedReads*divR)); + sb.append(String.format("%.5f\t", sc.mappedBases*divB)); + sb.append(String.format("%.5f\t", sc.ambiguousReads*divR)); + sb.append(String.format("%.5f\t", sc.ambiguousBases*divB)); + sb.append(sc.mappedReads).append('\t'); + sb.append(sc.ambiguousReads).append('\n'); + tsw.print(sb.toString()); + sb.setLength(0); + } + tsw.poison(); + } + + public static HashMap setCountTable=null; + public static HashMap scafCountTable=null; + + /** + * Holds named output streams. + */ + public static HashMap streamTable=null; + + /** + * Holds named output streams for ambiguous (across different references) reads. + */ + public static HashMap streamTableAmbiguous=null; + public static final int AMBIGUOUS2_UNSET=0; + public static final int AMBIGUOUS2_FIRST=1; + public static final int AMBIGUOUS2_SPLIT=2; + public static final int AMBIGUOUS2_TOSS=3; + public static final int AMBIGUOUS2_RANDOM=4; + public static final int AMBIGUOUS2_ALL=5; + public static int AMBIGUOUS2_MODE=AMBIGUOUS2_UNSET; + public static boolean TRACK_SET_STATS=false; + public static boolean TRACK_SCAF_STATS=false; + public static String SCAF_STATS_FILE=null; + public static String SET_STATS_FILE=null; + public static boolean OVERWRITE=true; + public static boolean verbose=false; + private static final ArrayList blank=new ArrayList(0); + + public static final int MAP_NORMAL=1; + public static final int MAP_ACC=2; + public static final int MAP_PACBIO=3; + public static final int MAP_PACBIOSKIMMER=4; + public static int MAP_MODE=MAP_NORMAL; + +} diff --git a/current/align2/BandedAligner.java b/current/align2/BandedAligner.java new file mode 100755 index 0000000..5c46bd8 --- /dev/null +++ b/current/align2/BandedAligner.java @@ -0,0 +1,573 @@ +package align2; + +import java.util.Arrays; + +import dna.AminoAcid; + +/** + * @author Brian Bushnell + * @date Aug 5, 2013 + * + */ +public class BandedAligner { + + + public static void main(String[] args){ + byte[] query=args[0].getBytes(); + byte[] ref=args[1].getBytes(); + int qstart=-1; + int rstart=-1; + int maxedits=big; + int width=5; + if(args.length>2){qstart=Integer.parseInt(args[2]);} + if(args.length>3){rstart=Integer.parseInt(args[3]);} + if(args.length>4){maxedits=Integer.parseInt(args[4]);} + if(args.length>4){width=Integer.parseInt(args[5]);} + + BandedAligner ba=new BandedAligner(width); + + int edits; + + edits=ba.alignForward(query, ref, (qstart==-1 ? 0 : qstart), (rstart==-1 ? 0 : rstart), maxedits, true); + System.out.println("Forward: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score()); + System.out.println("***********************\n"); +// +// edits=ba.alignForwardRC(query, ref, (qstart==-1 ? query.length-1 : qstart), (rstart==-1 ? 0 : rstart), maxedits, true); +// System.out.println("ForwardRC: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score()); +// System.out.println("***********************\n"); + + edits=ba.alignReverse(query, ref, (qstart==-1 ? query.length-1 : qstart), (rstart==-1 ? ref.length-1 : rstart), maxedits, true); + System.out.println("Reverse: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score()); + System.out.println("***********************\n"); + +// edits=ba.alignReverseRC(query, ref, (qstart==-1 ? 0 : qstart), (rstart==-1 ? ref.length-1 : rstart), maxedits, true); +// System.out.println("ReverseRC: \tedits="+edits+", lastRow="+ba.lastRow+", score="+ba.score()); +// System.out.println("***********************\n"); + } + + + public BandedAligner(int width_){ + maxWidth=Tools.max(width_, 3)|1; + assert(maxWidth>=3) : "width<3 : "+width_+" -> "+maxWidth; + array1=new int[maxWidth+2]; + array2=new int[maxWidth+2]; + Arrays.fill(array1, big); + Arrays.fill(array2, big); +// for(int i=2; imaxWidth/2); + } + + /** + * @param query + * @param ref + * @param qstart + * @param rstart + * @return Edit distance + */ + public int alignForward(final byte[] query, final byte[] ref, final int qstart, final int rstart, final int maxEdits, final boolean exact){ + assert(big>maxEdits); + if(verbose){System.err.println("alignForward("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");} + if(query.length-qstart>ref.length-rstart){ + int x=alignForward(ref, query, rstart, qstart, maxEdits, exact); + int temp=lastQueryLoc; + lastQueryLoc=lastRefLoc; + lastRefLoc=temp; + if(verbose){ + System.out.println("Reversed."); + System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+ + ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n"); + } + return x; + } + int edits=0, row=0; + lastRow=-1; + lastEdits=0; + lastOffset=0; + + final int width=Tools.min(maxWidth, (maxEdits*2)+1); + final int halfWidth=width/2; + final boolean inexact=!exact; + + int qloc=qstart; + int rsloc=rstart-halfWidth; + final int xlines=query.length-qstart; + final int ylines=ref.length-rstart; + final int len=Tools.min(xlines, ylines); + if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);} + if(len<1){ + if(false){ + throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + } + assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + return 0; + } + + Arrays.fill(array1, big); + Arrays.fill(array2, big); + arrayCurrent=array1; + arrayPrev=array2; + + { + if(verbose){System.err.println("\nFirst row.");} + final byte q=query[qloc]; + final int colStart=Tools.max(0, rsloc); + final int colLimit=Tools.min(rsloc+width, ref.length); + edits=big; + int mloc=1+(colStart-rsloc); + if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);} +// assert(false) : mloc+", "+colStart+", "+rsloc; + for(int col=colStart; colmaxEdits); + if(verbose){System.err.println("alignForwardRC("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");} + if(qstart+1>ref.length-rstart){ + int x=alignReverseRC(ref, query, rstart, qstart, maxEdits, exact); + int temp=lastQueryLoc; + lastQueryLoc=lastRefLoc; + lastRefLoc=temp; + if(verbose){ + System.out.println("Reversed."); + System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+ + ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n"); + } + return x; + } + int edits=0, row=0; + lastRow=-1; + lastEdits=0; + lastOffset=0; + + final int width=Tools.min(maxWidth, (maxEdits*2)+1); + final int halfWidth=width/2; + final boolean inexact=!exact; + + int qloc=qstart; + int rsloc=rstart-halfWidth; + final int xlines=qstart+1; + final int ylines=ref.length-rstart; + final int len=Tools.min(xlines, ylines); + if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);} + if(len<1){ + if(false){ + throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + } + assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + return 0; + } + + Arrays.fill(array1, big); + Arrays.fill(array2, big); + arrayCurrent=array1; + arrayPrev=array2; + + { + if(verbose){System.err.println("\nFirst row.");} + final byte q=AminoAcid.baseToComplementExtended[query[qloc]]; + final int colStart=Tools.max(0, rsloc); + final int colLimit=Tools.min(rsloc+width, ref.length); + edits=big; + int mloc=1+(colStart-rsloc); + if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);} + for(int col=colStart; colmaxEdits); + if(verbose){System.err.println("alignReverse("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");} + if(qstart>rstart){ + int x=alignReverse(ref, query, rstart, qstart, maxEdits, exact); + int temp=lastQueryLoc; + lastQueryLoc=lastRefLoc; + lastRefLoc=temp; + if(verbose){ + System.out.println("Reversed."); + System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+ + ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n"); + } + return x; + } +// if(true){return big;} + int edits=0, row=0; + lastRow=-1; + lastEdits=0; + lastOffset=0; + + final int width=Tools.min(maxWidth, (maxEdits*2)+1); + final int halfWidth=width/2; + final boolean inexact=!exact; + + int qloc=qstart; + int rsloc=rstart-halfWidth; + final int xlines=qstart+1; + final int ylines=rstart+1; + final int len=Tools.min(xlines, ylines); + if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);} + if(len<1){ + if(false){ + throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + } + assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + return 0; + } + + Arrays.fill(array1, big); + Arrays.fill(array2, big); + arrayCurrent=array1; + arrayPrev=array2; + + { + if(verbose){System.err.println("\nFirst row.");} + final byte q=query[qloc]; + final int colStart=Tools.max(0, rsloc); + final int colLimit=Tools.min(rsloc+width, ref.length); + edits=big; + int mloc=1+width-(colLimit-rsloc); + if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);} + for(int col=colLimit-1; col>=colStart; mloc++, col--){ + if(verbose){System.err.println("col="+col+", mloc="+mloc);} + final byte r=ref[col]; + final int score=(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1); + arrayCurrent[mloc]=score; + edits=Tools.min(edits, score); + if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; prev=0; score="+score+"; scores = "+Arrays.toString(arrayCurrent));} + } + row++; qloc--; rsloc--; + } + if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);} + + for(row=1; rowmaxEdits); + if(verbose){System.err.println("alignReverseRC("+new String(query)+", "+new String(ref)+", "+qstart+", "+rstart+", "+maxEdits+")");} + if(query.length-qstart>rstart+1){ + int x=alignForwardRC(ref, query, rstart, qstart, maxEdits, exact); + int temp=lastQueryLoc; + lastQueryLoc=lastRefLoc; + lastRefLoc=temp; + if(verbose){ + System.out.println("Reversed."); + System.out.println("Final state: lastRow="+lastRow+", lastEdits="+lastEdits+", lastOffset="+lastOffset+ + ", lastQueryLoc="+lastQueryLoc+", lastRefLoc="+lastRefLoc+(query.length<30 ? ", query="+new String(query)+", ref="+new String(ref) : "")+"\n"); + } + return x; + } + int edits=0, row=0; + lastRow=-1; + lastEdits=0; + lastOffset=0; + + final int width=Tools.min(maxWidth, (maxEdits*2)+1); + final int halfWidth=width/2; + final boolean inexact=!exact; + + int qloc=qstart; + int rsloc=rstart-halfWidth; + final int xlines=query.length-qstart; + final int ylines=rstart+1; + final int len=Tools.min(xlines, ylines); + if(verbose){System.err.println("xlines="+xlines+", ylines="+ylines+", len="+len);} + if(len<1){ + if(false){ + throw new RuntimeException("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + } + assert(false) : ("No overlap: qstart="+qstart+", rstart="+rstart+", qlen="+query.length+", rlen="+ref.length); + return 0; + } + + Arrays.fill(array1, big); + Arrays.fill(array2, big); + arrayCurrent=array1; + arrayPrev=array2; + + { + if(verbose){System.err.println("\nFirst row.");} + final byte q=AminoAcid.baseToComplementExtended[query[qloc]]; + final int colStart=Tools.max(0, rsloc); + final int colLimit=Tools.min(rsloc+width, ref.length); + edits=big; + int mloc=1+width-(colLimit-rsloc); + if(verbose){System.err.println("q="+(char)q+", qloc="+qloc+", rsloc="+rsloc+", colStart="+colStart+", colLimit="+colLimit+", mloc="+mloc);} + for(int col=colLimit-1; col>=colStart; mloc++, col--){ + if(verbose){System.err.println("col="+col+", mloc="+mloc);} + final byte r=ref[col]; + final int score=(q==r || (inexact && (!AminoAcid.isFullyDefined(q) || !AminoAcid.isFullyDefined(r))) ? 0 : 1); + arrayCurrent[mloc]=score; + edits=Tools.min(edits, score); + if(verbose){System.err.println("Comparing "+(char)q+" to "+(char)r+"; prev=0; score="+score+"; scores = "+Arrays.toString(arrayCurrent));} + } + row++; qloc++; rsloc--; + } + if(penalizeOffCenter){edits=penalizeOffCenter(arrayCurrent, halfWidth);} + + for(row=1; row set; + int added=0, overwritten=0; + if(black){ + if(blacklist==null){blacklist=new HashSet(4001);} + set=blacklist; + }else{ + if(whitelist==null){whitelist=new HashSet(4001);} + set=whitelist; + } + TextFile tf=new TextFile(fname, false, false); + String line=tf.nextLine(); + if(line==null){return 0;} + final boolean fasta=(line.charAt(0)=='>'); + System.err.println("Detected "+(black ? "black" : "white")+"list file "+fname+" as "+(fasta ? "" : "non-")+"fasta-formatted."); + while(line!=null){ + String key=null; + if(fasta){ + if(line.charAt(0)=='>'){key=new String(line.substring(1));} + }else{ + key=line; + } + if(key!=null){ + boolean b=set.add(key); + added++; + if(!b){ + if(overwritten==0){ + System.err.println("Duplicate "+(black ? "black" : "white")+"list key "+key); + System.err.println("Subsequent duplicates from this file will not be mentioned."); + } + overwritten++; + } + } + line=tf.nextLine(); + } + if(overwritten>0){ + System.err.println("Added "+overwritten+" duplicate keys."); + } + return added-overwritten; + } + + public static boolean hasBlacklist(){return blacklist!=null && !blacklist.isEmpty();} + public static boolean hasWhitelist(){return whitelist!=null && !whitelist.isEmpty();} + + public static void clearBlacklist(){blacklist=null;} + public static void clearWhitelist(){whitelist=null;} + + private static HashSet blacklist=null; + private static HashSet whitelist=null; + +} diff --git a/current/align2/Block.java b/current/align2/Block.java new file mode 100755 index 0000000..0494093 --- /dev/null +++ b/current/align2/Block.java @@ -0,0 +1,171 @@ +package align2; + +import java.io.File; +import java.io.Serializable; +import java.util.Arrays; + +import fileIO.LoadThread; +import fileIO.ReadWrite; + +/** + * @author Brian Bushnell + * @date Dec 23, 2012 + * + */ +public class Block implements Serializable{ + + /** + * + */ + private static final long serialVersionUID = -1638122096023589384L; + + public Block(int numSites_, int numStarts_){ + numSites=numSites_; + numStarts=numStarts_; + sites=new int[numSites]; + starts=new int[numStarts+1]; + assert(Integer.bitCount(numStarts)==1 && Integer.bitCount(starts.length)==2) : numStarts; + } + + public Block(int[] sites_, int[] starts_){ + sites=sites_; + starts=starts_; + numSites=sites.length; + numStarts=starts.length-1; + assert(Integer.bitCount(numStarts)==1 && Integer.bitCount(starts.length)==2) : numStarts; + } + + /** For legacy support */ + public int[] getHitList(int key){ + int len=length(key); + if(len==0){return null;} + int start=starts[key]; + int[] r=Arrays.copyOfRange(sites, start, start+len); + return r; + } + + /** For legacy support */ + public int[] getHitList(int start, int stop){ + int len=length(start, stop); + if(len==0){return null;} + assert(len>0) : len+", "+start+", "+stop; + int[] r=Arrays.copyOfRange(sites, start, start+len); + return r; + } + + /** For legacy support */ + public int[][] getHitLists(int[] start, int[] stop){ + int[][] r=new int[start.length][]; + for(int i=0; i0; i--){ + x[i]=x[i]-x[i-1]; + } + } + + private static void decompress(int[] x){ + int sum=x[0]; + for(int i=1; i lta=LoadThread.load(fname, int[].class); + b=ReadWrite.read(int[].class, fname2); + lta.waitForThisToFinish(); + a=lta.output; + } +// { +// LoadThread lta=LoadThread.load(fname, int[].class); +// LoadThread ltb=LoadThread.load(fname2, int[].class); +// lta.waitForThisToFinish(); +// ltb.waitForThisToFinish(); +// a=lta.output; +// b=ltb.output; +// } + +// int[] a=ReadWrite.read(int[].class, fname); +// int[] b=ReadWrite.read(int[].class, fname2); + + assert(a!=null && b!=null) : a+", "+b; + if(compress){ + int sum=b[0]; + for(int i=1; i=max+1); + + int pound=pattern.lastIndexOf('#'); + String a=pattern.substring(0, pound); + String b=pattern.substring(pound+1); + + ChromLoadThread[] clta=new ChromLoadThread[max]; + for(int i=min; i=min){ //Load last element in this thread instead of making a new thread. + increment(1); + r[max]=ChromosomeArray.read(a+max+b); + increment(-1); + } + + for(int i=min; i0){ + try { + lock.wait(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + lock.notify(); + } + } + } + } + + return r; + } + + @Override + public void run(){ + try { + array[id]=ChromosomeArray.read(fname); + } catch (Exception e) { + increment(-1); + throw new RuntimeException(e); + } + increment(-1); + } + + private static final int increment(int i){ + int r; + synchronized(lock){ + if(i<=0){ + lock[0]+=i; + lock.notify(); + }else{ + while(lock[0]>=MAX_CONCURRENT){ + try { + lock.wait(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + } + r=lock[0]; + } + return r; + } + + private final int id; + private final String fname; + private final ChromosomeArray[] array; + + public static final int[] lock=new int[1]; + public static int MAX_CONCURRENT=Shared.THREADS; + +} diff --git a/current/align2/CompareSamFiles.java b/current/align2/CompareSamFiles.java new file mode 100755 index 0000000..fba3ff8 --- /dev/null +++ b/current/align2/CompareSamFiles.java @@ -0,0 +1,382 @@ +package align2; + +import java.io.File; +import java.util.Arrays; +import java.util.BitSet; + +import stream.Read; +import stream.SamLine; +import stream.SiteScore; + +import dna.Data; + +import fileIO.TextFile; + +/** Generate a file containing reads mapped correctly in one file and incorrectly in another file. */ +public class CompareSamFiles { + + + public static void main(String[] args){ + + String in1=null; + String in2=null; + long reads=-1; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+args[i]); + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("path") || a.equals("root")){ + Data.setPath(b); + }else if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.equals("parsecustom")){ + parsecustom=Tools.parseBoolean(b); + }else if(a.equals("thresh")){ + THRESH2=Integer.parseInt(b); + }else if(a.equals("printerr")){ + printerr=Tools.parseBoolean(b); + }else if(a.equals("ssaha2") || a.equals("subtractleadingclip")){ + SamLine.SUBTRACT_LEADING_SOFT_CLIP=Tools.parseBoolean(b); + }else if(a.equals("blasr")){ + BLASR=Tools.parseBoolean(b); + }else if(a.equals("q") || a.equals("quality") || a.startsWith("minq")){ + minQuality=Integer.parseInt(b); + }else if(in1==null && i==0 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[i]).exists())){ + in1=args[i]; + }else if(in2==null && i==1 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[i]).exists())){ + in2=args[i]; + }else if(a.equals("reads")){ + reads=Long.parseLong(b); + }else if(i==2 && args[i].indexOf('=')<0 && Character.isDigit(a.charAt(0))){ + reads=Long.parseLong(a); + } + } + + assert(in1!=null) : args[0]+".exists() ? "+new File(args[0]).exists(); +// assert(in2!=null) : args[1]+".exists() ? "+new File(args[1]).exists(); + + if(reads<1){ +// assert(false) : "Number of expected reads was not specified. Please add a parameter reads= or disable assertions."; + reads=100000; + System.err.println("Warning - number of expected reads was not specified."); + } + + TextFile tf1=new TextFile(in1, false, false); + TextFile tf2=null; + if(in2!=null){tf2=new TextFile(in2, false, false);} + + BitSet truePos1=new BitSet((int)reads); + BitSet falsePos1=new BitSet((int)reads); + BitSet truePos2=new BitSet((int)reads); + BitSet falsePos2=new BitSet((int)reads); + + String s=null; + + TextFile tf; + { + tf=tf1; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); + if(c!='@'/* && c!=' ' && c!='\t'*/){ + SamLine sl=new SamLine(s); + if(sl.primary()){ + Read r=sl.toRead(parsecustom); + if(parsecustom && r.originalSite==null){ + assert(false); + System.err.println("Turned off custom parsing."); + parsecustom=false; + } + //System.out.println(r); + int type=type(r, sl); + int id=(int)r.numericID; + if(type==2){truePos1.set(id);} + else if(type>2){falsePos1.set(id);} + } + } + } + tf.close(); + } + if(tf2!=null){ + tf=tf2; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); + if(c!='@'/* && c!=' ' && c!='\t'*/){ + SamLine sl=new SamLine(s); + if(sl.primary()){ + Read r=sl.toRead(parsecustom); + if(parsecustom && r.originalSite==null){ + assert(false); + System.err.println("Turned off custom parsing."); + parsecustom=false; + } + //System.out.println(r); + int type=type(r, sl); + int id=(int)r.numericID; + if(type==2){truePos2.set(id);} + else if(type>2){falsePos2.set(id);} + } + } + } + tf.close(); + } + + + + BitSet added=new BitSet((int)reads); + { + tf=tf1; + tf.reset(); + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); + if(c!='@'/* && c!=' ' && c!='\t'*/){ + SamLine sl=new SamLine(s); +// assert(false) : s+", "+truePos1.cardinality()+", "+truePos2.cardinality()+", "+falsePos1.cardinality()+", "+falsePos2.cardinality()+", "; + if(sl.primary()){ + Read r=sl.toRead(parsecustom); + int id=(int)r.numericID; + if(!added.get(id)){ +// if(truePos1.get(id)!=truePos2.get(id) || falsePos1.get(id)!=falsePos2.get(id)){ +// System.out.println(s); +// added.set(id); +// } +// if(falsePos1.get(id) && truePos2.get(id)){ +// System.out.println(s); +// added.set(id); +// } + if(falsePos1.get(id) && !falsePos2.get(id)){ + System.out.println(s); + added.set(id); + } + } + } + } + } + tf.close(); + } + if(tf2!=null){ + tf=tf2; + tf.reset(); + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); + if(c!='@'/* && c!=' ' && c!='\t'*/){ + SamLine sl=new SamLine(s); + if(sl.primary()){ + Read r=sl.toRead(parsecustom); + int id=(int)r.numericID; + if(!added.get(id)){ +// if(truePos1.get(id)!=truePos2.get(id) || falsePos1.get(id)!=falsePos2.get(id)){ +// System.out.println(s); +// added.set(id); +// } +// if(falsePos2.get(id) && truePos1.get(id)){ +// System.out.println(s); +// added.set(id); +// } + if(falsePos2.get(id) && !falsePos1.get(id)){ + System.out.println(s); + added.set(id); + } + } + } + } + } + tf.close(); + } + } + + + public static void calcStatistics1(final Read r, SamLine sl){ + + int THRESH=0; + primary++; + + if(r.discarded()/* || r.mapScore==0*/){ + discarded++; + unmapped++; + }else if(r.ambiguous()){ +// assert(r.mapped()) : "\n"+r+"\n"+sl+"\n"; + if(r.mapped()){mapped++;} + ambiguous++; + }else if(r.mapScore<1){ + unmapped++; + }else if(r.mapScore<=minQuality){ + if(r.mapped()){mapped++;} + ambiguous++; + }else{ + if(!r.mapped()){ + unmapped++; + }else{ + mapped++; + mappedRetained++; + + if(parsecustom){ + SiteScore os=r.originalSite; + assert(os!=null); + if(os!=null){ + int trueChrom=os.chrom; + byte trueStrand=os.strand; + int trueStart=os.start; + int trueStop=os.stop; + SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0); + byte[] originalContig=sl.originalContig(); + if(BLASR){ + originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig : + Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/'))); + } + int cstart=sl.originalContigStart(); + + boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart); + boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart); + + // if(!strict){ + // System.out.println(ss+", "+new String(originalContig)+", "+new String(sl.rname())); + // assert(false); + // } + + // System.out.println("loose = "+loose+" for "+r.toText()); + + if(loose){ + // System.err.println("TPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t" + // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop); + truePositiveLoose++; + }else{ + // System.err.println("FPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t" + // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop); + falsePositiveLoose++; + } + + if(strict){ + // System.err.println("TPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop); + truePositiveStrict++; + }else{ + // System.err.println("FPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop); + falsePositiveStrict++; + } + } + } + } + } + } + + + + public static int type(final Read r, SamLine sl){ + + int THRESH=0; + primary++; + + if(r.discarded()/* || r.mapScore==0*/){ + return 0; + }else if(r.ambiguous()){ + return 1; + }else if(r.mapScore<1){ + return 0; + }else if(r.mapScore<=minQuality){ + return 1; + }else{ + if(!r.mapped()){ + return 0; + }else{ + + if(parsecustom){ + SiteScore os=r.originalSite; + assert(os!=null); + if(os!=null){ + int trueChrom=os.chrom; + byte trueStrand=os.strand; + int trueStart=os.start; + int trueStop=os.stop; + SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0); + byte[] originalContig=sl.originalContig(); + if(BLASR){ + originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig : + Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/'))); + } + int cstart=sl.originalContigStart(); + + boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart); + boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart); + + if(strict){return 2;} + if(loose){return 3;} + return 4; + } + } + } + } + return 0; + } + + + + public static boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, + byte[] originalContig, byte[] contig, int cstart){ + if(ss.strand!=trueStrand){return false;} + if(originalContig!=null){ + if(!Arrays.equals(originalContig, contig)){return false;} + }else{ + if(ss.chrom!=trueChrom){return false;} + } + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + int cstop=cstart+trueStop-trueStart; +// return (absdif(ss.start, trueStart)<=thresh && absdif(ss.stop, trueStop)<=thresh); + return (absdif(ss.start, cstart)<=thresh && absdif(ss.stop, cstop)<=thresh); + } + + + public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, + byte[] originalContig, byte[] contig, int cstart){ + if(ss.strand!=trueStrand){return false;} + if(originalContig!=null){ + if(!Arrays.equals(originalContig, contig)){return false;} + }else{ + if(ss.chrom!=trueChrom){return false;} + } + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + int cstop=cstart+trueStop-trueStart; +// return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh); + return (absdif(ss.start, cstart)<=thresh || absdif(ss.stop, cstop)<=thresh); + } + + private static final int absdif(int a, int b){ + return a>b ? a-b : b-a; + } + + public static int truePositiveStrict=0; + public static int falsePositiveStrict=0; + + public static int truePositiveLoose=0; + public static int falsePositiveLoose=0; + + public static int mapped=0; + public static int mappedRetained=0; + public static int unmapped=0; + + public static int discarded=0; + public static int ambiguous=0; + + public static long lines=0; + public static long primary=0; + public static long secondary=0; + + public static int minQuality=3; + + public static boolean parsecustom=true; + public static boolean printerr=false; + + public static int THRESH2=20; + public static boolean BLASR=false; + +} diff --git a/current/align2/CompressString.java b/current/align2/CompressString.java new file mode 100755 index 0000000..37f1d7a --- /dev/null +++ b/current/align2/CompressString.java @@ -0,0 +1,269 @@ +package align2; + +import dna.ChromosomeArray; +import dna.Data; + +public class CompressString { + + public static void main(String[] args){ + + String s; + + s=compressRepeats(args[0].getBytes(), 1); + s=compress(args[0]); + s=compressRepeatsUltra(args[0].getBytes(), 1, 3, null); + System.out.println(args[0]+"\n"+s); + + System.exit(0); + + ChromosomeArray cha=Data.getChromosome(1); + byte[] bytes=cha.array; + + int letters=0; + for(int i=0; i0 && log<=31); + + + //Append + for(int i=1; i=minPeriod; x--){ +// int temp=countRepeats(array, base, x); +// if(temp>1){ +// repeats=temp; +// period=x; +// break; +// } +// } + for(int x=minPeriod; x<=maxPeriod; x++){ + int temp=countRepeats(array, base, x); + if(temp>1){ + repeats=temp; + period=x; + break; + } + } + int occurances=repeats+1; + +// System.out.println("base = "+base+"\t, repeats = "+repeats+"\t, period = "+period); + + if(repeats==0){ + //Advance pointer by 1 + sb.append((char)array[base]); + if(list!=null){list.add(base);} + }else if(repeats==1){ + //Still advance pointer by 1 + sb.append((char)array[base]); + if(list!=null){list.add(base);} + } +// else if(repeats==2){ +// for(int j=0; j0 && log<=31); + + + //Append + for(int i=0; i1){ + repeats=temp; + period=x; + break; + } + } +// System.err.println(repeats); + if(repeats==0){ + //Advance pointer by 1 + sb.append((char)array[base]); + if(list!=null){list.add(base);} + }else if(repeats==1){ + //Still advance pointer by 1 + sb.append((char)array[base]); + if(list!=null){list.add(base);} + +// System.err.println(base); + base=base+(period*(repeats))-1; +// System.err.println(base); + } +// else if(repeats==2){ +// for(int j=0; j0){sb.append('~');} + sb.append(gaps[i]); + } + return sb.toString(); + } + + public static int[] fixGaps(int a, int b, int[] gaps, int minGap){ +// System.err.println("fixGaps Input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap); +// assert(false) : "fixGaps called!"; + if(verbose){System.err.println("fixGaps a: "+Arrays.toString(gaps));} + assert(b>a); + if(gaps==null){return null;} + assert(gaps.length>=4); + if(verbose){System.err.println("fixGaps b: "+Arrays.toString(gaps));} + + int g0=gaps[0]; + int gN=gaps[gaps.length-1]; + if(a==g0 && b==gN){return gaps;} + + if(!Tools.overlap(a, b, g0, gN)){return null;} + if(verbose){System.err.println("fixGaps c: "+Arrays.toString(gaps));} + + gaps[0]=a; + gaps[gaps.length-1]=b; + if(verbose){System.err.println("fixGaps d: "+Arrays.toString(gaps));} + + int remove=0; + for(int i=0; i0) : a+", "+b+", "+Arrays.toString(gaps); + return total; + } + + /** TODO: Verify. */ + public static final int calcBufferNeeded(int a, int b, int[] gaps){ + int total=b-a+1; + if(gaps==null){return total;} + for(int i=2; i0) : a+", "+b+", "+Arrays.toString(gaps); + return total; + } + + /** TODO: Verify. */ + public static int calcGapLen(int a, int b){ + assert(b>a); + int gap=b-a; + if(gapa); + int gap=b-a-Shared.GAPBUFFER2; + return Tools.max(0, gap/Shared.GAPLEN); + } + + public static final int[] fixGaps2(int a, int b, int[] gaps, int minGap){ + if(verbose){System.err.println("Input: "+a+", "+b+", "+Arrays.toString(gaps)+", "+minGap);} + ArrayList list=toList(gaps); + if(verbose){System.err.println("Before fixing: "+list);} + assert(list.size()>1); + for(int i=1; i"); + System.err.println(list.get(i-1)); + System.err.println(list.get(i)); + } + + } + if(verbose){System.err.println("After fixing: "+list);} + Tools.condenseStrict(list); + if(verbose){System.err.println("After condensing: "+list);} + + if(list.size()<2){return null;} + + int[] gaps2; + if(gaps.length==list.size()*2){ + gaps2=gaps; + }else{ + gaps2=new int[list.size()*2]; + } + for(int i=0, j=0; i toList(int[] gaps){ + ArrayList list=new ArrayList(gaps.length/2); + for(int i=0; i{ + + public Range(int a_, int b_){ + assert(b_>=a_); + a=a_; + b=b_; + } + + public int compareTo(Range r){ + int x; + x=a-r.a; + if(x!=0){return x;} + return b-r.b; + } + + public String toString(){ + return "("+a+","+b+")"; + } + + public boolean equals(Object other){return equals((Range)other);} + public boolean equals(Range other){return compareTo(other)==0;} + + public int a; + public int b; + } + + public static boolean verbose=false; + +} diff --git a/current/align2/GradeSamFile.java b/current/align2/GradeSamFile.java new file mode 100755 index 0000000..26a89ec --- /dev/null +++ b/current/align2/GradeSamFile.java @@ -0,0 +1,318 @@ +package align2; + +import java.io.File; +import java.util.Arrays; +import java.util.BitSet; + +import stream.Read; +import stream.SamLine; +import stream.SiteScore; + +import fileIO.TextFile; + +public class GradeSamFile { + + + public static void main(String[] args){ + + String in=null; + long reads=-1; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in") || a.equals("in1")){ + in=b; + }else if(a.equals("reads")){ + reads=Long.parseLong(b); + }else if(a.equals("parsecustom")){ + parsecustom=Tools.parseBoolean(b); + }else if(a.equals("thresh")){ + THRESH2=Integer.parseInt(b); + }else if(a.equals("printerr")){ + printerr=Tools.parseBoolean(b); + }else if(a.equals("ssaha2") || a.equals("subtractleadingclip")){ + SamLine.SUBTRACT_LEADING_SOFT_CLIP=Tools.parseBoolean(b); + }else if(a.equals("blasr")){ + BLASR=Tools.parseBoolean(b); + }else if(a.equals("q") || a.equals("quality") || a.startsWith("minq")){ + minQuality=Integer.parseInt(b); + }else if(a.equals("bitset")){ + USE_BITSET=Tools.parseBoolean(b); + }else if(i==0 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[0]).exists())){ + in=args[0]; + }else if(i==1 && args[i].indexOf('=')<0 && Character.isDigit(a.charAt(0))){ + reads=Long.parseLong(a); + } + } + + if(USE_BITSET){ + int x=400000; + if(reads>0 && reads<=Integer.MAX_VALUE){x=(int)reads;} + try { + seen=new BitSet(x); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.out.println("Did not have enough memory to allocate bitset; duplicate mappings will not be detected."); + } + } + + assert(in!=null) : args[0]+".exists() ? "+new File(args[0]).exists(); + + if(reads<1){ + assert(false) : "Number of expected reads was not specified. Please add a parameter reads= or disable assertions."; + System.err.println("Warning - number of expected reads was not specified."); + } + + TextFile tf=new TextFile(in, false, false); + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); +// System.out.println(s); + if(c!='@'/* && c!=' ' && c!='\t'*/){ + SamLine sl=new SamLine(s); + lines++; + int id=((((int)sl.parseNumericId())<<1)|sl.pairnum()); +// System.out.println(sl.parseNumericId()+", "+sl.pairnum()+", "+id+""); +// if(id%500==10){assert(false);} + if(sl.primary() && (!parsecustom || seen==null || !seen.get(id))){ + Read r=sl.toRead(parsecustom); + if(seen!=null){seen.set(id);} + if(parsecustom && r.originalSite==null){ + assert(false); + System.err.println("Turned off custom parsing."); + parsecustom=false; + } + //System.out.println(r); + calcStatistics1(r, sl); + }else{ + secondary++; + } + } + } + if(reads<-1){reads=primary;} + + double tmult=100d/reads; + + double mappedB=mapped*tmult; + double retainedB=mappedRetained*tmult; + double truePositiveStrictB=truePositiveStrict*tmult; + double falsePositiveStrictB=falsePositiveStrict*tmult; + double truePositiveLooseB=truePositiveLoose*tmult; + double falsePositiveLooseB=falsePositiveLoose*tmult; + double falseNegativeB=(reads-mapped)*tmult; + double discardedB=discarded*tmult; + double ambiguousB=ambiguous*tmult; + + System.out.println(); + System.out.println("Mapping Statistics for "+args[0]+":"); + System.out.println("primary alignments: \t"+primary+" found of "+reads+" expected"); + System.out.println("secondary alignments: \t"+secondary+" found"); + System.out.println(String.format("mapped: \t"+(mappedB<10?" ":"")+"%.3f", mappedB)+"%"); + System.out.println(String.format("retained: \t"+(retainedB<10?" ":"")+"%.3f", retainedB)+"%"); + System.out.println(String.format("discarded: \t"+(discardedB<10?" ":"")+"%.3f", discardedB)+"%"); + System.out.println(String.format("ambiguous: \t"+(ambiguousB<10?" ":"")+"%.3f", ambiguousB)+"%"); + if(parsecustom){ + System.out.println(); + System.out.println("Strict correctness (both ends exactly correct):"); + System.out.println(String.format("true positive: \t"+(truePositiveStrictB<10?" ":"")+"%.3f", truePositiveStrictB)+"%"); + System.out.println(String.format("false positive: \t"+(falsePositiveStrictB<10?" ":"")+"%.3f", falsePositiveStrictB)+"%"); + System.out.println(); + System.out.println("Loose correctness (one end approximately correct):"); + System.out.println(String.format("true positive: \t"+(truePositiveLooseB<10?" ":"")+"%.3f", truePositiveLooseB)+"%"); + System.out.println(String.format("false positive: \t"+(falsePositiveLooseB<10?" ":"")+"%.3f", falsePositiveLooseB)+"%"); + } + System.out.println(); + System.out.println(String.format("false negative: \t"+(falseNegativeB<10?" ":"")+"%.3f", falseNegativeB)+"%"); + + if(printerr){ + System.err.println(); + System.err.println("Mapping Statistics for "+args[0]+":"); + System.err.println("primary alignments: \t"+primary+" found of "+reads+" expected"); + System.err.println("secondary alignments: \t"+secondary+" found"); + System.err.println(String.format("mapped: \t"+(mappedB<10?" ":"")+"%.3f", mappedB)+"%"); + System.err.println(String.format("retained: \t"+(retainedB<10?" ":"")+"%.3f", retainedB)+"%"); + System.err.println(String.format("discarded: \t"+(discardedB<10?" ":"")+"%.3f", discardedB)+"%"); + System.err.println(String.format("ambiguous: \t"+(ambiguousB<10?" ":"")+"%.3f", ambiguousB)+"%"); + if(parsecustom){ + System.err.println(); + System.err.println("Strict correctness (both ends exactly correct):"); + System.err.println(String.format("true positive: \t"+(truePositiveStrictB<10?" ":"")+"%.3f", truePositiveStrictB)+"%"); + System.err.println(String.format("false positive: \t"+(falsePositiveStrictB<10?" ":"")+"%.3f", falsePositiveStrictB)+"%"); + System.err.println(); + System.err.println("Loose correctness (one end approximately correct):"); + System.err.println(String.format("true positive: \t"+(truePositiveLooseB<10?" ":"")+"%.3f", truePositiveLooseB)+"%"); + System.err.println(String.format("false positive: \t"+(falsePositiveLooseB<10?" ":"")+"%.3f", falsePositiveLooseB)+"%"); + } + System.err.println(); + System.err.println(String.format("false negative: \t"+(falseNegativeB<10?" ":"")+"%.3f", falseNegativeB)+"%"); + } + + + } + + + public static void calcStatistics1(final Read r, SamLine sl){ + + int THRESH=0; + primary++; + + if(r.discarded()/* || r.mapScore==0*/){ + discarded++; + unmapped++; + }else if(r.ambiguous()){ +// assert(r.mapped()) : "\n"+r+"\n"+sl+"\n"; + if(r.mapped()){mapped++;} + ambiguous++; + }else if(r.mapScore<1){ + unmapped++; + }else if(r.mapScore<=minQuality){ + if(r.mapped()){mapped++;} + ambiguous++; + }else{ + if(!r.mapped()){ + unmapped++; + }else{ + mapped++; + mappedRetained++; + + if(parsecustom){ + SiteScore os=r.originalSite; +// System.out.println("A1: "+os); + assert(os!=null); + if(os!=null){ + final int trueChrom=os.chrom; + final byte trueStrand=os.strand; + final int trueStart=os.start; + final int trueStop=os.stop; +// System.err.println(); +// System.err.println(sl); +// System.err.println(); +// System.err.println(r); +// System.err.println(); + SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0); + byte[] originalContig=sl.originalContig(); + if(BLASR){ + originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig : + Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/'))); + } + int cstart=sl.originalContigStart(); + +// System.out.println("A2: "+trueStart+", "+cstart); + boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart, r); + boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart); + + // if(!strict){ + // System.out.println(ss+", "+new String(originalContig)+", "+new String(sl.rname())); + // assert(false); + // } + + // System.out.println("loose = "+loose+" for "+r.toText()); + + if(loose){ + // System.err.println("TPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t" + // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop); + truePositiveLoose++; + }else{ + // System.err.println("FPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t" + // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop); + falsePositiveLoose++; + } + + if(strict){ + // System.err.println("TPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop); + truePositiveStrict++; + }else{ + // System.err.println("FPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop); + falsePositiveStrict++; + } + } + } + } + } + + } + + + + public static boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, + byte[] originalContig, byte[] contig, int cstart, Read r){ + + final int cstop=cstart+trueStop-trueStart; + +// System.out.println("\n"+r.id); +// System.out.println(" \tstrand"+/*"\tchrom"+*/"\tstart\tstop\t");//+"scaf"); +// System.out.println("Original:\t"+trueStrand+/*"\t"+trueChrom+*/"\t"+trueStart+"\t"+trueStop+"\t");//+new String(originalContig)); +// System.out.println("Mapped: \t"+ss.strand+/*"\t"+ss.chrom+*/"\t"+ss.start+"\t"+ss.stop+"\t");//+new String(contig)); +// System.out.println("cs: \t"+trueStrand+/*"\t"+trueChrom+*/"\t"+cstart+"\t"+cstop+"\t");//+new String(contig)); + + if(ss.strand!=trueStrand){return false;} + if(originalContig!=null){ + if(!Arrays.equals(originalContig, contig)){return false;} + }else{ + if(ss.chrom!=trueChrom){return false;} + } + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; +// return (absdif(ss.start, trueStart)<=thresh && absdif(ss.stop, trueStop)<=thresh); + return (absdif(ss.start, cstart)<=thresh && absdif(ss.stop, cstop)<=thresh); + } + + + public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, + byte[] originalContig, byte[] contig, int cstart){ + if(ss.strand!=trueStrand){return false;} + if(originalContig!=null){ + if(!Arrays.equals(originalContig, contig)){return false;} + }else{ + if(ss.chrom!=trueChrom){return false;} + } + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + int cstop=cstart+trueStop-trueStart; +// return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh); + return (absdif(ss.start, cstart)<=thresh || absdif(ss.stop, cstop)<=thresh); + } + + private static final int absdif(int a, int b){ + return a>b ? a-b : b-a; + } + + public static int truePositiveStrict=0; + public static int falsePositiveStrict=0; + + public static int truePositiveLoose=0; + public static int falsePositiveLoose=0; + + public static int mapped=0; + public static int mappedRetained=0; + public static int unmapped=0; + + public static int discarded=0; + public static int ambiguous=0; + + public static long lines=0; + public static long primary=0; + public static long secondary=0; + + public static int minQuality=3; + + public static boolean parsecustom=true; + public static boolean printerr=false; + + public static int THRESH2=20; + public static boolean BLASR=false; + public static boolean USE_BITSET=true; + public static BitSet seen=null; + +} diff --git a/current/align2/Heap.java b/current/align2/Heap.java new file mode 100755 index 0000000..70d3645 --- /dev/null +++ b/current/align2/Heap.java @@ -0,0 +1,140 @@ +package align2; + +import java.util.PriorityQueue; + +public final class Heap> { + + public Heap(int maxSize){ + + int len=maxSize+1; + if((len&1)==1){len++;} //Array size is always even. + + CAPACITY=maxSize; + array=(T[])new Comparable[len]; +// queue=new PriorityQueue(maxSize); + } + + public boolean add(T t){ + //assert(testForDuplicates()); +// assert(queue.size()==size); +// queue.add(t); + assert(size==0 || array[size]!=null); + size++; + array[size]=t; + percDown(size); +// assert(queue.size()==size); +// assert(queue.peek()==peek()); + //assert(testForDuplicates()); + return true; + } + + public T peek(){ + //assert(testForDuplicates()); +// assert(queue.size()==size); + if(size==0){return null;} +// assert(array[1]==queue.peek()) : size+", "+queue.size()+"\n"+ +// array[1]+"\n"+ +// array[2]+" , "+array[3]+"\n"+ +// array[4]+" , "+array[5]+" , "+array[6]+" , "+array[7]+"\n"+ +// queue.peek()+"\n"; + //assert(testForDuplicates()); + return array[1]; + } + + public T poll(){ + //assert(testForDuplicates()); +// assert(queue.size()==size); + if(size==0){return null;} + T t=array[1]; +// assert(t==queue.poll()); + array[1]=array[size]; + array[size]=null; + size--; + if(size>0){percUp(1);} +// assert(queue.size()==size); +// assert(queue.peek()==peek()); + //assert(testForDuplicates()); + return t; + } + + private void percDown(int loc){ + //assert(testForDuplicates()); + assert(loc>0); + if(loc==1){return;} + int next=loc/2; + T a=array[loc]; + T b=array[next]; + assert(a!=b); + if(a.compareTo(b)<0){ + array[next]=a; + array[loc]=b; + percDown(next); + } + } + + private void percUp(int loc){ + //assert(testForDuplicates()); + assert(loc>0 && loc<=size) : loc+", "+size; + int next1=loc*2; + int next2=next1+1; + if(next1>size){return;} + T a=array[loc]; + T b=array[next1]; + T c=array[next2]; + assert(a!=b); + assert(b!=c); + assert(b!=null); + //assert(testForDuplicates()); + if(c==null || b.compareTo(c)<1){ + if(a.compareTo(b)>0){ + array[next1]=a; + array[loc]=b; + //assert(testForDuplicates()); + percUp(next1); + } + }else{ + if(a.compareTo(c)>0){ + array[next2]=a; + array[loc]=c; + //assert(testForDuplicates()); + percUp(next2); + } + } + } + + public boolean isEmpty(){ +// assert((size==0) == queue.isEmpty()); + return size==0; + } + + public void clear(){ +// queue.clear(); + for(int i=1; i<=size; i++){array[i]=null;} + size=0; + } + + public int size(){ + return size; + } + + public static int tier(int x){ + int leading=Integer.numberOfLeadingZeros(x); + return 31-leading; + } + + public boolean testForDuplicates(){ + for(int i=0; i queue; + +} diff --git a/current/align2/Index.java b/current/align2/Index.java new file mode 100755 index 0000000..1a675a1 --- /dev/null +++ b/current/align2/Index.java @@ -0,0 +1,12 @@ +package align2; + +/** + * @author Brian Bushnell + * @date Dec 19, 2012 + * + */ +public abstract class Index { + + //TODO: Put static methods here. + +} diff --git a/current/align2/IndexMaker4.java b/current/align2/IndexMaker4.java new file mode 100755 index 0000000..ef1cf84 --- /dev/null +++ b/current/align2/IndexMaker4.java @@ -0,0 +1,624 @@ +package align2; + +import java.io.File; +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; + + +/** + * @author Brian Bushnell + * @date Dec 23, 2012 + * + */ +public class IndexMaker4 { + + public static Block[] makeIndex(final int genome, int minChrom, int maxChrom, int k, int CHROMBITS, + int MAX_ALLOWED_CHROM_INDEX, int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH, + boolean COLORSPACE, boolean WRITE, boolean DISK_INVALID, Block[] index){ + Timer t=new Timer(); + t.start(); + + MAX_CONCURRENT_BLOCKS=(Data.WINDOWS ? (WRITE ? 1 : Tools.max(1, Shared.THREADS/4)) : Tools.max(1, Shared.THREADS/4)); + + minChrom=Tools.max(1, minChrom); + if(genome>=0 && Data.GENOME_BUILD!=genome){ + Data.setGenome(genome); + maxChrom=Tools.min(Data.numChroms, maxChrom); + } + + assert(minChrom<=maxChrom); + + if(index==null){index=new Block[maxChrom+1];} + + ArrayList list=new ArrayList(); + + for(int i=1; i<=maxChrom;){ + if(i>=minChrom){ + int a=minChrom(i, minChrom, CHROM_MASK_HIGH); + int b=maxChrom(i, minChrom, maxChrom, CHROM_MASK_LOW); + assert(b>=i); + + BlockMaker idm=new BlockMaker(a, b, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, WRITE, DISK_INVALID, index); + list.add(idm); + incrementActiveBlocks(1); + idm.start(); + + while(idm.getState()==State.NEW){}//wait + + i=b+1; + }else{i++;} + } + + for(BlockMaker cm : list){ + while(cm.getState()!=State.TERMINATED){ + try { + cm.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + t.stop(); +// Data.sysout.println("Index gen time: \t"+t); + + return index; + } + + public static Block makeBlock(int minChrom, int maxChrom, int k, int CHROMBITS, int MAX_ALLOWED_CHROM_INDEX, + int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH, boolean COLORSPACE, boolean WRITE, boolean DISK_INVALID, Block[] matrix){ + assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX; + BlockMaker idm=new BlockMaker(minChrom, maxChrom, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, WRITE, DISK_INVALID, matrix); + Block block=idm.makeArrays(); + + assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX; + + if(verbose){ + for(int i=0; i=0; i--){ + sizes[i+1]=sizes[i]; + } + sizes[0]=0; + + if(matrix!=null){ + for(int i=baseChrom(minChrom); i<=maxChrom; i++){ + matrix[i]=indexHolder[0]; + } + } + + if(WRITE_TO_DISK){ + String fname=fname(minChrom, maxChrom, KEYLEN, CHROMBITS, COLORSPACE); +// File f=new File(fname); +// assert(!f.exists()) : "Tried to overwrite file "+f.getAbsolutePath(); + indexHolder[0].write(fname, true); + } + + return indexHolder[0]; + } + + + private class CountThread extends Thread{ + + public CountThread(int id_, int[] sizes_, int[] intercom_, Block[] indexHolder_){ + id=id_; + idb=AminoAcid.numberToBase[id]; + sizes=sizes_; + indexHolder=indexHolder_; + intercom=intercom_; + + minIndex=(id<<(2*KEYLEN-2)); + maxIndex=(int)(((id+1L)<<(2*KEYLEN-2))-1); + //Data.sysout.println("Thread "+id+" range is "+minIndex+", "+maxIndex); + + if(ALLOW_POLYMERS){ + banned=-1; + banmask=-1; //poly-A still slips through + }else{ + int b=0; + for(int i=0; i=4){intercom.notify();} + intercom.wait(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + } + + //Data.sysout.println("Thread "+id+" filling arrays for ("+minChrom+", "+maxChrom+")"); + + if(COLORSPACE){ + for(int i=minChrom; i<=maxChrom; i++){fillArraysColorspace(i);} + }else{ + for(int i=minChrom; i<=maxChrom; i++){fillArrays(i);} + } + //Data.sysout.println("Thread "+id+" finished."); + } + + private void countSizes(final int chrom){ + + // System.err.println("Thread "+id+" using chr"+chrom+" for countSizes"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(!ca.colorspace); + + // int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0) : "\n*** The key length is too short. For the flag set 'k=X', X should be between 8 and 15; it was set to "+KEYLEN+" ***\n"; + + + int start=ca.minIndex; + while(start=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex) : "\n"+id+", "+ca.getNumber(a)+", "+(char)ca.get(a)+", "+key+", "+Integer.toHexString(key)+ + ", "+ca.getString(a, b)+"\n"+minIndex+", "+maxIndex+"\n"; + sizes[key]++; + } +// else{ +// assert(key==banned) : "\n"+Integer.toBinaryString(key)+" != "+Integer.toBinaryString(banned)+"\n"; +// } + } + // Data.sysout.println("a="+a+", b="+b+", max="+max); + } + + // Data.sysout.println("Left hash loop."); + + } + + private void fillArrays(final int chrom){ + + // System.err.println("Thread "+id+" using chr"+chrom+" for fillArrays"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(!ca.colorspace); + + int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0); + + + int start=ca.minIndex; + while(start=0){ +// keyB=((keyB<<2)|c); +// len++; +// }else{ +// len=0; +// } +// int key=keyB&mask; +// if(len>=KEYLEN && /* array[a]==idb*/ key>=minIndex && key<=maxIndex){ +//// int key=keyB&mask; +// assert(key>=minIndex && key<=maxIndex); +// int number=toNumber(a, chrom); +// assert(numberToChrom(number, baseChrom)==chrom); +// assert(numberToSite(number)==a); +// index[key][sizes[key]]=number; +// sizes[key]++; +// } +// // Data.sysout.println("a="+a+", b="+b+", max="+max); +// } + + + // Data.sysout.println("Entering hash loop."); + // "a" is site start, "b" is site end + + int[] sites=indexHolder[0].sites; + + for(int a=start, b=start+skip; a=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex); + int number=toNumber(a, chrom); + assert(numberToChrom(number, baseChrom)==chrom); + assert(numberToSite(number)==a); + int loc=sizes[key]; + assert(sites[loc]==0); + sites[loc]=number; + sizes[key]++; + } + } + // Data.sysout.println("a="+a+", b="+b+", max="+max); + } + // Data.sysout.println("Left hash loop."); + + } + + private void countSizesColorspace(final int chrom){ + + //System.err.println("Thread "+id+" using chr"+chrom+" for countSizes"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(ca.colorspace==COLORSPACE); + + // int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0); + + + int start=ca.minIndex; + while(start=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex) : "\n"+id+", "+ca.getNumber(a)+", "+(char)ca.get(a)+", "+key+", "+Integer.toHexString(key)+", "+ca.getString(a, b)+"\n" + +minIndex+", "+maxIndex+"\n"; + sizes[key]++; + } + } + } + + Data.sysout.println("Left hash loop."); + + } + + private void fillArraysColorspace(final int chrom){ + + //System.err.println("Thread "+id+" using chr"+chrom+" for fillArrays"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(ca.colorspace==COLORSPACE); + + int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0); + + + int start=ca.minIndex; + while(start=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex); + int number=toNumber(a, chrom); + assert(numberToChrom(number, baseChrom)==chrom); + assert(numberToSite(number)==a); + int loc=sizes[key]; + assert(sites[loc]==0); + sites[loc]=number; + sizes[key]++; + } + } + // Data.sysout.println("a="+a+", b="+b+", max="+max); + } + + Data.sysout.println("Left hash loop."); + + } + + } + + + /** Encode a (location, chrom) pair to an index */ + public final int toNumber(int site, int chrom){ + int out=(chrom&CHROM_MASK_LOW); + out=out<=0); + assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS); + + while(i>0 && ACTIVE_BLOCKS>0 && ACTIVE_BLOCKS>=MAX_CONCURRENT_BLOCKS){ + try { + THREAD_SYNC.wait(10000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + ACTIVE_BLOCKS+=i; + if(ACTIVE_BLOCKS=0); + assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS); + } + } + + public static boolean verbose=false; + + public static boolean USE_ALLOC_SYNC=false; + private static final String ALLOC_SYNC=new String("ALLOC_SYNC"); + private static final String THREAD_SYNC=new String("THREAD_SYNC"); + + public static int MAX_CONCURRENT_BLOCKS=(Data.WINDOWS ? 1 : Tools.max(1, Shared.THREADS/4)); + private static int ACTIVE_BLOCKS=0; + + public static boolean ALLOW_POLYMERS=false; + +} diff --git a/current/align2/IndexMaker5.java b/current/align2/IndexMaker5.java new file mode 100755 index 0000000..7141e3b --- /dev/null +++ b/current/align2/IndexMaker5.java @@ -0,0 +1,617 @@ +package align2; + +import java.io.File; +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; + + +/** + * @author Brian Bushnell + * @date Jan 3, 2013 + * + */ +public class IndexMaker5 { + + + public static Block[] makeIndex(final int genome, int minChrom, int maxChrom, int k, int CHROMBITS, + int MAX_ALLOWED_CHROM_INDEX, int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH, boolean COLORSPACE, boolean WRITE, boolean DISK_INVALID, Block[] index){ + Timer t=new Timer(); + t.start(); + + MAX_CONCURRENT_BLOCKS=(Data.WINDOWS ? 1 : Tools.max(1, Shared.THREADS/4)); + + minChrom=Tools.max(1, minChrom); + if(genome>=0 && Data.GENOME_BUILD!=genome){ + Data.setGenome(genome); + maxChrom=Tools.min(Data.numChroms, maxChrom); + } + + assert(minChrom<=maxChrom); + + if(index==null){index=new Block[maxChrom+1];} + + ArrayList list=new ArrayList(); + + for(int i=1; i<=maxChrom;){ + if(i>=minChrom){ + int a=minChrom(i, minChrom, CHROM_MASK_HIGH); + int b=maxChrom(i, minChrom, maxChrom, CHROM_MASK_LOW); + assert(b>=i); + + BlockMaker idm=new BlockMaker(a, b, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, WRITE, DISK_INVALID, index); + list.add(idm); + incrementActiveBlocks(1); + idm.start(); + + while(idm.getState()==State.NEW){}//wait + + i=b+1; + }else{i++;} + } + + for(BlockMaker cm : list){ + while(cm.getState()!=State.TERMINATED){ + try { + cm.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + t.stop(); +// Data.sysout.println("Index gen time: \t"+t); + + return index; + } + + public static Block makeBlock(int minChrom, int maxChrom, int k, int CHROMBITS, int MAX_ALLOWED_CHROM_INDEX, + int CHROM_MASK_LOW, int CHROM_MASK_HIGH, int SITE_MASK, int SHIFT_LENGTH, boolean COLORSPACE, boolean WRITE, boolean DISK_INVALID, Block[] matrix){ + assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX; + BlockMaker idm=new BlockMaker(minChrom, maxChrom, k, CHROMBITS, MAX_ALLOWED_CHROM_INDEX, CHROM_MASK_LOW, CHROM_MASK_HIGH, SITE_MASK, SHIFT_LENGTH, COLORSPACE, WRITE, DISK_INVALID, matrix); + Block block=idm.makeArrays(); + + assert(false) : maxChrom+", "+MAX_ALLOWED_CHROM_INDEX; + + if(verbose){ + for(int i=0; i=0; i--){ + sizes[i+1]=sizes[i]; + } + sizes[0]=0; + + if(matrix!=null){ + for(int i=baseChrom(minChrom); i<=maxChrom; i++){ + matrix[i]=indexHolder[0]; + } + } + + if(WRITE_TO_DISK){ + String fname=fname(minChrom, maxChrom, KEYLEN, CHROMBITS, COLORSPACE); +// File f=new File(fname); +// assert(!f.exists()) : "Tried to overwrite file "+f.getAbsolutePath(); + indexHolder[0].write(fname, true); + } + + return indexHolder[0]; + } + + + private class CountThread extends Thread{ + + public CountThread(int id_, int[] sizes_, int[] intercom_, Block[] indexHolder_){ + id=id_; + idb=AminoAcid.numberToBase[id]; + sizes=sizes_; + indexHolder=indexHolder_; + intercom=intercom_; + + minIndex=(id<<(2*KEYLEN-2)); + maxIndex=(int)(((id+1L)<<(2*KEYLEN-2))-1); + //Data.sysout.println("Thread "+id+" range is "+minIndex+", "+maxIndex); + + if(ALLOW_POLYMERS){ + banned=-1; + banmask=-1; //poly-A still slips through + }else{ + int b=0; + for(int i=0; i=4){intercom.notify();} + intercom.wait(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + } + + //Data.sysout.println("Thread "+id+" filling arrays for ("+minChrom+", "+maxChrom+")"); + + if(COLORSPACE){ + for(int i=minChrom; i<=maxChrom; i++){fillArraysColorspace(i);} + }else{ + for(int i=minChrom; i<=maxChrom; i++){fillArrays(i);} + } + //Data.sysout.println("Thread "+id+" finished."); + } + + private void countSizes(final int chrom){ + + // System.err.println("Thread "+id+" using chr"+chrom+" for countSizes"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(!ca.colorspace); + + // int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0); + + + int start=ca.minIndex; + while(start=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex) : "\n"+id+", "+ca.getNumber(a)+", "+(char)ca.get(a)+", "+key+", "+Integer.toHexString(key)+ + ", "+ca.getString(a, b)+"\n"+minIndex+", "+maxIndex+"\n"; + sizes[key]++; + } + } + // Data.sysout.println("a="+a+", b="+b+", max="+max); + } + + // Data.sysout.println("Left hash loop."); + + } + + private void fillArrays(final int chrom){ + + // System.err.println("Thread "+id+" using chr"+chrom+" for fillArrays"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(!ca.colorspace); + + int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0); + + + int start=ca.minIndex; + while(start=0){ +// keyB=((keyB<<2)|c); +// len++; +// }else{ +// len=0; +// } +// int key=keyB&mask; +// if(len>=KEYLEN && /* array[a]==idb*/ key>=minIndex && key<=maxIndex){ +//// int key=keyB&mask; +// assert(key>=minIndex && key<=maxIndex); +// int number=toNumber(a, chrom); +// assert(numberToChrom(number, baseChrom)==chrom); +// assert(numberToSite(number)==a); +// index[key][sizes[key]]=number; +// sizes[key]++; +// } +// // Data.sysout.println("a="+a+", b="+b+", max="+max); +// } + + + // Data.sysout.println("Entering hash loop."); + // "a" is site start, "b" is site end + + int[] sites=indexHolder[0].sites; + + for(int a=start, b=start+skip; a=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex); + int number=toNumber(a, chrom); + assert(numberToChrom(number, baseChrom)==chrom); + assert(numberToSite(number)==a); + int loc=sizes[key]; + assert(sites[loc]==0); + sites[loc]=number; + sizes[key]++; + } + } + // Data.sysout.println("a="+a+", b="+b+", max="+max); + } + // Data.sysout.println("Left hash loop."); + + } + + private void countSizesColorspace(final int chrom){ + + //System.err.println("Thread "+id+" using chr"+chrom+" for countSizes"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(ca.colorspace==COLORSPACE); + + // int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0); + + + int start=ca.minIndex; + while(start=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex) : "\n"+id+", "+ca.getNumber(a)+", "+(char)ca.get(a)+", "+key+", "+Integer.toHexString(key)+", "+ca.getString(a, b)+"\n" + +minIndex+", "+maxIndex+"\n"; + sizes[key]++; + } + } + } + + Data.sysout.println("Left hash loop."); + + } + + private void fillArraysColorspace(final int chrom){ + + //System.err.println("Thread "+id+" using chr"+chrom+" for fillArrays"); + ChromosomeArray ca=dna.Data.getChromosome(chrom); + assert(ca.colorspace==COLORSPACE); + + int baseChrom=baseChrom(chrom); + + if(ca.maxIndex>MAX_ALLOWED_CHROM_INDEX){ + throw new RuntimeException("Chrom "+chrom+": "+ca.maxIndex+" > "+MAX_ALLOWED_CHROM_INDEX); + } + + final int max=ca.maxIndex-KEYLEN+1; + final int skip=KEYLEN-1; + assert(skip>0); + + + int start=ca.minIndex; + while(start=0 && key!=banned){ + if(key>=0 && (key>>banshift)!=(key&banmask)){ + assert(key>=minIndex && key<=maxIndex); + int number=toNumber(a, chrom); + assert(numberToChrom(number, baseChrom)==chrom); + assert(numberToSite(number)==a); + int loc=sizes[key]; + assert(sites[loc]==0); + sites[loc]=number; + sizes[key]++; + } + } + // Data.sysout.println("a="+a+", b="+b+", max="+max); + } + + Data.sysout.println("Left hash loop."); + + } + + } + + + /** Encode a (location, chrom) pair to an index */ + public final int toNumber(int site, int chrom){ + int out=(chrom&CHROM_MASK_LOW); + out=out<=0); + assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS); + + while(i>0 && ACTIVE_BLOCKS>0 && ACTIVE_BLOCKS>=MAX_CONCURRENT_BLOCKS){ + try { + THREAD_SYNC.wait(10000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + ACTIVE_BLOCKS+=i; + if(ACTIVE_BLOCKS=0); + assert(ACTIVE_BLOCKS<=MAX_CONCURRENT_BLOCKS); + } + } + + public static boolean verbose=false; + + public static boolean USE_ALLOC_SYNC=false; + private static final String ALLOC_SYNC=new String("ALLOC_SYNC"); + private static final String THREAD_SYNC=new String("THREAD_SYNC"); + + public static int MAX_CONCURRENT_BLOCKS=(Data.WINDOWS ? 1 : 2); + private static int ACTIVE_BLOCKS=0; + + public static boolean ALLOW_POLYMERS=false; + +} diff --git a/current/align2/IntList.java b/current/align2/IntList.java new file mode 100755 index 0000000..d99dc74 --- /dev/null +++ b/current/align2/IntList.java @@ -0,0 +1,90 @@ +package align2; + +import java.util.Arrays; + + + +public final class IntList{ + + public IntList(){this(256);} + + public IntList(int initial){ + assert(initial>0); + array=new int[initial]; + } + + public final void set(int loc, int value){ + if(loc>=array.length){ + resize((loc+1)*2); + } + array[loc]=value; + size=max(size, loc+1); + } + + public final void increment(int loc, int value){ + if(loc>=array.length){ + resize((loc+1)*2); + } + array[loc]+=value; + size=max(size, loc+1); + } + + public final int get(int loc){ + return(loc>=size ? 0 : array[loc]); + } + + + + public final void add(int x){ + if(size>=array.length){ + resize(max(size*2, 1)); + } + array[size]=x; + size++; + } + + public final void resize(int size2){ + assert(size2>size); + array=Arrays.copyOf(array, size2); + } + + public final void shrink(){ + if(size==array.length){return;} + array=Arrays.copyOf(array, size); + } + + public final void shrinkToUnique(){ + //Assumes sorted. + if(size<=0){ + shrink(); + return; + } + + int unique=1; + + for(int i=1; i=array[i-1]); + if(array[i]!=array[i-1]){unique++;} + } + if(unique==array.length){return;} + int[] alt=new int[unique]; + + alt[0]=array[0]; + for(int i=1, j=1; jy ? x : y;} + + public int[] array; + public int size=0; + +} diff --git a/current/align2/KeyRing.java b/current/align2/KeyRing.java new file mode 100755 index 0000000..131d2a8 --- /dev/null +++ b/current/align2/KeyRing.java @@ -0,0 +1,574 @@ +package align2; + +import java.util.Arrays; + +import dna.AminoAcid; +import dna.ChromosomeArray; + +public final class KeyRing { + + public static final void main(String[] args){ + int len=Integer.parseInt(args[0]); + float density=(float) Double.parseDouble(args[1]); + int chunksize=13; + if(args.length>2){chunksize=Integer.parseInt(args[2]);} + + byte[] qual=new byte[len]; + Arrays.fill(qual, (byte)20); + + int[] offsets=KeyRing.makeOffsets(qual, chunksize, density, 2); + System.out.println(Arrays.toString(offsets)); + } + + public static final int[] makeKeys(String s, int[] offsets, int chunksize){ + if(offsets==null){return null;} + assert(chunksize>0 && chunksize<16); + assert(offsets!=null) : s.length()+", "+s; + int[] keys=new int[offsets.length]; + +// System.out.println(Arrays.toString(offsets)); + + for(int i=0; i0 && chunksize<16); + assert(offsets!=null) : s.length+", "+new String(s); + int[] keys=new int[offsets.length]; + +// System.out.println(Arrays.toString(offsets)); + + for(int i=0; i>(2*(chunksize-i-1))); + temp=(temp&3); + sb.append((char)AminoAcid.numberToBase[temp]); + } + + String s=sb.toString(); + + assert(key==ChromosomeArray.toNumber(0, s.length()-1, s)) : + Integer.toHexString(key)+" -> "+s+" != "+Integer.toHexString(ChromosomeArray.toNumber(0, s.length()-1, s)); + + return sb.toString(); + } + + /* + public static final int[] makeOffsets(int readlen, int blocksize, int overlap, int minKeysDesired){ + assert(blocksize>0); + assert(overlap=minKeysDesired){ +// while(middles+20); +// middles=(midslots/spacing); +// } +// } + + int middles=(midslots/spacing); + if(middles0); //due to the escape conditions + +// float fspacing=midslots/(float)(middles+1); + float fspacing=midslots/(float)(middles); + assert(fspacing>=1); + + int[] offsets=new int[middles+2]; + offsets[0]=0; + offsets[offsets.length-1]=slots-1; + + for(int i=1; i<=middles; i++){ + offsets[i]=Math.round(fspacing*i); + } + +// System.out.println("readlen = \t"+readlen); +// System.out.println("blocksize = \t"+blocksize); +// System.out.println("overlap = \t"+overlap); +// System.out.println("slots = \t"+slots); +// System.out.println("midslots = \t"+midslots); +// System.out.println("spacing = \t"+spacing); +// System.out.println("middles = \t"+middles); +// System.out.println("fspacing = \t"+fspacing); +// System.out.println("Offsets = \t"+Arrays.toString(offsets)); + return offsets; + + }*/ + + /** This is only useful for low-quality reads, with no-calls. Otherwise it just wastes time... */ + public static final int[] reverseOffsets(final int[] offsetsP, final int k, final int readlen){ + int[] offsetsM=new int[offsetsP.length]; + for(int i=0; i=0); + assert(x+k<=readlen); + x=readlen-(x+k); + assert(x>=0); + assert(x+k<=readlen) : "\n"+Arrays.toString(offsetsP)+"\n"+Arrays.toString(offsetsM)+"\n"+i+"\n"+x+"\n"+readlen; + offsetsM[i]=x; + } + return offsetsM; + } + + public static final int[] makeOffsetsWithDensity(int readlen, int blocksize, float density, int minKeysDesired){ + assert(blocksize>0); + assert(density0); + assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired; + + int slots=readlen-blocksize+1; + int midslots=slots-2; + + int desired=(int)Math.ceil((readlen*density)/blocksize); + assert(desired>=0); + desired=Tools.max(minKeysDesired, desired); + desired=Tools.min(slots, desired); + + if(slots==1 || desired==1){return new int[] {0};} + if(desired==2){return new int[] {0, slots-1};} + + int middles=desired-2; + + assert(middles>0); //due to the escape conditions + +// float fspacing=midslots/(float)(middles+1); + float fspacing=midslots/(float)(middles); + assert(fspacing>=1); + + int[] offsets=new int[desired]; + offsets[0]=0; + offsets[offsets.length-1]=slots-1; + + for(int i=1; i<=middles; i++){ + offsets[i]=Math.round(fspacing*i); + } + +// System.out.println("readlen = \t"+readlen); +// System.out.println("blocksize = \t"+blocksize); +// System.out.println("overlap = \t"+overlap); +// System.out.println("slots = \t"+slots); +// System.out.println("midslots = \t"+midslots); +// System.out.println("spacing = \t"+spacing); +// System.out.println("middles = \t"+middles); +// System.out.println("fspacing = \t"+fspacing); +// System.out.println("Offsets = \t"+Arrays.toString(offsets)); + return offsets; + + } + + + public static final int[] makeOffsetsWithNumberOfKeys(int readlen, int blocksize, int maxKeys){ + assert(maxKeys>0); +// System.err.println("readlen, blocksize, maxKeys = "+readlen+","+blocksize+","+maxKeys); + if(blocksize>readlen){return null;} + int slots=readlen-blocksize+1; +// System.err.println("slots = "+slots); + if(slots==1 || maxKeys==1){return new int[] {slots/2};} + if(slots==2 || maxKeys==2){return new int[] {0, slots-1};} + if(slots==3 || maxKeys==3){return new int[] {0, slots/2, slots-1};} + + int midslots=slots-2; + maxKeys=Tools.min(maxKeys, slots); + int middles=Tools.min(maxKeys-2, midslots); +// System.err.println("midslots = "+midslots); +// System.err.println("middles = "+middles); + + assert(middles>0); //due to the escape conditions + +// float fspacing=midslots/(float)(middles+0); //Bad - leaves 2 adjacent keys at the end. + float fspacing=midslots/(float)(middles+1f); + fspacing=Tools.max(1f, fspacing); + assert(fspacing>=1); + + int[] offsets=new int[middles+2]; + offsets[0]=0; + offsets[offsets.length-1]=slots-1; + + +// for(int i=1; i<=middles; i++){ +// offsets[i]=Math.round(fspacing*i); +// } + + + + for(int i=1; i<=middles; i++){ + offsets[i]=Math.round(fspacing*i); + } + if(middles>2){ + offsets[1]=(int)fspacing; + offsets[middles]=(int) Math.ceil(fspacing*middles); + } + +// System.out.println("readlen = \t"+readlen); +// System.out.println("blocksize = \t"+blocksize); +//// System.out.println("overlap = \t"+overlap); +// System.out.println("slots = \t"+slots); +// System.out.println("midslots = \t"+midslots); +//// System.out.println("spacing = \t"+spacing); +// System.out.println("middles = \t"+middles); +// System.out.println("fspacing = \t"+fspacing); +// System.out.println("Offsets = \t"+Arrays.toString(offsets)); + + for(int i=1; i0); +// assert(overlap=2); +// +// int slots=readlen-blocksize+1; +// int midslots=slots-2; +// int spacing=blocksize-overlap; +// +// if(slots<=minKeysDesired){return slots;} +// if(slots<=spacing+1){return Tools.min(3, slots);} +// +// int middles=(midslots/spacing); +// if(middles0); //due to the escape conditions +// return middles+2; +// } + + public static final int desiredKeysFromDensity(int readlen, int blocksize, float density, int minKeysDesired){ + assert(blocksize>0); + assert(density<=blocksize) : density+", "+blocksize; + assert(density>0); + assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired; + + int slots=readlen-blocksize+1; + + int desired=(int)Math.ceil((readlen*density)/blocksize); + assert(desired>=0); + desired=Tools.max(minKeysDesired, desired); + desired=Tools.min(slots, desired); + return desired; + } + + public static final int[] makeOffsets(final int readlen, int blocksize, float density, int minKeysDesired){ + assert(blocksize>0); + assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired; + + if(readlen0) : readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys; + + int[] offsets=makeOffsetsWithNumberOfKeys(readlen, blocksize, desiredKeys); +// System.out.println("desiredKeys="+desiredKeys+", actual="+(offsets==null ? 0 : offsets.length)); + assert(offsets!=null) :readlen+","+blocksize+","+density+","+minKeysDesired+","+desiredKeys; + return offsets; + } + + public static final int[] makeOffsets(byte[] qual, int blocksize, float density, int minKeysDesired){ + int readlen=qual.length; + assert(blocksize>0); + assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired; + + int left=0, right=readlen-1; + + for(int i=left, cntr=0; i=0 && cntr0){ + for(int i=0; i=density); + assert(blocksize>0); + assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired; + + int left=0, right=readlen-blocksize; + + //This can be set as low as .90 for long reads, if qualities are accurate. + final float errorLimit=KEEP_BAD_KEYS ? 2f : 0.94f; //Default: .95f + + while(left<=right && keyErrorProb[left]>errorLimit){left++;} + while(right>=left && keyErrorProb[right]>errorLimit){right--;} + +// System.out.println("left="+left+", right="+right+", readlen="+readlen+", " + +// "blocksize="+blocksize+", density="+density+", minKeysDesired="+minKeysDesired); + + if(right0){ + for(int i=0; i=density); + assert(blocksize>0); + assert(blocksize<=readlen) : readlen+", "+blocksize+", "+density+", "+minKeysDesired; + + final int maxProbIndex=readlen-blocksize; +// assert(maxProbIndex==keyErrorProb.length-1); + assert(maxProbIndex<=keyErrorProb.length-1) : maxProbIndex+", "+keyErrorProb.length; + int left=0, right=maxProbIndex; + + final float errorLimit2=KEEP_BAD_KEYS ? 2f : 0.9999f; //Default: .95f + + //This can be set as low as .90 for long reads, if qualities are accurate. + final float errorLimit1=KEEP_BAD_KEYS ? 2f : (semiperfectmode ? 0.99f : 0.94f); //Default: .95f + + while(left<=right && keyErrorProb[left]>=errorLimit1){left++;} + while(right>=left && keyErrorProb[right]>=errorLimit1){right--;} + +// System.out.println("Left="+left+", right="+right); + + int potentialKeys=0; + for(int i=left; i<=right; i++){ + if(keyErrorProb[i] { + + public ListNum(ArrayList list_, long id_){ + list=list_; + id=id_; + if(GEN_RANDOM_NUMBERS && list!=null){ + for(K k : list){ + if(k!=null){ + ((Read)k).rand=randy.nextDouble(); + } + } + } + } + + public final ArrayList list; + public final long id; + + public static synchronized void setDeterministicRandom(boolean b){ + GEN_RANDOM_NUMBERS=b; + if(b){ + randy=new Random(seed); + seed++; + } + } + public static boolean deterministicRandom(){ + return GEN_RANDOM_NUMBERS; + } + + private static boolean GEN_RANDOM_NUMBERS=false; + private static Random randy; + private static long seed=0; + +} diff --git a/current/align2/LongList.java b/current/align2/LongList.java new file mode 100755 index 0000000..e0f769d --- /dev/null +++ b/current/align2/LongList.java @@ -0,0 +1,106 @@ +package align2; + +import java.util.Arrays; + + + +public final class LongList{ + + public LongList(){this(256);} + + public LongList(int initial){ + assert(initial>0); + array=new long[initial]; + } + + public final void set(int loc, long value){ + if(loc>=array.length){ + resize((loc+1L)*2); + } + array[loc]=value; + size=max(size, loc+1); + } + + public final void increment(int loc, long value){ + if(loc>=array.length){ + resize((loc+1L)*2); + } + array[loc]+=value; + size=max(size, loc+1); + } + + public final long get(int loc){ + return(loc>=size ? 0 : array[loc]); + } + + public final void add(long x){ + if(size>=array.length){ + resize((size+1L)*2); + } + array[size]=x; + size++; + } + + public final void resize(long x){ + int size2=(int)min(x, Integer.MAX_VALUE); + assert(size2>size); + array=Arrays.copyOf(array, size2); + } + + public final void shrink(){ + if(size==array.length){return;} + array=Arrays.copyOf(array, size); + } + + public final void shrinkToUnique(){ + //Assumes sorted. + if(size<=0){ + shrink(); + return; + } + + int unique=1; + + for(int i=1; i=array[i-1]); + if(array[i]!=array[i-1]){unique++;} + } + if(unique==array.length){return;} + long[] alt=new long[unique]; + + alt[0]=array[0]; + for(int i=1, j=1; jy ? x : y;} + + private static final int min(int x, int y){return xy ? x : y;} + + public long[] array; + public int size=0; + +} diff --git a/current/align2/LongM.java b/current/align2/LongM.java new file mode 100755 index 0000000..92d48f0 --- /dev/null +++ b/current/align2/LongM.java @@ -0,0 +1,62 @@ +package align2; + +/** + * A mutable long object + * @author Brian Bushnell + * @date Feb 8, 2013 + * + */ +public class LongM implements Comparable { + public LongM(){this(0L);} + public LongM(long v){value=v;} + + /** + * @param key + * @param b + */ + public LongM(long v, boolean mut) { + value=v; + mutable=mut; + } + + public LongM iCopy(){ + if(!mutable){return this;} + return new LongM(value, false); + } + + public long value(){return value;} +// public long longValue(){return value;} + public void lock(){mutable=false;} + + public long set(long v){ + if(!mutable){throw new RuntimeException("Mutating a locked LongM");} + return (value=v); + } + public long increment(){return set(value+1);} + public long increment(long x){return set(value+x);} + + @Override + public int hashCode(){ + return (int)((value^(value>>>32))&0xFFFFFFFFL); + } + + @Override + public int compareTo(LongM b){ + return value==b.value ? 0 : value=a); + + int[] score; + + if(gaps==null){ + if(verbose){ + System.err.println("no gaps"); + } + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + assert(false) : refStartLoc+", "+refEndLoc; + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillLimited(read, ref, a, b, minScore, gaps); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], false)); + }else{ + if(verbose){System.err.println("\ngaps: "+Arrays.toString(gaps)+"\n"+new String(read)+"\ncoords: "+refStartLoc+", "+refEndLoc);} + int[] max=fillLimited(read, ref, a, b, minScore, gaps); + if(verbose){System.err.println("max: "+Arrays.toString(max));} +// score=(max==null ? null : score(read, grefbuffer, 0, greflimit, max[0], max[1], max[2], true)); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], true)); + } + return score; + } + + public final int[] fillAndScoreLimited(byte[] read, SiteScore ss, int thresh, int minScore){ + return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore, ss.gaps); + } + +// public final int[] translateScoreFromGappedCoordinate(int[] score) + + public final int[] fillAndScoreLimited(byte[] read, int chrom, int start, int stop, int thresh, int minScore, int[] gaps){ + return fillAndScoreLimited(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, minScore, gaps); + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, byte[] baseScores){ + int a=Tools.max(0, refStartLoc); + int b=Tools.min(ref.length-1, refEndLoc); + assert(b>=a); + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillQ(read, ref, baseScores, a, b); +// int[] score=score(read, ref, a, b, max[0], max[1], max[2]); +// return score; + return null; + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, SiteScore ss, int thresh, byte[] baseScores){ + return fillAndScoreQ(read, ss.chrom, ss.start, ss.stop, thresh, baseScores); + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, int chrom, int start, int stop, int thresh, byte[] baseScores){ + return fillAndScoreQ(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, baseScores); + } + +// public final int scoreNoIndels(byte[] read, SiteScore ss){ +// +// ChromosomeArray cha=Data.getChromosome(ss.chrom); +// final int refStart=ss.start; +// +// int score=0; +// int mode=MODE_START; +// int timeInMode=0; +// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match +// +// for(int i=0; i"+(char)c+"; rpos="+rpos);} + prevMode=mode; + prevStreak=current; + mode=c; + current=1; + } + } + if(current>0){ + assert(mode==match[match.length-1]); + if(mode=='m'){ + if(score<=0){ + score=0; + lastZeroC=cpos; + lastZeroM=match.length-current; + lastZeroR=rpos; + } + int add=calcMatchScore(current); + score+=(matchPointsMult*add); +// if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N + cpos+=current; + rpos+=current; + if(score>maxScore){ + maxScore=score; + startLocC=lastZeroC; + startLocM=lastZeroM; + startLocR=lastZeroR; + stopLocC=cpos-1; + stopLocM=match.length-1; + stopLocR=rpos-1; + } + }else if(mode=='S'){ + score+=calcSubScore(current); + if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N + else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();} + cpos+=current; + rpos+=current; + }else if(mode=='D'){ + score+=calcDelScore(current, true); + rpos+=current; + }else if(mode=='I'){ + score+=calcInsScore(current); + cpos+=current; + }else if(mode=='C'){ + cpos+=current; + }else if(mode=='X' || mode=='Y'){ + score+=calcInsScore(current); + cpos+=current; + }else if(mode=='N'){ + score+=calcNocallScore(current); + cpos+=current; + rpos+=current; + }else if(mode=='R'){ + score+=calcNorefScore(current); + cpos+=current; + rpos+=current; + }else if(mode!=0){ + assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match)+"\n"+new String(bases); + } + if(verbose){System.err.println("mode "+(char)mode+"->end; rpos="+rpos);} + } + + if(startLocC<0 || stopLocC<0){ + assert(false) : "Failed."; + return false; + } + + + if(verbose){System.err.println("A: r.start="+r.start+", r.stop="+r.stop+"; rpos="+rpos+"; len="+bases.length+"; reflen="+(r.stop-r.start+1));} + + assert(rpos==r.stop+1) : rpos+"!="+r.start+"\n"+r; + + if(verbose){System.err.println("B: rpos="+rpos+", startLocR="+startLocR+", stopLocR="+stopLocR);} + + int headTrimR=startLocC; + int headTrimM=startLocM; + int tailTrimR=bases.length-stopLocC-1; + int tailTrimM=match.length-stopLocM-1; + + if(verbose){System.err.println("C: headTrimR="+headTrimR+", headTrimM="+headTrimM+", tailTrimR="+tailTrimR+", tailTrimM="+tailTrimM);} + + if(headTrimR<=minToClip && headTrimM<=minToClip){ + headTrimR=headTrimM=0; + } + if(tailTrimR<=minToClip && tailTrimM<=minToClip){ + tailTrimR=tailTrimM=0; + } + if(headTrimR==0 && headTrimM==0 && tailTrimR==0 && tailTrimM==0){ + return false; + } + //Do trimming + final int headDelta=headTrimR-headTrimM; + final int tailDelta=tailTrimR-tailTrimM; + final byte[] match2; + + if(verbose){System.err.println("D: headTrimR="+headTrimR+", headTrimM="+headTrimM+", tailTrimR="+tailTrimR+", tailTrimM="+tailTrimM);} + if(verbose){System.err.println("D: headDelta="+headDelta+", tailDelta="+tailDelta);} + + if(headDelta==0 && tailDelta==0){ + //Length-neutral trimming + match2=match; + for(int i=0; i0 ? Tools.max(ss.pairedScore+(maxScore-ss.slowScore), 0) : 0; + } + + return true; + } + + + /** Assumes match string is in long format. */ + public final int score(byte[] match){ + if(match==null || match.length<1){return 0;} + + byte mode=match[0], prevMode='0'; + int current=0, prevStreak=0; + int score=0; + + for(int mpos=0; mpos"+(char)c+"\tcurrent="+current+"\tscore="+score);} + prevMode=mode; + prevStreak=current; + mode=c; + current=1; + } + } + if(current>0){ + assert(mode==match[match.length-1]); + if(mode=='m'){ + score+=calcMatchScore(current); +// if(prevMode=='N' || prevMode=='R'){score=score+POINTS_MATCH2()-POINTS_MATCH();} //Don't penalize first match after N + }else if(mode=='S'){ + score+=calcSubScore(current); + if(prevMode=='N' || prevMode=='R'){score=score+POINTS_SUB2()-POINTS_SUB();} //Don't penalize first sub after N + else if(prevMode=='m' && prevStreak<2){score=score+POINTS_SUBR()-POINTS_SUB();} + }else if(mode=='D'){ + score+=calcDelScore(current, true); + }else if(mode=='I'){ + score+=calcInsScore(current); + }else if(mode=='C'){ + //do nothing + }else if(mode=='X' || mode=='Y'){ + score+=calcInsScore(current); + }else if(mode=='N'){ + score+=calcNocallScore(current); + }else if(mode=='R'){ + score+=calcNorefScore(current); + }else if(mode!=0){ + assert(false) : "Unhandled symbol "+mode+"\n"+(char)mode+"\n"+new String(match); + } + if(verbose){System.err.println("mode "+(char)mode+"->end; score="+score);} + } + + return score; + } + +// //TODO +// public final byte[] softClipBoundsShortmatch(byte[] match, byte[] bases, int minToClip){ +// if(match==null || match.length<1){return null;} +// int[] score=new int[bases.length]; +// +// byte mode='0', c='0'; +// int current=0; +// int rpos=0; +// long currentScore; +// for(int i=0; i0 || !Character.isDigit(c)){ +// current=Tools.max(current, 1); +// if(mode=='m'){ +// msdicn[0]+=current; +// }else if(mode=='S'){ +// msdicn[1]+=current; +// }else if(mode=='D'){ +// msdicn[2]+=current; +// }else if(mode=='I'){ +// msdicn[3]+=current; +// }else if(mode=='C' || mode=='X' || mode=='Y'){ +// msdicn[4]+=current; +// }else if(mode=='N' || mode=='R'){ +// msdicn[5]+=current; +// } +// } +// return msdicn; +// } + + public abstract int maxQuality(int numBases); + + public abstract int maxQuality(byte[] baseScores); + + public abstract int maxImperfectScore(int numBases); + + public abstract int maxImperfectScore(byte[] baseScores); + + public final static String toString(int[] a){ + + int width=7; + + StringBuilder sb=new StringBuilder((a.length+1)*width+2); + for(int num : a){ + String s=" "+num; + int spaces=width-s.length(); + assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i>SCOREOFFSET; + String s=" "+num; + if(s.length()>width){s=num>0 ? maxString : minString;} + int spaces=width-s.length(); + assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0); + for(int i=0; i0) : len; + return POINTS_MATCH()+(len-1)*POINTS_MATCH2(); + } + + public final int calcSubScore(int len){ + assert(len>0) : len; + final int lim3=LIMIT_FOR_COST_3(); + int score=POINTS_SUB(); + if(len>lim3){ + score+=(len-lim3)*POINTS_SUB3(); + len=lim3; + } + if(len>1){ + score+=(len-1)*POINTS_SUB2(); + } + return score; + } + + public final int calcNorefScore(int len){return len*POINTS_NOREF();} + + public final int calcNocallScore(int len){return len*POINTS_NOCALL();} + + public abstract int calcDelScore(int len, boolean approximateGaps); + +// private static int calcDelScoreOffset(int len){ +// if(len<=0){return 0;} +// int score=POINTSoff_DEL; +// +// if(len>LIMIT_FOR_COST_5){ +// score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5; +// len=LIMIT_FOR_COST_5; +// } +// if(len>LIMIT_FOR_COST_4){ +// score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; +// len=LIMIT_FOR_COST_4; +// } +// if(len>LIMIT_FOR_COST_3){ +// score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; +// len=LIMIT_FOR_COST_3; +// } +// if(len>1){ +// score+=(len-1)*POINTSoff_DEL2; +// } +// return score; +// } + + public abstract int calcInsScore(int len); + +// private static int calcInsScoreOffset(int len){ +// if(len<=0){return 0;} +// int score=POINTSoff_INS; +// if(len>LIMIT_FOR_COST_4){ +// score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; +// len=LIMIT_FOR_COST_4; +// } +// if(len>LIMIT_FOR_COST_3){ +// score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; +// len=LIMIT_FOR_COST_3; +// } +// if(len>1){ +// score+=(len-1)*POINTSoff_INS2; +// } +// return score; +// } + + static final int GAPBUFFER=Shared.GAPBUFFER; + static final int GAPBUFFER2=Shared.GAPBUFFER2; + static final int GAPLEN=Shared.GAPLEN; + static final int MINGAP=Shared.MINGAP; + static final int GAPCOST=Shared.GAPCOST; + static final byte GAPC=Shared.GAPC; + + /** Seemingly to clear out prior data from the gref. Not sure what else it's used for. */ + static final int GREFLIMIT2_CUSHION=128; //Tools.max(GAPBUFFER2, GAPLEN); + + + /**DO NOT MODIFY*/ + public abstract byte[] getGrefbuffer(); + +// public final int[] vertLimit; +// public final int[] horizLimit; + + public abstract CharSequence showVertLimit(); + public abstract CharSequence showHorizLimit(); + +//// public static final int MODEBITS=2; +// public static final int TIMEBITS=11; +// public static final int SCOREBITS=32-TIMEBITS; +// public static final int MAX_TIME=((1<1 ? args[1] : null); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + + long maxReads=0; + RTextInputStream rtis=new RTextInputStream(fname1, fname2, maxReads); + ConcurrentReadInputStream cris=new ConcurrentReadInputStream(rtis, maxReads); + + int[][] errors=process(cris); + printHistogram(errors); +// System.out.println("*** main() finished ***"); + } + + public static void printHistogram(int[][] errors){ + System.out.println("#Error Count Histogram: Number of Reads with X Mismatches"); + System.out.println("Errors\tRead 1\tRead 2\tPair\tRead 1 %\tRead 2 %\tPair %\t"); + + long sum1=Tools.sum(errors[0]), sum2=Tools.sum(errors[1]), sum3=Tools.sum(errors[2]); + + for(int i=0; i ln=cris.nextList(); + ArrayList readlist=ln.list; + while(!readlist.isEmpty()){ + + processList(readlist, counts); + + cris.returnList(ln, readlist.isEmpty()); + //System.err.println("Waiting on a list..."); + ln=cris.nextList(); + readlist=ln.list; + } + + //System.err.println("Returning a list... (final)"); + assert(readlist.isEmpty()); + cris.returnList(ln, readlist.isEmpty()); + ReadWrite.closeStream(cris); + + return counts; + } + + private static void processList(ArrayList list, int[][] counts) { + for(Read r : list){ + processRead(r, counts); + } + } + + private static void processRead(Read r, int[][] counts) { + +// if(!r.paired()){return;} + +// if(r.containsIndels()){return;} + +// if(r.countMismatches()>6){return;} +// if(r.avgQuality()<8){return;} + + + int n1=-1, s1=-1, n2=-1, s2=-1, sum1=-1, sum2=-1; + Read r2=r.mate; + + if(r.mapped() && r.valid() && r.match!=null){ + n1=count(r.match, 'N'); + s1=count(r.match, 'S'); + sum1=n1+s1; + } + if(r2!=null && r2.mapped() && r2.valid() && r2.match!=null){ + n2=count(r2.match, 'N'); + s2=count(r2.match, 'S'); + sum2=n2+s2; + } + + if(sum1>-1){counts[0][sum1]++;} + if(sum2>-1){counts[1][sum2]++;} + if(sum1>-1 && sum2>-1){counts[2][sum1+sum2]++;} + + } + + + + public static int count(byte[] match, char symbol){ + assert(match!=null); + int x=0; + for(byte b : match){ + if(b==symbol){x++;} + } + return x; + } + +} diff --git a/current/align2/MakeErrorQualityHistogram.java b/current/align2/MakeErrorQualityHistogram.java new file mode 100755 index 0000000..23cb2a2 --- /dev/null +++ b/current/align2/MakeErrorQualityHistogram.java @@ -0,0 +1,124 @@ +package align2; + +import java.util.ArrayList; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; + +import dna.Gene; +import fileIO.ReadWrite; + +public class MakeErrorQualityHistogram { + + public static void main(String[] args){ + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + + long maxReads=0; + RTextInputStream rtis=new RTextInputStream(fname1, fname2, maxReads); + ConcurrentReadInputStream cris=new ConcurrentReadInputStream(rtis, maxReads); + + int[][] errors=process(cris); + printHistogram(errors); +// System.out.println("*** main() finished ***"); + } + + public static void printHistogram(int[][] errors){ + System.out.println("#Error Quality Histogram"); + System.out.println("Quality\tErrors\tMatches\tPercent Errors"); + for(int i=0; i ln=cris.nextList(); + ArrayList readlist=ln.list; + while(!readlist.isEmpty()){ + + processList(readlist, counts); + + cris.returnList(ln, readlist.isEmpty()); + //System.err.println("Waiting on a list..."); + ln=cris.nextList(); + readlist=ln.list; + } + + //System.err.println("Returning a list... (final)"); + assert(readlist.isEmpty()); + cris.returnList(ln, readlist.isEmpty()); + ReadWrite.closeStream(cris); + + return counts; + } + + private static void processList(ArrayList list, int[][] counts) { + for(Read r : list){ + processRead(r, counts); + if(r.mate!=null){ + processRead(r.mate, counts); + } + } + } + + private static void processRead(Read r, int[][] counts) { + if(!r.mapped() || r.invalid()){return;} + if(r.mate!=null){ + if(!r.mate.mapped() || r.mate.invalid() || r.mate.match==null || r.mate.containsIndels()){return;} + int len=Tools.max(r.stop, r.mate.stop)-Tools.min(r.start, r.mate.start); + if(len(2+r.bases.length/10)){return;} + if(r.avgQuality()<8){return;} + + if(r.chrom<1 && r.numSites()>0){ + assert(false) : r.toText(false); + SiteScore ss=r.topSite(); //Should not be necessary + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + } + + if(r.strand()==Gene.MINUS){Tools.reverseInPlace(r.match);} + + for(int i=0; i1 ? args[1] : null); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + + long maxReads=0; + RTextInputStream rtis=new RTextInputStream(fname1, fname2, maxReads); + ConcurrentReadInputStream cris=new ConcurrentReadInputStream(rtis, maxReads); + + int[] counts=process(cris); + printMappedHistogram(counts); +// System.out.println("*** main() finished ***"); + } + + public static void printMappedHistogram(int[] counts){ + System.out.println("#Insert Length Histogram"); + System.out.println("#Reads: \t"+totalReads); + System.out.println("#Used: \t"+used+String.format("\t%.3f", (used*100d/totalReads))); + + long wtSum=0; + for(int i=0; i ln=cris.nextList(); + ArrayList readlist=ln.list; + while(!readlist.isEmpty()){ + + processList(readlist, counts); + + cris.returnList(ln, readlist.isEmpty()); + //System.err.println("Waiting on a list..."); + ln=cris.nextList(); + readlist=ln.list; + } + + //System.err.println("Returning a list... (final)"); + assert(readlist.isEmpty()); + cris.returnList(ln, readlist.isEmpty()); + ReadWrite.closeStream(cris); + + return counts; + } + + private static void processList(ArrayList list, int[] counts) { + for(Read r : list){ + processRead(r, counts); +// if(r.mate!=null){ +// processRead(r.mate, mapped, paired); +// } + } + } + + private static void processRead(Read r, int[]counts) { + totalReads++; + if(!r.paired()){return;} + Read r2=r.mate; + if(r.match==null || r2.match==null || r.invalid() || r2.invalid()){return;} + + if(r.containsIndels()){return;} + if(r2.containsIndels()){return;} +// + if(r.countMismatches()>5){return;} + if(r.avgQuality()<12){return;} + if(r2.countMismatches()>5){return;} + if(r2.avgQuality()<12){return;} + + if(r.chrom<1 && r.numSites()>0){ + assert(false) : r.toText(false); + SiteScore ss=r.topSite(); //Should not be necessary + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + } + + if(r2.chrom<1 && r2.numSites()>0){ + assert(false) : r2.toText(false); + SiteScore ss=r2.topSite(); //Should not be necessary + r2.start=ss.start; + r2.stop=ss.stop; + r2.chrom=ss.chrom; + r2.setStrand(ss.strand); + } + + if(r.chrom!=r2.chrom || Tools.absdif(r.start, r2.start)>2000){ + return; + } + + if(r.chrom<1 || r2.chrom<1){return;} + +// int insert; +// if(r.start<=r2.start){ +// insert=r2.start-r.stop; +// }else{ +// insert=r.start-r2.stop; +// } + +// int insert=Tools.max(r.stop, r2.stop)-Tools.min(r.start, r2.start); + int insert; + if(r.strand()==Gene.PLUS){ + insert=r2.stop-r.start; + }else{ + insert=r.stop-r2.start; + } + + if(insert<0){insert=0;} + if(insert>=counts.length){insert=counts.length-1;} + counts[insert]++; + used++; + } + + public static long totalReads=0; + public static long used=0; + +} diff --git a/current/align2/MakeQualityHistogram.java b/current/align2/MakeQualityHistogram.java new file mode 100755 index 0000000..fea6d87 --- /dev/null +++ b/current/align2/MakeQualityHistogram.java @@ -0,0 +1,120 @@ +package align2; + +import java.util.ArrayList; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; + +import fileIO.ReadWrite; + +public class MakeQualityHistogram { + + public static void main(String[] args){ + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + + long maxReads=0; + RTextInputStream rtis=new RTextInputStream(fname1, fname2, maxReads); + ConcurrentReadInputStream cris=new ConcurrentReadInputStream(rtis, maxReads); + + int[][][] counts=process(cris); + printMappedHistogram(counts[0]); + System.out.println(); + printPairedHistogram(counts[1]); +// System.out.println("*** main() finished ***"); + } + + public static void printMappedHistogram(int[][] mapped){ + System.out.println("#Error Quality Histogram"); + System.out.println("Quality\tMapped\tUnmapped\tPercent Mapped"); + for(int i=0; i ln=cris.nextList(); + ArrayList readlist=ln.list; + while(!readlist.isEmpty()){ + + processList(readlist, mapped, paired); + + cris.returnList(ln, readlist.isEmpty()); + //System.err.println("Waiting on a list..."); + ln=cris.nextList(); + readlist=ln.list; + } + + //System.err.println("Returning a list... (final)"); + assert(readlist.isEmpty()); + cris.returnList(ln, readlist.isEmpty()); + ReadWrite.closeStream(cris); + + return new int[][][] {mapped, paired}; + } + + private static void processList(ArrayList list, int[][] mapped, int[][] paired) { + for(Read r : list){ + processRead(r, mapped, paired); +// if(r.mate!=null){ +// processRead(r.mate, mapped, paired); +// } + } + } + + private static void processRead(Read r, int[][] mapped, int[][] paired) { + +// if(!r.paired()){return;} + +// if(TranslateColorspaceRead.containsIndels(r.match)){return;} +// +// if(r.countMismatches()>4){return;} +// if(r.avgQuality()<8){return;} + + if(r.chrom<1 && r.numSites()>0){ + SiteScore ss=r.topSite(); //Should not be necessary + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + } + + int avgQ=r.avgQuality(); + if(r.chrom>0){ + mapped[0][avgQ]++; + }else{ + mapped[1][avgQ]++; + } + if(r.paired()){ + paired[0][avgQ]++; + }else{ + paired[1][avgQ]++; + } + + } + +} diff --git a/current/align2/MakeRocCurve.java b/current/align2/MakeRocCurve.java new file mode 100755 index 0000000..8fc5c84 --- /dev/null +++ b/current/align2/MakeRocCurve.java @@ -0,0 +1,327 @@ +package align2; + +import java.io.File; +import java.util.Arrays; +import java.util.BitSet; + +import stream.Read; +import stream.SamLine; +import stream.SiteScore; + +import dna.Timer; + +import fileIO.TextFile; + +public class MakeRocCurve { + + + public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + String in=null; + long reads=-1; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in") || a.equals("in1")){ + in=b; + }else if(a.equals("reads")){ + reads=Long.parseLong(b); + }else if(a.equals("parsecustom")){ + parsecustom=Tools.parseBoolean(b); + }else if(a.equals("ssaha2") || a.equals("subtractleadingclip")){ + SamLine.SUBTRACT_LEADING_SOFT_CLIP=Tools.parseBoolean(b); + }else if(a.equals("blasr")){ + BLASR=Tools.parseBoolean(b); + }else if(a.equals("bitset")){ + USE_BITSET=Tools.parseBoolean(b); + }else if(a.equals("thresh")){ + THRESH2=Integer.parseInt(b); + }else if(a.equals("outputerrors")){ +// OUTPUT_ERRORS=true; + }else if(i==0 && args[i].indexOf('=')<0 && (a.startsWith("stdin") || new File(args[0]).exists())){ + in=args[0]; + }else if(i==1 && args[i].indexOf('=')<0 && Character.isDigit(a.charAt(0))){ + reads=Long.parseLong(a); + } + } + + if(USE_BITSET){ + int x=400000; + if(reads>0 && reads<=Integer.MAX_VALUE){x=(int)reads;} + try { + seen=new BitSet(x); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + System.out.println("Did not have enough memory to allocate bitset; duplicate mappings will not be detected."); + } + } + + process(in); + + System.out.println("ROC Curve for "+in); + System.out.println(header()); + gradeList(reads); + t.stop(); + System.err.println("Time: \t"+t); + + } + + public static void process(String samfile){ + TextFile tf=new TextFile(samfile, false, false); + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); + if(c!='@'/* && c!=' ' && c!='\t'*/){ + SamLine sl=new SamLine(s); + final int id=((((int)sl.parseNumericId())<<1)|sl.pairnum()); + assert(sl!=null); + Read r=sl.toRead(true); + if(r!=null){ + r.obj=sl; + if(sl.primary() && (seen==null || !seen.get(id))){ + if(seen!=null){seen.set(id);} + calcStatistics1(r, (SamLine) r.obj); + } + }else{ + assert(false) : "'"+"'"; + System.err.println("Bad read from line '"+s+"'"); + } +// calcStatistics1(r); + } + } + tf.close(); + } + + public static String header(){ + return "minScore\tmapped\tretained\ttruePositiveStrict\tfalsePositiveStrict\ttruePositiveLoose" + + "\tfalsePositiveLoose\tfalseNegative\tdiscarded\tambiguous"; + } + + public static void gradeList(long reads){ + + int truePositiveStrict=0; + int falsePositiveStrict=0; + + int truePositiveLoose=0; + int falsePositiveLoose=0; + + int mapped=0; + int mappedRetained=0; + int unmapped=0; + + int discarded=0; + int ambiguous=0; + + int primary=0; + + + for(int q=truePositiveStrictA.length-1; q>=0; q--){ + if(mappedA[q]>0 || unmappedA[q]>0){ + truePositiveStrict+=truePositiveStrictA[q]; + falsePositiveStrict+=falsePositiveStrictA[q]; + truePositiveLoose+=truePositiveLooseA[q]; + falsePositiveLoose+=falsePositiveLooseA[q]; + mapped+=mappedA[q]; + mappedRetained+=mappedRetainedA[q]; + unmapped+=unmappedA[q]; + discarded+=discardedA[q]; + ambiguous+=ambiguousA[q]; + primary+=primaryA[q]; + + double tmult=100d/reads; + + double mappedB=mapped*tmult; + double retainedB=mappedRetained*tmult; + double truePositiveStrictB=truePositiveStrict*tmult; + double falsePositiveStrictB=falsePositiveStrict*tmult; + double truePositiveLooseB=truePositiveLoose*tmult; + double falsePositiveLooseB=falsePositiveLoose*tmult; + double falseNegativeB=(reads-mapped)*tmult; + double discardedB=discarded*tmult; + double ambiguousB=ambiguous*tmult; + + StringBuilder sb=new StringBuilder(); + sb.append(q); + sb.append('\t'); + sb.append(String.format("%.4f", mappedB)); + sb.append('\t'); + sb.append(String.format("%.4f", retainedB)); + sb.append('\t'); + sb.append(String.format("%.4f", truePositiveStrictB)); + sb.append('\t'); + sb.append(String.format("%.4f", falsePositiveStrictB)); + sb.append('\t'); + sb.append(String.format("%.4f", truePositiveLooseB)); + sb.append('\t'); + sb.append(String.format("%.4f", falsePositiveLooseB)); + sb.append('\t'); + sb.append(String.format("%.4f", falseNegativeB)); + sb.append('\t'); + sb.append(String.format("%.4f", discardedB)); + sb.append('\t'); + sb.append(String.format("%.4f", ambiguousB)); + + System.out.println(sb); + }else{ + assert(truePositiveStrictA[q]==0) : q; + assert(falsePositiveStrictA[q]==0) : q; + assert(truePositiveLooseA[q]==0) : q; + assert(falsePositiveLooseA[q]==0) : q; + } + + } + } + + public static void calcStatistics1(final Read r, SamLine sl){ + + int q=r.mapScore; + + int THRESH=0; + primaryA[q]++; + if(q<0){q=0;} + if(q>=discardedA.length){q=discardedA.length-1;} + + if(r.discarded()/* || r.mapScore==0*/){ + discardedA[q]++; + unmappedA[q]++; + }else if(r.ambiguous()){ +// assert(r.mapped()) : "\n"+r+"\n"+sl+"\n"; + if(r.mapped()){mappedA[q]++;} + ambiguousA[q]++; + }else if(r.mapScore<1){ + unmappedA[q]++; + } +// else if(r.mapScore<=minQuality){ +// if(r.mapped()){mappedA[q]++;} +// ambiguousA[q]++; +// } + else{ + if(!r.mapped()){ + unmappedA[q]++; + }else{ + mappedA[q]++; + mappedRetainedA[q]++; + + if(parsecustom){ + SiteScore os=r.originalSite; + assert(os!=null); + if(os!=null){ + int trueChrom=os.chrom; + byte trueStrand=os.strand; + int trueStart=os.start; + int trueStop=os.stop; + SiteScore ss=new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0, 0); + byte[] originalContig=sl.originalContig(); + if(BLASR){ + originalContig=(originalContig==null || Tools.indexOf(originalContig, (byte)'/')<0 ? originalContig : + Arrays.copyOfRange(originalContig, 0, Tools.lastIndexOf(originalContig, (byte)'/'))); + } + int cstart=sl.originalContigStart(); + + boolean strict=isCorrectHit(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH, originalContig, sl.rname(), cstart); + boolean loose=isCorrectHitLoose(ss, trueChrom, trueStrand, trueStart, trueStop, THRESH+THRESH2, originalContig, sl.rname(), cstart); + + // if(!strict){ + // System.out.println(ss+", "+new String(originalContig)+", "+new String(sl.rname())); + // assert(false); + // } + + // System.out.println("loose = "+loose+" for "+r.toText()); + + if(loose){ + // System.err.println("TPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t" + // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop); + truePositiveLooseA[q]++; + }else{ + // System.err.println("FPL\t"+trueChrom+", "+trueStrand+", "+trueStart+", "+trueStop+"\tvs\t" + // +ss.chrom+", "+ss.strand+", "+ss.start+", "+ss.stop); + falsePositiveLooseA[q]++; + } + + if(strict){ + // System.err.println("TPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop); + truePositiveStrictA[q]++; + }else{ + // System.err.println("FPS\t"+trueStart+", "+trueStop+"\tvs\t"+ss.start+", "+ss.stop); + falsePositiveStrictA[q]++; + } + } + } + } + } + + } + + + + public static boolean isCorrectHit(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, + byte[] originalContig, byte[] contig, int cstart){ + if(ss.strand!=trueStrand){return false;} + if(originalContig!=null){ + if(!Arrays.equals(originalContig, contig)){return false;} + }else{ + if(ss.chrom!=trueChrom){return false;} + } + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + int cstop=cstart+trueStop-trueStart; +// return (absdif(ss.start, trueStart)<=thresh && absdif(ss.stop, trueStop)<=thresh); + return (absdif(ss.start, cstart)<=thresh && absdif(ss.stop, cstop)<=thresh); + } + + + public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, + byte[] originalContig, byte[] contig, int cstart){ + if(ss.strand!=trueStrand){return false;} + if(originalContig!=null){ + if(!Arrays.equals(originalContig, contig)){return false;} + }else{ + if(ss.chrom!=trueChrom){return false;} + } + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + int cstop=cstart+trueStop-trueStart; +// return (absdif(ss.start, trueStart)<=thresh || absdif(ss.stop, trueStop)<=thresh); + return (absdif(ss.start, cstart)<=thresh || absdif(ss.stop, cstop)<=thresh); + } + + private static final int absdif(int a, int b){ + return a>b ? a-b : b-a; + } + + public static int truePositiveStrictA[]=new int[1000]; + public static int falsePositiveStrictA[]=new int[1000]; + + public static int truePositiveLooseA[]=new int[1000]; + public static int falsePositiveLooseA[]=new int[1000]; + + public static int mappedA[]=new int[1000]; + public static int mappedRetainedA[]=new int[1000]; + public static int unmappedA[]=new int[1000]; + + public static int discardedA[]=new int[1000]; + public static int ambiguousA[]=new int[1000]; + + public static int primaryA[]=new int[1000]; + + public static boolean parsecustom=true; + + public static int THRESH2=20; + public static boolean BLASR=false; + public static boolean USE_BITSET=true; + public static BitSet seen=null; + +} diff --git a/current/align2/MultiStateAligner10ts.java b/current/align2/MultiStateAligner10ts.java new file mode 100755 index 0000000..3868efd --- /dev/null +++ b/current/align2/MultiStateAligner10ts.java @@ -0,0 +1,3453 @@ +package align2; + +import java.util.Arrays; + +import dna.AminoAcid; + +/** + * "P" for "Packed".
+ * Same as MSA2P, but the "prevState" field was removed. + * Yields identical results to MSA2, but is faster. + * For very long reads (over 2000bp) the score may overflow, so MSA2 should be used instead, + * or the time field should be shrunk. */ +public final class MultiStateAligner10ts extends MSA{ + + + public static void main(String[] args){ + byte[] read=args[0].getBytes(); + byte[] ref=args[1].getBytes(); + + byte[] original=ref; + + boolean colorspace=false; + + if(args.length>2 && args[2].equalsIgnoreCase("cs")){ + colorspace=true; + read=AminoAcid.toColorspace(read); + ref=AminoAcid.toColorspace(ref); + } + + MultiStateAligner10ts msa=new MultiStateAligner10ts(read.length, ref.length, colorspace); + + System.out.println("Initial: "); + for(int mode=0; mode0 && bandwidth0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(170, read.length+20)){ +// assert(false) : minScore; +// assert(minScore>0) : minScore; +// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length); + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=120; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + byte c=read[i]; + if(AminoAcid.isFullyDefined(c)){ + vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor); + prevDefined=false; + } + } + + horizLimit[columns]=minScore_off; + prevDefined=false; + for(int i=columns-1; i>=0; i--){ + byte c=ref[refStartLoc+i]; + if(AminoAcid.isFullyDefined(c)){ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor); + prevDefined=false; + } + } + +// vertLimit[rows]=minScore_off; +// for(int i=rows-1; i>=0; i--){ +// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); +// } +// +// horizLimit[columns]=minScore_off; +// for(int i=columns-1; i>=0; i--){ +// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); +// } + + for(int row=1; row<=rows; row++){ + + int colStart=minGoodCol; + int colStop=maxGoodCol; + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && maxGoodCol1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** + * Like fillLimitedX but additionally restricted to a band. + * return new int[] {rows, maxC, maxS, max}; + * Will not fill areas that cannot match minScore */ + private final int[] fillBanded1(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ +// minScore=0; +// assert(minScore>0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + final int halfband=bandwidth/2; + + if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(170, read.length+20)){ +// assert(false) : minScore; +// assert(minScore>0) : minScore; +// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length); + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=120; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + byte c=read[i]; + if(AminoAcid.isFullyDefined(c)){ + vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor); + prevDefined=false; + } + } + + horizLimit[columns]=minScore_off; + prevDefined=false; + for(int i=columns-1; i>=0; i--){ + byte c=ref[refStartLoc+i]; + if(AminoAcid.isFullyDefined(c)){ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor); + prevDefined=false; + } + } + +// vertLimit[rows]=minScore_off; +// for(int i=rows-1; i>=0; i--){ +// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); +// } +// +// horizLimit[columns]=minScore_off; +// for(int i=columns-1; i>=0; i--){ +// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); +// } + + for(int row=1; row<=rows; row++){ + + int colStart=Tools.max(minGoodCol, row-halfband); + int colStop=Tools.min(maxGoodCol, row+halfband); + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && maxGoodCol1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** return new int[] {rows, maxC, maxS, max, maxStart}; + * Will not fill areas that cannot match minScore */ + private final int[] fillBanded(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ + assert(false) : "TODO"; +// minScore=0; +// assert(minScore>0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=120; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + byte c=read[i]; + if(AminoAcid.isFullyDefined(c)){ + vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor); + prevDefined=false; + } + } + + horizLimit[columns]=minScore_off; + prevDefined=false; + for(int i=columns-1; i>=0; i--){ + byte c=ref[refStartLoc+i]; + if(AminoAcid.isFullyDefined(c)){ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor); + prevDefined=false; + } + } + +// vertLimit[rows]=minScore_off; +// for(int i=rows-1; i>=0; i--){ +// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); +// } +// +// horizLimit[columns]=minScore_off; +// for(int i=columns-1; i>=0; i--){ +// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); +// } + + for(int row=1; row<=rows; row++){ + + int colStart=minGoodCol; + int colStop=maxGoodCol; + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + bpacked[MODE_MS][row][colStart-1]=subfloor; + bpacked[MODE_INS][row][colStart-1]=subfloor; + bpacked[MODE_DEL][row][colStart-1]=subfloor; + }else{ + bpacked[MODE_MS][row-1][0]=bpacked[MODE_INS][row-1][0]=bpacked[MODE_DEL][row-1][0]=col0score[row-1]; + bpacked[MODE_MS][row][0]=bpacked[MODE_INS][row][0]=bpacked[MODE_DEL][row][0]=col0score[row]; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=bpacked[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=bpacked[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=bpacked[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=bpacked[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=bpacked[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=bpacked[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=bpacked[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// bpacked[MODE_MS][row][col]=(score|prevState|time); +// bpacked[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// bpacked[MODE_MS][row][col]=(score|prevState|time); +// bpacked[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + start=startmatrix[prevState][row-1][col-1]; + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// bpacked[MODE_MS][row][col]=(score|prevState|time); + bpacked[MODE_MS][row][col]=(score|time); + startmatrix[MODE_MS][row][col]=start; + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + bpacked[MODE_DEL][row][col]=subfloor; + startmatrix[MODE_MS][row][col]=-1; + }else{//Calculate DEL score + + final int streak=bpacked[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + start=startmatrix[prevState][row][col-1]; + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// bpacked[MODE_DEL][row][col]=(score|prevState|time); + bpacked[MODE_DEL][row][col]=(score|time); + startmatrix[MODE_MS][row][col]=start; + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + bpacked[MODE_INS][row][col]=subfloor; + startmatrix[MODE_MS][row][col]=-1; + }else{//Calculate INS score + + final int streak=bpacked[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && maxGoodCol1){ + bpacked[MODE_MS][row-1][col+1]=subfloor; + bpacked[MODE_INS][row-1][col+1]=subfloor; + bpacked[MODE_DEL][row-1][col+1]=subfloor; + startmatrix[MODE_MS][row-1][col+1]=-1; + startmatrix[MODE_INS][row-1][col+1]=-1; + startmatrix[MODE_DEL][row-1][col+1]=-1; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxStart=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxStart=startmatrix[state][rows][col]; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore, maxStart}; + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Will not fill areas that cannot match minScore */ + public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ + if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} + else{ + byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); + assert(gref!=null) : "Excessively long read:\n"+new String(read); + return fillUnlimited(read, gref, 0, greflimit); + } + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more. +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + //temporary, for finding a bug + if(rows>maxRows || columns>maxColumns){ + throw new RuntimeException(); + } + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + if(rowBARRIER_D2){ + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + //Calculate INS score +// if(gap || colBARRIER_I2){ + if(gap || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** @return {score, bestRefStart, bestRefStop} */ + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ + if(gapped){ + final byte[] gref=grefbuffer; + int gstart=translateToGappedCoordinate(refStartLoc, gref); + int gstop=translateToGappedCoordinate(refEndLoc, gref); + byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); + return out; + }else{ + return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); + } + } + + + /** Generates the match string */ + public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + int gaps=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c, colorspace)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r, colorspace)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + if(r==GAPC){ + out[outPos]='-'; + gaps++; + }else{ + out[outPos]='D'; + } + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ + translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); + assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); + + out[1]=translateFromGappedCoordinate(out[1], gref); + out[2]=translateFromGappedCoordinate(out[2], gref); + if(verbose){System.err.println("returning score "+Arrays.toString(out));} + return out; + }else{ + return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); + } + } + + /** @return {score, bestRefStart, bestRefStop}, or {score, bestRefStart, bestRefStop, padLeft, padRight} if more padding is needed */ + public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, + final int maxRow, final int maxCol, final int maxState){ + + int row=maxRow; + int col=maxCol; + int state=maxState; + + assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}; + } + return rvec; + } + + /** + * Fills grefbuffer + * @param ref + * @param a + * @param b + * @param gaps + * @return gref + */ + private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ + assert(gaps!=null && gaps.length>0); + + assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); + assert(refEndLoc>=gaps[gaps.length-1]); + + final int g0_old=gaps[0]; + final int gN_old=gaps[gaps.length-1]; + gaps[0]=Tools.min(gaps[0], refStartLoc); + gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); + grefRefOrigin=gaps[0]; + + if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} + +// grefRefOrigin=Tools.min(gaps[0], refStartLoc); + +// //This block is no longer needed since the array is preallocated. +// int len=0; +// final int gb2=GAPBUFFER*2; +// for(int i=0; iy); +// int gap=z-y-1; +// if(gapy); + int gap=z-y-1; + assert(gap>=MINGAP) : gap+"\t"+MINGAP; + if(gapgref.length){ + System.err.println("gref buffer overflow: "+lim+" > "+gref.length); + return null; + } + for(int i=greflimit, r=refEndLoc+1; i "+j);} + return j; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + private final int translateToGappedCoordinate(int point, byte[] gref){ + if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} + if(point<=grefRefOrigin){return point-grefRefOrigin;} + for(int i=0, j=grefRefOrigin; i "+i);} + return i; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + +// public final int scoreNoIndels(byte[] read, SiteScore ss){ +// +// ChromosomeArray cha=Data.getChromosome(ss.chrom); +// final int refStart=ss.start; +// +// int score=0; +// int mode=MODE_START; +// int timeInMode=0; +// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match +// +// for(int i=0; icha.maxIndex+1){ +// int dif=(cha.maxIndex+1-refStop); +// readStop-=dif; +// score+=POINTSoff_NOREF*dif; +// } +// +//// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //No longer needed. +// +// for(int i=readStart; i0){//match + if(loc==lastValue){//contiguous match + score+=POINTS_MATCH2; + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=POINTS_MATCH; + }else if(loc=0); + score+=POINTS_MATCH; + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=POINTS_MATCH; + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode0){//match + if(loc==lastValue){//contiguous match + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode1) : minContig; + + int contig=0; + int maxContig=0; + + int score=0; + int lastLoc=-3; //Last true location + int lastValue=-1; + int timeInMode=0; + + for(int i=0; i0){//match + if(loc==lastValue){//contiguous match + contig++; + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + maxContig=Tools.max(maxContig, contig); + contig=1; + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + maxContig=Tools.max(maxContig, contig); + contig=0; + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInModeref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; i=ref.length) ? (byte)'N' : ref[j]; + + if(c=='N' || r=='N'){match[i]='N';} + else if(c==r){match[i]='m';} + else{match[i]='S';} + + } + + return match; + } + + @Override + public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){ + + int score=0; + int mode=-1; + int timeInMode=0; + + //This block handles cases where the read runs outside the reference + //Of course, padding the reference with 'N' would be better, but... + int readStart=0; + int readStop=read.length; + final int refStop=refStart+read.length; + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length>SCOREOFFSET).append(",");} + return sb; + } + + public static float minIdToMinRatio(double minid){ + if(minid>1){minid=minid/100;} + assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100."; + double matchdif=POINTS_MATCH-POINTS_MATCH2; + double match=POINTS_MATCH2; + double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2; + double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4; + double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3); + double badAvg=.7*sub+.2*del+.1*ins; + double badFraction=1-minid; + double minratio=(match+badFraction*badAvg)/match; + assert(minratio<=1); + minratio=Tools.max(0.05, minratio); + return (float)minratio; + } + + public static final int TIMEBITS=11; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1<2 && args[2].equalsIgnoreCase("cs")){ + colorspace=true; + read=AminoAcid.toColorspace(read); + ref=AminoAcid.toColorspace(ref); + } + + MultiStateAligner11ts msa=new MultiStateAligner11ts(read.length, ref.length, colorspace); + + System.out.println("Initial: "); + for(int mode=0; mode0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 : + Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2; + + if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){ +// assert(false) : minScore; +// assert(minScore>0) : minScore; +// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length); + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=120; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + byte c=read[i]; + if(AminoAcid.isFullyDefined(c)){ + vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor); + prevDefined=false; + } + } + + horizLimit[columns]=minScore_off; + prevDefined=false; + for(int i=columns-1; i>=0; i--){ + byte c=ref[refStartLoc+i]; + if(AminoAcid.isFullyDefined(c)){ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor); + prevDefined=false; + } + } + +// vertLimit[rows]=minScore_off; +// for(int i=rows-1; i>=0; i--){ +// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); +// } +// +// horizLimit[columns]=minScore_off; +// for(int i=columns-1; i>=0; i--){ +// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); +// } + + for(int row=1; row<=rows; row++){ + + final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband)); + final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1)); + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + POINTSoff_SUB_ARRAY[streak+1]); +// scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : +// (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+POINTSoff_INS_ARRAY[streak+1]; +// int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : +// streak=colStop){ + if(col>colStop && (maxGoodCol0)){break;} + if(row>1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Override + public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ + if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} + else{ + byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); + assert(gref!=null) : "Excessively long read:\n"+new String(read); + return fillUnlimited(read, gref, 0, greflimit); + } + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n" + +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more. +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + //temporary, for finding a bug + if(rows>maxRows || columns>maxColumns){ + throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); + } + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + POINTSoff_SUB_ARRAY[streak+1]); +// scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : +// (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + if(rowBARRIER_D2){ + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + //Calculate INS score +// if(gap || colBARRIER_I2){ + if(gap || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+POINTSoff_INS_ARRAY[streak+1]; +// int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : +// streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + POINTSoff_SUB_ARRAY[streak+1]); +// int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : +// (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+POINTSoff_INS_ARRAY[streak+1]; +// int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : +// streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Override + /** @return {score, bestRefStart, bestRefStop} */ + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ + if(gapped){ + final byte[] gref=grefbuffer; + int gstart=translateToGappedCoordinate(refStartLoc, gref); + int gstop=translateToGappedCoordinate(refEndLoc, gref); + byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); + return out; + }else{ + return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); + } + } + + @Override + /** Generates the match string */ + public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + int gaps=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c, colorspace)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r, colorspace)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + if(r==GAPC){ + out[outPos]='-'; + gaps++; + }else{ + out[outPos]='D'; + } + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ + translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); + assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); + + out[1]=translateFromGappedCoordinate(out[1], gref); + out[2]=translateFromGappedCoordinate(out[2], gref); + if(verbose){System.err.println("returning score "+Arrays.toString(out));} + return out; + }else{ + return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); + } + } + + @Override + /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState},
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}
+ * if more padding is needed */ + public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, + final int maxRow, final int maxCol, final int maxState){ + + int row=maxRow; + int col=maxCol; + int state=maxState; + + assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}; + } + return rvec; + } + + /** + * Fills grefbuffer + * @param ref + * @param a + * @param b + * @param gaps + * @return gref + */ + private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ + assert(gaps!=null && gaps.length>0); + + assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); + assert(refEndLoc>=gaps[gaps.length-1]); + + final int g0_old=gaps[0]; + final int gN_old=gaps[gaps.length-1]; + gaps[0]=Tools.min(gaps[0], refStartLoc); + gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); + grefRefOrigin=gaps[0]; + + if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} + +// grefRefOrigin=Tools.min(gaps[0], refStartLoc); + +// //This block is no longer needed since the array is preallocated. +// int len=0; +// final int gb2=GAPBUFFER*2; +// for(int i=0; iy); +// int gap=z-y-1; +// if(gapy); + int gap=z-y-1; + assert(gap>=MINGAP) : gap+"\t"+MINGAP; + if(gapgref.length){ + System.err.println("gref buffer overflow: "+lim+" > "+gref.length); + return null; + } + for(int i=greflimit, r=refEndLoc+1; i "+j);} + return j; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + private final int translateToGappedCoordinate(int point, byte[] gref){ + if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} + if(point<=grefRefOrigin){return point-grefRefOrigin;} + for(int i=0, j=grefRefOrigin; i "+i);} + return i; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + + /** Calculates score based on an array from Index */ + private final int calcAffineScore(int[] locArray){ + int score=0; + int lastLoc=-2; //Last true location + int lastValue=-1; + int timeInMode=0; + + for(int i=0; i0){//match + if(loc==lastValue){//contiguous match + score+=POINTS_MATCH2; + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=POINTS_MATCH; + }else if(loc=0); + score+=POINTS_MATCH; + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=(POINTS_MATCH+POINTS_INS_ARRAY_C[lastLoc-loc]); +// score+=POINTS_MATCH; +// score+=POINTS_INS; +// int dif=lastLoc-loc+1; +// if(dif>LIMIT_FOR_COST_4){ +// score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; +// dif=LIMIT_FOR_COST_4; +// } +// if(dif>LIMIT_FOR_COST_3){ +// score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; +// dif=LIMIT_FOR_COST_3; +// } +// if(dif>1){ +// score+=(dif-1)*POINTS_INS2; +// } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous +// if(timeInMode0){//match + if(loc==lastValue){//contiguous match + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]+POINTS_INS_ARRAY_C[Tools.min(loc-lastLoc, 5)]); +// score+=(POINTS_MATCH+baseScores[i]); +// score+=POINTS_INS; +// int dif=lastLoc-loc+1; +// if(dif>LIMIT_FOR_COST_4){ +// score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; +// dif=LIMIT_FOR_COST_4; +// } +// if(dif>LIMIT_FOR_COST_3){ +// score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; +// dif=LIMIT_FOR_COST_3; +// } +// if(dif>1){ +// score+=(dif-1)*POINTS_INS2; +// } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous +// if(timeInMode1) : minContig; + + int contig=0; + int maxContig=0; + + int score=0; + int lastLoc=-3; //Last true location + int lastValue=-1; + int timeInMode=0; + + for(int i=0; i0){//match + if(loc==lastValue){//contiguous match + contig++; + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + maxContig=Tools.max(maxContig, contig); + contig=1; + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + maxContig=Tools.max(maxContig, contig); + contig=0; + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]+POINTS_INS_ARRAY_C[Tools.min(loc-lastLoc, 5)]); +// score+=(POINTS_MATCH+baseScores[i]); +// score+=POINTS_INS; +// int dif=lastLoc-loc+1; +// if(dif>LIMIT_FOR_COST_4){ +// score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; +// dif=LIMIT_FOR_COST_4; +// } +// if(dif>LIMIT_FOR_COST_3){ +// score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; +// dif=LIMIT_FOR_COST_3; +// } +// if(dif>1){ +// score+=(dif-1)*POINTS_INS2; +// } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous +// if(timeInModeref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; i=ref.length) ? (byte)'N' : ref[j]; + + if(c=='N' || r=='N'){match[i]='N';} + else if(c==r){match[i]='m';} + else{match[i]='S';} + + } + + return match; + } + + @Override + public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){ + return scoreNoIndels(read, ref, baseScores, refStart, null); + } + @Override + public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){ + + int score=0; + int mode=-1; + int timeInMode=0; + int norefs=0; + + //This block handles cases where the read runs outside the reference + //Of course, padding the reference with 'N' would be better, but... + int readStart=0; + int readStop=read.length; + final int refStop=refStart+read.length; + boolean semiperfect=true; + + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + norefs+=readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length>SCOREOFFSET).append(",");} + return sb; + } + + public static float minIdToMinRatio(double minid){ + if(minid>1){minid=minid/100;} + assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100."; + double matchdif=POINTS_MATCH-POINTS_MATCH2; + double match=POINTS_MATCH2; + double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2; + double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4; + double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3); + double badAvg=.7*sub+.2*del+.1*ins; + double badFraction=1-minid; + double minratio=(match+badFraction*badAvg)/match; + assert(minratio<=1); + minratio=Tools.max(0.1, minratio); + return (float)minratio; + } + + public static final int TIMEBITS=11; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1<LIMIT_FOR_COST_4){ + pts=POINTS_INS4; + ptsoff=POINTSoff_INS4; + }else if(i>LIMIT_FOR_COST_3){ + pts=POINTS_INS3; + ptsoff=POINTSoff_INS3; + }else if(i>1){ + pts=POINTS_INS2; + ptsoff=POINTSoff_INS2; + }else{ + pts=POINTS_INS; + ptsoff=POINTSoff_INS; + } + POINTS_INS_ARRAY[i]=pts; + POINTSoff_INS_ARRAY[i]=ptsoff; + POINTS_INS_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_INS_ARRAY_C[i-1]); + POINTSoff_INS_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_INS_ARRAY_C[i-1]); + } + + + POINTS_SUB_ARRAY=new int[504]; + POINTSoff_SUB_ARRAY=new int[504]; + POINTS_SUB_ARRAY_C=new int[504]; + POINTSoff_SUB_ARRAY_C=new int[504]; + + for(int i=1; iLIMIT_FOR_COST_3){ + pts=POINTS_SUB3; + ptsoff=POINTSoff_SUB3; + }else if(i>1){ + pts=POINTS_SUB2; + ptsoff=POINTSoff_SUB2; + }else{ + pts=POINTS_SUB; + ptsoff=POINTSoff_SUB; + } + POINTS_SUB_ARRAY[i]=pts; + POINTSoff_SUB_ARRAY[i]=ptsoff; + POINTS_SUB_ARRAY_C[i]=Tools.max(MIN_SCORE, pts+POINTS_SUB_ARRAY_C[i-1]); + POINTSoff_SUB_ARRAY_C[i]=Tools.max(MINoff_SCORE, ptsoff+POINTSoff_SUB_ARRAY_C[i-1]); + } + } + + public final int POINTS_NOREF(){return POINTS_NOREF;} + public final int POINTS_NOCALL(){return POINTS_NOCALL;} + public final int POINTS_MATCH(){return POINTS_MATCH;} + public final int POINTS_MATCH2(){return POINTS_MATCH2;} + public final int POINTS_COMPATIBLE(){return POINTS_COMPATIBLE;} + public final int POINTS_SUB(){return POINTS_SUB;} + public final int POINTS_SUBR(){return POINTS_SUBR;} + public final int POINTS_SUB2(){return POINTS_SUB2;} + public final int POINTS_SUB3(){return POINTS_SUB3;} + public final int POINTS_MATCHSUB(){return POINTS_MATCHSUB;} + public final int POINTS_INS(){return POINTS_INS;} + public final int POINTS_INS2(){return POINTS_INS2;} + public final int POINTS_INS3(){return POINTS_INS3;} + public final int POINTS_INS4(){return POINTS_INS4;} + public final int POINTS_DEL(){return POINTS_DEL;} + public final int POINTS_DEL2(){return POINTS_DEL2;} + public final int POINTS_DEL3(){return POINTS_DEL3;} + public final int POINTS_DEL4(){return POINTS_DEL4;} + public final int POINTS_DEL5(){return POINTS_DEL5;} + public final int POINTS_DEL_REF_N(){return POINTS_DEL_REF_N;} + public final int POINTS_GAP(){return POINTS_GAP;} + + public final int TIMESLIP(){return TIMESLIP;} + public final int MASK5(){return MASK5;} + public final int SCOREOFFSET(){return SCOREOFFSET();} + + final int BARRIER_I1(){return BARRIER_I1;} + final int BARRIER_D1(){return BARRIER_D1;} + + public final int LIMIT_FOR_COST_3(){return LIMIT_FOR_COST_3;} + public final int LIMIT_FOR_COST_4(){return LIMIT_FOR_COST_4;} + public final int LIMIT_FOR_COST_5(){return LIMIT_FOR_COST_5;} + + public final int BAD(){return BAD;} + + + private int rows; + private int columns; + +} diff --git a/current/align2/MultiStateAligner9PacBio.java b/current/align2/MultiStateAligner9PacBio.java new file mode 100755 index 0000000..ba67e3a --- /dev/null +++ b/current/align2/MultiStateAligner9PacBio.java @@ -0,0 +1,2461 @@ +package align2; + +import java.util.Arrays; + +import stream.Read; +import stream.SiteScore; + +import dna.AminoAcid; + +/** + * Based on MSA9ts, with transform scores tweaked for PacBio. */ +public final class MultiStateAligner9PacBio extends MSA{ + + + public static void main(String[] args){ + byte[] read=args[0].getBytes(); + byte[] ref=args[1].getBytes(); + + byte[] original=ref; + + boolean colorspace=false; + + if(args.length>2 && args[2].equalsIgnoreCase("cs")){ + colorspace=true; + read=AminoAcid.toColorspace(read); + ref=AminoAcid.toColorspace(ref); + } + + MultiStateAligner9PacBio msa=new MultiStateAligner9PacBio(read.length, ref.length, colorspace); + + System.out.println("Initial: "); + for(int mode=0; mode0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 : + Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2; + + if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){ +// assert(false) : minScore; +// assert(minScore>0) : minScore; +// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length); + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=120; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + byte c=read[i]; + if(AminoAcid.isFullyDefined(c)){ + vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor); + prevDefined=false; + } + } + + horizLimit[columns]=minScore_off; + prevDefined=false; + for(int i=columns-1; i>=0; i--){ + byte c=ref[refStartLoc+i]; + if(AminoAcid.isFullyDefined(c)){ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor); + prevDefined=false; + } + } + +// vertLimit[rows]=minScore_off; +// for(int i=rows-1; i>=0; i--){ +// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); +// } +// +// horizLimit[columns]=minScore_off; +// for(int i=columns-1; i>=0; i--){ +// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); +// } + + for(int row=1; row<=rows; row++){ + + final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband)); + final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1)); + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && (maxGoodCol0)){break;} + if(row>1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Override + /** return new int[] {rows, maxC, maxS, max}; + * Will not fill areas that cannot match minScore */ + public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ + if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} + else{ + byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); + assert(gref!=null) : "Excessively long read:\n"+new String(read); + return fillUnlimited(read, gref, 0, greflimit); + } + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n" + +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more. +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + //temporary, for finding a bug + if(rows>maxRows || columns>maxColumns){ + throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); + } + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + if(rowBARRIER_D2){ + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + //Calculate INS score +// if(gap || colBARRIER_I2){ + if(gap || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Override + /** @return {score, bestRefStart, bestRefStop} */ + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ + if(gapped){ + final byte[] gref=grefbuffer; + int gstart=translateToGappedCoordinate(refStartLoc, gref); + int gstop=translateToGappedCoordinate(refEndLoc, gref); + byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); + return out; + }else{ + return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); + } + } + + @Override + /** Generates the match string */ + public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + int gaps=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c, colorspace)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r, colorspace)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + if(r==GAPC){ + out[outPos]='-'; + gaps++; + }else{ + out[outPos]='D'; + } + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ + translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); + assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); + + out[1]=translateFromGappedCoordinate(out[1], gref); + out[2]=translateFromGappedCoordinate(out[2], gref); + if(verbose){System.err.println("returning score "+Arrays.toString(out));} + return out; + }else{ + return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); + } + } + + @Override + /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState},
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}
+ * if more padding is needed */ + public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, + final int maxRow, final int maxCol, final int maxState){ + + int row=maxRow; + int col=maxCol; + int state=maxState; + + assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}; + } + return rvec; + } + + /** + * Fills grefbuffer + * @param ref + * @param a + * @param b + * @param gaps + * @return gref + */ + private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ + assert(gaps!=null && gaps.length>0); + + assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); + assert(refEndLoc>=gaps[gaps.length-1]); + + final int g0_old=gaps[0]; + final int gN_old=gaps[gaps.length-1]; + gaps[0]=Tools.min(gaps[0], refStartLoc); + gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); + grefRefOrigin=gaps[0]; + + if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} + +// grefRefOrigin=Tools.min(gaps[0], refStartLoc); + +// //This block is no longer needed since the array is preallocated. +// int len=0; +// final int gb2=GAPBUFFER*2; +// for(int i=0; iy); +// int gap=z-y-1; +// if(gapy); + int gap=z-y-1; + assert(gap>=MINGAP) : gap+"\t"+MINGAP; + if(gapgref.length){ + System.err.println("gref buffer overflow: "+lim+" > "+gref.length); + return null; + } + for(int i=greflimit, r=refEndLoc+1; i "+j);} + return j; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + private final int translateToGappedCoordinate(int point, byte[] gref){ + if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} + if(point<=grefRefOrigin){return point-grefRefOrigin;} + for(int i=0, j=grefRefOrigin; i "+i);} + return i; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + + /** Calculates score based on an array from Index */ + private final int calcAffineScore(int[] locArray){ + int score=0; + int lastLoc=-2; //Last true location + int lastValue=-1; + int timeInMode=0; + + for(int i=0; i0){//match + if(loc==lastValue){//contiguous match + score+=POINTS_MATCH2; + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=POINTS_MATCH; + }else if(loc=0); + score+=POINTS_MATCH; + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=POINTS_MATCH; + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode0){//match + if(loc==lastValue){//contiguous match + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode1) : minContig; + + int contig=0; + int maxContig=0; + + int score=0; + int lastLoc=-3; //Last true location + int lastValue=-1; + int timeInMode=0; + + for(int i=0; i0){//match + if(loc==lastValue){//contiguous match + contig++; + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + maxContig=Tools.max(maxContig, contig); + contig=1; + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + maxContig=Tools.max(maxContig, contig); + contig=0; + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInModeref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; i=ref.length) ? (byte)'N' : ref[j]; + + if(c=='N' || r=='N'){match[i]='N';} + else if(c==r){match[i]='m';} + else{match[i]='S';} + + } + + return match; + } + + @Override + public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){ + return scoreNoIndels(read, ref, baseScores, refStart, null); + } + @Override + public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){ + + int score=0; + int mode=-1; + int timeInMode=0; + int norefs=0; + + //This block handles cases where the read runs outside the reference + //Of course, padding the reference with 'N' would be better, but... + int readStart=0; + int readStop=read.length; + final int refStop=refStart+read.length; + boolean semiperfect=true; + + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + norefs+=readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length>SCOREOFFSET).append(",");} + return sb; + } + + public static float minIdToMinRatio(double minid){ + if(minid>1){minid=minid/100;} + assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100."; + double matchdif=POINTS_MATCH-POINTS_MATCH2; + double match=POINTS_MATCH2; + double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2; + double del=0.8*(matchdif+POINTS_DEL)+0.1*POINTS_DEL2+0.05*POINTS_DEL3+0.05*POINTS_DEL4; + double ins=-POINTS_MATCH2+0.8*(matchdif+POINTS_INS)+0.15*(POINTS_INS2)+0.05*(POINTS_INS3); + double badAvg=.2*sub+.3*del+.5*ins; + double badFraction=1-minid; + double minratio=(match+badFraction*badAvg)/match; + assert(minratio<=1); + minratio=Tools.max(0.1, minratio); + return (float)minratio; + } + + public static final int TIMEBITS=9; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1<=200); +// assert(maxRows_>=200); + maxRows=maxRows_; + maxColumns=maxColumns_; + packed=new int[3][maxRows+1][maxColumns+1]; + + vertLimit=new int[maxRows+1]; + horizLimit=new int[maxColumns+1]; + Arrays.fill(vertLimit, BADoff); + Arrays.fill(horizLimit, BADoff); + +// for(int i=0; i0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){ +// assert(false) : minScore; + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + + minScore-=100; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); + } + + horizLimit[columns]=minScore_off; + for(int i=columns-1; i>=0; i--){ + horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); + } + + for(int row=1; row<=rows; row++){ + + int colStart=minGoodCol; + int colStop=maxGoodCol; + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + if(scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && maxGoodCol1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more. + + //temporary, for finding a bug + if(rows>maxRows || columns>maxColumns){ + throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); + } + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + out[outPos]='D'; + + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop}; + } + return rvec; + } + + + /** Will not fill areas that cannot match minScore. + * @return {score, bestRefStart, bestRefStop} */ + public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ + int a=Tools.max(0, refStartLoc); + int b=Tools.min(ref.length-1, refEndLoc); + assert(b>=a); + + int[] score; + + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + assert(false) : refStartLoc+", "+refEndLoc; + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillLimited(read, ref, a, b, minScore); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2])); + + return score; + } + + + + public final int scoreNoIndels(byte[] read, SiteScore ss){ + ChromosomeArray cha=Data.getChromosome(ss.chrom); + return scoreNoIndels(read, cha.array, ss.start, ss); + } + + public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){ + ChromosomeArray cha=Data.getChromosome(chrom); + return scoreNoIndels(read, cha.array, refStart, null); + } + + public final int scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores){ + ChromosomeArray cha=Data.getChromosome(ss.chrom); + return scoreNoIndels(read, cha.array, baseScores, ss.start, ss); + } + + public final int scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores){ + ChromosomeArray cha=Data.getChromosome(chrom); + return scoreNoIndels(read, cha.array, baseScores, refStart, null); + } + + + + public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){ + + int score=0; + int mode=-1; + int timeInMode=0; + + //This block handles cases where the read runs outside the reference + //Of course, padding the reference with 'N' would be better, but... + int readStart=0; + int readStop=read.length; + final int refStop=refStart+read.length; + boolean semiperfect=true; + int norefs=0; + + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + norefs+=readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i>SCOREOFFSET; + String s=" "+num; + if(s.length()>width){s=num>0 ? maxString : minString;} + int spaces=width-s.length(); + assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0); + for(int i=0; iLIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_DEL2; + } + return score; + } + + private static int calcDelScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_DEL; + + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_DEL2; + } + return score; + } + + public static int calcInsScore(int len){ + if(len<=0){return 0;} + int score=POINTS_INS; + + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_INS2; + } + return score; + } + + private static int calcInsScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_INS; + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_INS2; + } + return score; + } + + + public final int maxRows; + public final int maxColumns; + + private final int[][][] packed; + + public final int[] vertLimit; + public final int[] horizLimit; + + CharSequence showVertLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + CharSequence showHorizLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + +// public static final int MODEBITS=2; + public static final int TIMEBITS=12; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1<=200); +// assert(maxRows_>=200); + maxRows=maxRows_; + maxColumns=maxColumns_; + packed=new int[3][maxRows+1][maxColumns+1]; + + vertLimit=new int[maxRows+1]; + horizLimit=new int[maxColumns+1]; + Arrays.fill(vertLimit, BADoff); + Arrays.fill(horizLimit, BADoff); + +// for(int i=0; i0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){ +// assert(false) : minScore; + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + + minScore-=100; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); + } + + horizLimit[columns]=minScore_off; + for(int i=columns-1; i>=0; i--){ + horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); + } + + for(int row=1; row<=rows; row++){ + + int colStart=minGoodCol; + int colStop=maxGoodCol; + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + if(scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && maxGoodCol1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more. + + //temporary, for finding a bug + if(rows>maxRows || columns>maxColumns){ + throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); + } + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + assert(time<=MAX_TIME);//if(time>MAX_TIME){time=MAX_TIME-3;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + out[outPos]='D'; + + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop}; + } + return rvec; + } + + + /** Will not fill areas that cannot match minScore. + * @return {score, bestRefStart, bestRefStop} */ + public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore){ + int a=Tools.max(0, refStartLoc); + int b=Tools.min(ref.length-1, refEndLoc); + assert(b>=a); + + int[] score; + + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + assert(false) : refStartLoc+", "+refEndLoc; + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillLimited(read, ref, a, b, minScore); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2])); + + return score; + } + + + + public final int scoreNoIndels(byte[] read, SiteScore ss){ + ChromosomeArray cha=Data.getChromosome(ss.chrom); + return scoreNoIndels(read, cha.array, ss.start, ss); + } + + public final int scoreNoIndels(byte[] read, final int chrom, final int refStart){ + ChromosomeArray cha=Data.getChromosome(chrom); + return scoreNoIndels(read, cha.array, refStart, null); + } + + public final int scoreNoIndels(byte[] read, SiteScore ss, byte[] baseScores){ + ChromosomeArray cha=Data.getChromosome(ss.chrom); + return scoreNoIndels(read, cha.array, baseScores, ss.start, ss); + } + + public final int scoreNoIndels(byte[] read, final int chrom, final int refStart, byte[] baseScores){ + ChromosomeArray cha=Data.getChromosome(chrom); + return scoreNoIndels(read, cha.array, baseScores, refStart, null); + } + + + + public final int scoreNoIndels(byte[] read, byte[] ref, final int refStart, final SiteScore ss){ + + int score=0; + int mode=-1; + int timeInMode=0; + + //This block handles cases where the read runs outside the reference + //Of course, padding the reference with 'N' would be better, but... + int readStart=0; + int readStop=read.length; + final int refStop=refStart+read.length; + boolean semiperfect=true; + int norefs=0; + + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + norefs+=readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i>SCOREOFFSET; + String s=" "+num; + if(s.length()>width){s=num>0 ? maxString : minString;} + int spaces=width-s.length(); + assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0); + for(int i=0; iLIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_DEL2; + } + return score; + } + + private static int calcDelScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_DEL; + + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_DEL2; + } + return score; + } + + public static int calcInsScore(int len){ + if(len<=0){return 0;} + int score=POINTS_INS; + + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_INS2; + } + return score; + } + + private static int calcInsScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_INS; + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_INS2; + } + return score; + } + + + public final int maxRows; + public final int maxColumns; + + private final int[][][] packed; + + public final int[] vertLimit; + public final int[] horizLimit; + + CharSequence showVertLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + CharSequence showHorizLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + +// public static final int MODEBITS=2; + public static final int TIMEBITS=12; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1<2 && args[2].equalsIgnoreCase("cs")){ + colorspace=true; + read=AminoAcid.toColorspace(read); + ref=AminoAcid.toColorspace(ref); + } + + MultiStateAligner9PacBioAdapter_WithBarriers msa=new MultiStateAligner9PacBioAdapter_WithBarriers(read.length, ref.length, colorspace); + + System.out.println("Initial: "); + for(int mode=0; mode=200); +// assert(maxRows_>=200); + maxRows=maxRows_; + maxColumns=maxColumns_; + colorspace=colorspace_; + packed=new int[3][maxRows+1][maxColumns+1]; + grefbuffer=new byte[maxColumns+2]; + + vertLimit=new int[maxRows+1]; + horizLimit=new int[maxColumns+1]; + Arrays.fill(vertLimit, BADoff); + Arrays.fill(horizLimit, BADoff); + +// for(int i=0; i0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){ +// assert(false) : minScore; + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=100; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); + } + + horizLimit[columns]=minScore_off; + for(int i=columns-1; i>=0; i--){ + horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); + } + + for(int row=1; row<=rows; row++){ + + int colStart=minGoodCol; + int colStop=maxGoodCol; + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && maxGoodCol1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Will not fill areas that cannot match minScore */ + public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ + if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} + else{ + byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); + assert(gref!=null) : "Excessively long read:\n"+new String(read); + return fillUnlimited(read, gref, 0, greflimit); + } + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more. +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + //temporary, for finding a bug + if(rows>maxRows || columns>maxColumns){ + throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); + } + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + if(rowBARRIER_D2){ + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + //Calculate INS score +// if(gap || colBARRIER_I2){ + if(gap || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** @return {score, bestRefStart, bestRefStop} */ + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ + if(gapped){ + final byte[] gref=grefbuffer; + int gstart=translateToGappedCoordinate(refStartLoc, gref); + int gstop=translateToGappedCoordinate(refEndLoc, gref); + byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); + return out; + }else{ + return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); + } + } + + + /** Generates the match string */ + public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + int gaps=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c, colorspace)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r, colorspace)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + if(r==GAPC){ + out[outPos]='-'; + gaps++; + }else{ + out[outPos]='D'; + } + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ + translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); + assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); + + out[1]=translateFromGappedCoordinate(out[1], gref); + out[2]=translateFromGappedCoordinate(out[2], gref); + if(verbose){System.err.println("returning score "+Arrays.toString(out));} + return out; + }else{ + return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); + } + } + + /** @return {score, bestRefStart, bestRefStop} */ + public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, + final int maxRow, final int maxCol, final int maxState){ + + int row=maxRow; + int col=maxCol; + int state=maxState; + + assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop}; + } + return rvec; + } + + + /** Will not fill areas that cannot match minScore. + * @return {score, bestRefStart, bestRefStop} */ + public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){ + int a=Tools.max(0, refStartLoc); + int b=Tools.min(ref.length-1, refEndLoc); + assert(b>=a); + + int[] score; + + if(gaps==null){ + if(verbose){ + System.err.println("no gaps"); + } + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + assert(false) : refStartLoc+", "+refEndLoc; + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillLimited(read, ref, a, b, minScore, gaps); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], false)); + }else{ + if(verbose){System.err.println("\ngaps: "+Arrays.toString(gaps)+"\n"+new String(read)+"\ncoords: "+refStartLoc+", "+refEndLoc);} + int[] max=fillLimited(read, ref, a, b, minScore, gaps); + if(verbose){System.err.println("max: "+Arrays.toString(max));} +// score=(max==null ? null : score(read, grefbuffer, 0, greflimit, max[0], max[1], max[2], true)); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], true)); + } + return score; + } + + public final int[] fillAndScoreLimited(byte[] read, SiteScore ss, int thresh, int minScore){ + return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore, ss.gaps); + } + /* + public final int[] fillAndScoreLimited_Gapped(byte[] read, SiteScore ss, int thresh, int minScore){ + if(ss.gaps==null){return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore);} + int[] gaps=ss.gaps; + final int bound1=gaps[0]=Tools.min(ss.start, gaps[0]); + final int bound2=gaps[gaps.length-1]=Tools.max(ss.stop, gaps[gaps.length-1]); + + //This block is no longer needed since the array is preallocated. + int len=0; + final int gb2=GAPBUFFER*2; + for(int i=0; iy); + int gap=z-y-1; + if(gap=len) : ss+"\t"+len+"\t"+gref.length; + + ChromosomeArray cha=Data.getChromosome(ss.chrom); + + for(int i=0, j=0; iy); + int gap=z-y-1; + assert(gap>=MINGAP); + if(gap0); + int rem=gap%GAPLEN; + int lim=y+GAPBUFFER; + + for(int r=y+1; r<=lim; r++, j++){ + gref[j]=cha.get(r); + } + for(int g=0; g-9999); + break; + } + + if(refc!=GAPC){ + j++; + }else{ + j+=GAPLEN; + } + } + assert(rstart2>-9999 && rstop2>-9999); + scoreArray[1]=rstart2; + scoreArray[2]=rstop2; + + return scoreArray; + }*/ + + /** + * Fills grefbuffer + * @param ref + * @param a + * @param b + * @param gaps + * @return gref + */ + public final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ + assert(gaps!=null && gaps.length>0); + + assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); + assert(refEndLoc>=gaps[gaps.length-1]); + + final int g0_old=gaps[0]; + final int gN_old=gaps[gaps.length-1]; + gaps[0]=Tools.min(gaps[0], refStartLoc); + gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); + grefRefOrigin=gaps[0]; + + if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} + +// grefRefOrigin=Tools.min(gaps[0], refStartLoc); + +// //This block is no longer needed since the array is preallocated. +// int len=0; +// final int gb2=GAPBUFFER*2; +// for(int i=0; iy); +// int gap=z-y-1; +// if(gapy); + int gap=z-y-1; + assert(gap>=MINGAP) : gap+"\t"+MINGAP; + if(gap "+gref.length); + return null; + } + for(int i=greflimit, r=refEndLoc+1; i "+j);} + return j; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + private final int translateToGappedCoordinate(int point, byte[] gref){ + if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} + if(point<=grefRefOrigin){return point-grefRefOrigin;} + for(int i=0, j=grefRefOrigin; i "+i);} + return i; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + public final int[] fillAndScoreLimited(byte[] read, int chrom, int start, int stop, int thresh, int minScore, int[] gaps){ + return fillAndScoreLimited(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, minScore, gaps); + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, byte[] baseScores){ + int a=Tools.max(0, refStartLoc); + int b=Tools.min(ref.length-1, refEndLoc); + assert(b>=a); + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillQ(read, ref, baseScores, a, b); +// int[] score=score(read, ref, a, b, max[0], max[1], max[2]); +// return score; + return null; + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, SiteScore ss, int thresh, byte[] baseScores){ + return fillAndScoreQ(read, ss.chrom, ss.start, ss.stop, thresh, baseScores); + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, int chrom, int start, int stop, int thresh, byte[] baseScores){ + return fillAndScoreQ(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, baseScores); + } + +// public final int scoreNoIndels(byte[] read, SiteScore ss){ +// +// ChromosomeArray cha=Data.getChromosome(ss.chrom); +// final int refStart=ss.start; +// +// int score=0; +// int mode=MODE_START; +// int timeInMode=0; +// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match +// +// for(int i=0; icha.maxIndex+1){ +// int dif=(cha.maxIndex+1-refStop); +// readStop-=dif; +// score+=POINTSoff_NOREF*dif; +// } +// +//// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //No longer needed. +// +// for(int i=readStart; i0){//match + if(loc==lastValue){//contiguous match + score+=POINTS_MATCH2; + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=POINTS_MATCH; + }else if(loc=0); + score+=POINTS_MATCH; + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=POINTS_MATCH; + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode0){//match + if(loc==lastValue){//contiguous match + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInModeref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i>SCOREOFFSET; + String s=" "+num; + if(s.length()>width){s=num>0 ? maxString : minString;} + int spaces=width-s.length(); + assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0); + for(int i=0; iLIMIT_FOR_COST_5){ + score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + len=LIMIT_FOR_COST_5; + } + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_DEL2; + } + return score; + } + + private static int calcDelScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_DEL; + + if(len>LIMIT_FOR_COST_5){ + score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5; + len=LIMIT_FOR_COST_5; + } + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_DEL2; + } + return score; + } + + public static int calcInsScore(int len){ + if(len<=0){return 0;} + int score=POINTS_INS; + + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_INS2; + } + return score; + } + + private static int calcInsScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_INS; + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_INS2; + } + return score; + } + + + public final int maxRows; + public final int maxColumns; + + private final int[][][] packed; + private final byte[] grefbuffer; + private int greflimit=-1; + private int greflimit2=-1; + private int grefRefOrigin=-1; + + public static final int GAPBUFFER=Shared.GAPBUFFER; + public static final int GAPBUFFER2=Shared.GAPBUFFER2; + public static final int GAPLEN=Shared.GAPLEN; + public static final int MINGAP=Shared.MINGAP; + public static final int GAPCOST=Shared.GAPCOST*2; + public static final byte GAPC=Shared.GAPC; + + private static final int GREFLIMIT2_CUSHION=128; //Tools.max(GAPBUFFER2, GAPLEN); + + + /**DO NOT MODIFY*/ + public final byte[] getGrefbuffer(){ + return grefbuffer; + } + + public final int[] vertLimit; + public final int[] horizLimit; + + CharSequence showVertLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + CharSequence showHorizLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + +// public static final int MODEBITS=2; + public static final int TIMEBITS=12; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1< + * Same as MSA2P, but the "prevState" field was removed. + * Yields identical results to MSA2, but is faster. + * For very long reads (over 2000bp) the score may overflow, so MSA2 should be used instead, + * or the time field should be shrunk. */ +public final class MultiStateAligner9fs { + + + public static void main(String[] args){ + byte[] read=args[0].getBytes(); + byte[] ref=args[1].getBytes(); + + byte[] original=ref; + + boolean colorspace=false; + + if(args.length>2 && args[2].equalsIgnoreCase("cs")){ + colorspace=true; + read=AminoAcid.toColorspace(read); + ref=AminoAcid.toColorspace(ref); + } + + MultiStateAligner9fs msa=new MultiStateAligner9fs(read.length, ref.length, colorspace); + + System.out.println("Initial: "); + for(int mode=0; mode0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + if(/*read.length<40 || */false || minScore<=0 || columns>read.length+Tools.min(100, read.length)){ +// assert(false) : minScore; + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=100; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); + } + + horizLimit[columns]=minScore_off; + for(int i=columns-1; i>=0; i--){ + horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); + } + + for(int row=1; row<=rows; row++){ + + int colStart=minGoodCol; + int colStop=maxGoodCol; + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MATCH][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + packed[MODE_SUB][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromMatch_MS=packed[MODE_MATCH][row-1][col-1]&SCOREMASK; + final int scoreFromSub_MS=packed[MODE_SUB][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=Tools.max(packed[MODE_MATCH][row][col-1]&SCOREMASK, packed[MODE_SUB][row][col-1]&SCOREMASK); + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=Tools.max(packed[MODE_MATCH][row-1][col]&SCOREMASK, packed[MODE_SUB][row-1][col]&SCOREMASK); + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MATCH; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + // if(time>MAX_TIME){time=MAX_TIME-MASK5;} + // assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + // assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; + //// packed[MODE_MS][row][col]=(score|prevState|time); + // packed[MODE_MS][row][col]=(score|time); + // assert((score&SCOREMASK)==score); + //// assert((prevState&MODEMASK)==prevState); + // assert((time&TIMEMASK)==time); + + + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MATCH][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + + + if(gap || (scoreFromMatch_MS<=limit3 && scoreFromSub_MS<=limit3 && scoreFromDel_MS<=limit3 && scoreFromIns_MS<=limit3)){ + packed[MODE_SUB][row][col]=subfloor; + }else{//Calculate match and sub scores + final int streak=(packed[MODE_SUB][row-1][col-1]&TIMEMASK); + + {//Calculate match/sub score + + int score; + int time; + byte prevState; + + + int scoreM=match ? subfloor : scoreFromMatch_MS+POINTSoff_MATCHSUB; + int scoreS=scoreFromSub_MS+(streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); + // time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MATCH; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + // if(time>MAX_TIME){time=MAX_TIME-MASK5;} + // assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + // assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; + //// packed[MODE_MS][row][col]=(score|prevState|time); + // packed[MODE_MS][row][col]=(score|time); + // assert((score&SCOREMASK)==score); + //// assert((prevState&MODEMASK)==prevState); + // assert((time&TIMEMASK)==time); + + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MATCH][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + */ + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MATCH; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && maxGoodCol1){ + packed[MODE_MATCH][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Will not fill areas that cannot match minScore */ + public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ + if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} + else{ + byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); + assert(gref!=null) : "Excessively long read:\n"+new String(read); + return fillUnlimited(read, gref, 0, greflimit); + } + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff); //TODO: Actually, it needs to be substantially more. +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MATCH][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MATCH; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MATCH][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + if(rowBARRIER_D2){ + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MATCH; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + //Calculate INS score +// if(gap || colBARRIER_I2){ + if(gap || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MATCH][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MATCH; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MATCH][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MATCH; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + + /** @return {score, bestRefStart, bestRefStop} */ + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ + if(gapped){ + final byte[] gref=grefbuffer; + int gstart=translateToGappedCoordinate(refStartLoc, gref); + int gstop=translateToGappedCoordinate(refEndLoc, gref); + byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); + return out; + }else{ + return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); + } + } + + + /** Generates the match string */ + public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + int gaps=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MATCH){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MATCH][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c, colorspace)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r, colorspace)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MATCH;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + if(r==GAPC){ + out[outPos]='-'; + gaps++; + }else{ + out[outPos]='D'; + } + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ + translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); + assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); + + out[1]=translateFromGappedCoordinate(out[1], gref); + out[2]=translateFromGappedCoordinate(out[2], gref); + if(verbose){System.err.println("returning score "+Arrays.toString(out));} + return out; + }else{ + return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); + } + } + + /** @return {score, bestRefStart, bestRefStop} */ + public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, + final int maxRow, final int maxCol, final int maxState){ + + int row=maxRow; + int col=maxCol; + int state=maxState; + + assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MATCH){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MATCH][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MATCH][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MATCH;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MATCH][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MATCH;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop}; + } + return rvec; + } + + + /** Will not fill areas that cannot match minScore. + * @return {score, bestRefStart, bestRefStop} */ + public final int[] fillAndScoreLimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int minScore, int[] gaps){ + int a=Tools.max(0, refStartLoc); + int b=Tools.min(ref.length-1, refEndLoc); + assert(b>=a); + + int[] score; + + if(gaps==null){ + if(verbose){ + System.err.println("no gaps"); + } + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + assert(false) : refStartLoc+", "+refEndLoc; + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillLimited(read, ref, a, b, minScore, gaps); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], false)); + }else{ + if(verbose){System.err.println("\ngaps: "+Arrays.toString(gaps)+"\n"+new String(read)+"\ncoords: "+refStartLoc+", "+refEndLoc);} + int[] max=fillLimited(read, ref, a, b, minScore, gaps); + if(verbose){System.err.println("max: "+Arrays.toString(max));} +// score=(max==null ? null : score(read, grefbuffer, 0, greflimit, max[0], max[1], max[2], true)); + score=(max==null ? null : score(read, ref, a, b, max[0], max[1], max[2], true)); + } + return score; + } + + public final int[] fillAndScoreLimited(byte[] read, SiteScore ss, int thresh, int minScore){ + return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore, ss.gaps); + } + /* + public final int[] fillAndScoreLimited_Gapped(byte[] read, SiteScore ss, int thresh, int minScore){ + if(ss.gaps==null){return fillAndScoreLimited(read, ss.chrom, ss.start, ss.stop, thresh, minScore);} + int[] gaps=ss.gaps; + final int bound1=gaps[0]=Tools.min(ss.start, gaps[0]); + final int bound2=gaps[gaps.length-1]=Tools.max(ss.stop, gaps[gaps.length-1]); + + //This block is no longer needed since the array is preallocated. + int len=0; + final int gb2=GAPBUFFER*2; + for(int i=0; iy); + int gap=z-y-1; + if(gap=len) : ss+"\t"+len+"\t"+gref.length; + + ChromosomeArray cha=Data.getChromosome(ss.chrom); + + for(int i=0, j=0; iy); + int gap=z-y-1; + assert(gap>=MINGAP); + if(gap0); + int rem=gap%GAPLEN; + int lim=y+GAPBUFFER; + + for(int r=y+1; r<=lim; r++, j++){ + gref[j]=cha.get(r); + } + for(int g=0; g-9999); + break; + } + + if(refc!=GAPC){ + j++; + }else{ + j+=GAPLEN; + } + } + assert(rstart2>-9999 && rstop2>-9999); + scoreArray[1]=rstart2; + scoreArray[2]=rstop2; + + return scoreArray; + }*/ + + /** + * Fills grefbuffer + * @param ref + * @param a + * @param b + * @param gaps + * @return gref + */ + public final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ + assert(gaps!=null && gaps.length>0); + + assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); + assert(refEndLoc>=gaps[gaps.length-1]); + + final int g0_old=gaps[0]; + final int gN_old=gaps[gaps.length-1]; + gaps[0]=Tools.min(gaps[0], refStartLoc); + gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); + grefRefOrigin=gaps[0]; + + if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} + +// grefRefOrigin=Tools.min(gaps[0], refStartLoc); + +// //This block is no longer needed since the array is preallocated. +// int len=0; +// final int gb2=GAPBUFFER*2; +// for(int i=0; iy); +// int gap=z-y-1; +// if(gapy); + int gap=z-y-1; + assert(gap>=MINGAP) : gap+"\t"+MINGAP; + if(gap "+gref.length); + return null; + } + for(int i=greflimit, r=refEndLoc+1; i "+j);} + return j; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + private final int translateToGappedCoordinate(int point, byte[] gref){ + if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} + if(point<=grefRefOrigin){return point-grefRefOrigin;} + for(int i=0, j=grefRefOrigin; i "+i);} + return i; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + public final int[] fillAndScoreLimited(byte[] read, int chrom, int start, int stop, int thresh, int minScore, int[] gaps){ + return fillAndScoreLimited(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, minScore, gaps); + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, byte[] baseScores){ + int a=Tools.max(0, refStartLoc); + int b=Tools.min(ref.length-1, refEndLoc); + assert(b>=a); + if(b-a>=maxColumns){ + System.err.println("Warning: Max alignment columns exceeded; restricting range. "+(b-a+1)+" > "+maxColumns); + b=Tools.min(ref.length-1, a+maxColumns-1); + } + int[] max=fillQ(read, ref, baseScores, a, b); +// int[] score=score(read, ref, a, b, max[0], max[1], max[2]); +// return score; + return null; + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, SiteScore ss, int thresh, byte[] baseScores){ + return fillAndScoreQ(read, ss.chrom, ss.start, ss.stop, thresh, baseScores); + } + + @Deprecated + public final int[] fillAndScoreQ(byte[] read, int chrom, int start, int stop, int thresh, byte[] baseScores){ + return fillAndScoreQ(read, Data.getChromosome(chrom).array, start-thresh, stop+thresh, baseScores); + } + +// public final int scoreNoIndels(byte[] read, SiteScore ss){ +// +// ChromosomeArray cha=Data.getChromosome(ss.chrom); +// final int refStart=ss.start; +// +// int score=0; +// int mode=MODE_START; +// int timeInMode=0; +// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //TODO: Partial match +// +// for(int i=0; icha.maxIndex+1){ +// int dif=(cha.maxIndex+1-refStop); +// readStop-=dif; +// score+=POINTSoff_NOREF*dif; +// } +// +//// if(refStart<0 || refStart+read.length>cha.maxIndex+1){return -99999;} //No longer needed. +// +// for(int i=readStart; i0){//match + if(loc==lastValue){//contiguous match + score+=POINTS_MATCH2; + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=POINTS_MATCH; + }else if(loc=0); + score+=POINTS_MATCH; + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=POINTS_MATCH; + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode0){//match + if(loc==lastValue){//contiguous match + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInModeref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){ + int dif=(refStop-ref.length); + System.err.println("dif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i>SCOREOFFSET; + String s=" "+num; + if(s.length()>width){s=num>0 ? maxString : minString;} + int spaces=width-s.length(); + assert(spaces>=0) : width+", "+s.length()+", "+s+", "+num+", "+spaces; + for(int i=0; i=0); + for(int i=0; iLIMIT_FOR_COST_5){ + score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + len=LIMIT_FOR_COST_5; + } + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_DEL2; + } + return score; + } + + private static int calcDelScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_DEL; + + if(len>LIMIT_FOR_COST_5){ + score+=((len-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTSoff_DEL5; + len=LIMIT_FOR_COST_5; + } + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_DEL4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_DEL3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_DEL2; + } + return score; + } + + public static int calcInsScore(int len){ + if(len<=0){return 0;} + int score=POINTS_INS; + + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTS_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTS_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTS_INS2; + } + return score; + } + + private static int calcInsScoreOffset(int len){ + if(len<=0){return 0;} + int score=POINTSoff_INS; + if(len>LIMIT_FOR_COST_4){ + score+=(len-LIMIT_FOR_COST_4)*POINTSoff_INS4; + len=LIMIT_FOR_COST_4; + } + if(len>LIMIT_FOR_COST_3){ + score+=(len-LIMIT_FOR_COST_3)*POINTSoff_INS3; + len=LIMIT_FOR_COST_3; + } + if(len>1){ + score+=(len-1)*POINTSoff_INS2; + } + return score; + } + + + public final int maxRows; + public final int maxColumns; + + private final int[][][] packed; + private final byte[] grefbuffer; + private int greflimit=-1; + private int greflimit2=-1; + private int grefRefOrigin=-1; + + public static final int GAPBUFFER=Shared.GAPBUFFER; + public static final int GAPBUFFER2=Shared.GAPBUFFER2; + public static final int GAPLEN=Shared.GAPLEN; + public static final int MINGAP=Shared.MINGAP; + public static final int GAPCOST=Shared.GAPCOST; + public static final byte GAPC=Shared.GAPC; + + private static final int GREFLIMIT2_CUSHION=128; //Tools.max(GAPBUFFER2, GAPLEN); + + + /**DO NOT MODIFY*/ + public final byte[] getGrefbuffer(){ + return grefbuffer; + } + + public final int[] vertLimit; + public final int[] horizLimit; + + CharSequence showVertLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=rows; i++){sb.append(vertLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + CharSequence showHorizLimit(){ + StringBuilder sb=new StringBuilder(); + for(int i=0; i<=columns; i++){sb.append(horizLimit[i]>>SCOREOFFSET).append(",");} + return sb; + } + +// public static final int MODEBITS=2; + public static final int TIMEBITS=12; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1< + * Same as MSA2P, but the "prevState" field was removed. + * Yields identical results to MSA2, but is faster. + * For very long reads (over 2000bp) the score may overflow, so MSA2 should be used instead, + * or the time field should be shrunk. */ +public final class MultiStateAligner9ts extends MSA{ + + + public static void main(String[] args){ + byte[] read=args[0].getBytes(); + byte[] ref=args[1].getBytes(); + + byte[] original=ref; + + boolean colorspace=false; + + if(args.length>2 && args[2].equalsIgnoreCase("cs")){ + colorspace=true; + read=AminoAcid.toColorspace(read); + ref=AminoAcid.toColorspace(ref); + } + + MultiStateAligner9ts msa=new MultiStateAligner9ts(read.length, ref.length, colorspace); + + System.out.println("Initial: "); + for(int mode=0; mode0); + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int halfband=(bandwidth<1 && bandwidthRatio<=0) ? 0 : + Tools.max(Tools.min(bandwidth<1 ? 9999999 : bandwidth, bandwidthRatio<=0 ? 9999999 : 8+(int)(rows*bandwidthRatio)), (columns-rows+8))/2; + + if(minScore<1 || (columns+rows<90) || ((halfband<1 || halfband*3>columns) && (columns>read.length+Tools.min(170, read.length+20)))){ +// assert(false) : minScore; +// assert(minScore>0) : minScore; +// assert(false) : +minScore+", "+columns+", "+read.length+", "+Tools.min(100, read.length); + return fillUnlimited(read, ref, refStartLoc, refEndLoc); + } + +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + minScore-=120; //Increases quality trivially + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc+"\n"+ + refStartLoc+", "+refEndLoc+", "+rows+", "+maxRows+", "+columns+", "+maxColumns+"\n"+new String(read)+"\n"; + assert(refEndLocBADoff); //TODO: Actually, it needs to be substantially more. + assert(subfloor=0; i--){ + byte c=read[i]; + if(AminoAcid.isFullyDefined(c)){ + vertLimit[i]=Tools.max(vertLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_NOCALL, floor); + prevDefined=false; + } + } + + horizLimit[columns]=minScore_off; + prevDefined=false; + for(int i=columns-1; i>=0; i--){ + byte c=ref[refStartLoc+i]; + if(AminoAcid.isFullyDefined(c)){ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined ? POINTSoff_MATCH2 : POINTSoff_MATCH), floor); + prevDefined=true; + }else{ + horizLimit[i]=Tools.max(horizLimit[i+1]-(prevDefined && c==GAPC ? POINTSoff_DEL : POINTSoff_NOREF), floor); + prevDefined=false; + } + } + +// vertLimit[rows]=minScore_off; +// for(int i=rows-1; i>=0; i--){ +// vertLimit[i]=Tools.max(vertLimit[i+1]-POINTSoff_MATCH2, floor); +// } +// +// horizLimit[columns]=minScore_off; +// for(int i=columns-1; i>=0; i--){ +// horizLimit[i]=Tools.max(horizLimit[i+1]-POINTSoff_MATCH2, floor); +// } + + for(int row=1; row<=rows; row++){ + + final int colStart=(halfband<1 ? minGoodCol : Tools.max(minGoodCol, row-halfband)); + final int colStop=(halfband<1 ? maxGoodCol : Tools.min(maxGoodCol, row+halfband*2-1)); + + minGoodCol=-1; + maxGoodCol=-2; + + final int vlimit=vertLimit[row]; + + if(verbose2){ + System.out.println(); + System.out.println("row="+row); + System.out.println("colStart="+colStart); + System.out.println("colStop="+colStop); + System.out.println("vlimit="+(vlimit>>SCOREOFFSET)+"\t"+(vlimit)); + } + + if(colStart<0 || colStop1){ + assert(row>0); + packed[MODE_MS][row][colStart-1]=subfloor; + packed[MODE_INS][row][colStart-1]=subfloor; + packed[MODE_DEL][row][colStart-1]=subfloor; + } + + + for(int col=colStart; col<=columns; col++){ + + + if(verbose2){ + System.out.println("\ncol "+col); + } + + final byte call0=(row<2 ? (byte)'?' : read[row-2]); + final byte call1=read[row-1]; + final byte ref0=(col<2 ? (byte)'!' : ref[refStartLoc+col-2]); + final byte ref1=ref[refStartLoc+col-1]; + + final boolean gap=(ref1==GAPC); + assert(call1!=GAPC); + +// final boolean match=(read[row-1]==ref[refStartLoc+col-1]); +// final boolean prevMatch=(row<2 || col<2 ? false : read[row-2]==ref[refStartLoc+col-2]); + final boolean match=(call1==ref1 && ref1!='N'); + final boolean prevMatch=(call0==ref0 && ref0!='N'); + +// System.err.println("") + + iterationsLimited++; + final int limit=Tools.max(vlimit, horizLimit[col]); + final int limit3=Tools.max(floor, (match ? limit-POINTSoff_MATCH2 : limit-POINTSoff_SUB3)); + + final int delNeeded=Tools.max(0, row-col-1); + final int insNeeded=Tools.max(0, (rows-row)-(columns-col)-1); + + final int delPenalty=calcDelScoreOffset(delNeeded); + final int insPenalty=calcInsScoreOffset(insNeeded); + + + final int scoreFromDiag_MS=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel_MS=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns_MS=packed[MODE_INS][row-1][col-1]&SCOREMASK; + + final int scoreFromDiag_DEL=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel_DEL=packed[MODE_DEL][row][col-1]&SCOREMASK; + + final int scoreFromDiag_INS=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns_INS=packed[MODE_INS][row-1][col]&SCOREMASK; + +// if(scoreFromDiag_MS=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag_MS+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + +// if(time>MAX_TIME){time=MAX_TIME-MASK5;} +// assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; +// assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +//// packed[MODE_MS][row][col]=(score|prevState|time); +// packed[MODE_MS][row][col]=(score|time); +// assert((score&SCOREMASK)==score); +//// assert((prevState&MODEMASK)==prevState); +// assert((time&TIMEMASK)==time); + } + + final int limit2; + if(delNeeded>0){ + limit2=limit-delPenalty; + }else if(insNeeded>0){ + limit2=limit-insPenalty; + }else{ + limit2=limit; + } + assert(limit2>=limit); + + if(verbose2){System.err.println("MS: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + + if((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit) || rowBARRIER_D2){ +// assert((scoreFromDiag_DEL<=limit && scoreFromDel_DEL<=limit)) : scoreFromDiag_DEL+", "+row; + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + int scoreMS=scoreFromDiag_DEL+POINTSoff_DEL; + int scoreD=scoreFromDel_DEL+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + final int limit2; + if(insNeeded>0){ + limit2=limit-insPenalty; + }else if(delNeeded>0){ + limit2=limit-calcDelScoreOffset(time+delNeeded)+calcDelScoreOffset(time); + }else{ + limit2=limit; + } + assert(limit2>=limit); + if(verbose2){System.err.println("DEL: \tlimit2="+(limit2>>SCOREOFFSET)+"\t, score="+(score>>SCOREOFFSET));} + + if(score>=limit2){ + maxGoodCol=col; + if(minGoodCol<0){minGoodCol=col;} + }else{ + score=subfloor; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE || score==BADoff) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + +// if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || colBARRIER_I2){ + if(gap || (scoreFromDiag_INS<=limit && scoreFromIns_INS<=limit) || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + int scoreMS=scoreFromDiag_INS+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns_INS+(streak==0 ? POINTSoff_INS : + streak=colStop){ + if(col>colStop && (maxGoodCol0)){break;} + if(row>1){ + packed[MODE_MS][row-1][col+1]=subfloor; + packed[MODE_INS][row-1][col+1]=subfloor; + packed[MODE_DEL][row-1][col+1]=subfloor; + } + } + } + } + + + int maxCol=-1; + int maxState=-1; + int maxScore=Integer.MIN_VALUE; + + for(int state=0; statemaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + + assert(maxScore>=BADoff); +// if(maxScore==BADoff){ +// return null; +// } +// if(maxScore>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Override + public final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int[] gaps){ + if(gaps==null){return fillUnlimited(read, ref, refStartLoc, refEndLoc);} + else{ + byte[] gref=makeGref(ref, gaps, refStartLoc, refEndLoc); + assert(gref!=null) : "Excessively long read:\n"+new String(read); + return fillUnlimited(read, gref, 0, greflimit); + } + } + + + /** return new int[] {rows, maxC, maxS, max}; + * Does not require a min score (ie, same as old method) */ + private final int[] fillUnlimited(byte[] read, byte[] ref, int refStartLoc, int refEndLoc){ + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + final int maxGain=(read.length-1)*POINTSoff_MATCH2+POINTSoff_MATCH; + final int subfloor=0-2*maxGain; + assert(subfloor>BADoff && subfloor*2>BADoff) : (read.length-1)+", "+maxGain+", "+subfloor+", "+(subfloor*2)+", "+BADoff+"\n" + +rows+", "+columns+", "+POINTSoff_MATCH2+", "+SCOREOFFSET+"\n"+new String(read)+"\n"; //TODO: Actually, it needs to be substantially more. +// final int BARRIER_I2=columns-BARRIER_I1; + final int BARRIER_I2=rows-BARRIER_I1; + final int BARRIER_D2=rows-BARRIER_D1; + + //temporary, for finding a bug + if(rows>maxRows || columns>maxColumns){ + throw new RuntimeException("rows="+rows+", maxRows="+maxRows+", cols="+columns+", maxCols="+maxColumns+"\n"+new String(read)+"\n"); + } + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS; + if(ref1!='N' && call1!='N'){ + scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + if(rowBARRIER_D2){ + packed[MODE_DEL][row][col]=subfloor; + }else{//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + //Calculate INS score +// if(gap || colBARRIER_I2){ + if(gap || rowBARRIER_I2){ + packed[MODE_INS][row][col]=subfloor; + }else{//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Deprecated + /** return new int[] {rows, maxC, maxS, max}; */ + public final int[] fillQ(byte[] read, byte[] ref, byte[] baseScores, int refStartLoc, int refEndLoc){ + assert(false) : "Needs to be redone to work with score cutoffs. Not difficult."; + rows=read.length; + columns=refEndLoc-refStartLoc+1; + + assert(rows<=maxRows) : "Check that values are in-bounds before calling this function: "+rows+", "+maxRows; + assert(columns<=maxColumns) : "Check that values are in-bounds before calling this function: "+columns+", "+maxColumns; + + assert(refStartLoc>=0) : "Check that values are in-bounds before calling this function: "+refStartLoc; + assert(refEndLoc=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? streak+1 : 1); +// prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; +// prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; +// prevState=MODE_INS; + } + score+=(((int)baseScores[row-1])<MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + + }else{ + + int scoreMS=scoreFromDiag+(prevMatch ? (streak<=1 ? POINTSoff_SUBR : POINTSoff_SUB) : + (streak==0 ? POINTSoff_SUB : streak=scoreD && scoreMS>=scoreI){ + score=scoreMS; + time=(prevMatch ? 1 : streak+1); +// time=(prevMatch ? (streak==1 ? 3 : 1) : streak+1); + prevState=MODE_MS; + }else if(scoreD>=scoreI){ + score=scoreD; + time=1; + prevState=MODE_DEL; + }else{ + score=scoreI; + time=1; + prevState=MODE_INS; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_MS][row][col]=(score|prevState|time); + packed[MODE_MS][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + } + } + + {//Calculate DEL score + + final int streak=packed[MODE_DEL][row][col-1]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_DEL; + int scoreD=scoreFromDel+(streak==0 ? POINTSoff_DEL : + streak=scoreD){ + score=scoreMS; + time=1; + prevState=MODE_MS; + }else{ + score=scoreD; + time=streak+1; + prevState=MODE_DEL; + } + + if(time>MAX_TIME){time=MAX_TIME-MASK5;} + assert(score>=MINoff_SCORE) : "Score overflow - use MSA2 instead"; + assert(score<=MAXoff_SCORE) : "Score overflow - use MSA2 instead"; +// packed[MODE_DEL][row][col]=(score|prevState|time); + packed[MODE_DEL][row][col]=(score|time); + assert((score&SCOREMASK)==score); +// assert((prevState&MODEMASK)==prevState); + assert((time&TIMEMASK)==time); + } + + {//Calculate INS score + + final int streak=packed[MODE_INS][row-1][col]&TIMEMASK; + + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + + int scoreMS=scoreFromDiag+POINTSoff_INS; +// int scoreD=scoreFromDel+POINTSoff_INS; + int scoreI=scoreFromIns+(streak==0 ? POINTSoff_INS : + streakmaxScore){ + maxScore=x; + maxCol=col; + maxState=state; + } + } + } + maxScore>>=SCOREOFFSET; + +// System.err.println("Returning "+rows+", "+maxCol+", "+maxState+", "+maxScore); + return new int[] {rows, maxCol, maxState, maxScore}; + } + + @Override + /** @return {score, bestRefStart, bestRefStop} */ + /** Generates the match string */ + public final byte[] traceback(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state, boolean gapped){ + if(gapped){ + final byte[] gref=grefbuffer; + int gstart=translateToGappedCoordinate(refStartLoc, gref); + int gstop=translateToGappedCoordinate(refEndLoc, gref); + byte[] out=traceback2(read, gref, gstart, gstop, row, col, state); + return out; + }else{ + return traceback2(read, ref, refStartLoc, refEndLoc, row, col, state); + } + } + + @Override + /** Generates the match string */ + public final byte[] traceback2(byte[] read, byte[] ref, int refStartLoc, int refEndLoc, int row, int col, int state){ +// assert(false); + assert(refStartLoc<=refEndLoc) : refStartLoc+", "+refEndLoc; + assert(row==rows); + + byte[] out=new byte[row+col-1]; //TODO if an out of bound crash occurs, try removing the "-1". + int outPos=0; + + int gaps=0; + + if(state==MODE_INS){ + //TODO ? Maybe not needed. + } + + while(row>0 && col>0){ + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + +// System.err.println("state="+state+", prev="+prev+", row="+row+", col="+col+", score="+scores[state][row][col]); + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + + byte c=read[row-1]; + byte r=ref[refStartLoc+col-1]; + if(c==r){ + out[outPos]='m'; + }else{ + if(!AminoAcid.isFullyDefined(c, colorspace)){ + out[outPos]='N'; + }else if(!AminoAcid.isFullyDefined(r, colorspace)){ +// out[outPos]='X'; + out[outPos]='N'; + }else{ + out[outPos]='S'; + } + } + + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + + byte r=ref[refStartLoc+col-1]; + if(r==GAPC){ + out[outPos]='-'; + gaps++; + }else{ + out[outPos]='D'; + } + col--; + }else{ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + + assert(state==MODE_INS) : state; + if(col==0){ + out[outPos]='X'; + }else if(col>=columns){ + out[outPos]='Y'; + }else{ + out[outPos]='I'; + } + row--; + } + +// assert(prev==prev0); + state=prev; + outPos++; + } + + assert(row==0 || col==0); + if(col!=row){ + while(row>0){ + out[outPos]='X'; + outPos++; + row--; + col--; + } + if(col>0){ + //do nothing + } + } + + + //Shrink and reverse the string + byte[] out2=new byte[outPos]; + for(int i=0; i "+translateFromGappedCoordinate(out[1], gref)+" -> "+ + translateToGappedCoordinate(translateFromGappedCoordinate(out[1], gref), gref); + assert(out[2]==translateToGappedCoordinate(translateFromGappedCoordinate(out[2], gref), gref)); + + out[1]=translateFromGappedCoordinate(out[1], gref); + out[2]=translateFromGappedCoordinate(out[2], gref); + if(verbose){System.err.println("returning score "+Arrays.toString(out));} + return out; + }else{ + return score2(read, ref, refStartLoc, refEndLoc, maxRow, maxCol, maxState); + } + } + + @Override + /** @return {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState},
+ * or {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}
+ * if more padding is needed */ + public final int[] score2(final byte[] read, final byte[] ref, final int refStartLoc, final int refEndLoc, + final int maxRow, final int maxCol, final int maxState){ + + int row=maxRow; + int col=maxCol; + int state=maxState; + + assert(maxState>=0 && maxState=0 && maxRow=0 && maxColdifC){ + score+=POINTSoff_NOREF; + difR--; + } + + row+=difR; + col+=difR; + + } + + assert(refStartLoc<=refEndLoc); + assert(row==rows); + + + final int bestRefStop=refStartLoc+col-1; + + while(row>0 && col>0){ +// System.err.println("state="+state+", row="+row+", col="+col); + + + +// byte prev0=(byte)(packed[state][row][col]&MODEMASK); + + final int time=packed[state][row][col]&TIMEMASK; + final byte prev; + + if(state==MODE_MS){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row-1][col-1]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel && scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else if(scoreFromDel>=scoreFromIns){prev=MODE_DEL;} + else{prev=MODE_INS;} + } + row--; + col--; + }else if(state==MODE_DEL){ + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row][col-1]&SCOREMASK; + final int scoreFromDel=packed[MODE_DEL][row][col-1]&SCOREMASK; + if(scoreFromDiag>=scoreFromDel){prev=MODE_MS;} + else{prev=MODE_DEL;} + } + col--; + }else{ + assert(state==MODE_INS); + if(time>1){prev=(byte)state;} + else{ + final int scoreFromDiag=packed[MODE_MS][row-1][col]&SCOREMASK; + final int scoreFromIns=packed[MODE_INS][row-1][col]&SCOREMASK; + if(scoreFromDiag>=scoreFromIns){prev=MODE_MS;} + else{prev=MODE_INS;} + } + row--; + } + + if(col<0){ + System.err.println(row); + break; //prevents an out of bounds access + + } + +// assert(prev==prev0); + state=prev; + +// System.err.println("state2="+state+", row2="+row+", col2="+col+"\n"); + } +// assert(false) : row+", "+col; + if(row>col){ + col-=row; + } + + final int bestRefStart=refStartLoc+col; + + score>>=SCOREOFFSET; + int[] rvec; + if(bestRefStartrefEndLoc){ //Suggest extra padding in cases of overflow + int padLeft=Tools.max(0, refStartLoc-bestRefStart); + int padRight=Tools.max(0, bestRefStop-refEndLoc); + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState, padLeft, padRight}; + }else{ + rvec=new int[] {score, bestRefStart, bestRefStop, maxRow, maxCol, maxState}; + } + return rvec; + } + + /** + * Fills grefbuffer + * @param ref + * @param a + * @param b + * @param gaps + * @return gref + */ + private final byte[] makeGref(byte[] ref, int[] gaps, int refStartLoc, int refEndLoc){ + assert(gaps!=null && gaps.length>0); + + assert(refStartLoc<=gaps[0]) : refStartLoc+", "+refEndLoc+", "+Arrays.toString(gaps); + assert(refEndLoc>=gaps[gaps.length-1]); + + final int g0_old=gaps[0]; + final int gN_old=gaps[gaps.length-1]; + gaps[0]=Tools.min(gaps[0], refStartLoc); + gaps[gaps.length-1]=Tools.max(gN_old, refEndLoc); + grefRefOrigin=gaps[0]; + + if(verbose){System.err.println("\ngaps2: "+Arrays.toString(gaps));} + +// grefRefOrigin=Tools.min(gaps[0], refStartLoc); + +// //This block is no longer needed since the array is preallocated. +// int len=0; +// final int gb2=GAPBUFFER*2; +// for(int i=0; iy); +// int gap=z-y-1; +// if(gapy); + int gap=z-y-1; + assert(gap>=MINGAP) : gap+"\t"+MINGAP; + if(gapgref.length){ + System.err.println("gref buffer overflow: "+lim+" > "+gref.length); + return null; + } + for(int i=greflimit, r=refEndLoc+1; i "+j);} + return j; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + private final int translateToGappedCoordinate(int point, byte[] gref){ + if(verbose){System.err.println("translateToGappedCoordinate("+point+"), gro="+grefRefOrigin+", grl="+greflimit);} + if(point<=grefRefOrigin){return point-grefRefOrigin;} + for(int i=0, j=grefRefOrigin; i "+i);} + return i; + } + + j+=(c==GAPC ? GAPLEN : 1); +// if(c!=GAPC){j++;} +// else{j+=GAPLEN;} + } + + System.err.println(grefRefOrigin); + System.err.println(point); + System.err.println(new String(gref)); + + throw new RuntimeException("Out of bounds."); + } + + + /** Calculates score based on an array from Index */ + private final int calcAffineScore(int[] locArray){ + int score=0; + int lastLoc=-2; //Last true location + int lastValue=-1; + int timeInMode=0; + + for(int i=0; i0){//match + if(loc==lastValue){//contiguous match + score+=POINTS_MATCH2; + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=POINTS_MATCH; + }else if(loc=0); + score+=POINTS_MATCH; + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=POINTS_MATCH; + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else{//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode0){//match + if(loc==lastValue){//contiguous match + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInMode1) : minContig; + + int contig=0; + int maxContig=0; + + int score=0; + int lastLoc=-3; //Last true location + int lastValue=-1; + int timeInMode=0; + + for(int i=0; i0){//match + if(loc==lastValue){//contiguous match + contig++; + score+=(POINTS_MATCH2+baseScores[i]); + }else if(loc==lastLoc || lastLoc<0){//match after a sub, or first match + maxContig=Tools.max(maxContig, contig); + contig=1; + score+=(POINTS_MATCH+baseScores[i]); + }else if(loc=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_DEL; + int dif=lastLoc-loc+1; + if(dif>MINGAP){ + int rem=dif%GAPLEN; + int div=(dif-GAPBUFFER2)/GAPLEN; + score+=(div*POINTS_GAP); + assert(rem+GAPBUFFER2LIMIT_FOR_COST_4); //and probably LIMIT_FOR_COST_5 +// assert(false) : div; + } + if(dif>LIMIT_FOR_COST_5){ + score+=((dif-LIMIT_FOR_COST_5+MASK5)/TIMESLIP)*POINTS_DEL5; + dif=LIMIT_FOR_COST_5; + } + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_DEL4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_DEL3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_DEL2; + } + timeInMode=1; + }else if(loc>lastLoc){//insertion + maxContig=Tools.max(maxContig, contig); + contig=0; + assert(lastLoc>=0); + score+=(POINTS_MATCH+baseScores[i]); + score+=POINTS_INS; + int dif=Tools.min(loc-lastLoc+1, 5); + assert(dif>0); + if(dif>LIMIT_FOR_COST_4){ + score+=(dif-LIMIT_FOR_COST_4)*POINTS_INS4; + dif=LIMIT_FOR_COST_4; + } + if(dif>LIMIT_FOR_COST_3){ + score+=(dif-LIMIT_FOR_COST_3)*POINTS_INS3; + dif=LIMIT_FOR_COST_3; + } + if(dif>1){ + score+=(dif-1)*POINTS_INS2; + } + timeInMode=1; + }else{ + assert(false); + } + lastLoc=loc; + }else if(loc==-1){//substitution + if(lastValue<0 && timeInMode>0){//contiguous + if(timeInModeref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; i=ref.length) ? (byte)'N' : ref[j]; + + if(c=='N' || r=='N'){match[i]='N';} + else if(c==r){match[i]='m';} + else{match[i]='S';} + + } + + return match; + } + + @Override + public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart){ + return scoreNoIndels(read, ref, baseScores, refStart, null); + } + @Override + public final int scoreNoIndels(byte[] read, byte[] ref, byte[] baseScores, final int refStart, SiteScore ss){ + + int score=0; + int mode=-1; + int timeInMode=0; + int norefs=0; + + //This block handles cases where the read runs outside the reference + //Of course, padding the reference with 'N' would be better, but... + int readStart=0; + int readStop=read.length; + final int refStop=refStart+read.length; + boolean semiperfect=true; + + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + norefs+=readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + score+=POINTS_NOREF*dif; + norefs+=dif; + } + +// if(refStart<0 || refStart+read.length>ref.length){return -99999;} //No longer needed. + + for(int i=readStart; iref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].lengthref.length){return -99999;} + if(refStart<0){ + readStart=0-refStart; + score+=POINTS_NOREF*readStart; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + System.err.println(new String(read)+"\ndif="+dif+", ref.length="+ref.length+", refStop="+refStop); + readStop-=dif; + score+=POINTS_NOREF*dif; + } + assert(refStart+readStop<=ref.length) : "readStart="+readStart+", readStop="+readStop+ + ", refStart="+refStart+", refStop="+refStop+", ref.length="+ref.length+", read.length="+read.length; + + assert(matchReturn!=null); + assert(matchReturn.length==1); + if(matchReturn[0]==null || matchReturn[0].length!=read.length){ + assert(matchReturn[0]==null || matchReturn[0].length>SCOREOFFSET).append(",");} + return sb; + } + + public static float minIdToMinRatio(double minid){ + if(minid>1){minid=minid/100;} + assert(minid>0 && minid<=1) : "Min identity should be between 0 and 1. Values above 1 will be assumed to be percent and divided by 100."; + double matchdif=POINTS_MATCH-POINTS_MATCH2; + double match=POINTS_MATCH2; + double sub=-POINTS_MATCH2+0.5*(matchdif+POINTS_SUB)+0.5*POINTS_SUB2; + double del=0.1*(matchdif+POINTS_DEL)+0.2*POINTS_DEL2+0.4*POINTS_DEL3+0.3*POINTS_DEL4; + double ins=-POINTS_MATCH2+0.4*(matchdif+POINTS_INS)+0.3*(POINTS_INS2)+0.3*(POINTS_INS3); + double badAvg=.7*sub+.2*del+.1*ins; + double badFraction=1-minid; + double minratio=(match+badFraction*badAvg)/match; + assert(minratio<=1); + minratio=Tools.max(0.1, minratio); + return (float)minratio; + } + + public static final int TIMEBITS=11; + public static final int SCOREBITS=32-TIMEBITS; + public static final int MAX_TIME=((1<0){percUp(1);} +// assert(queue.size()==size); +// assert(queue.peek()==peek()); + //assert(testForDuplicates()); + return t; + } + + private void percDown(int loc){ + //assert(testForDuplicates()); + assert(loc>0); + if(loc==1){return;} + + int next=loc/2; + final long a=array[loc]; + long b=array[next]; + +// while(loc>1 && (a.site1 && a0 && loc<=size) : loc+", "+size; + int next1=loc*2; + int next2=next1+1; + if(next1>size){return;} + long a=array[loc]; + long b=array[next1]; + long c=array[next2]; + assert(a!=b); + assert(b!=c); + assert(b!=-1L); + //assert(testForDuplicates()); + if(c==-1L || b<=c){ + if(a>b){ +// if((a.site>b.site || (a.site==b.site && a.column>b.column))){ + array[next1]=a; + array[loc]=b; + //assert(testForDuplicates()); + percUp(next1); + } + }else{ + if(a>c){ +// if((a.site>c.site || (a.site==c.site && a.column>c.column))){ + array[next2]=a; + array[loc]=c; + //assert(testForDuplicates()); + percUp(next2); + } + } + } + + private void percUpIter(int loc){ + //assert(testForDuplicates()); + assert(loc>0 && loc<=size) : loc+", "+size; + final long a=array[loc]; + //assert(testForDuplicates()); + + int next1=loc*2; + int next2=next1+1; + + while(next1<=size){ + + long b=array[next1]; + long c=array[next2]; + assert(a!=b); + assert(b!=c); + assert(b!=-1L); + + if(c==-1L || b<=c){ +// if(c==-1L || (b.siteb){ +// if((a.site>b.site || (a.site==b.site && a.column>b.column))){ +// array[next1]=a; + array[loc]=b; + loc=next1; + }else{ + break; + } + }else{ + if(a>c){ +// if((a.site>c.site || (a.site==c.site && a.column>c.column))){ +// array[next2]=a; + array[loc]=c; + loc=next2; + }else{ + break; + } + } + next1=loc*2; + next2=next1+1; + } + array[loc]=a; + } + + public boolean isEmpty(){ +// assert((size==0) == queue.isEmpty()); + return size==0; + } + + public void clear(){ +// queue.clear(); +// for(int i=1; i<=size; i++){array[i]=-1L;} + size=0; + } + + public int size(){ + return size; + } + + public static int tier(int x){ + int leading=Integer.numberOfLeadingZeros(x); + return 31-leading; + } + + public boolean testForDuplicates(){ + for(int i=0; i{ + + public static Pointer[] loadMatrix(int[][] matrix){ + Pointer[] out=new Pointer[matrix.length]; + for(int i=0; i0){ + File f=new File(args[0]); + if(f.exists()){ + String s=ReadWrite.readString(args[0]); +// TextFile tf=new TextFile(args[0], false, false); +// String s=tf.nextLine(); +// tf.close(); + long old=Long.parseLong(s); + long elapsed=millis-old; + if(args.length<2 || Tools.parseBoolean(args[1])){ + System.out.println("Elapsed:\t"+String.format("%.2f", elapsed/1000d)); + if(true){ + System.err.println("Elapsed:\t"+String.format("%.2f", elapsed/1000d)); + } + } + } + f=null; + ReadWrite.writeString(millis+"", args[0]); + } + } + +} diff --git a/current/align2/Quad.java b/current/align2/Quad.java new file mode 100755 index 0000000..3801ef3 --- /dev/null +++ b/current/align2/Quad.java @@ -0,0 +1,33 @@ +package align2; + +public class Quad implements Comparable{ + + public Quad(int col_, int row_, int val_){ + column=col_; + row=row_; + site=val_; + } + + public boolean equals(Object other){ + return site==((Quad)other).site; + } + + @Override + public int hashCode(){return site;} + + @Override + public int compareTo(Quad other) { + int x=site-other.site; + return(x==0 ? column-other.column : x); + } + + public String toString(){ + return("("+column+","+row+","+site+")"); + } + + public final int column; + public int row; + public int site; + public int list[]; + +} diff --git a/current/align2/Quad64.java b/current/align2/Quad64.java new file mode 100755 index 0000000..838688e --- /dev/null +++ b/current/align2/Quad64.java @@ -0,0 +1,35 @@ +package align2; + +public class Quad64 implements Comparable{ + + public Quad64(int col_, int row_, int val_){ + column=col_; + row=row_; + site=val_; + } + + public boolean equals(Object other){ + assert(false); + return site==((Quad64)other).site; + } + + @Override + public int hashCode(){return (int)site;} + + @Override + public int compareTo(Quad64 other) { + return site>other.site ? 1 : site0 ? 1 : x<0 ? -1 : column-other.column); + } + + public String toString(){ + return("("+column+","+row+","+site+")"); + } + + public final int column; + public int row; + public long site; + public int list[]; + +} diff --git a/current/align2/Quad64Heap.java b/current/align2/Quad64Heap.java new file mode 100755 index 0000000..61761d7 --- /dev/null +++ b/current/align2/Quad64Heap.java @@ -0,0 +1,219 @@ +package align2; + +public final class Quad64Heap { + + public Quad64Heap(int maxSize){ + + int len=maxSize+1; + if((len&1)==1){len++;} //Array size is always even. + + CAPACITY=maxSize; + array=new Quad64[len]; +// queue=new PriorityQueue(maxSize); + } + + public boolean add(Quad64 t){ + //assert(testForDuplicates()); +// assert(queue.size()==size); +// queue.add(t); + assert(size==0 || array[size]!=null); + size++; + array[size]=t; + percDown(size); +// assert(queue.size()==size); +// assert(queue.peek()==peek()); + //assert(testForDuplicates()); + return true; + } + + public Quad64 peek(){ + //assert(testForDuplicates()); +// assert(queue.size()==size); + if(size==0){return null;} +// assert(array[1]==queue.peek()) : size+", "+queue.size()+"\n"+ +// array[1]+"\n"+ +// array[2]+" , "+array[3]+"\n"+ +// array[4]+" , "+array[5]+" , "+array[6]+" , "+array[7]+"\n"+ +// queue.peek()+"\n"; + //assert(testForDuplicates()); + return array[1]; + } + + public Quad64 poll(){ + //assert(testForDuplicates()); +// assert(queue.size()==size); + if(size==0){return null;} + Quad64 t=array[1]; +// assert(t==queue.poll()); + array[1]=array[size]; + array[size]=null; + size--; + if(size>0){percUp(1);} +// assert(queue.size()==size); +// assert(queue.peek()==peek()); + //assert(testForDuplicates()); + return t; + } + +// private void percDownRecursive(int loc){ +// //assert(testForDuplicates()); +// assert(loc>0); +// if(loc==1){return;} +// int next=loc/2; +// Quad64 a=array[loc]; +// Quad64 b=array[next]; +// assert(a!=b); +// if(a.compareTo(b)<0){ +// array[next]=a; +// array[loc]=b; +// percDown(next); +// } +// } +// +// private void percDown_old(int loc){ +// //assert(testForDuplicates()); +// assert(loc>0); +// +// final Quad64 a=array[loc]; +// +// while(loc>1){ +// int next=loc/2; +// Quad64 b=array[next]; +// assert(a!=b); +// if(a.compareTo(b)<0){ +// array[next]=a; +// array[loc]=b; +// loc=next; +// }else{return;} +// } +// } + + private void percDown(int loc){ + //assert(testForDuplicates()); + assert(loc>0); + if(loc==1){return;} + + int next=loc/2; + final Quad64 a=array[loc]; + Quad64 b=array[next]; + +// while(loc>1 && (a.site1 && a.compareTo(b)<0){ + array[loc]=b; + loc=next; + next=next/2; + b=array[next]; + } + + array[loc]=a; + } + + private void percUp(int loc){ + //assert(testForDuplicates()); + assert(loc>0 && loc<=size) : loc+", "+size; + int next1=loc*2; + int next2=next1+1; + if(next1>size){return;} + Quad64 a=array[loc]; + Quad64 b=array[next1]; + Quad64 c=array[next2]; + assert(a!=b); + assert(b!=c); + assert(b!=null); + //assert(testForDuplicates()); + if(c==null || b.compareTo(c)<1){ + if(a.compareTo(b)>0){ +// if((a.site>b.site || (a.site==b.site && a.column>b.column))){ + array[next1]=a; + array[loc]=b; + //assert(testForDuplicates()); + percUp(next1); + } + }else{ + if(a.compareTo(c)>0){ +// if((a.site>c.site || (a.site==c.site && a.column>c.column))){ + array[next2]=a; + array[loc]=c; + //assert(testForDuplicates()); + percUp(next2); + } + } + } + + private void percUpIter(int loc){ + //assert(testForDuplicates()); + assert(loc>0 && loc<=size) : loc+", "+size; + final Quad64 a=array[loc]; + //assert(testForDuplicates()); + + int next1=loc*2; + int next2=next1+1; + + while(next1<=size){ + + Quad64 b=array[next1]; + Quad64 c=array[next2]; + assert(a!=b); + assert(b!=c); + assert(b!=null); + + if(c==null || b.compareTo(c)<1){ +// if(c==null || (b.site0){ +// if((a.site>b.site || (a.site==b.site && a.column>b.column))){ +// array[next1]=a; + array[loc]=b; + loc=next1; + }else{ + break; + } + }else{ + if(a.compareTo(c)>0){ +// if((a.site>c.site || (a.site==c.site && a.column>c.column))){ +// array[next2]=a; + array[loc]=c; + loc=next2; + }else{ + break; + } + } + next1=loc*2; + next2=next1+1; + } + array[loc]=a; + } + + public boolean isEmpty(){ +// assert((size==0) == queue.isEmpty()); + return size==0; + } + + public void clear(){ +// queue.clear(); +// for(int i=1; i<=size; i++){array[i]=null;} + size=0; + } + + public int size(){ + return size; + } + + public static int tier(int x){ + int leading=Integer.numberOfLeadingZeros(x); + return 31-leading; + } + + public boolean testForDuplicates(){ + for(int i=0; i(maxSize); + } + + public boolean add(Quad t){ + //assert(testForDuplicates()); +// assert(queue.size()==size); +// queue.add(t); + assert(size==0 || array[size]!=null); + size++; + array[size]=t; + percDown(size); +// assert(queue.size()==size); +// assert(queue.peek()==peek()); + //assert(testForDuplicates()); + return true; + } + + public Quad peek(){ + //assert(testForDuplicates()); +// assert(queue.size()==size); + if(size==0){return null;} +// assert(array[1]==queue.peek()) : size+", "+queue.size()+"\n"+ +// array[1]+"\n"+ +// array[2]+" , "+array[3]+"\n"+ +// array[4]+" , "+array[5]+" , "+array[6]+" , "+array[7]+"\n"+ +// queue.peek()+"\n"; + //assert(testForDuplicates()); + return array[1]; + } + + public Quad poll(){ + //assert(testForDuplicates()); +// assert(queue.size()==size); + if(size==0){return null;} + Quad t=array[1]; +// assert(t==queue.poll()); + array[1]=array[size]; + array[size]=null; + size--; + if(size>0){percUp(1);} +// assert(queue.size()==size); +// assert(queue.peek()==peek()); + //assert(testForDuplicates()); + return t; + } + +// private void percDownRecursive(int loc){ +// //assert(testForDuplicates()); +// assert(loc>0); +// if(loc==1){return;} +// int next=loc/2; +// Quad a=array[loc]; +// Quad b=array[next]; +// assert(a!=b); +// if(a.compareTo(b)<0){ +// array[next]=a; +// array[loc]=b; +// percDown(next); +// } +// } +// +// private void percDown_old(int loc){ +// //assert(testForDuplicates()); +// assert(loc>0); +// +// final Quad a=array[loc]; +// +// while(loc>1){ +// int next=loc/2; +// Quad b=array[next]; +// assert(a!=b); +// if(a.compareTo(b)<0){ +// array[next]=a; +// array[loc]=b; +// loc=next; +// }else{return;} +// } +// } + + private void percDown(int loc){ + //assert(testForDuplicates()); + assert(loc>0); + if(loc==1){return;} + + int next=loc/2; + final Quad a=array[loc]; + Quad b=array[next]; + +// while(loc>1 && (a.site1 && a.compareTo(b)<0){ + array[loc]=b; + loc=next; + next=next/2; + b=array[next]; + } + + array[loc]=a; + } + + private void percUp(int loc){ + //assert(testForDuplicates()); + assert(loc>0 && loc<=size) : loc+", "+size; + int next1=loc*2; + int next2=next1+1; + if(next1>size){return;} + Quad a=array[loc]; + Quad b=array[next1]; + Quad c=array[next2]; + assert(a!=b); + assert(b!=c); + assert(b!=null); + //assert(testForDuplicates()); + if(c==null || b.compareTo(c)<1){ + if(a.compareTo(b)>0){ +// if((a.site>b.site || (a.site==b.site && a.column>b.column))){ + array[next1]=a; + array[loc]=b; + //assert(testForDuplicates()); + percUp(next1); + } + }else{ + if(a.compareTo(c)>0){ +// if((a.site>c.site || (a.site==c.site && a.column>c.column))){ + array[next2]=a; + array[loc]=c; + //assert(testForDuplicates()); + percUp(next2); + } + } + } + + private void percUpIter(int loc){ + //assert(testForDuplicates()); + assert(loc>0 && loc<=size) : loc+", "+size; + final Quad a=array[loc]; + //assert(testForDuplicates()); + + int next1=loc*2; + int next2=next1+1; + + while(next1<=size){ + + Quad b=array[next1]; + Quad c=array[next2]; + assert(a!=b); + assert(b!=c); + assert(b!=null); + + if(c==null || b.compareTo(c)<1){ +// if(c==null || (b.site0){ +// if((a.site>b.site || (a.site==b.site && a.column>b.column))){ +// array[next1]=a; + array[loc]=b; + loc=next1; + }else{ + break; + } + }else{ + if(a.compareTo(c)>0){ +// if((a.site>c.site || (a.site==c.site && a.column>c.column))){ +// array[next2]=a; + array[loc]=c; + loc=next2; + }else{ + break; + } + } + next1=loc*2; + next2=next1+1; + } + array[loc]=a; + } + + public boolean isEmpty(){ +// assert((size==0) == queue.isEmpty()); + return size==0; + } + + public void clear(){ +// queue.clear(); +// for(int i=1; i<=size; i++){array[i]=null;} + size=0; + } + + public int size(){ + return size; + } + + public static int tier(int x){ + int leading=Integer.numberOfLeadingZeros(x); + return 31-leading; + } + + public boolean testForDuplicates(){ + for(int i=0; i1 || r[r.length-1]<0){ + System.err.println("Ooops! "+Arrays.toString(r)); + } + } + + time=System.nanoTime()-time; + float seconds=(float)(time/1000000000d); + System.out.println("Bench Time: "+String.format("%.3f",seconds)+" s"); + } + + public static void bench2(int length, int rounds){ + + long time=System.nanoTime(); + + byte[] qual=new byte[length]; + for(int i=0; i1 || r[r.length-1]<0){ + System.err.println("Ooops! "+Arrays.toString(r)); + } + } + + time=System.nanoTime()-time; + float seconds=(float)(time/1000000000d); + System.out.println("Bench2 Time: "+String.format("%.3f",seconds)+" s"); + } + + public static int[] makeKeyScores(byte[] qual, int keylen, int range, int baseScore, int[] out){ + float[] probs=makeKeyProbs(qual, keylen); + return makeKeyScores(probs, (qual.length-keylen+1), range, baseScore, out); + } + + public static int[] makeKeyScores(float[] probs, int numProbs, int range, int baseScore, int[] out){ + if(out==null){out=new int[numProbs];} +// assert(out.length==probs.length); + assert(out.length>=numProbs); + for(int i=0; i=Byte.MIN_VALUE && x<=Byte.MAX_VALUE); + if(negative){ + x=x-maxScore; + assert(x<=0); + }else{ + assert(x>=0 && x<=maxScore); + } + out[i]=(byte)x; + } + return out; + } + + public static byte[] makeByteScoreArray(int maxScore, byte[] out, boolean negative){ + assert(out!=null); +// for(int i=0; i=Byte.MIN_VALUE && x<=Byte.MAX_VALUE); +// if(negative){ +// x=x-maxScore; +// assert(x<=0); +// }else{ +// assert(x>=0 && x<=maxScore); +// } +// out[i]=(byte)x; +// } + Arrays.fill(out, (byte)0); + return out; + } + + /** Returns prob of error for each key */ + public static float[] makeKeyProbs(byte[] quality, int keylen){ + return makeKeyProbs(quality, keylen, null); + } + + /** Returns prob of error for each key */ + public static float[] makeKeyProbs(byte[] quality, int keylen, float[] out){ + if(quality==null){return makeKeyProbs(keylen, out);} + if(out==null){out=new float[quality.length-keylen+1];} + assert(out.length>=quality.length-keylen+1) : quality.length+", "+keylen+", "+out.length; +// assert(out.length==quality.length-keylen+1); + float key1=1; + + int timeSinceZero=0; + for(int i=0; i0){timeSinceZero++;}else{timeSinceZero=0;} //Tracks location of N's + assert(q0){timeSinceZero++;}else{timeSinceZero=0;} + float ipa=PROB_CORRECT_INVERSE[qa]; + float pb=PROB_CORRECT[qb]; + key1=key1*ipa*pb; + out[a+1]=1-key1; + if(timeSinceZero0){ +// int range=Tools.max(1, maxQual-q+1); +// int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range)); +// q=(byte)(q+delta); +// }else{ +// int range=Tools.max(1, q-minQual+1); +// int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range), randyQual.nextInt(range)); +// q=(byte)(q-delta); +// } + + if((hilo&15)>0){ + int range=Tools.max(1, maxQual-q+1); + int delta=(randyQual.nextInt(range)+randyQual.nextInt(range+1))/2; + q=(byte)(q+delta); + }else{ + int range=Tools.max(1, q-minQual+1); + int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range)); + q=(byte)(q-delta); + } + q=(byte)Tools.min(Tools.max(q, minQual), maxQual); + out[i]=q; + } + + if(length>50){ + final int x=length/10; + for(int i=0; imax){ + max=f; + index=i; + } + } + + if(index==0 || index==offsets.length-1){return offsets;} + if(max<.98f){return offsets;} + + final int removed=offsets[index]; + { + int[] offsets2=new int[offsets.length-1]; + for(int i=0; iremoved && removed>=0); +// while(i>removed && keyProbs[i-1]>=keyProbs[i]){i--;} +// offsets[0]=i; + }else if(index==offsets.length){ + assert(false); +// int i=offsets[offsets.length-1]; +// assert(i=keyProbs[i]){i++;} +// offsets[offsets.length-1]=i; + }else if(offsets.length>2){ + if(index==offsets.length-1){ + assert(index>1); + int i=offsets[index-1]; //5, 7, 9, 5, 6 + assert(i=keyProbs[i]){i++;} + offsets[index-1]=i; + }else{ + assert(index0); + int i=offsets[index]; + assert(i>removed && removed>=0); + while(i>removed+1 && keyProbs[i-1]>=keyProbs[i]){i--;} + offsets[index]=i; + } + } + + return offsets; + } + + + /*-------------------- Fields --------------------*/ + + /*-------------------- Final Fields --------------------*/ + + /*-------------------- Static Fields --------------------*/ + + /** Probability that this base is an error */ + public static final float[] PROB_ERROR=makeQualityToFloat(96); + /** 1/PROB */ + public static final float[] PROB_ERROR_INVERSE=makeInverse(PROB_ERROR); + + public static final float[] PROB_CORRECT=oneMinus(PROB_ERROR); + public static final float[] PROB_CORRECT_INVERSE=makeInverse(PROB_CORRECT); + + /*-------------------- Constants --------------------*/ + + /*-------------------- Initializers --------------------*/ + + public static final double phredToProbError(int phred){ + if(phred<1){return 1;} + return Math.pow(10, 0-.1*phred); + } + + private static final float[] makeQualityToFloat(int n){ + float[] r=new float[n]; + for(int i=0; i=0); + if(minChrom<1){minChrom=1;} + if(maxChrom<1){maxChrom=Data.numChroms;} + + RandomReads rr=new RandomReads(paired); + mateLen=readlen; + + System.err.println("snpRate = \t"+snpRate); + System.err.println("insRate = \t"+insRate); + System.err.println("delRate = \t"+delRate); + System.err.println("subRate = \t"+subRate); + System.err.println("Reads = \t"+number); + System.err.println("Readlen = \t"+readlen); + System.err.println("Paired = \t"+paired); + System.err.println("Genome = \t"+Data.GENOME_BUILD); + System.err.println("PERFECT_READ_RATIO="+PERFECT_READ_RATIO); + + String fname="reads_B"+Data.GENOME_BUILD+"_"+number+"x"+readlen+"bp_" + +maxSnps+"S_"+maxInss+"I_"+maxDels+"D_"+maxSubs+"U_chr"+minChrom+"-"+maxChrom+(paired ? "_#" : "")+".fq"; + + Read[] reads=rr.makeRandomReadsX(number, readlen, + maxSnps, maxInss, maxDels, maxSubs, + snpRate, insRate, delRate, subRate, + maxInsertionLen, maxDeletionLen, maxSubLen, + minChrom, maxChrom, false, minQuality, midQuality, maxQuality); + + FASTQ.writeFASTQ(reads, fname.replace("#", "1")); + if(paired){ + for(int i=0; i=0 && num<=3 && num!=oldNum); + byte[] bytes=s.getBytes(); + bytes[index]=AminoAcid.numberToBase[num]; + return new String(bytes); + } + + + public String addSUB(String s, int minlen, int maxlen, int readlen, Random rand){ + assert(readlen<=s.length()) : readlen+", "+s.length(); + assert(minlen>1); + assert(maxlen>=minlen); + +// int len=minlen+rand.nextInt(maxlen-minlen+1); + int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); + + assert(len>=minlen); + assert(len<=maxlen); + + assert(readlen<=s.length()); + int index=rand.nextInt(readlen-len+1); + byte[] bytes=s.getBytes(); + + int lim=index+len-1; + + {//Change first and last to anything except old + int i=index; + + byte old=bytes[i]; + if(AminoAcid.isFullyDefined(old)){ + byte oldNum=AminoAcid.baseToNumber[old]; + int num=(oldNum+rand.nextInt(4))%4; + assert(num>=0 && num<=3); + byte base=AminoAcid.numberToBase[num]; + bytes[i]=base; + } + + i=lim; + old=bytes[i]; + if(AminoAcid.isFullyDefined(old)){ + byte oldNum=AminoAcid.baseToNumber[old]; + int num=(oldNum+rand.nextInt(4))%4; + assert(num>=0 && num<=3); + byte base=AminoAcid.numberToBase[num]; + bytes[i]=base; + } + } + + for(int i=index+1; i=0 && num<=3 && num!=oldNum); + byte base=AminoAcid.numberToBase[num]; + bytes[i]=base; + } + } + + return new String(bytes); + } + + + public String addInsertion(String s, int minlen, int maxlen, int readlen, int[] dif, Random rand){ + + +// assert(false) : minlen+","+maxlen; + assert(readlen<=s.length()) : readlen+", "+s.length(); + assert(minlen>0); + assert(maxlen>=minlen); + +// int len=minlen+rand.nextInt(maxlen-minlen+1); + int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); + + len=Tools.min(len, readlen-dif[1]-2); +// assert(false) : len+", "+readlen+", "+dif[1]; + if(len<1){return s;} + + if(verbose){System.err.println("\nAdding insertion of len="+len+", dif="+dif[0]);} + + dif[0]-=len; + dif[1]+=len; + + int index=rand.nextInt(readlen-len+1); //Assures that all inserted bases will be within the read + + String mid=""; + for(int i=0; i\n"+s2);} + + return s2; + } + + + public String addDeletion(String s, int minlen, int maxlen, int readlen, int[] dif, Random rand){ + assert(s.length()>=readlen+maxlen); + assert(minlen>0); + assert(maxlen>=minlen); + +// int len=maxlen; +// int len=minlen+rand.nextInt(maxlen-minlen+1); + int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); +// System.err.println("Made del len "+len); + dif[0]+=len; + +// int index=rand.nextInt(s.length()-len); + int index=1+rand.nextInt(readlen-1); //Assures there will never be a deletion of the first base, which would not technically be a deletion. + +// System.err.println("Added deletion "+len+" at "+index); + + String s2=s.substring(0, index)+s.substring(index+len); + return s2; + } + + + public Read[] makeRandomReads(int readlen, int number, int minChrom, int maxChrom){ + Read[] out=new Read[number]; + for(int i=0; imaxChrom){ + x=randy.nextInt(); + chrom=randomChrom[(x&0x7FFFFFFF)%randomChrom.length]; +// if(chrom>25 && Data.GENOME_BUILD==36){chrom=-1;} + } + byte strand=(byte) (x>=0 ? 0 : 1); +// strand=0; //TODO +// System.err.println("Chose chrom "+chrom+", strand "+strand); + return makeRandomRead2(readlen, chrom, strand); + } + + + public Read makeRandomRead2(int readlen, int chrom, byte strand){ + byte[] s=null; + ChromosomeArray cha=Data.getChromosome(chrom); + + int loc=-1; + while(s==null){ +// loc=randy.nextInt(cha.maxIndex-40000); + loc=randy.nextInt(cha.maxIndex-readlen); +// loc=10180206; + s=cha.getBytes(loc, loc+readlen-1); + assert(s.length==readlen); + +// System.out.println(new String(s)); + + if(AminoAcid.countUndefined(s)>5){ + s=null; +// System.out.println("Tossed out string."); + } + } +// System.err.println("Chose loc="+loc); + assert(strand==Gene.MINUS || strand==Gene.PLUS); + + if(strand==Gene.MINUS){ + s=AminoAcid.reverseComplementBases(s); + } + long id=nextReadID; + nextReadID++; + Read r=new Read(s, chrom, strand, loc, loc+s.length-1, id, null, false); + r.setSynthetic(true); +// System.err.println("Made read "+r.start+", "+r.stop); +// assert(readlen==20); + assert(r.bases.length==readlen); +// assert(false) : r.start+", "+r.stop; + return r; + } + + + + public Read[] makeRandomReadsX(int numReads, int readlen, + int maxSnps, int maxInss, int maxDels, int maxSubs, + float snpRate, float insRate, float delRate, float subRate, + int maxInsertionLen, int maxDeletionLen, int maxSubLen, + int minChrom, int maxChrom, boolean colorspace, + int minQual, int midQual, int maxQual){ + assert(minQual<=midQual); + assert(midQual<=maxQual); + assert(minQual>=0 && maxQual<48); +// System.err.println("Called makeRandomReadsX("+numReads+", "+readlen+", "+maxSnps+", "+maxDels+", "+maxInss+", "+ +// snpRate+", "+delRate+", "+insRate+", "+maxInsertionLen+", "+maxDeletionLen+", "+minChrom+", "+maxChrom+")"); + +// if(colorspace){readlen++;} + + Read[] reads=new Read[numReads]; + +// assert(Index2.maxIndel==maxIndel); //Temporary + + + final int maxQualP=Tools.max(35, maxQual); + final int midQualP=30; + final int minQualP=Tools.min(25, maxQual); + + for(int i=0; ireadlen){s=s.substring(0, readlen);} + assert(s.length()==readlen); + + if(verbose){System.err.println("After length adjust 1 to "+readlen+": dif="+dif[0]+"\n"+s+"\n");} + +// int preInsertLength=s.length(); + for(int j=0; jreadlen); + s=s.substring(0, readlen); + } + if(verbose){System.err.println("After length adjust 2 to "+readlen+": dif="+dif[0]+"\n"+s+"\n");} + +// if(s.length()!=readlen){ +// assert(s.length()>readlen); +// boolean start=randyCutPos.nextBoolean(); +// if(start){//Take first part of string +// s=s.substring(0, readlen); +// }else{//take last part of string +// s=s.substring(s.length()-readlen); +// } +// } + + for(int j=0; j0){ + int range=(perfect ? maxQualP-midQualP+1 : maxQual-midQual+1); + int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range)); + baseQuality=(byte)((perfect ? midQualP : midQual)+delta); + }else{ + int range=perfect ? midQualP-minQualP+1 : midQual-minQual+1; + int delta=Tools.min(randyQual.nextInt(range), randyQual.nextInt(range)); + baseQuality=(byte)((perfect ? midQualP : midQual)-delta); + } + } + + if(USE_FIXED_QUALITY){ + r.quality=getFixedQualityRead(r.bases.length); + }else{ + if(perfect){ + r.quality=QualityTools.makeQualityArray( + r.bases.length, randyQual, minQualP, maxQualP, baseQuality, slant); + }else{ + r.quality=QualityTools.makeQualityArray( + r.bases.length, randyQual, minQual, maxQual, baseQuality, slant); + } + } + for(int j=0; jr.start) : "\n"+Read.header()+"\n"+r+"\n"+SNPs+", "+SUBs+", "+INSs+", "+DELs+"\n"+s+"\n"; + + if(colorspace){ + r=reads[i]=r.translateToColorspace(true); + r.obj=new String(r.bases); //TODO - for testing + } + r.mapLength=r.bases.length; + + + if(paired){ + + Read r2=makeMate(r, mateLen, + maxSnps, maxInss, maxDels, maxSubs, + snpRate, insRate, delRate, subRate, + maxInsertionLen, maxDeletionLen, maxSubLen, + mateMiddleMin, mateMiddleMax, mateSameStrand, + minQual, maxQual, baseQuality, slant, perfect); + + while(r2==null){ + r2=makeMate(r, mateLen, + maxSnps, maxInss, maxDels, maxSubs, + snpRate, insRate, delRate, subRate, + maxInsertionLen, maxDeletionLen, maxSubLen, + mateMiddleMin, mateMiddleMax, mateSameStrand, + minQual, maxQual, baseQuality, slant, perfect); + } + + r.mate=r2; + r2.mate=r; + } +// System.err.println("Made "+r.start+" ~ "+r.stop+" = "+(r.stop-r.start)); + } + +// if(colorspace){ +// for(int i=0; i=minMiddle); +// assert(minMiddle>=0); + int midRange=maxMiddle-minMiddle+1; + int middle=(randyMate.nextInt(midRange)+randyMate.nextInt(midRange))/2+minMiddle; + byte strand=(byte) (sameStrand ? other.strand() : other.strand()^1); + +// System.out.println(sameStrand+": "+other.strand+" -> "+strand); + + if(other.strand()==Gene.PLUS){ + x=other.stop+middle; + }else{ + x=other.start-middle-readlen; + } + y=x+readlen+(maxDeletionLen*maxDels); + if(x<0){x=0; y=readlen-1; maxDels=0;} + if(y>Data.getChromosome(chrom).maxIndex){y=Data.getChromosome(chrom).maxIndex; x=y-readlen+1; maxDels=0;} + + String s=Data.getChromosome(chrom).getString(x, y); + +// System.out.println("Making string length "+s.length()+" from "+x+"-"+y+" of "+Data.getChromosome(chrom).maxIndex); + + //I already do this later. +// if(strand==Gene.MINUS){ +// s=AminoAcid.reverseComplementBases(s); +// } + + long id=other.numericID; + + int SNPs=0; + int INSs=0; + int DELs=0; + int SUBs=0; + +// assert(maxSnps==0 || (snpRate>.0001 && snpRate<=1)) : maxSnps+", "+snpRate; +// assert(maxInss==0 || (snpRate>.0001 && insRate<=1)) : maxInss+", "+insRate; +// assert(maxDels==0 || (snpRate>.0001 && delRate<=1)) : maxDels+", "+delRate; +// assert(maxSubs==0 || (snpRate>.0001 && subRate<=1)) : maxSubs+", "+subRate; + + while(SNPsreadlen){s=s.substring(0, readlen);} + assert(s.length()==readlen); + + // int preInsertLength=s.length(); + + int insLen=0; + for(int j=0; jreadlen); + s=s.substring(0, readlen); + } + + // if(s.length()!=readlen){ + // assert(s.length()>readlen); + // boolean start=randyCutPos.nextBoolean(); + // if(start){//Take first part of string + // s=s.substring(0, readlen); + // }else{//take last part of string + // s=s.substring(s.length()-readlen); + // } + // } + + for(int j=0; jr.start) : "DELs="+DELs+", INSs="+INSs+", SUBs="+SUBs+", SNPs="+SNPs+ + ", r.start="+r.start+", r.stop="+r.stop+", "+Data.getChromosome(chrom).maxIndex+"\n"+ + (other==null ? "" : "\n\n"+other.toText(false)+"\n\n"); + + + if(other.colorspace()){ + r=r.translateToColorspace(true); + r.obj=new String(r.bases); //TODO - for testing + } + + assert(sameStrand == (r.strand()==other.strand())) : "\n"+r.toText(false)+"\n"+other.toText(false)+"\n\n"+ + sameStrand+", "+r.strand()+", "+other.strand()+"\n"+r.pairnum()+", "+other.pairnum()+"\n"; + + return r; + + } + + + public void addColorspaceErrors(Read r, int errors){ + assert(r.colorspace()); + for(int i=0; i=4); + r.quality[loc]=(byte) (4+randyCSError.nextInt(r.quality[loc])); + r.bases[loc]=(byte)((r.bases[loc]+randyCSError.nextInt(3)+1)&3); + } + } + } + + +// public static int[] approxChromLengths=new int[] { +// 0, +// 600 +// }; + +// public static final int[] randomChrom=fillRandomChrom(approxChromLengths); + public static int[] randomChrom; + + private static int[] fillRandomChrom(){ + assert(Data.chromLengths!=null); + int[] in=Arrays.copyOf(Data.chromLengths, Data.chromLengths.length); + long total=Tools.sum(in); + int div=(int)(total/1000); + for(int i=0; i0 ? 1 : 0); + maxInsLen=x; + insRate=1; + }else if(a.equals("d") || a.startsWith("del")){ + maxDels=(x>0 ? 1 : 0); + maxDelLen=x; + delRate=1; + }else if(a.equals("u") || a.startsWith("sub")){ + maxSubs=(x>0 ? 1 : 0); + maxSubLen=x; + subRate=1; + }else if(a.equals("n")){ + maxNs=x; + nRate=1; + minNLen=maxNLen=1; + }else if(a.startsWith("minchrom")){ + minChrom=x; + }else if(a.equals("int") || a.equals("interleaved") || a.equals("interleave")){ + OUTPUT_INTERLEAVED=Tools.parseBoolean(b); + if(OUTPUT_INTERLEAVED){paired=true;} + }else if(a.equals("biasedsnps")){ + BIASED_SNPS=Tools.parseBoolean(b); + }else if(a.startsWith("maxchrom")){ + maxChrom=x; + }else if(a.startsWith("build") || a.startsWith("genome")){ + build=x; +// assert(false) : "Set genome to "+x; + }else if(a.startsWith("minq")){ + minQuality=x; + midQuality=Tools.max(midQuality, minQuality); + maxQuality=Tools.max(maxQuality, minQuality); + }else if(a.startsWith("midq")){ + midQuality=x; + }else if(a.startsWith("maxq")){ + maxQuality=x; + midQuality=Tools.min(midQuality, maxQuality); + minQuality=Tools.min(minQuality, maxQuality); + }else if(a.startsWith("qual") || a.equals("q")){ + minQuality=midQuality=maxQuality=x; + }else if(a.equals("mininsert")){ + minInsert=x; + }else if(a.equals("maxinsert")){ + maxInsert=x; + }else if(a.startsWith("minmid")){ + mateMiddleMin=x; + }else if(a.startsWith("maxmid")){ + mateMiddleMax=x; + }else if(a.startsWith("paired")){ + paired=Tools.parseBoolean(b); + }else if(a.startsWith("superflat")){ + SUPERFLAT_DIST=Tools.parseBoolean(b); + }else if(a.startsWith("flat")){ + FLAT_DIST=Tools.parseBoolean(b); + }else if(a.startsWith("bell") || a.startsWith("gauss") || a.startsWith("round")){ + BELL_DIST=Tools.parseBoolean(b); + }else if(a.startsWith("unique")){ + USE_UNIQUE_SNPS=Tools.parseBoolean(b); + }else if(a.startsWith("adderrors") || a.startsWith("usequality")){ + ADD_ERRORS_FROM_QUALITY=Tools.parseBoolean(b); + }else if(a.startsWith("replacenoref")){ + REPLACE_NOREF=Tools.parseBoolean(b); + }else if(a.equals("out")){ + out=b; + }else if(a.equals("colorspace")){ + colorspace=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + if(x>=0){ + ReadWrite.ZIPLEVEL=Tools.min(x, 9); + } + }else if(a.equals("ext") || a.equals("extension")){ + fileExt=b; + if(fileExt==null){fileExt=".fq.gz";} + if(!fileExt.startsWith(".")){fileExt="."+fileExt;} + }else if(a.equals("perfect")){ + PERFECT_READ_RATIO=Float.parseFloat(b); + }else if(a.equals("singlescaffold")){ + FORCE_SINGLE_SCAFFOLD=Tools.parseBoolean(b); + }else if(a.equals("minoverlap") || a.equals("overlap")){ + MIN_SCAFFOLD_OVERLAP=Integer.parseInt(b); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else{throw new RuntimeException("Unknown parameter "+args[i]);} + + } +// assert(false) : OUTPUT_INTERLEAVED; + assert(build>=0) : "Please specify a genome."; + + if(minInsert>-1){mateMiddleMin=minInsert-2*maxlen;} + if(maxInsert>-1){mateMiddleMax=maxInsert-2*minlen;} + + ArrayList chromlist=null; + if(ref!=null){ + chromlist=writeRef(ref, build); + } + + Data.setGenome(build); + if(minChrom<1){minChrom=1;} + if(maxChrom<1){maxChrom=Data.numChroms;} + + if(chromlist==null){ + Data.loadChromosomes(minChrom, maxChrom); + }else{ + assert(chromlist.size()==maxChrom-minChrom+1) : chromlist.size()+", "+minChrom+", "+maxChrom; + for(ChromosomeArray cha : chromlist){ + Data.chromosomePlusMatrix[cha.chromosome]=cha; + } + } + if(Shared.TRIM_READ_COMMENTS){Data.trimScaffoldNames();} + + if(number<1){ + Data.sysout.println("No reads to generate; quitting."); + return; + } + + RandomReads3 rr=(seed2==Long.MIN_VALUE ? new RandomReads3(paired) : + new RandomReads3((seed2==-1 ? System.nanoTime() : seed2), paired)); + if(adapter!=null){ + rr.adapter1=adapter.getBytes(); + rr.adapter2=AminoAcid.reverseComplementBases(rr.adapter1); + rr.adapter1=rr.adapter2; //For PacBio, since adapters never appear in plus configuration + } + + if(REPLACE_NOREF){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + ChromosomeArray cha=Data.getChromosome(chrom); + final byte[] array=cha.array; + final byte n='N'; + for(int i=0; i=1){ + snpRate=insRate=delRate=subRate=0; + maxSnps=maxInss=maxDels=maxSubs=maxNs=0; + } + + if(delRate<=0 || maxDelLen<=0 || maxDels<=0){ + delRate=0; + maxDelLen=minDelLen=maxDels=0; + } + if(insRate<=0 || maxInsLen<=0 || maxInss<=0){ + insRate=0; + maxInsLen=minInsLen=maxInss=0; + } + if(subRate<=0 || maxSubLen<=0 || maxSubs<=0){ + subRate=0; + maxSubLen=minSubLen=maxSubs=0; + } + if(snpRate<=0 || maxSnps<=0){ + snpRate=0; + maxSnps=0; + } + if(nRate<=0 || maxNLen<=0 || maxNs<=0){ + nRate=0; + maxNLen=minNLen=maxNs=0; + } + + System.err.println("snpRate="+snpRate+", max="+maxSnps+", unique="+USE_UNIQUE_SNPS); + System.err.println("insRate="+insRate+", max="+maxInss+", len=("+minInsLen+"-"+maxInsLen+")"); + System.err.println("delRate="+delRate+", max="+maxDels+", len=("+minDelLen+"-"+maxDelLen+")"); + System.err.println("subRate="+subRate+", max="+maxSubs+", len=("+minSubLen+"-"+maxSubLen+")"); + System.err.println("nRate ="+nRate+", max="+maxNs+", len=("+minNLen+"-"+maxNLen+")"); + System.err.println("genome="+Data.GENOME_BUILD); + System.err.println("PERFECT_READ_RATIO="+PERFECT_READ_RATIO); + System.err.println("ADD_ERRORS_FROM_QUALITY="+ADD_ERRORS_FROM_QUALITY); + System.err.println("REPLACE_NOREF="+REPLACE_NOREF); + System.err.println("paired="+paired); + System.err.println("read length="+(minlen==maxlen ? ""+minlen : minlen+"-"+maxlen)); + if(paired){ + System.err.println("insert size="+(mateMiddleMin+2*minlen)+"-"+(mateMiddleMax+2*maxlen)); + } + +// assert(false) : OUTPUT_INTERLEAVED; + String fname1="reads_B"+Data.GENOME_BUILD+"_"+number+"x"+maxlen+"bp_" + +(maxSnps==0 || snpRate==0 ? 0 : maxSnps)+"S_"+(maxInss==0 || insRate==0 ? 0 : +maxInsLen)+"I_"+(maxDels==0 || delRate==0 ? 0 : maxDelLen)+"D_"+ + (maxSubs==0 || subRate==0 ? 0 : maxSubLen)+"U_"+ + (maxNs==0 || nRate==0 ? 0 : maxNs)+"N"/*+"_chr"+minChrom+"-"+maxChrom*/+(paired ? (OUTPUT_INTERLEAVED ? "_interleaved" : "_1") : "")+fileExt; + + String fname2=(!paired || OUTPUT_INTERLEAVED) ? null : "reads_B"+Data.GENOME_BUILD+"_"+number+"x"+maxlen+"bp_" + +(maxSnps==0 || snpRate==0 ? 0 : maxSnps)+"S_"+(maxInss==0 || insRate==0 ? 0 : +maxInsLen)+"I_"+(maxDels==0 || delRate==0 ? 0 : maxDelLen)+"D_"+ + (maxSubs==0 || subRate==0 ? 0 : maxSubLen)+"U_"+ + (maxNs==0 || nRate==0 ? 0 : maxNs)+"N"/*+"_chr"+minChrom+"-"+maxChrom*/+"_2"+fileExt; + + if(out!=null){ + fname1=out.replaceFirst("#", "1"); + fname2=(!out.contains("#") || !paired || OUTPUT_INTERLEAVED) ? null : out.replaceFirst("#", "2"); + } + + rr.writeRandomReadsX(number, minlen, maxlen, + maxSnps, maxInss, maxDels, maxSubs, maxNs, + snpRate, insRate, delRate, subRate, nRate, + minInsLen, minDelLen, minSubLen, minNLen, + maxInsLen, maxDelLen, maxSubLen, maxNLen, + minChrom, maxChrom, colorspace, minQuality, midQuality, maxQuality, fname1, fname2); + + t.stop(); + Data.sysout.println("Wrote "+fname1); + if(fname2!=null){Data.sysout.println("Wrote "+fname2);} + Data.sysout.println("Time: \t"+t); + + } + + + private static ArrayList writeRef(String reference, int build){ + ArrayList chromlist=null; + if(reference!=null){ + { + File f=new File(reference); + if(!f.exists() || !f.isFile() || !f.canRead()){throw new RuntimeException("Cannot read file "+f.getAbsolutePath());} + } + { + String s=align2.IndexMaker4.fname(1, 1, 13, 1, false); + String dir=new File(s).getParent(); + dir=dir.replace('\\', '/'); + dir=dir.replace("ref/index/", "ref/genome/"); + String sf=dir+"/summary.txt"; + if(!NODISK && new File(sf).exists() && SummaryFile.compare(sf, reference)){ + //do nothing + System.err.println("NOTE:\tIgnoring reference file because it already appears to have been processed."); + System.err.println("NOTE:\tIf you wish to regenerate the index, please manually delete "+dir+"/summary.txt"); + return null; + } + File f=new File(dir); + if(f.exists()){ + File[] f2=f.listFiles(); + if(f2!=null && f2.length>0){ + if(OVERWRITE){ + Data.sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+OVERWRITE); + for(File f3 : f2){ + if(f3.isFile()){ + String f3n=f3.getName(); + if((f3n.contains(".chrom") || f3n.endsWith(".txt") || f3n.endsWith(".txt.gz")) && !f3n.endsWith("list.txt")){ + f3.delete(); + } + } + } + }else{ + Data.sysout.println(Arrays.toString(f2)); + throw new RuntimeException("\nThere is already a reference at location '"+f.getAbsolutePath()+"'. " + + "Please delete it (and the associated index), or use a different build ID, " + + "or remove the 'reference=' parameter from the command line, or set overwrite=true."); + } + } + } + dir=dir.replace("ref/genome/", "ref/index/"); + f=new File(dir); + if(f.exists()){ + File[] f2=f.listFiles(); + if(f2!=null && f2.length>0){ + if(OVERWRITE){ + Data.sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+OVERWRITE); + for(File f3 : f2){ + if(f3.isFile()){f3.delete();} + } + }else{ + throw new RuntimeException("\nThere is already an index at location '"+f.getAbsolutePath()+"'. " + + "Please delete it, or use a different build ID, or remove the 'reference=' parameter from the command line."); + } + } + } + } + + Data.sysout.println("Writing reference."); + + int oldzl=ReadWrite.ZIPLEVEL; + ReadWrite.ZIPLEVEL=Tools.max(4, ReadWrite.ZIPLEVEL); + + int minScaf=-1; + int midPad=500; + int maxChromLen=-1; + boolean genScaffoldInfo=true; + + maxChromLen=maxChromLen>0 ? maxChromLen : FastaToChromArrays.MAX_LENGTH; + minScaf=minScaf>-1 ? minScaf : FastaToChromArrays.MIN_SCAFFOLD; + midPad=midPad>-1 ? midPad : FastaToChromArrays.MID_PADDING; + String[] ftcaArgs=new String[] {reference, ""+build, "writeinthread=false", "genscaffoldinfo="+genScaffoldInfo, "retain", "waitforwriting=false", + "gzip="+(Data.CHROMGZ), "chromc="+Data.CHROMC, "maxlen="+maxChromLen, + "writechroms="+(!NODISK), "minscaf="+minScaf, "midpad="+midPad, "nodisk="+NODISK}; + + chromlist=FastaToChromArrays.main2(ftcaArgs); + + ReadWrite.ZIPLEVEL=oldzl; + } + return chromlist; + } + + + public RandomReads3(boolean paired_){ + this(getSeed(), paired_); + } + + public RandomReads3(long seed, boolean paired_){ + if(randomChrom==null){ + synchronized(getClass()){ + if(randomChrom==null){ + randomChrom=fillRandomChrom(); + } + } + } + randy=new Random(seed+1); + randy2=new Random(seed+2); + randyMutationType=new Random(seed+3); + randyCSError=new Random(seed+4); + randyQual=new Random(seed+5); + randyAdapter=new Random(seed+25); + paired=paired_; + + randyPerfectRead=new Random(seed+20); + randyLength=new Random(seed+21); + randyAmp=new Random(seed+22); + + if(paired){ + randyMate=new Random(seed+6); + randy2Mate=new Random(seed+7); + randyMutationTypeMate=new Random(seed+8); + randyCSErrorMate=new Random(seed+9); + randyQualMate=new Random(seed+10); + randyAdapterMate=new Random(seed+30); + }else{ + randyMate=null; + randy2Mate=null; + randyMutationTypeMate=null; + randyCSErrorMate=null; + randyQualMate=null; + randyAdapterMate=null; + } + + if(REPLACE_NOREF){ + randyNoref=new Random(seed+31); + }else{ + randyNoref=null; + } + } + + private final void addErrorsFromQuality(Read r, Random randy){ + for(int i=0; i=0 && j0){ + num=(oldNum^3); + }else{ + num=(oldNum+rand.nextInt(3)+1)%4; + } + assert(num>=0 && num<=3 && num!=oldNum); + bases[index]=AminoAcid.numberToBase[num]; + return bases; + } + + public byte[] addSNP(byte[] bases, int[] locs, int readlen, Random rand, BitSet bits){ + assert(readlen<=bases.length); + int index=rand.nextInt(readlen); + + while(bits.get(index)){ + index=rand.nextInt(readlen); + } + bits.set(index); + + byte old=bases[index]; + byte oldNum=AminoAcid.baseToNumber[old]; + if(oldNum<0){oldNum=0;} + int num; + if(BIASED_SNPS && rand.nextInt(3)>0){ + num=(oldNum^3); + }else{ + num=(oldNum+rand.nextInt(3)+1)%4; + } + assert(num>=0 && num<=3 && num!=oldNum) : num+", "+oldNum; + bases[index]=AminoAcid.numberToBase[num]; + return bases; + } + + + public byte[] addSUB(byte[] bases, int[] locs, int minlen, int maxlen, int readlen, Random rand){ + assert(readlen<=bases.length) : readlen+", "+bases.length; + assert(minlen>=1); + assert(maxlen>=minlen); + +// int len=minlen+randy2.nextInt(maxlen-minlen+1); + int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); +// int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); + + assert(len>=minlen); + assert(len<=maxlen); + +// System.err.println(minlen+", "+maxlen+", "+readlen+", "+s.length()); + + int index=rand.nextInt(readlen-len+1); + + int lim=index+len-1; + + {//Change first and last to anything except old + int i=index; + + byte old=bases[i]; + if(AminoAcid.isFullyDefined(old)){ + byte oldNum=AminoAcid.baseToNumber[old]; + int num=(oldNum+rand.nextInt(4))%4; + assert(num>=0 && num<=3); + byte base=AminoAcid.numberToBase[num]; + bases[i]=base; + } + + i=lim; + old=bases[i]; + if(AminoAcid.isFullyDefined(old)){ + byte oldNum=AminoAcid.baseToNumber[old]; + int num=(oldNum+rand.nextInt(4))%4; + assert(num>=0 && num<=3); + byte base=AminoAcid.numberToBase[num]; + bases[i]=base; + } + } + + for(int i=index+1; i=0 && num<=3 && num!=oldNum); + byte base=AminoAcid.numberToBase[num]; + bases[i]=base; + } + } + return bases; + } + + + public byte[] addN(byte[] bases, int[] locs, int minlen, int maxlen, int readlen, Random rand, BitSet bits){ + assert(readlen<=bases.length) : readlen+", "+bases.length; + assert(minlen>=1); + assert(maxlen>=minlen); + +// int len=minlen+randy2.nextInt(maxlen-minlen+1); + int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); + + assert(len>=minlen); + assert(len<=maxlen); + +// System.err.println(minlen+", "+maxlen+", "+readlen+", "+s.length()); + + int index=rand.nextInt(readlen-len+1); + if(bits!=null){ + int trials=40; + while(bits.get(index) && (trials--)>0){ + index=rand.nextInt(readlen-len+1); + } + bits.set(index); + } + + int lim=index+len-1; + + for(int i=index; i<=lim; i++){bases[i]='N';} + + return bases; + } + + public byte[] addInsertion(byte[] bases, int[] locs, int minlen, int maxlen, int readlen, int[] dif, Random rand){ + assert(readlen<=bases.length) : readlen+", "+bases.length; + assert(minlen>0); + assert(maxlen>=minlen); + +// int len=minlen+randy2.nextInt(maxlen-minlen+1); + int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); +// int len=minlen+Tools.min(rand.nextInt(maxlen-minlen+1), rand.nextInt(maxlen-minlen+1)); + + len=Tools.min(len, readlen-dif[1]-2); + if(len<1){return bases;} + + dif[0]-=len; + dif[1]+=len; + + int index=rand.nextInt(readlen-len+1); //Assures that all inserted bases will be within the read + +// System.err.println("Added insertion "+len+" at "+index); + + byte[] bases2=new byte[bases.length+len]; + for(int i=0; i=index; i--, j--){ +// if(verbose){ +// System.err.println("i="+i+", bases.length="+bases.length+", j="+j+", bases2.length="+bases2.length+", locs.length="+locs.length+"\n"+Arrays.toString(locs)); +// } + if(j=index; i--){ +// bases2[i+len]=bases[i]; +//// locs[i+len]=locs[i]; +// } +// final int locfill=locs[(index==0 ? 0 : index-1)]; + for(int i=index; i0); + assert(maxlen>=minlen); + int[] delsa=new int[dels]; + for(int i=0; i=readlen+len) : "bases.len="+bases.length+", readlen="+readlen+", len="+len+", dif="+Arrays.toString(dif); + assert(len>0); + + dif[0]+=len; + +// int index=randy2.nextInt(s.length()-len); + int index=1+rand.nextInt(readlen-1); //Assures there will never be a deletion of the first base, which would not technically be a deletion. + +// System.err.println("Added deletion "+len+" at "+index); + + byte[] bases2=new byte[bases.length-len]; + for(int i=0; imaxChrom){ + x=randy.nextInt(); + chrom=randomChrom[(x&0x7FFFFFFF)%randomChrom.length]; + } + return chrom; + } + + public int randomStrand(Read r0, int minChrom, int maxChrom, boolean sameStrandMate){ + if(r0!=null){ + return sameStrandMate ? r0.strand() : r0.strand()^1; + } + return randy.nextInt()&1; + } + + public int randomLoc(Read r0, int chrom, int readlen, int minMiddle, int maxMiddle, int strand){ + + if(r0!=null){ + return randomLocPaired(r0, chrom, readlen, minMiddle, maxMiddle, strand); + } + return randomLocSingle(chrom, readlen); + } + + public int randomLocPaired(Read r0, int chrom, int readlen, int minMiddle, int maxMiddle, int strand){ + assert(r0!=null); + + final int midRange=maxMiddle-minMiddle+1; + int middle=(maxMiddle+minMiddle)/2; + if(SUPERFLAT_DIST){ + // Data.sysout.print(other.numericID); + middle=((int)(r0.numericID%midRange))+minMiddle; + // Data.sysout.println("\t"+middle); + }else if(FLAT_DIST){ + middle=randyMate.nextInt(midRange)+minMiddle; + }else if(BELL_DIST){ + assert(false) : "TODO"; + }else{ + middle=(randyMate.nextInt(midRange)+randyMate.nextInt(midRange))/2+minMiddle; + } + + // Data.sysout.println(sameStrand+": "+other.strand+" -> "+strand); + int x; + if(r0.strand()==Gene.PLUS){ + x=r0.stop+middle+1; + }else{ + x=r0.start-middle-readlen; + } + return x; + } + + public int randomLocSingle(int chrom, int readlen){ + ChromosomeArray cha=Data.getChromosome(chrom); + byte[] array=cha.array; + if(readlen>=(cha.maxIndex-cha.minIndex)){return -1;} + + int loc=-1; + for(int i=0; loc<0 && i<24; i++){ + loc=randy.nextInt(cha.maxIndex-readlen); + for(int j=0; j=0 && maxQual<60); + + final int maxQualP=maxQual;//Tools.max(35, maxQual); + final int midQualP=midQual;//30; + final int minQualP=minQual;//Tools.min(25, maxQual); + + final BitSet bits=new BitSet(maxlen+1); + final int[] locs=new int[(int)Tools.min(300000000, maxlen+(maxDelLen*(long)maxDels))]; + + Read lastRead=null; + int ampLevel=0; + int ampLength=2000; + + for(int i=0; i1 && lastRead!=null){ + if(ampLevel>0){ + forceChrom=lastRead.chrom; +// forceLoc=lastRead.start+4-randyAmp.nextInt(9); +// forceLoc=lastRead.start+10-randyAmp.nextInt(21); + + int a=ampLength; + int b=a*2+1; + if(randyAmp.nextBoolean()){ + forceLoc=lastRead.start+a-randyAmp.nextInt(b); + }else{ +// if(randyAmp.nextBoolean()){ +// forceLoc=lastRead.start+a-(randyAmp.nextInt(b)+randyAmp.nextInt(b))/2; +// }else{ + forceLoc=lastRead.start+a-(randyAmp.nextInt(b)+randyAmp.nextInt(b)+randyAmp.nextInt(b))/3; +// } + } + }else{ + + int a1=AMP; + if(randyAmp.nextInt(30)==0){a1*=7;} + + if(randyAmp.nextInt(3)>0){ + ampLevel=Tools.min(randyAmp.nextInt(a1), randyAmp.nextInt(a1)); + }else{ + double log=Math.log10(a1*7); + ampLevel=(int)Math.round(Math.pow(10, randyAmp.nextDouble()*log)); + } + + ampLength=500+randyAmp.nextInt(3001); +// ampLevel=randyAmp.nextInt(AMP); + } + } + + Read r1=makeRead(null, minlen, maxlen, minChrom, maxChrom, + maxSnps, maxInss, maxDels, maxSubs, maxNs, + snpRate, insRate, delRate, subRate, nRate, + minInsLen, minDelLen, minSubLen, minNLen, + maxInsLen, maxDelLen, maxSubLen, maxNLen, + mateMiddleMin, mateMiddleMax, mateSameStrand, + minQual, midQual, maxQual, baseQuality, slant, + perfect, colorspace, nextReadID, locs, bits, forceChrom, forceLoc); + +// assert(false) : r1; + if(paired && r1!=null){ + + Read r2=null; + for(int tries=0; r2==null && tries<100; tries++){ + r2=makeRead(r1, minlen, maxlen, minChrom, maxChrom, + maxSnps, maxInss, maxDels, maxSubs, maxNs, + snpRate, insRate, delRate, subRate, nRate, + minInsLen, minDelLen, minSubLen, minNLen, + maxInsLen, maxDelLen, maxSubLen, maxNLen, + mateMiddleMin, mateMiddleMax, mateSameStrand, + minQual, midQual, maxQual, baseQuality, slant, + perfect, colorspace, nextReadID, locs, bits, -1, -1); + } + + if(r2!=null){ + r1.mate=r2; + r2.mate=r1; + }else{ + r1=null; + } + +// Data.sysout.println(r.strand()+"\t"+r.insertSize()); + } + if(r1!=null){ +// assert(false) : r1; + tsw1.println(r1); + if(r1.mate!=null){ + r1.mate.setPairnum(1); + if(tsw2!=null){tsw2.println(r1.mate);} + else{tsw1.println(r1.mate);} + + } + nextReadID++; + }else{ + i--; + } + ampLevel=Tools.max(0, ampLevel-1); + if(ampLevel==0){lastRead=null;} + + if(lastRead==null){lastRead=r1;} +// System.err.println("Made "+r.start+" ~ "+r.stop+" = "+(r.stop-r.start)); + } + tsw1.poison(); + if(tsw2!=null){tsw2.poison();} + } + + public Read makeRead(Read r0, int minlen, int maxlen, int minChrom, int maxChrom, + int maxSnps, int maxInss, int maxDels, int maxSubs, int maxNs, + float snpRate, float insRate, float delRate, float subRate, float nRate, + int minInsLen, int minDelLen, int minSubLen, int minNLen, + int maxInsLen, int maxDelLen, int maxSubLen, int maxNLen, + int minMiddle, int maxMiddle, boolean sameStrand, + int minQual, int midQual, int maxQual, byte baseQuality, byte slant, + boolean perfect, boolean colorspace, long rid, int[] locs, BitSet bits, + int FORCE_CHROM, int FORCE_LOC){ + +// verbose=(rid==3860); + + int SNPs=0; + int INSs=0; + int DELs=0; + int SUBs=0; + int Ns=0; + int adapters=0; + + while(SNPs0) : cha.get(y); + } + + if(loc<0){ + if(verbose){ + System.err.println("Bad values; returning null."); + } + return null; + } + + final ChromosomeArray cha=Data.getChromosome(chrom); + if(readlen>=(cha.maxIndex-cha.minIndex)){ + if(verbose){ + System.err.println("Too long; returning null."); + } + return null; + } + if(loc>=cha.maxIndex || loc<0){return null;} + byte[] bases=cha.getBytes(loc, loc+inititallen0-1); + assert(bases[0]>0 && bases[bases.length-1]>0) : Arrays.toString(bases); + assert(strand==Gene.MINUS || strand==Gene.PLUS); + + for(int i=0; ireadlen){bases=Arrays.copyOf(bases, readlen);} + assert(bases.length==readlen); + + for(int j=0; j=0; i--){ + if(locs[i]<0){locs[i]=locs[i+1];} + } + final int x=locs[0], y=locs[bases.length-1]; + if(verbose){ + System.err.println("After adding SNPs, SUBs, Ns, and fixing locs: "); + System.err.println("'"+new String(bases)+"'"); + System.err.println(Arrays.toString(Arrays.copyOf(locs, Tools.min(locs.length, bases.length)))); + } + +// if(FORCE_LOC>=0 || FORCE_CHROM>=0){ +// if(y<0 || y+readlen>) +// } + assert(FORCE_LOC>=0 || FORCE_CHROM>=0 || y<=cha.maxIndex) : y+", "+r0; + assert(FORCE_LOC>=0 || FORCE_CHROM>=0 || cha.get(y)>0) : cha.get(y); + + if(strand==Gene.MINUS){ + AminoAcid.reverseComplementBasesInPlace(bases); + //Reverse loc array; not really necessary + for(int i=0, lim=bases.length/2; ir.start) : r; + + if(colorspace){ + r=r.translateToColorspace(true); + r.obj=new String(r.bases); //TODO - for testing + } + r.mapLength=r.bases.length; + if(adapters>0){r.setHasAdapter(true);} + if(FORCE_SINGLE_SCAFFOLD && !Data.isSingleScaffold(r.chrom, r.start, r.stop)){return null;} + if(MIN_SCAFFOLD_OVERLAP>0 && Data.scaffoldOverlapLength(r.chrom, r.start, r.stop)=4); + r.quality[loc]=(byte) (4+randyCSError.nextInt(r.quality[loc])); + r.bases[loc]=(byte)((r.bases[loc]+randyCSError.nextInt(3)+1)&3); + } + } + } + + private static int[] randomChrom; + + private static int[] fillRandomChrom(){ + + int[] in=Arrays.copyOf(Data.chromLengths, Data.chromLengths.length); + long total=Tools.sum(in); + int div=(int)(total/8192); + for(int i=0; i{ + + @Override + public int compare(Read r1, Read r2) { + + int a=(r1.errors+(r1.mate==null ? 0 : r1.mate.errors)); + int b=(r2.errors+(r2.mate==null ? 0 : r2.mate.errors)); + if(a!=b){return a-b;} + + a=(r1.bases.length+(r1.mate==null ? 0 : r1.mate.bases.length)); + b=(r2.bases.length+(r2.mate==null ? 0 : r2.mate.bases.length)); + if(a!=b){return b-a;} + + float a2=(r1.expectedErrors()+(r1.mate==null ? 0 : r1.mate.expectedErrors())); + float b2=(r2.expectedErrors()+(r2.mate==null ? 0 : r2.mate.expectedErrors())); + if(a2!=b2){return a2>b2 ? 1 : -1;} + + if(r1.numericIDr2.numericID){return 1;} + + if(!r1.id.equals(r2.id)){return r1.id.compareTo(r2.id);} + return 0; + } + + public static final ReadErrorComparator comparator=new ReadErrorComparator(); + +} diff --git a/current/align2/ReadLengthComparator.java b/current/align2/ReadLengthComparator.java new file mode 100755 index 0000000..99003bd --- /dev/null +++ b/current/align2/ReadLengthComparator.java @@ -0,0 +1,48 @@ +package align2; + +import java.util.Comparator; + +import stream.Read; + +/** + * Sorts longest reads first + * @author Brian Bushnell + * @date Jul 19, 2013 + * + */ +public final class ReadLengthComparator implements Comparator { + + private ReadLengthComparator(){} + + @Override + public int compare(Read a, Read b) { + int x=compare2(a, b); + if(x==0){x=compare2(a.mate, b.mate);} + if(x==0){x=a.id.compareTo(b.id);} + if(x==0){x=a.numericID>b.numericID ? 1 : a.numericID0){x.insertHist.increment(i, y);} + } + } + + } + return x; + } + + public void addToQualityHistogram(final Read r){ + if(r==null){return;} + addToQualityHistogram(r, r.obj, 0); + if(r.mate!=null){addToQualityHistogram(r.mate, r.obj, 1);} + } + + private void addToQualityHistogram(final Read r, Object obj, final int pairnum){ + if(r==null || r.quality==null || r.quality.length<1){return;} + final byte[] qual; + if(obj!=null && obj.getClass()==TrimRead.class){ + qual=(pairnum==0 ? ((TrimRead)obj).qual1 : ((TrimRead)obj).qual2); + }else{ + qual=r.quality; + } + final int limit=Tools.min(qual.length, MAXLEN); + final long[] ql=qualLength[pairnum], qs=qualSum[pairnum]; + ql[limit-1]++; + for(int i=0; i0){insertHist.increment(x, 1);} +// assert(x!=1) : "\n"+r+"\n\n"+r.mate+"\n"; +// System.out.println("Incrementing "+x); + } + + public void writeQualityToFile(String fname, boolean writePaired){ + TextStreamWriter tsw=new TextStreamWriter(fname, OVERWRITE, false, false); + tsw.start(); + tsw.print("#BaseNum\tRead1"+(writePaired ? "\tRead2" : "")+"\n"); + + final long[] qs1=qualSum[0], qs2=qualSum[1], ql1=qualLength[0], ql2=qualLength[1]; + + for(int i=MAXLEN-2; i>=0; i--){ + ql1[i]+=ql1[i+1]; + ql2[i]+=ql2[i+1]; + } + + if(writePaired){ + for(int i=0; i0 || ql2[i]>0); i++){ + int a=i+1; + double b=qs1[i]/(double)Tools.max(1, ql1[i]); + double c=qs2[i]/(double)Tools.max(1, ql2[i]); + tsw.print(String.format("%d\t%.3f\t%.3f\n", a, b, c)); + } + }else{ + for(int i=0; i0; i++){ + int a=i+1; + double b=qs1[i]/(double)Tools.max(1, ql1[i]); + tsw.print(String.format("%d\t%.3f\n", a, b)); + } + } + tsw.poison(); + tsw.waitForFinish(); + } + + public void writeMatchToFile(String fname, boolean writePaired){ + if(!writePaired){ + writeMatchToFileUnpaired(fname); + return; + } + TextStreamWriter tsw=new TextStreamWriter(fname, OVERWRITE, false, false); + tsw.start(); + tsw.print("#BaseNum\tMatch1\tSub1\tDel1\tIns1\tN1\tOther1\tMatch2\tSub2\tDel2\tIns2\tN2\tOther2\n"); + + final long[] ms1=matchSum[0], ds1=delSum[0], is1=insSum[0], + ss1=subSum[0], ns1=nSum[0], cs1=clipSum[0], os1=otherSum[0]; + final long[] ms2=matchSum[1], ds2=delSum[1], is2=insSum[1], + ss2=subSum[1], ns2=nSum[1], cs2=clipSum[1], os2=otherSum[1]; + + for(int i=0; i0 || !skipZeroInsertCount){ + tsw.print(i+"\t"+x+"\t"+"\n"); + } + } + tsw.poison(); + tsw.waitForFinish(); + } + + public final long[][] qualLength; + public final long[][] qualSum; + + public final long[][] matchSum; + public final long[][] delSum; + public final long[][] insSum; + public final long[][] subSum; + public final long[][] nSum; + public final long[][] clipSum; + public final long[][] otherSum; + + public final LongList insertHist; + + public static final int MAXLEN=2000; + public static final int MAXINSERTLEN=24000; + + public static ArrayList objectList=new ArrayList(); + public static boolean COLLECT_QUALITY_STATS=false; + public static boolean COLLECT_MATCH_STATS=false; + public static boolean COLLECT_INSERT_STATS=false; + public static String QUAL_HIST_FILE=null; + public static String MATCH_HIST_FILE=null; + public static String INSERT_HIST_FILE=null; + public static boolean OVERWRITE=false; + public static final boolean verbose=false; + + public static boolean skipZeroInsertCount=true; + +} diff --git a/current/align2/ReadToSam.java b/current/align2/ReadToSam.java new file mode 100755 index 0000000..58eac5e --- /dev/null +++ b/current/align2/ReadToSam.java @@ -0,0 +1,176 @@ +package align2; + +import java.io.File; +import java.util.ArrayList; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.ReadStreamStringWriter; +import stream.ReadStreamWriter; + +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; + +public class ReadToSam { + + + public static void main(String[] args){ + + for(String arg : args){ + if(arg.startsWith("b=") || arg.startsWith("build=")){ + String[] split=arg.split("="); + Data.setGenome(Integer.parseInt(split[1])); + } + } + + System.err.println("Using header for build "+Data.GENOME_BUILD); + String reads1=args[0]; + String reads2=args[1].equalsIgnoreCase("null") ? null : args[1]; + String outname=args[2].equalsIgnoreCase("null") ? "" : args[2]; + + ReadToSam smr=new ReadToSam(reads1, reads2, outname); + smr.process(); + + } + + public ReadToSam(String fname1, String fname2, String outname_){ + this(new RTextInputStream(fname1, fname2, -1), outname_); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + } + + public ReadToSam(RTextInputStream stream_, String outname_){ + stream=stream_; + outname=outname_; + paired=stream.paired(); +// assert(outname.contains("#")) : "Output file name must contain the character '#' to be used for chromosome number."; + + cris=(USE_CRIS ? new ConcurrentReadInputStream(stream, -1) : null); + } + + public void process(){ + + Timer t=new Timer(); + t.start(); + + final String fname1=outname.replaceFirst("#", "1"); + if(fname1!=null && new File(fname1).exists()){throw new RuntimeException("Destination file "+fname1+" already exists.");} + + ReadStreamWriter wt1=new ReadStreamStringWriter(fname1, true, 4, true, fname1.endsWith(".bam"), false, false, false, false, false, false, true); + + Thread wtt1=new Thread(wt1); + +// while(t.hashCode()!=0){} + + if(wtt1!=null){wtt1.start();} + + assert(USE_CRIS); + new Thread(cris).start(); + System.err.println("Started cris"); + +// System.err.println + + long count=0; + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + + ArrayList reads2=new ArrayList(reads.size()); + + if(paired){ + for(Read r : reads){ +// r.setPaired(false); +// if(r.mapped() && !r.discarded() && r.valid()){ +// reads2.add(r); +// } + assert(r!=null); + assert(r.mate!=null); + reads2.add(r); + reads2.add(r.mate); + } + }else{ + for(Read r : reads){ + r.setPaired(false); + if(r.mapped() && !r.discarded() && r.valid()){ + reads2.add(r); + } + } + } + +// ArrayList reads2=(ArrayList) reads.clone(); + + if(wt1!=null){wt1.addList(reads2);} + + // System.err.println("Added list of length "+reads.size()); + + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + } + + //Add poison + // if(wt1!=null){wt1.addList(null);} + // if(wt2!=null){wt2.addList(null);} + wt1.poison(); + + while(wtt1.isAlive()){ + try { + wtt1.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + t.stop(); + Data.sysout.println("Time:\t"+t); + +// if(cris!=null){ +// new Thread(cris).start(); +// ListNum ln=cris.nextList(); +// ArrayList reads=(ln!=null ? ln.list : null); +// +// while(reads!=null && reads.size()>0){ +// processReads(reads); +// cris.returnList(ln, ln.list.isEmpty()); +// ln=cris.nextList(); +// reads=(ln!=null ? ln.list : null); +// } +// cris.returnList(ln, ln.list.isEmpty()); +// }else{ +// ArrayList reads=stream.nextList(); +// while(reads!=null && reads.size()>0){ +// processReads(reads); +// reads=stream.nextList(); +// } +// } +// +// synchronized(this){this.notifyAll();} +// +// finish(); + } + + + public final String outname; + private final RTextInputStream stream; + private final ConcurrentReadInputStream cris; + + public final boolean paired; + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + + public static final int WRITE_BUFFER=400; //Bigger number uses more memory, for less frequent writes. + + +} diff --git a/current/align2/RefToIndex.java b/current/align2/RefToIndex.java new file mode 100755 index 0000000..8e29dfe --- /dev/null +++ b/current/align2/RefToIndex.java @@ -0,0 +1,146 @@ +package align2; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; + +import dna.ChromosomeArray; +import dna.Data; +import dna.FastaToChromArrays; +import fileIO.ReadWrite; +import fileIO.SummaryFile; + +/** + * @author Brian Bushnell + * @date Sep 25, 2013 + * + */ +public class RefToIndex { + + public static void makeIndex(String reference, int build, PrintStream sysout, int keylen){ + assert(reference!=null); + { + File f=new File(reference); + if(!f.exists() || !f.isFile() || !f.canRead()){ + if(!reference.startsWith("stdin")){ + throw new RuntimeException("Cannot read file "+f.getAbsolutePath()); + } + } + } + + String s=IndexMaker4.fname(1, 1, keylen, 1, false); + String dir=new File(s).getParent(); + dir=dir.replace('\\', '/'); + final String base=dir.substring(0, dir.length()-7); + final String args=(Shared.COMMAND_LINE==null ? "null" : Arrays.toString(Shared.COMMAND_LINE)); + final String indexlog=base+"build"+build+"_"+ + (System.nanoTime()&Long.MAX_VALUE)+"."+((args==null ? (reference==null ? "null" : reference) : args).hashCode()&Integer.MAX_VALUE)+".log"; + dir=dir.replace("ref/index/", "ref/genome/"); + String sf=dir+"/summary.txt"; + if(!NODISK && new File(sf).exists() && SummaryFile.compare(sf, reference)){ + //do nothing + if(LOG && !NODISK){ + if(!new File(base).exists()){new File(base).mkdirs();} + ReadWrite.writeString(new Date()+"\nFound an already-written genome for build "+build+".\n"+args+"\n", indexlog, true); + } + sysout.println("NOTE:\tIgnoring reference file because it already appears to have been processed."); + sysout.println("NOTE:\tIf you wish to regenerate the index, please manually delete "+dir+"/summary.txt"); + }else{ + if(NODISK){} + else{//Delete old data if present + File f=new File(dir); + if(f.exists()){ + File[] f2=f.listFiles(); + if(f2!=null && f2.length>0){ + if(OVERWRITE || f2[0].getAbsolutePath().equals(new File(reference).getAbsolutePath())){ + sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+OVERWRITE); + if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nDeleting genome for build "+build+".\n"+args+"\n", indexlog, true);} + for(File f3 : f2){ + if(f3.isFile()){ + String f3n=f3.getName(); + if((f3n.contains(".chrom") || f3n.endsWith(".txt") || f3n.endsWith(".txt.gz")) && !f3n.endsWith("list.txt")){ + f3.delete(); + } + } + } + }else{ + sysout.println(Arrays.toString(f2)); + if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nFailed to overwrite genome for build "+build+".\n"+args+"\n", indexlog, true);} + throw new RuntimeException("\nThere is already a reference at location '"+f.getAbsolutePath()+"'. " + + "Please delete it (and the associated index), or use a different build ID, " + + "or remove the 'reference=' parameter from the command line, or set overwrite=true."); + } + } + } + + dir=dir.replace("ref/genome/", "ref/index/"); + f=new File(dir); + if(f.exists()){ + File[] f2=f.listFiles(); + if(f2!=null && f2.length>0){ + if(OVERWRITE){ + sysout.println("NOTE:\tDeleting contents of "+dir+" because reference is specified and overwrite="+OVERWRITE); + if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nDeleting index for build "+build+".\n"+args+"\n", indexlog, true);} + for(File f3 : f2){ + if(f3.isFile()){f3.delete();} + } + }else{ + if(LOG && !NODISK){ReadWrite.writeString(new Date()+"\nFailed to overwrite index for build "+build+".\n"+args+"\n", indexlog, true);} + throw new RuntimeException("\nThere is already an index at location '"+f.getAbsolutePath()+"'. " + + "Please delete it, or use a different build ID, or remove the 'reference=' parameter from the command line."); + } + } + } + } + + if(!NODISK){ + sysout.println("Writing reference."); + if(LOG && !NODISK){ + if(!new File(base).exists()){new File(base).mkdirs();} + ReadWrite.writeString(new Date()+"\nWriting genome for build "+build+".\n"+args+"\n", indexlog, true); + } + } + + int oldzl=ReadWrite.ZIPLEVEL; + ReadWrite.ZIPLEVEL=Tools.max(4, ReadWrite.ZIPLEVEL); + + //assert(false) : "minScaf="+minScaf+", midPad="+midPad+", maxChromLen="+maxChromLen+ + // ", startPad="+startPad+", stopPad="+stopPad+", FastaToChromArrays.END_PADDING="+FastaToChromArrays.END_PADDING; + + maxChromLen=maxChromLen>0 ? maxChromLen : AUTO_CHROMBITS ? FastaToChromArrays.MAX_LENGTH : ((1L<<(31-(chrombits<0 ? 2 : chrombits)))-200000); + minScaf=minScaf>-1 ? minScaf : FastaToChromArrays.MIN_SCAFFOLD; + midPad=midPad>-1 ? midPad : FastaToChromArrays.MID_PADDING; + startPad=startPad>-1 ? startPad : FastaToChromArrays.START_PADDING; + stopPad=stopPad>-1 ? stopPad : FastaToChromArrays.END_PADDING; + + String[] ftcaArgs=new String[] {reference, ""+build, "writeinthread=false", "genscaffoldinfo="+genScaffoldInfo, "retain", "waitforwriting=false", + "gzip="+(Data.CHROMGZ), "chromc="+Data.CHROMC, "maxlen="+maxChromLen, + "writechroms="+(!NODISK), "minscaf="+minScaf, "midpad="+midPad, "startpad="+startPad, "stoppad="+stopPad, "nodisk="+NODISK}; + + chromlist=FastaToChromArrays.main2(ftcaArgs); + + ReadWrite.ZIPLEVEL=oldzl; + } + + } + + public static boolean AUTO_CHROMBITS=true; + public static boolean LOG=false; + public static boolean NODISK=false; + public static boolean OVERWRITE=true; + public static boolean genScaffoldInfo=true; + + public static long maxChromLen=-1; + + public static int minScaf=-1, midPad=-1, stopPad=-1, startPad=-1; + public static int chrombits=-1; +// public static int minScaf=FastaToChromArrays.MIN_SCAFFOLD; +// public static int midPad=FastaToChromArrays.MID_PADDING; +// public static int startPad=FastaToChromArrays.START_PADDING; +// public static int stopPad=FastaToChromArrays.END_PADDING; + + public static ArrayList chromlist=null; + +} diff --git a/current/align2/ReformatBatchOutput.java b/current/align2/ReformatBatchOutput.java new file mode 100755 index 0000000..f8f2f83 --- /dev/null +++ b/current/align2/ReformatBatchOutput.java @@ -0,0 +1,217 @@ +package align2; + +import java.util.ArrayList; +import java.util.Arrays; + +import fileIO.TextFile; + +public class ReformatBatchOutput { + +// Elapsed: 31.7 +// +// Mapping Statistics for 0s_default.sam: +// mapped: 100.00% +// retained: 96.06% +// discarded: 0.00% +// ambiguous: 3.94% +// +// Strict correctness (both ends exactly correct): +// true positive: 96.06% +// false positive: 0.00% +// +// Loose correctness (one end approximately correct): +// true positive: 96.06% +// false positive: 0.00% +// +// false negative: 0.00% +// Elapsed: 2.34 +// Elapsed: 20.51 + + +// Elapsed: 0.33 +// +// Mapping Statistics for bwa_0S_0I_0D_0U_0N_r100.sam: +// primary alignments: 100 found of 100 expected +// secondary alignments: 0 found +// mapped: 100.000% +// retained: 97.000% +// discarded: 0.000% +// ambiguous: 3.000% +// +// Strict correctness (both ends exactly correct): +// true positive: 97.000% +// false positive: 0.000% +// +// Loose correctness (one end approximately correct): +// true positive: 97.000% +// false positive: 0.000% +// +// false negative: 0.000% + + + public static void main(String[] args){ + TextFile tf=new TextFile(args[0], false, false); + String[] lines=tf.toStringLines(); + ArrayList list=new ArrayList(); + + int mode=0; + + System.out.println(header()); + + for(String s : lines){ + if(s.startsWith("Elapsed:")){ + if(!list.isEmpty()){ + process(list); //failure + list.clear(); + mode=0; + } + mode++; + } + + if(mode>0){ + list.add(s); + if(s.startsWith("false negative:")){ + process(list); + list.clear(); + mode=0; + } + } + } + } + + + public static String header() { + return("program\tfile\tvartype\tcount\treads\tprimary\tsecondary\ttime\tmapped\tretained\tdiscarded\tambiguous\ttruePositive\t" + + "falsePositive\ttruePositiveL\tfalsePositiveL\tfalseNegative"); + } + + //bwa_1S_0I_0D_0U_0N_r400000x100.sam + public static int getReads(String name){ +// String[] split=name.substring(0, name.length()-4).split("_"); + String[] split=name.split("_"); + String r=(split[split.length-1]); + if(r.charAt(0)=='r' && Character.isDigit(r.charAt(r.length()-1))){ + assert(r.charAt(0)=='r') : Arrays.toString(split)+", "+name; + r=r.substring(1); + if(r.contains("x")){ + r=r.substring(0, r.indexOf('x')); + } + return Integer.parseInt(r); + }else{ + for(String s : split){ + if(s.endsWith("bp") && s.contains("x") && Character.isDigit(s.charAt(0))){ + r=s.substring(0, s.indexOf('x')-1); + return Integer.parseInt(r); + } + } + } + return 0; + } + + public static char getVarType(String name){ +// String[] split=name.substring(0, name.length()-4).split("_"); + String[] split=name.split("_"); + for(String s : split){ + char c=s.charAt(0); + if(Character.isDigit(c) && c!='0' && !s.endsWith("bp")){ + return s.charAt(s.length()-1); + } + } + return '?'; + } + + public static int getCount(String name){ +// String[] split=name.substring(0, name.length()-4).split("_"); + String[] split=name.split("_"); + for(String s : split){ + char c=s.charAt(0); + if(Character.isDigit(c) && c!='0' && !s.endsWith("bp")){ + String r=s.substring(0, s.length()-1); + return Integer.parseInt(r); + } + } + return 0; + } + + public static String getProgram(String name){ + return name.substring(0, name.indexOf('_')); + } + + + + public static void process(ArrayList list){ + + String name=null; +// String count=null; + String time=null; + StringBuilder sb=new StringBuilder(); + + int primary=0; + int secondary=0; + int expected=0; + + for(String s : list){ + String[] split=s.split("\t"); + String a=split[0]; + String b=(split.length>1 ? split[1] : null); + if(a.equals("Elapsed:")){ + time=b; + }else if(a.startsWith("lines:")){ + //do nothing + }else if(a.startsWith("Mapping Statistics for ")){ + name=a.replace("Mapping Statistics for ", "").replace(".sam:", ""); + }else if(a.startsWith("primary alignments:")){ + b=b.replace(" found of ", "_"); + b=b.replace(" expected", ""); + String[] split2=b.split("_"); + primary=Integer.parseInt(split2[0]); + expected=Integer.parseInt(split2[1]); + }else if(a.startsWith("secondary alignments:")){ + b=b.replace(" found", ""); + secondary=Integer.parseInt(b); + }else if(b!=null){ + assert(!b.contains("found")) : "\na='"+a+"'\nb='"+b+"'\n"+a.equals("primary alignments:"); + sb.append('\t').append(b.replace("%", "")); + } + + } + +// if(name!=null){ +// count=""; +// +// String[] split=name.split("_"); +// +// for(String s : split){ +// if(s!=null && s.length()>0 && s.charAt(0)!='0'){ +// for(int i=0; i0){break;} +// } +// } + + String prg=null; + char type='S'; + int reads=1; + int vars=0; + + if(name!=null){ + try { + prg=getProgram(name); + type=getVarType(name); + reads=getReads(name); + vars=getCount(name); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + System.out.println(prg+"\t"+name+"\t"+type+"\t"+vars+"\t"+reads+"\t"+primary+"\t"+secondary+"\t"+time+sb); + + } + +} diff --git a/current/align2/ReformatBatchOutput2.java b/current/align2/ReformatBatchOutput2.java new file mode 100755 index 0000000..27f55d2 --- /dev/null +++ b/current/align2/ReformatBatchOutput2.java @@ -0,0 +1,61 @@ +package align2; + +import java.util.ArrayList; + +import fileIO.TextFile; + +public class ReformatBatchOutput2 { + +// Elapsed: 31.7 +// +// Mapping Statistics for 0s_default.sam: +// mapped: 100.00% +// retained: 96.06% +// discarded: 0.00% +// ambiguous: 3.94% +// +// Strict correctness (both ends exactly correct): +// true positive: 96.06% +// false positive: 0.00% +// +// Loose correctness (one end approximately correct): +// true positive: 96.06% +// false positive: 0.00% +// +// false negative: 0.00% +// Elapsed: 2.34 +// Elapsed: 20.51 + + + public static void main(String[] args){ + TextFile tf=new TextFile(args[0], false, false); + String[] lines=tf.toStringLines(); + ArrayList list=new ArrayList(); + + int mode=0; + + System.out.println(header()); + + for(String s : lines){ + if(s.startsWith("Elapsed:")){mode++;} + if(mode>1){ + mode=0; + }else{ +// list.add(s); + if(s.startsWith("Mapping Statistics for ")){ + System.out.println(s.replace("Mapping Statistics for ", "").replace(".sam:", "")+"\t"); + }else if(s.startsWith("Mapping:")){ + s=s.replace("Mapping:", "").replace("seconds.", "").trim(); + System.out.print(s+"\t"); + } + } + } + } + + + public static String header() { + return("name\tcount\ttime\tmapTime\tmapped\tretained\tdiscarded\tambiguous\ttruePositive\t" + + "falsePositive\ttruePositiveL\tfalsePositiveL\tfalseNegative"); + } + +} diff --git a/current/align2/Shared.java b/current/align2/Shared.java new file mode 100755 index 0000000..5df19e3 --- /dev/null +++ b/current/align2/Shared.java @@ -0,0 +1,65 @@ +package align2; + +import java.lang.management.ManagementFactory; +import java.util.List; + +import dna.Data; + +public class Shared { + + public static int THREADS=SET_THREADS(-1); + + public static int READ_BUFFER_LENGTH=200; + public static int READ_BUFFER_NUM_BUFFERS=Tools.max(4, (THREADS*3)/2); + public static final long READ_BUFFER_MAX_DATA=500000; + + //TODO: Actually... for some reason... it seems as though GAPBUFFER must equal exactly 1/2 of GAPLEN. Not good; 1/4 would be far better. + + public static final int GAPBUFFER=64; //TODO: Seems to break less than 64, for some reason + public static final int GAPBUFFER2=2*GAPBUFFER; + public static final int GAPLEN=128; //TODO: May break when over 128 + public static final int MINGAP=GAPBUFFER2+GAPLEN; + public static final int GAPCOST=Tools.max(1, GAPLEN/64); + public static final byte GAPC='-'; + + public static int BBMAP_VERSION=31; + public static int BBMAP_VERSION_MINOR=27; + public static String BBMAP_VERSION_STRING=BBMAP_VERSION+"."+BBMAP_VERSION_MINOR; + + public static boolean TRIM_READ_COMMENTS=false; + + public static String BBMAP_CLASS=null; + public static String[] COMMAND_LINE=null; + public static List JVM_ARGS(){ + return ManagementFactory.getRuntimeMXBean().getInputArguments(); + } + + /** Directory in which to write temp files */ + public static String TMPDIR=(System.getenv("TMPDIR")==null ? null : (System.getenv("TMPDIR")+"/").replaceAll("//", "/")); +// static{assert(false) : "TMPDIR="+TMPDIR;} + + /** Anomaly probably resolved as of v.20.1 + * This variable should be TRUE for normal users and FALSE for me. */ + public static boolean anomaly=!System.getProperty("user.dir").contains("/bushnell/") && !Data.WINDOWS; + + public static final char[] getTLCB(int len){ + char[] buffer=TLCB.get(); + if(buffer==null || buffer.length TLCB=new ThreadLocal(); + + public static int SET_THREADS(int x){ + if(x>0){ + THREADS=x; + }else{ + THREADS=(Data.HOSTNAME()==null || !Data.HOSTNAME().startsWith("gpint") ? Data.LOGICAL_PROCESSORS : Tools.min(4, Data.LOGICAL_PROCESSORS)); + } +// assert(false) : Data.HOSTNAME()+", "+THREADS; + return THREADS; + } + +} diff --git a/current/align2/Solver.java b/current/align2/Solver.java new file mode 100755 index 0000000..1fdbc50 --- /dev/null +++ b/current/align2/Solver.java @@ -0,0 +1,243 @@ +package align2; + +import java.util.Arrays; + +public class Solver { + + + public static final long bruteForce(int[] offsets, int[] lengths, int chunk, int minLists, int maxTotalLength){ + + int bits=offsets.length; + int max=(1<Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)value); + return; + } + min=value; + worstIndex=i; + } + } +// if(min>0){worstIndex=-1;} + r[0]=worstIndex; + r[1]=(minInteger.MAX_VALUE ? Integer.MAX_VALUE : (int)min); + } + + + public static final void findWorstGreedy(final int[] offsets, final int[] lengths, + final float[] weights, final int chunk, final int[] lists, int[] r){ + assert(r!=null && r.length==2); + + long min=Long.MAX_VALUE; + int worstIndex=-1; + for(int i=0; i=0; i--){ + long value=valueOfElement(offsets, lengths, weights[i], chunk, lists, i); + if(valueInteger.MAX_VALUE ? Integer.MAX_VALUE : (int)value); +// System.out.print("."); + return; + } + min=value; + worstIndex=i; + } + } +// if(min>0){worstIndex=-1;} + r[0]=worstIndex; + r[1]=(minInteger.MAX_VALUE ? Integer.MAX_VALUE : (int)min); + } + + + public static long valueOfElement(final int[] offsets, final int[] lengths, float keyWeight, + final int chunk, final int[] lists, int index){ + + final int numlists=lists.length; + if(numlists<1){return 0;} + + final int prospect=lists[index]; + if(lengths[prospect]==0){return -999999;} + + long valuep=POINTS_PER_LIST+(POINTS_PER_LIST*2/lists.length)+((POINTS_PER_LIST*10)/lengths[prospect]); + long valuem=POINTS_PER_SITE*lengths[prospect]; + + if(prospect==0 || (prospect==offsets.length-1)){ + valuep+=BONUS_POINTS_FOR_END_LIST; + } + + if(numlists==1){ + valuep+=(POINTS_FOR_TOTAL_LIST_WIDTH+POINTS_PER_BASE1)*chunk; + return ((long)(valuep*keyWeight))+valuem; + } + + + final int first=lists[0]; + final int last=lists[numlists-1]; + + //Offsets of elements to the left and right of the prospect +// final int offL=(prospect==first ? - : offsets[lists[index-1]]); +// final int offP=offsets[prospect]; +// final int offR=(prospect==last ? offsets[offsets.length-1] : offsets[lists[index+1]]); +// assert(offL<=offP); +// assert(offP<=offR); +// assert(offL0) : "\n"+spaceScore+", "+oldLeftSpace+", "+oldRightSpace+", "+newSpace+"\n"+ + Arrays.toString(offsets)+"\nprospect="+prospect+"\n"; + valuep+=spaceScore; + + int uniquelyCovered; + if(prospect==first){ + uniquelyCovered=offR-offP; //Technically, -1 should be added + }else if(prospect==last){ + uniquelyCovered=offP-offL; //Technically, -1 should be added + }else{ + int a=offL+chunk; + int b=offR-a; + uniquelyCovered=(b>0 ? b : 0); + } + + if(prospect==first || prospect==last){ + valuep+=(POINTS_PER_BASE1+POINTS_FOR_TOTAL_LIST_WIDTH)*uniquelyCovered; + }else{ + valuep+=POINTS_PER_BASE1*uniquelyCovered; + } + + return ((long)(valuep*keyWeight))+valuem; + } + + public static int[] toBitList(final int key){ + final int numlists=Integer.bitCount(key); + final int[] lists=new int[numlists]; + for(int i=0, ptr=0; ptr0); + final int[] lists=new int[numlists]; + for(int i=0, ptr=0; ptrchunk ? space-chunk : 0; + + score+=MULT_FOR_SPACING_PENALTY*(space*space); + score-=POINTS_PER_BASE1*uncovered; + } + + if(first>0){ + long x=offsets[first]; + score+=MULT_FOR_SPACING_PENALTY*(x*x); + score-=POINTS_PER_BASE1*x; + } + + if(last<(offsets.length-1)){ + long x=offsets[offsets.length-1]-offsets[last]; + score+=MULT_FOR_SPACING_PENALTY*(x*x); + score-=POINTS_PER_BASE1*x; + } + + return score; + } + + public static final int BASE_POINTS_PER_SITE=-50; //Used to set POINTS_PER_SITE + public static long POINTS_PER_SITE=-50; //TODO: Make private with a get() and set() function + + public static final long MULT_FOR_SPACING_PENALTY=-30; + + public static long EARLY_TERMINATION_SCORE=(POINTS_PER_SITE*2000); //TODO: Should be set dynamically + + public static final long POINTS_PER_LIST=30000; + public static final long POINTS_PER_BASE1=6000; //Points for a base covered once + public static final long POINTS_PER_BASE2=1000;//POINTS_PER_BASE1/4; //Points for a base covered twice + public static final long BONUS_POINTS_FOR_END_LIST=40000; //Extra points for the first and last list + public static final long POINTS_FOR_TOTAL_LIST_WIDTH=5500; //multiplier for distance between first and last list + + public static final long[] masks=new long[64]; + public static final int[] masks32=new int[32]; + static{ + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+args[i]); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("i") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + if(b.indexOf('#')>=0){ + in1=b.replaceFirst("#", "1"); + in2=b.replaceFirst("#", "2"); + } + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("o") || a.equals("out") || a.equals("output")){ + out=b; + }else if(a.endsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + Data.sysout.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM); + }else if(a.endsWith("renumber")){ + RENUMBER=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + Data.sysout.println("Set OVERWRITE to "+OVERWRITE); + }else if(a.endsWith("blocksize")){ + BLOCKSIZE=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + } + + if(in1==null){throw new RuntimeException("Please specify input file.");} + if(out==null){throw new RuntimeException("Please specify output file.");} + if(in1.equalsIgnoreCase(in2) || in1.equalsIgnoreCase(out) || (in2!=null && in2.equalsIgnoreCase(out))){ + throw new RuntimeException("Duplicate filenames."); + } + + if(out!=null && !out.contains("#")){ + throw new RuntimeException("Output filename must contain '#' symbol."); + } + + SortReadsByID srid=new SortReadsByID(in1, in2, out); + srid.process(); + } + + + public void process(){ + + Timer tRead=new Timer(); + Timer tSort=new Timer(); + Timer tAll=new Timer(); + + tRead.start(); + tAll.start(); + + final long maxReads=-1; + ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + Thread th=new Thread(cris); + th.start(); + } + + HashMap map=new HashMap(); + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + int bin=(int)(r.numericID/BLOCKSIZE); + Block b=map.get(bin); + if(b==null){ + String o1=out.replaceFirst("#", "_bin"+bin+"_1"); + String o2=(cris.paired() && !OUT_INTERLEAVED) ? out.replaceFirst("#", "_bin"+bin+"_2") : null; + b=new Block(o1, o2); + map.put(bin, b); + } + b.add(r); + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + + cris.returnList(ln, ln.list.isEmpty()); + ReadWrite.closeStream(cris); + } + + for(Block b : map.values()){b.close();} + + tRead.stop(); + Data.sysout.println("Read time: \t"+tRead); + tSort.start(); + + String o1=out.replaceFirst("#", "1"); + String o2=(cris.paired() && !OUT_INTERLEAVED) ? out.replaceFirst("#", "2") : null; + Block sorted=new Block(o1, o2); + + long count=0; + + ArrayList keys=new ArrayList(); + keys.addAll(map.keySet()); + Collections.sort(keys); + for(Integer key : keys){ + Block b=map.get(key); + b.join(); + map.remove(key); + { + FileFormat ff1=FileFormat.testInput(b.out1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(b.out2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + Thread th=new Thread(cris); + th.start(); + } + ArrayList reads2=new ArrayList((int)b.count); + count+=b.count; + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + reads2.addAll(reads); + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + + cris.returnList(ln, ln.list.isEmpty()); + ReadWrite.closeStream(cris); + } + + Collections.sort(reads2, idComparator); + for(Read r : reads2){sorted.add(r);} + new File(b.out1).delete(); + if(b.out2!=null){new File(b.out2).delete();} + } + + sorted.close(); + sorted.join(); + + tSort.stop(); + tAll.stop(); + + Data.sysout.println("Total reads: \t"+count); + Data.sysout.println("Sort time: \t"+tSort); + Data.sysout.println("Total time: \t"+tAll); + + } + + /** + * @param in1 + * @param in2 + * @param out + */ + public SortReadsByID(String in1_, String in2_, String out_) { + in1=in1_; + in2=in2_; + out=out_; + + FileFormat ff=FileFormat.testOutput(out, FileFormat.BREAD, null, true, false, false); + outFastq=ff.fastq(); + outFasta=ff.fasta(); + outText=ff.bread(); + } + + public String in1; + public String in2; + public String out; + + private final boolean outText; + private final boolean outFasta; + private final boolean outFastq; + + public static int BLOCKSIZE=8000000; + public static boolean OVERWRITE=true; + public static boolean RENUMBER=false; + public static boolean OUT_INTERLEAVED=false; + + private class Block{ + + public Block(String out1_, String out2_){ + out1=out1_; + out2=out2_; + + tsw1=new TextStreamWriter(out1, OVERWRITE, false, false); + tsw2=(out2==null ? null : new TextStreamWriter(out2, OVERWRITE, false, false)); + + tsw1.start(); + if(tsw2!=null){tsw2.start();} + } + + public void add(Read r){ + count++; + Read r2=r.mate; + + StringBuilder sb1=outText ? r.toText(true) : outFastq ? r.toFastq() : outFasta ? r.toFasta() : null; + StringBuilder sb2=r2==null ? null : outText ? r2.toText(true) : outFastq ? r2.toFastq() : outFasta ? r2.toFasta() : null; + + tsw1.print(sb1.append('\n')); + if(sb2!=null){ + if(tsw2!=null){ + tsw2.print(sb2.append('\n')); + }else{ + tsw1.print(sb2.append('\n')); //Interleaved + } + } + + } + + public void close(){ + tsw1.poison(); + if(tsw2!=null){tsw2.poison();} + } + + public void join(){ + tsw1.waitForFinish(); + if(tsw2!=null){tsw2.waitForFinish();} + } + + String out1; + String out2; + + TextStreamWriter tsw1; + TextStreamWriter tsw2; + + long count=0; + + } + + public static final class ReadComparatorID implements Comparator{ + + @Override + public int compare(Read r1, Read r2) { + if(r1.numericIDr2.numericID){return 1;} + + if(!r1.id.equals(r2.id)){return r1.id.compareTo(r2.id);} + return 0; + } + + } + public static final ReadComparatorID idComparator=new ReadComparatorID(); + + +} diff --git a/current/align2/SortReadsByMapping.java b/current/align2/SortReadsByMapping.java new file mode 100755 index 0000000..4a1867c --- /dev/null +++ b/current/align2/SortReadsByMapping.java @@ -0,0 +1,2205 @@ +package align2; + +import java.io.File; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.concurrent.ArrayBlockingQueue; + +import stream.ConcurrentReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.ConcurrentSolidInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.ReadStreamStringWriter; +import stream.ReadStreamWriter; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; +import dna.Timer; +import fileIO.ReadWrite; + +public class SortReadsByMapping { + + + public static void main(String[] args){ + + Data.GENOME_BUILD=-1; + + for(String s_ : args){ + String s=s_.toLowerCase(); + String split[]=(s.contains("=") ? s.split("=") : null); + if(s.equalsIgnoreCase("merge")){MERGE_DUPLICATES=true;} + else if(s.equalsIgnoreCase("regen")){REGENERATE_MATCH_STRING=true;} + else if(s.equalsIgnoreCase("trim")){TRIM_LOW_QUALITY_TAILS=true;} + else if(s.equalsIgnoreCase("fixshort")){FIX_SHORT_PAIRED_READS=true;} + else if(s.equalsIgnoreCase("removesingletonduplicates")){REMOVE_SINGLETON_DUPLICATES_OF_PAIRS=true;} + else if(s.equalsIgnoreCase("swaptoplus") || s.equalsIgnoreCase("swap")){SWAP_READ1_TO_PLUS=true;} + else if(s.equalsIgnoreCase("mergeoppositestrand")){MERGE_OPPOSITE_STRAND_DUPLICATES=true;} + else if(s.startsWith("merge=")){ + MERGE_DUPLICATES=(split[1].startsWith("t") || split[1].equals("1") ? true : false); + }else if(s.startsWith("regen=")){ + REGENERATE_MATCH_STRING=(split[1].startsWith("t") || split[1].equals("1") ? true : false); + }else if(s.startsWith("trim=")){ + TRIM_LOW_QUALITY_TAILS=(split[1].startsWith("t") || split[1].equals("1") ? true : false); + }else if(s.startsWith("fixshort=")){ + FIX_SHORT_PAIRED_READS=(split[1].startsWith("t") || split[1].equals("1") ? true : false); + }else if(s.startsWith("removesingletonduplicates=") || s.startsWith("removesingletonduplicatesofpairs=")){ + REMOVE_SINGLETON_DUPLICATES_OF_PAIRS=(split[1].startsWith("t") || split[1].equals("1") ? true : false); + }else if(s.startsWith("minq=") || s.startsWith("minquality=") || s.startsWith("trimquality=")){ + TRIM_QUALITY=Byte.parseByte(split[1]); + }else if(s.startsWith("window=") || s.startsWith("trimwindow=")){ + TRIM_WINDOW=Byte.parseByte(split[1]); + }else if(s.startsWith("swaptoplus=") || s.startsWith("swap=")){ + SWAP_READ1_TO_PLUS=(split[1].startsWith("t") || split[1].equals("1") ? true : false); + }else if(s.startsWith("mergeoppositestrand=")){ + MERGE_OPPOSITE_STRAND_DUPLICATES=(split[1].startsWith("t") || split[1].equals("1") ? true : false);; + }else if(s.startsWith("readlimit=")){ + READ_LIMIT=Long.parseLong(split[1]); + Data.sysout.println("Set READ_LIMIT to "+READ_LIMIT); + }else if(s.startsWith("build=") || s.startsWith("genome=")){ + Data.setGenome(Integer.parseInt(split[1])); + Data.sysout.println("Set GENOME_BUILD to "+Data.GENOME_BUILD); + }else if(s.startsWith("threads=")){ + REGEN_THREADS=Integer.parseInt(split[1]); + }else if(s.startsWith("overwrite=")){ + OVERWRITE=Tools.parseBoolean(split[1]); + } + } + + Read.DECOMPRESS_MATCH_ON_LOAD=true; + + SortReadsByMapping srt; + if(args[0].contains(".csfasta") && args[1].contains(".qual")){ + String reads1=args[0]; + String q1=args[1]; + String reads2=args[2].equalsIgnoreCase("null") ? null : args[2]; + String q2=args[3].equalsIgnoreCase("null") ? null : args[3]; + String outname=args[4].equalsIgnoreCase("null") ? ReadWrite.parseRoot(reads1)+"mapped_sorted#.txt.gz" : args[4]; + assert(outname.contains("#")); + int blocksize=Integer.parseInt(args[5]); + + srt=new SortReadsByMapping(reads1, q1, reads2, q2, outname, blocksize); + }else{ + String reads1=args[0]; + String reads2=args[1].equalsIgnoreCase("null") ? null : args[1]; + String outname=args[2].equalsIgnoreCase("null") ? ReadWrite.parseRoot(reads1)+"mapped_sorted#.txt.gz" : args[2]; + assert(outname.contains("#")); + int blocksize=Integer.parseInt(args[3]); + + srt=new SortReadsByMapping(reads1, reads2, outname, blocksize); + } + + srt.process(); + + double rmult=100d/(srt.processed); + double bmult=100d/srt.basesInitiallyMapped; + + float pmult=(srt.paired ? 2 : 1); + + long remaining=srt.processed-srt.merged-srt.merged2-srt.removedSingletonDupe-srt.removedLQ-srt.removedShort; + Data.sysout.println("Processed "+srt.processed+" reads; "+remaining+" remaining"+String.format(" (%.2f%%)", remaining*rmult)); + if(MERGE_DUPLICATES){ + Data.sysout.println("Merged "+srt.merged2+" strict duplicates"+String.format(" (%.2f%%)", srt.merged2*rmult)); + Data.sysout.println("Merged "+srt.merged+" duplicates"+String.format(" (%.2f%%)", srt.merged*rmult)); + if(srt.paired && REMOVE_SINGLETON_DUPLICATES_OF_PAIRS){ + Data.sysout.println("Removed "+srt.removedSingletonDupe+" singleton duplicates of pairs"+ + String.format(" (%.2f%%)", srt.removedSingletonDupe*rmult)); + } + } + if(FIX_SHORT_PAIRED_READS){ + Data.sysout.println("Removed "+srt.removedShort+" short reads"+String.format(" (%.2f%%)", srt.removedShort*rmult)); + Data.sysout.println("Trimmed "+srt.basesOverlapping+" overlapping bases of "+srt.basesInitiallyMapped+" initially mapped"+ + String.format(" (%.2f%%)", srt.basesOverlapping*bmult)); + } + if(TRIM_LOW_QUALITY_TAILS){ + Data.sysout.println("Removed "+srt.removedLQ+" low-quality reads"+String.format(" (%.2f%%)", srt.removedLQ*rmult)); + Data.sysout.println("Trimmed "+srt.basesRemoved+" low-quality bases of "+srt.basesMapped+" mapped"+ + String.format(" (%.2f%%)", srt.basesRemoved*bmult)); + } + + Data.sysout.println("Total valid, mapped tags written: "+ + srt.validReadsWritten+String.format(" (%.2f%%)", srt.validReadsWritten*rmult/pmult)); + Data.sysout.println("Total valid, mapped bases written: "+ + srt.validBasesWritten+String.format(" (%.2f%%)", srt.validBasesWritten*bmult)); + } + + public SortReadsByMapping(String fname1, String fname2, String outname_, int blocksize_){ + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + + long limit=READ_LIMIT; + RTextInputStream rtis=new RTextInputStream(fname1, fname2, limit); + outname=outname_; + paired=rtis.paired(); + cris=new ConcurrentReadInputStream(rtis, limit); + blocksize=blocksize_; + assert(blocksize>200000); + + blockwriter1=(fname1==null ? null : new ReadStreamStringWriter(null, true, 4, false)); + blockwriter2=(fname2==null ? null : new ReadStreamStringWriter(null, false, 4, false)); + } + + public SortReadsByMapping(String fname1, String q1, String fname2, String q2, String outname_, int blocksize_){ + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + outname=outname_; + cris=new ConcurrentSolidInputStream(fname1, q1, fname2, q2, 0); + paired=cris.paired(); + blocksize=blocksize_; + assert(blocksize>200000); + + blockwriter1=(fname1==null ? null : new ReadStreamStringWriter(null, true, 4, false)); + blockwriter2=(fname2==null ? null : new ReadStreamStringWriter(null, false, 4, false)); + } + + public void process(){ + + final String fname1=outname.replaceFirst("#", "1"); + final String fname2=(!paired ? null : outname.replaceFirst("#", "2")); + if(!OVERWRITE){ + if(fname1!=null && new File(fname1).exists()){throw new RuntimeException("Destination file "+fname1+" already exists.");} + if(fname2!=null && new File(fname2).exists()){throw new RuntimeException("Destination file "+fname2+" already exists.");} + } + + Timer t=new Timer(); + Timer total=new Timer(); + t.start(); + total.start(); + + + Thread tcris=new Thread(cris); + tcris.start(); + System.err.println("Started cris"); + + Thread bwt1=null, bwt2=null; + if(fname1!=null){ + bwt1=new Thread(blockwriter1); + bwt1.start(); + } + if(fname2!=null){ + bwt2=new Thread(blockwriter2); + bwt2.start(); + } + System.err.println("Started blockwriters"); + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + if(paired){ + asymmetricReads=(r.bases.length!=r.mate.bases.length); + } + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + + + if(KILL_BAD_PAIRS && paired){ + for(Read r : reads){ + + if(r.isBadPair(REQUIRE_CORRECT_STRANDS_PAIRS, SAME_STRAND_PAIRS, 20000)){ + int x=r.mapScore/r.mapLength; + int y=r.mate.mapScore/r.mate.mapLength; + if(x>=y){ + r.mate.clearAnswers(false); + }else{ + r.clearAnswers(false); + } + } + + addRead(r); + } + }else{ + for(Read r : reads){addRead(r);} + } + + + + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + } + + + synchronized(this){this.notifyAll();} + System.err.println("Notified all"); + + finishWritingBlocks(); + System.err.println("Wrote blocks"); + + + if(bwt1!=null){blockwriter1.poison();} + if(bwt2!=null){blockwriter2.poison();} + + if(bwt1!=null){ + while(bwt1.isAlive()){ + try { + bwt1.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + if(bwt2!=null){ + while(bwt2.isAlive()){ + try { + bwt2.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + t.stop(); + Data.sysout.println("Temp Write Time: "+t); + t.start(); + + if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} + ReadStreamWriter wt1=(fname1==null ? null : new ReadStreamStringWriter(fname1, true, 4, false)); + ReadStreamWriter wt2=(fname2==null ? null : new ReadStreamStringWriter(fname2, false, 4, false)); + + Thread wtt1=(wt1==null ? null : new Thread(wt1)); + Thread wtt2=(wt2==null ? null : new Thread(wt2)); + + if(wtt1!=null){wtt1.start();} + if(wtt2!=null){wtt2.start();} + +// OutputStream outStream1, outStream2; +// PrintWriter writer1, writer2; +// +// if(fname1==null){ +// assert(false); +// outStream1=null; +// writer1=null; +// }else{ +// outStream1=ReadWrite.getOutputStream(fname1, false); +// writer1=new PrintWriter(outStream1); +// writer1.println("#"+Read.header()); +// } +// +// if(fname2==null){ +// outStream2=null; +// writer2=null; +// }else{ +// outStream2=ReadWrite.getOutputStream(fname2, false); +// writer2=new PrintWriter(outStream2); +// writer2.println("#"+Read.header()); +// } + + ArrayList keys=new ArrayList(table.size()); + keys.addAll(table.keySet()); + Collections.sort(keys); + + final ReadComparatorMapping mcomp=new ReadComparatorMapping(); + + int lastChrom=-1; + for(String key : keys){ + Block b=table.get(key); + table.remove(key); + processed+=b.added; + + if(UNLOAD_CHROMS_WHEN_DONE && lastChrom>-1 && b.chrom!=lastChrom){ + Data.unload(lastChrom, false); //Saves memory when regenerating match strings + } + lastChrom=b.chrom; + + if(b.added>MAX_BLOCKSIZE_TO_SORT){ + if(true){throw new RuntimeException("Skipping sorting for key "+key+" of size "+b.added);} + RTextInputStream temp=new RTextInputStream(b.fname1, b.fname2, -1); + ArrayList reads=temp.nextList(); + while(reads!=null && reads.size()>0){ + if(reads!=null && reads.size()>0){ + if(wt1!=null){wt1.addList(reads);} + if(wt2!=null){wt2.addList(reads);} + } + b.numRead+=reads.size(); + reads=temp.nextList(); + } + temp.close(); + temp=null; + + Data.sysout.println(key+"\t"+b.added); + b.delete(); + }else{ + ArrayList list=b.readBlock(); + Data.sysout.println(key+"\t"+list.size()); + b.delete(); + + //Collections.sort(list, mcomp); + if(MERGE_DUPLICATES){ + if(!paired){ + Collections.sort(list, mcomp); + if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(list);} + findAndMergeDuplicates(list, false); + }else{ + + //Possibly, doing two passes (unswap, merge, reswap, merge) is unnecessary... + + if(SWAP_READ1_TO_PLUS && paired){ + //Unswap + for(int i=0; i0){ + regenMatchStrings(list); + } + }else{ + for(Read r : list){ + if(r!=null){ + if(r.mapped()){basesInitiallyMapped+=r.mapLength;} + if(r.mate!=null && r.mate.mapped()){basesInitiallyMapped+=r.mate.mapLength;} + } + } + } + + if(TRIM_LOW_QUALITY_TAILS){ + int[] rvector=new int[4]; + int removedTemp=trimTails(list, TRIM_WINDOW, TRIM_QUALITY, rvector); + removedLQ+=removedTemp; + basesRemoved+=rvector[1]; + basesMapped+=rvector[2]; + int needRegen=rvector[3]; + + if(REGENERATE_MATCH_STRING && needRegen>0){ + regenMatchStrings(list); + } + }else{ + for(Read r : list){ + if(r!=null){ + if(r.mapped() && !r.invalid()){basesMapped+=r.bases.length;} + if(r.mate!=null && !r.mate.invalid() && r.mate.mapped()){basesMapped+=r.mate.bases.length;} + } + } + } + + //Reswap + if(SWAP_READ1_TO_PLUS && paired){ + for(int i=0; i0){ + if(wt1!=null){wt1.addList(list);} + if(wt2!=null){wt2.addList(list);} + } + } + } + + //Add poison +// if(wt1!=null){wt1.addList(null);} +// if(wt2!=null){wt2.addList(null);} + if(wt1!=null){wt1.poison();} + if(wt2!=null){wt2.poison();} + + readsWritten=0; + basesWritten=0; + validReadsWritten=0; + validBasesWritten=0; + + if(wtt1!=null){ + while(wtt1.isAlive()){ + try { + wtt1.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + readsWritten+=wt1.readsWritten(); + basesWritten+=wt1.basesWritten(); + validReadsWritten+=wt1.validReadsWritten(); + validBasesWritten+=wt1.validBasesWritten(); + } + + if(wtt2!=null){ + while(wtt2.isAlive()){ + try { + wtt2.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + readsWritten+=wt2.readsWritten(); + basesWritten+=wt2.basesWritten(); + validReadsWritten+=wt2.validReadsWritten(); + validBasesWritten+=wt2.validBasesWritten(); + } + + t.stop(); + total.stop(); + Data.sysout.println("Final Sort + Write Time: "+t); + Data.sysout.println("Total Time: "+total); + + } + + + private void doPairedSplitAndMergeSeries(ArrayList list, final ReadComparatorMapping mcomp, boolean mergeDifferentLength){ + + //This special section is probably not necessary. + //Theoretically, keeping everything in a single list should work fine. + + int p=0, e1=0, e2=0, e12=0; + for(Read r : list){ + if(r!=null){ + if(r.paired()){ + p++; + }else if(r.mapped() && r.mate.mapped()){ + e12++; + }else if(r.mapped()){ + e1++; + }else if(r.mate.mapped()){ + e2++; + } + } + } + + ArrayList listP=new ArrayList(p); + ArrayList list1=new ArrayList(e1); + ArrayList list2=new ArrayList(e2); + ArrayList list12=new ArrayList(e12); + + for(Read r : list){ + if(r!=null){ + if(r.paired()){ + listP.add(r); + }else if(r.mapped() && r.mate.mapped()){ + list12.add(r); + }else if(r.mapped()){ + list1.add(r); + }else if(r.mate.mapped()){ + list2.add(r); + } + } + } + list.clear(); + + Collections.sort(listP, mcomp); + if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(listP);} + findAndMergeDuplicates(listP, mergeDifferentLength); + list.addAll(listP); + listP=null; + + Collections.sort(list1, mcomp); + findAndMergeDuplicates(list1, mergeDifferentLength); + list.addAll(list1); + list1=null; + + Collections.sort(list2, mcomp); + findAndMergeDuplicates(list2, mergeDifferentLength); + list.addAll(list2); + list2=null; + + Collections.sort(list12, mcomp); + if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(list12);} + findAndMergeDuplicates(list12, mergeDifferentLength); + list.addAll(list12); + list12=null; + + Tools.condense(list); + Collections.sort(list, mcomp); + if(REMOVE_SINGLETON_DUPLICATES_OF_PAIRS){ + findAndRemoveSingletonDuplicatesOfPairs(list); + Tools.condense(list); + Collections.sort(list, mcomp); + } + } + + + private void doPairedSplitAndMergeSeries_old(ArrayList list, final ReadComparatorMapping mcomp){ + + //This special section is probably not necessary. + //Theoretically, keeping everything in a single list should work fine. + + int p=0, e1=0, e2=0, e12=0; + for(Read r : list){ + if(r!=null){ + if(r.paired()){ + p++; + }else if(r.mapped() && r.mate.mapped()){ + e12++; + }else if(r.mapped()){ + e1++; + }else if(r.mate.mapped()){ + e2++; + } + } + } + + ArrayList listP=new ArrayList(p); + ArrayList list1=new ArrayList(e1); + ArrayList list2=new ArrayList(e2); + ArrayList list12=new ArrayList(e12); + + for(Read r : list){ + if(r!=null){ + if(r.paired()){ + listP.add(r); + }else if(r.mapped() && r.mate.mapped()){ + list12.add(r); + }else if(r.mapped()){ + list1.add(r); + }else if(r.mate.mapped()){ + list2.add(r); + } + } + } + list.clear(); + + Collections.sort(listP, mcomp); + if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(listP);} + findAndMergeDuplicates(listP, false); + if(asymmetricReads && SWAP_READ1_TO_PLUS){ + Tools.condense(listP); + Collections.sort(listP, mcomp); + findAndMergeDuplicates(listP, true); + } + list.addAll(listP); + listP=null; + + Collections.sort(list1, mcomp); + findAndMergeDuplicates(list1, false); + if(asymmetricReads && SWAP_READ1_TO_PLUS){ + Tools.condense(list1); + Collections.sort(list1, mcomp); + findAndMergeDuplicates(list1, true); + } + list.addAll(list1); + list1=null; + + Collections.sort(list2, mcomp); + findAndMergeDuplicates(list2, false); + if(asymmetricReads && SWAP_READ1_TO_PLUS){ + Tools.condense(list2); + Collections.sort(list2, mcomp); + findAndMergeDuplicates(list2, true); + } + list.addAll(list2); + list2=null; + + Collections.sort(list12, mcomp); + if(USE_STRICT_MERGE){findAndMergeDuplicatesStrict(list12);} + findAndMergeDuplicates(list12, false); + if(asymmetricReads && SWAP_READ1_TO_PLUS){ + Tools.condense(list12); + Collections.sort(list12, mcomp); + findAndMergeDuplicates(list12, true); + } + list.addAll(list12); + list12=null; + + Tools.condense(list); + Collections.sort(list, mcomp); + if(REMOVE_SINGLETON_DUPLICATES_OF_PAIRS){ + findAndRemoveSingletonDuplicatesOfPairs(list); + Tools.condense(list); + Collections.sort(list, mcomp); + } + } + + + private static int trimTails(ArrayList list, int thresh, byte minq, int[] rvector){ + + int removed=0; + int basesRemoved=0; + int basesMapped=0; + int needRegen=0; + for(int i=0; i list, int[] rvector){ + + int removed=0; + int basesRemoved=0; + int basesMapped=0; + int needRegen=0; + for(int i=0; i0){ + basesRemoved+=trimTailByXBases(r, rem1); + if(r.match==null && r.valid()){needRegen++;} + else{ + assert(r.invalid() || TranslateColorspaceRead.verifyMatchString2(r, true)); + } + } + if(rem2>0){ + basesRemoved+=trimTailByXBases(r2, rem2); + if(r2.match==null && r2.valid()){needRegen++;} + else{ + assert(r2.invalid() || TranslateColorspaceRead.verifyMatchString2(r2, true)); + } + } + } + }else if(refLengthInner<=0 || refLengthOuter0){ + int toRemain=r.bases.length+r2.bases.length-overlap; + + if(toRemain0){ + basesRemoved+=trimTailByXBases(r, rem1); + if(r.match==null && r.valid()){needRegen++;} + else{ + assert(r.invalid() || TranslateColorspaceRead.verifyMatchString2(r, true)); + } + } + if(rem2>0){ + basesRemoved+=trimTailByXBases(r2, rem2); + if(r2.match==null && r2.valid()){needRegen++;} + else{ + assert(r2.invalid() || TranslateColorspaceRead.verifyMatchString2(r2, true)); + } + } + } + } + } + + if((r.invalid() || !r.mapped()) && (r2==null || r2.invalid() || !r2.mapped())){ + removed++; + list.set(i, null); + } + } + } + + if(rvector!=null){ + rvector[0]=removed; + rvector[1]=basesRemoved; + rvector[2]=basesMapped; + rvector[3]=needRegen; + } + + return removed; + } + + + //TODO: Add support for deletions + /** thresh: Must see this many consecutive 'm' to stop. */ + private static int trimTail(Read r, int thresh, byte minq){ + byte[] bases=r.bases; + byte[] match=r.match; + byte[] quality=r.quality; + + assert(match!=null); + if(r.strand()==Gene.MINUS){ //Remember to un-reverse later + Tools.reverseInPlace(match); + } + + + int lastBadLoc=quality.length; + int lastBadMLoc=match.length; + int qloc=quality.length-1; + int mloc=match.length-1; + + for(; mloc>=0 && qloc>=0; mloc--){ + + assert(qlocthresh){break;} + + byte m=match[mloc]; + byte q=quality[qloc]; + + if(m=='D'){ + //do nothing + lastBadLoc=qloc+1; + lastBadMLoc=mloc; + }else{ + if(q0); + assert(r.quality.length==r.bases.length); + assert(r.match==null || r.match.length>=r.quality.length); + +// System.err.println("After:\n"+r.toText(false)); + + return trimmed; + } + + + private static int trimTailByXBases(Read r, final int x){ + byte[] bases=r.bases; + byte[] match=r.match; + byte[] quality=r.quality; + + final int newLen=bases.length-x; + + if(newLen<6){ + r.setInvalid(true); + return quality.length; + } + + assert(match!=null); + if(r.strand()==Gene.MINUS){ //Remember to un-reverse later + Tools.reverseInPlace(match); + } + + int qloc=quality.length-1; + int mloc=match.length-1; + + for(; mloc>=0 && qloc>=newLen; mloc--){ + + byte m=match[mloc]; +// byte q=quality[qloc]; + + if(m=='D'){ + //do nothing + }else{ + qloc--; + } + } + + while(mloc>=0 && match[mloc]=='D'){mloc--;} + assert(qloc==newLen-1); + + bases=Arrays.copyOf(bases, newLen); + quality=Arrays.copyOf(quality, newLen); + match=Arrays.copyOf(match, mloc+1); + + if(r.strand()==Gene.MINUS){Tools.reverseInPlace(match);} + + boolean realign=false; + int lengthOfMatchString=0; + for(byte m : match){ + if(m=='m' || m=='N' || m=='s' || m=='S' || m=='D'){ + lengthOfMatchString++; + }else if(m=='X' || m=='Y'){ + realign=true; + } + } + +// assert(!realign) : r.toText(false); + + if(realign){ + System.err.println("Killed match string while trimming this read:\n"+r.toText(false)); + r.match=null; + match=null; + }else{ + if(r.strand()==Gene.PLUS){ + r.stop=r.start+lengthOfMatchString-1; + }else{ + r.start=r.stop-lengthOfMatchString+1; + } + } + + int trimmed=r.quality.length-quality.length; + r.quality=quality; + r.match=match; + r.bases=bases; + + assert(trimmed>0); + assert(r.quality.length==r.bases.length); + assert(r.match==null || r.match.length>=r.quality.length); + +// System.err.println("After:\n"+r.toText(false)); + + return trimmed; + } + + + private static int countCalledBasesOnOrAfterRefLoc(Read r, final int rlimit){ + final int clen=r.bases.length; + byte[] match=r.match; + + if(r.strand()==Gene.PLUS){ + +// final int rlimit=rlimit_0-1; + + int cloc=0; + int mloc=0; + int rloc=r.start; + for(; mloc=rlimit){ + + if(rloc>rlimit){ + return clen+(rloc-rlimit); + } + + int ret=clen-cloc; + assert(rloc==rlimit) : "ret="+ret+", clen="+clen+", cloc="+cloc+",\n"+ + "rloc="+rloc+", rlimit="+rlimit+", mloc="+mloc+", mlen="+match.length+",\n"+ + "r.start="+r.start+", r.stop="+r.stop+", r2.start="+r.mate.start+", r2.stop="+r.mate.stop+"\n\n"+r.toText(false)+"\n\n"; + assert(ret>=0 && ret<=clen) : "ret="+ret+", clen="+clen+", cloc="+cloc+",\n"+ + "rloc="+rloc+", rlimit="+rlimit+", mloc="+mloc+", mlen="+match.length+",\n"+ + "r.start="+r.start+", r.stop="+r.stop+", r2.start="+r.mate.start+", r2.stop="+r.mate.stop; + return ret; + }else{ + assert(cloc==clen) : clen+", "+cloc+"\n"+r.toText(false)+"\n"; //Maybe cloc==clen + return 0; + } + }else{ + +// final int rlimit=rlimit_0+1; + + int cloc=clen-1; + int mloc=match.length-1; + int rloc=r.stop; + for(; mloc>=0 && rloc>rlimit; mloc--){ + byte m=match[mloc]; + + if(m=='D'){ + rloc--; + }else if(m=='X' || m=='Y' || m=='I'){ + cloc--; + }else{ + cloc--; + rloc--; + } + } + + if(rloc<=rlimit){ + if(rloc=0 && ret<=clen) : "ret="+ret+", clen="+clen+", cloc="+cloc+",\n"+ + "rloc="+rloc+", rlimit="+rlimit+", mloc="+mloc+", mlen="+match.length+",\n"+ + "r.start="+r.start+", r.stop="+r.stop+", r2.start="+r.mate.start+", r2.stop="+r.mate.stop; + return ret; + }else{ + assert(cloc==-1) : clen+", "+cloc; //Maybe cloc==-1 + return 0; + } + } + } + + + private void findAndMergeDuplicates(ArrayList list, boolean mergeDifferentLengthReads){ + if(list==null || list.size()<2){return;} + Read current=list.get(0); + + for(int i=1; i list){ + if(list==null || list.size()<2){return;} + + int addIndex=0; + + ArrayList toMerge=new ArrayList(); + ArrayList toMerge2=new ArrayList(); + + for(int i=0; i1){ + merged2+=toMerge.size()-1; + x=mergeReads(toMerge, true); + } + assert(list.get(addIndex)==null); + list.set(addIndex, x); + addIndex++; + toMerge.clear(); + } + }else{ + assert(toMerge.size()==toMerge2.size()); + final boolean mdupeStrict=current.mate.isDuplicateByMapping(r.mate, true, true); + if(!dupeStrict || !mdupeStrict){ + Read x=toMerge.get(0); + if(toMerge.size()>1){ + merged2+=toMerge.size()-1; + x=mergeReads(toMerge, true); + Read y=mergeReads(toMerge2, true); + assert(x.mate==y); + assert(y.mate==x); + assert(x!=y); + } + assert(list.get(addIndex)==null); + list.set(addIndex, x); + addIndex++; + toMerge.clear(); + toMerge2.clear(); + } + } + } + + toMerge.add(r); + if(paired){toMerge2.add(r.mate);} + } + + if(!toMerge.isEmpty()){ + Read x=toMerge.get(0); + if(toMerge.size()>1){ + merged2+=toMerge.size()-1; + x=mergeReads(toMerge, true); + if(paired){ + Read y=mergeReads(toMerge2, true); + assert(x.mate==y); + assert(y.mate==x); + assert(x!=y); + } + } + assert(list.get(addIndex)==null); + list.set(addIndex, x); + addIndex++; + } + + for(int i=list.size()-1; i>=0 && list.get(i)==null; i--){list.remove(i);} + + if(REGENERATE_MATCH_STRING){regenMatchStrings(list);} + } + + + private void findAndRemoveSingletonDuplicatesOfPairs(ArrayList list){ + if(list==null || list.size()<2){return;} + assert(paired); + + Read current=null; + for(int i=0; i list, boolean retainPerfect){ + if(list==null || list.isEmpty()){return null;} + if(list.size()==1){return list.get(0);} + + //This block prevents the destruction of perfect reads. + { + Read a=list.get(0); + for(int i=1; i=0 && b<=3){ + count[b][i]+=r.copies; + qual[b][i]+=q; + maxQual[b][i]=Tools.max(q, maxQual[b][i]); + } + } + } + + int[] carray=new int[4]; + int[] qarray=new int[4]; + byte[] marray=new byte[4]; + + + byte[] bases=new byte[len]; + byte[] quality=new byte[len]; + + for(int i=0; i=0); + assert(q2>=0); + + if(b1==b2){ + r.quality[i]=q2; + if(b1=='N'){r.quality[i]=0;} + }else{ + if(b2=='N'){ + r.quality[i]=Tools.min(r.quality[i], (byte)2); + }else if(b1=='N'){ + r.bases[i]=b2; + r.quality[i]=q2; + killMatch=true; + }else{ + if(retain){ + r.quality[i]=Tools.max((byte)2, (byte)(q1-q2)); + }else if(q2-q1>10){ + r.bases[i]=b2; + r.quality[i]=q2; + killMatch=true; + }else if(q1<15 && q2>20){ + r.bases[i]=b2; + r.quality[i]=q2; + killMatch=true; + }else{ + r.quality[i]=Tools.max((byte)2, (byte)(q1-q2)); + } + } + } + } + assert(checkColorspace(r.bases, cs)): r.toText(false)+"\n"+Arrays.toString(bases)+"\n"+new String(bases); + + if(killMatch){r.match=null;} + + return r; + } + + private static boolean checkColorspace(byte[] bases, boolean cs){ + if(cs){ + for(int i=0; i0){ + if(score>bestScore){ + best=i; + bestScore=score; + }else if(score==bestScore){ + if(qual[i]>qual[best]){ + best=i; + bestScore=score; + }else if(qual[i]==qual[best] && count[i]>count[best]){ + best=i; + bestScore=score; + } + } + } + } + return best; + } + + + private void regenMatchStrings(ArrayList list){ + if(list==null || list.isEmpty()){return;} + + int needed=0; + for(Read r : list){ + if(r!=null){ + if(r.mapped() && r.match==null){ + needed++; + }else if(r.mate!=null && r.mate.mapped() && r.mate.match==null){ + needed++; + } + } + } + if(needed<1){return;} + + + final int lim=100; + +// System.err.println("Starting RMTs"); + RegenMatchThread[] rmt=new RegenMatchThread[Tools.max(1, Tools.min(REGEN_THREADS, needed/lim))]; + for(int i=0; i list2=new ArrayList(lim); + for(Read r : list){ + if(r!=null){ + boolean flag=false; + if(r.mapped() && r.match==null){ + flag=true; + }else if(r.mate!=null && r.mate.mapped() && r.mate.match==null){ + flag=true; + } + if(flag){ + list2.add(r); + if(list2.size()>=lim){ + while(list2!=null){ + try { + REGEN_PIPE.put(list2); + list2=null; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + list2=new ArrayList(lim); + } + } + } + } + + if(list2!=null && list2.size()>0){ + while(list2!=null){ + try { + REGEN_PIPE.put(list2); + list2=null; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + +// System.err.println("Poisoning RMTs"); + //Poison + for(int i=0; i(0); + while(list2!=null){ + try { + REGEN_PIPE.put(list2); + list2=null; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + +// System.err.println("Joining RMTs"); + for(int i=0; i0; i--){ + sb.append('0'); + } + sb.append(num); + return sb; + } + + + private void addRead(Read r){ + Read r2=r.mate; + assert(r2==null || r.numericID==r2.numericID); + boolean swap=false; + if(SWAP_READ1_TO_PLUS && r2!=null){ + + if(r.paired() && r.mapped() && r.valid() && r2.mapped() && r2.valid()){ //Ideal pair + if(r.strand()==Gene.MINUS && r2.strand()==Gene.PLUS){swap=true;} + }else if(r.mapped() && r.valid() && r2.mapped() && r2.valid()){ + if(r.strand()==Gene.MINUS && r2.strand()==Gene.PLUS){swap=true;} + }else if(r.mapped() && r.valid()){ + if(r.strand()==Gene.MINUS){swap=true;} + }else if(r2.mapped() && r2.valid()){ + if(r2.strand()==Gene.PLUS){swap=true;} + } + } + + if(swap){ + r.setSwapped(true); + r2.setSwapped(true); + Read temp=r; + r=r2; + r2=temp; + } + assert(r2==null || (r.numericID==r2.numericID && r!=r2)); + + String key=makeKey(r); + +// String key=sb.toString(); + Block b=table.get(key); + if(b==null){ + //System.err.println("Created block "+key); + b=new Block(key, outname, r.chrom); + table.put(key, b); + } + b.add(r); + } + + + public void finishWritingBlocks(){ + System.err.println("Called finishWritingBlocks()"); + int numWritten=0; + for(String key : table.keySet()){ + Block b=table.get(key); + b.finishWritingBuffer(); + numWritten++; + } + assert(numWritten==table.size()) : "Only wrote "+numWritten+" of "+table.size(); + } + + + private class Block{ + + public Block(String name_, String fname_, int chrom_){ + + if(DONT_COMPRESS_TEMP_FILES){ + if(fname_.endsWith(".gz") || fname_.endsWith(".zip") || fname_.endsWith(".bz2")){ + fname_=fname_.substring(0, fname_.lastIndexOf('.')); + } + } + + name=name_; + fname1=fname_.replaceFirst("#", "_msort_tempBlock_"+name+"_1"); + fname2=(!paired ? null : fname_.replaceFirst("#", "_msort_tempBlock_"+name+"_2")); + chrom=chrom_; +// Data.sysout.println(fname1); + if(fname1==null){ + assert(false); + outStream1=null; + writer1=null; + }else{ + outStream1=ReadWrite.getOutputStream(fname1, false, true, false); + writer1=new PrintWriter(outStream1); + } + + if(fname2==null){ + outStream2=null; + writer2=null; + }else{ + outStream2=ReadWrite.getOutputStream(fname2, false, true, false); + writer2=new PrintWriter(outStream2); + } + } + + public void add(Read r){ + buffer.add(r); + added++; + if(buffer.size()>=WRITE_BUFFER){ + writeBuffer(false); + } + } + + public void writeBuffer(boolean close){ + + written+=buffer.size(); + ArrayList temp=buffer; + buffer=(close ? null : new ArrayList(WRITE_BUFFER)); + + if(close){ +// System.err.println("Closing "+name+": "+ fname1+", "+fname2); + if(blockwriter1!=null){blockwriter1.addList(temp, writer1, outStream1, close);} + if(blockwriter2!=null){blockwriter2.addList(temp, writer2, outStream2, close);} + }else{ + if(blockwriter1!=null && temp!=null && !temp.isEmpty()){blockwriter1.addList(temp, writer1, outStream1, close);} + if(blockwriter2!=null && temp!=null && !temp.isEmpty()){blockwriter2.addList(temp, writer2, outStream2, close);} + } + + assert(added==written); +// buffer.clear(); + } + + public void finishWritingBuffer(){ + //System.err.println("Writing block "+name); + writeBuffer(true); + +// finishWriting(writer1, outStream1); +// if(fname2!=null){ +// finishWriting(writer2, outStream2); +// } + + } + + public synchronized ArrayList readBlock(){ + RTextInputStream temp=new RTextInputStream(fname1, fname2, -1); + ArrayList out=new ArrayList((int)written); + ArrayList reads=temp.nextList(); + while(reads!=null && reads.size()>0){ + out.addAll(reads); + numRead+=reads.size(); + reads=temp.nextList(); + } + temp.close(); + temp=null; + assert(numRead==written); + + return out; + } + + public synchronized void delete() { + if(fname1!=null){new File(fname1).delete();} + if(fname2!=null){new File(fname2).delete();} + } + + public final String name; + public final String fname1, fname2; + public final int chrom; //Necessary for unloading data + + public final OutputStream outStream1, outStream2; + public final PrintWriter writer1, writer2; + private ArrayList buffer=new ArrayList(WRITE_BUFFER); + + public long added=0, written=0, numRead=0; + } + + + + public static class ReadComparatorMapping implements Comparator { + + @Override + public int compare(Read a, Read b) { + + if(a.mate==null){ + int x=compare2(a, b); + if(x!=0){return x;} + return compare3(a, b); + }else{ + + if(a.mapped() && b.mapped()){ + int x=compare2(a, b); + if(x!=0){return x;} + + if(a.paired() && b.paired()){ + x=compare2(a.mate, b.mate); + if(x!=0){return x;} + x=compare3(a, b); + if(x!=0){return x;} + x=compare3(a.mate, b.mate); + return x; + }else{ + assert(!a.paired() && !b.paired()); + return compare3(a, b); + } + } + + if(!a.mapped() && !b.mapped()){ + int x=compare2(a.mate, b.mate); + if(x!=0){return x;} + return compare3(a.mate, b.mate); + }else if(a.mapped()){ + if(a.paired()){ + int x=compare2(a.mate, b.mate); + if(x!=0){return x;} + return -1; + }else{ + int x=compareCross(a, b.mate); + if(x!=0){return x;} + return -1; + } + }else if(b.mapped()){ + if(b.paired()){ + int x=compare2(a.mate, b.mate); + if(x!=0){return x;} + return 1; + }else{ + int x=compareCross(b, a.mate); + if(x!=0){return 0-x;} + return 1; + } + }else{ + assert(false) : a.mapped()+", "+a.paired()+", "+b.mapped()+", "+b.paired()+", "+a.mate.mapped()+", "+b.mate.mapped(); + } + + //I think this is unreachable... + return compare3(a, b); + } + } + + public int compare2(Read a, Read b) { + if(a.mapped() && !b.mapped()){return -1;} + if(b.mapped() && !a.mapped()){return 1;} + if(a.chrom!=b.chrom){return a.chrom-b.chrom;} + if(a.strand()!=b.strand()){return a.strand()-b.strand();} + + assert(!SAME_STRAND_PAIRS) : "TODO"; + if(a.strand()==Gene.PLUS){ + if(a.start!=b.start){return a.start-b.start;} + }else{ + if(a.stop!=b.stop){return a.stop-b.stop;} + } + + if(a.paired()!=b.paired()){return a.paired() ? -1 : 1;} + return 0; + } + + public int compareCross(Read a, Read b) { + if(a.mapped() && !b.mapped()){return -1;} + if(b.mapped() && !a.mapped()){return 1;} + if(a.chrom!=b.chrom){return a.chrom-b.chrom;} + if(SAME_STRAND_PAIRS){ + if(a.strand()!=b.strand()){ + return a.strand()-b.strand(); + } + }else{ + if(a.strand()==b.strand()){ + return a.strand()==0 ? -1 : 1; + } + } + if(a.start!=b.start){return a.start-b.start;} + if(a.paired()!=b.paired()){return a.paired() ? -1 : 1;} + return 0; + } + + public int compare3(Read a, Read b){ + if(a.bases.length!=b.bases.length){ + return b.bases.length-a.bases.length; //Preferentially puts longer reads first + } + if(a.perfect() != b.perfect()){return a.perfect() ? -1 : 1;} + int x; + + if(a.match!=null && b.match!=null){ + x=compareMatchStrings(a.match, b.match); + if(x!=0){return x;} + } + + assert(!SAME_STRAND_PAIRS) : "TODO"; + if(a.strand()==Gene.PLUS){ + if(a.start!=b.start){return a.start-b.start;} //This line should be dead code + if(a.stop!=b.stop){return a.stop-b.stop;} + }else{ + if(a.stop!=b.stop){return a.stop-b.stop;} //This line should be dead code + if(a.start!=b.start){return a.start-b.start;} + } + + x=compareVectors(a.quality, b.quality); + if(x!=0){return 0-x;} +// if(a.stop!=b.stop){return a.stop-b.stop;} + if(a.numericID!=b.numericID){return a.numericID>b.numericID ? 1 : -1;} + return a.id.compareTo(b.id); + } + + public int compareVectors(final byte[] a, final byte[] b){ + if(a==null || b==null){ + if(a==null && b!=null){return 1;} + if(a!=null && b==null){return -1;} + return 0; + } + final int lim=Tools.min(a.length, b.length); + for(int i=0; ib[i]){return 1;} + } + return 0; + } + + public int compareMatchStrings(final byte[] a, final byte[] b){ + if(a==null || b==null){ + if(a==null && b!=null){return 1;} + if(a!=null && b==null){return -1;} + return 0; + } + final int lim=Tools.min(a.length, b.length); + for(int i=0; i list=take(); !list.isEmpty(); list=take()){ + for(Read r : list){ + if(r!=null){ + final Read r2=r.mate; + if(r.mapped() && r.match==null && r.valid()){regenMatchString(r);} + if(r2!=null && r2.mapped() && r2.match==null && r.valid()){regenMatchString(r2);} + } + } + } + } + + private void regenMatchString(Read r){ + assert(r.match==null); +// (final Read r, final int padding, final boolean recur, final int minValidScore){ + tcr.realign_new(r, 4, true, 0, false); + r.setPerfectFlag(Integer.MAX_VALUE); + assert(!r.perfect() || r.stop-r.start==(r.bases.length-1)) : + "\n"+r.toText(false)+"\n"+new String(r.bases)+"\n"+new String(AminoAcid.reverseComplementBases(r.bases))+ + "\n"+Data.getChromosome(r.chrom).getString(r.topSite().start, r.topSite().stop)+"\n"; + + if(r.match!=null){ +// boolean xy=TranslateColorspaceRead.containsXY(r.match); + assert(TranslateColorspaceRead.verifyMatchString2(r, true)) : r.toText(false); + } + } + + private ArrayList take(){ + ArrayList list=null; + while(list==null){ + try { + list=REGEN_PIPE.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + return list; + } + + private final TranslateColorspaceRead tcr=null; /*new TranslateColorspaceRead(2000, 3000);*/ //Specific type needs to be specified. + } + + public final String outname; + private final ConcurrentReadStreamInterface cris; + private final ArrayBlockingQueue> REGEN_PIPE=new ArrayBlockingQueue>(40); + public long merged=0; + public long merged2=0; + public long removedSingletonDupe=0; + public long removedLQ=0; + public long removedShort=0; + public long processed=0; + public long basesInitiallyMapped=0; + public long basesOverlapping=0; + public long basesMapped=0; + public long basesRemoved=0; +// public long numSwapped=0; + private long readsWritten; + private long basesWritten; + private long validReadsWritten; + private long validBasesWritten; + + private boolean asymmetricReads=false; + + private final HashMap table=new HashMap(4096); + + public final boolean paired; + public final int blocksize; + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + public static boolean MOVE_SINGLETONS_TO_END=false; + + public static long READ_LIMIT=-1; //Max number of reads to process + public static final int WRITE_BUFFER=8000; //Bigger number uses more memory, for less frequent writes. + public static final int MAX_BLOCKSIZE_TO_SORT=8000000; + public static boolean OVERWRITE=false; + + public static final boolean DONT_COMPRESS_TEMP_FILES=false; + public static boolean MERGE_DUPLICATES=false; + public static final boolean KILL_BAD_PAIRS=true; + public static boolean SAME_STRAND_PAIRS=false; + public static boolean REQUIRE_CORRECT_STRANDS_PAIRS=true; + public static boolean REMOVE_SINGLETON_DUPLICATES_OF_PAIRS=true; + public static boolean USE_STRICT_MERGE=false; + + public static boolean SWAP_READ1_TO_PLUS=false; + public static boolean MERGE_OPPOSITE_STRAND_DUPLICATES=false; //Requires SWAP_READ1_TO_PLUS=true + + public static final boolean UNLOAD_CHROMS_WHEN_DONE=true; + + public static boolean FIX_SHORT_PAIRED_READS=false; + + public static boolean TRIM_LOW_QUALITY_TAILS=false; + public static byte TRIM_QUALITY=7; + public static byte TRIM_WINDOW=3; + + public static boolean REGENERATE_MATCH_STRING=false; + public static int REGEN_THREADS=Shared.THREADS; + + private final ReadStreamWriter blockwriter1; + private final ReadStreamWriter blockwriter2; + +// private final TranslateColorspaceRead tcr2=new TranslateColorspaceRead(200, 2400); +} diff --git a/current/align2/SortReadsTopologically.java b/current/align2/SortReadsTopologically.java new file mode 100755 index 0000000..31b6626 --- /dev/null +++ b/current/align2/SortReadsTopologically.java @@ -0,0 +1,691 @@ +package align2; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.HashMap; +import java.util.zip.ZipOutputStream; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.ConcurrentSolidInputStream; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.FastqReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.ReadStreamStringWriter; +import stream.ReadStreamWriter; + +import dna.AminoAcid; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.FileFormat; + +public class SortReadsTopologically { + + + public static void main(String[] args){ + + String in1=null; + String in2=null; + String out="raw_tsorted#.txt.gz"; + int prefix=4; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+args[i]); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("i") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + if(b.indexOf('#')>=0){ + in1=b.replaceFirst("#", "1"); + in2=b.replaceFirst("#", "2"); + } + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("o") || a.equals("out") || a.equals("output")){ + out=b; + }else if(a.endsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + Data.sysout.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM); + }else if(a.endsWith("merge")){ + MERGE_DUPLICATES=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + Data.sysout.println("Set OVERWRITE to "+OVERWRITE); + }else if(a.equals("prefix")){ + prefix=Integer.parseInt(b); + }else if(a.endsWith("blocksize")){ + MAX_BLOCKSIZE_TO_SORT=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + } + + if(in1==null){throw new RuntimeException("Please specify input file.");} + if(out==null){throw new RuntimeException("Please specify output file.");} + if(in1.equalsIgnoreCase(in2) || in1.equalsIgnoreCase(out) || (in2!=null && in2.equalsIgnoreCase(out))){ + throw new RuntimeException("Duplicate filenames."); + } + + FileFormat ff=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, false); + boolean fastq=ff.fastq(); + boolean fasta=ff.fasta(); + boolean bread=ff.bread(); + + if(out!=null && !out.contains("#")){ + throw new RuntimeException("Output filename must contain '#' symbol."); + } + + SortReadsTopologically srt; + if(fasta){ + FastaReadInputStream fris1=new FastaReadInputStream(in1, false, (FASTQ.FORCE_INTERLEAVED && in2==null), true, in2==null ? Shared.READ_BUFFER_MAX_DATA : -1); + FastaReadInputStream fris2=(in2==null ? null : new FastaReadInputStream(in2, false, false, true, -1)); + ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, -1); + srt=new SortReadsTopologically(cris, out, prefix); + }else if(fastq){ + FastqReadInputStream fris1=new FastqReadInputStream(in1, false, true); + FastqReadInputStream fris2=(in2==null ? null : new FastqReadInputStream(in2, false, true)); + ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, -1); + srt=new SortReadsTopologically(cris, out, prefix); + }else{ + srt=new SortReadsTopologically(in1, in2, out, prefix); + } + + srt.processMT(); + if(MERGE_DUPLICATES){ + Data.sysout.println("Merged "+srt.merged+" duplicates of "+srt.processed+" total."); + if(srt.correctMerged>0 || srt.incorrectMerged>0){ + Data.sysout.println("Merged "+srt.correctMerged+" reads from same origin (correct)."); + Data.sysout.println("Merged "+srt.incorrectMerged+" reads from different origin (incorrect)."); + } + } + } + + public SortReadsTopologically(String fname1, String fname2, String outname_, int prefix_){ + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + + RTextInputStream rtis=new RTextInputStream(fname1, fname2, -1); + outname=outname_; + paired=rtis.paired(); + cris=new ConcurrentReadInputStream(rtis, -1); + prefix=prefix_; + assert(prefix<=5); + + blockwriter1=(fname1==null ? null : new ReadStreamStringWriter(null, true, 4, false)); + blockwriter2=(fname2==null ? null : new ReadStreamStringWriter(null, false, 4, false)); + } + + public SortReadsTopologically(String fname1, String q1, String fname2, String q2, String outname_, int prefix_){ + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + outname=outname_; + cris=new ConcurrentSolidInputStream(fname1, q1, fname2, q2, -1); + paired=cris.paired(); + prefix=prefix_; + assert(prefix<=5); + + blockwriter1=(fname1==null ? null : new ReadStreamStringWriter(null, true, 4, false)); + blockwriter2=(fname2==null ? null : new ReadStreamStringWriter(null, false, 4, false)); + } + + public SortReadsTopologically(ConcurrentReadStreamInterface cris_, String outname_, int prefix_){ + cris=cris_; + outname=outname_; + paired=cris.paired(); + prefix=prefix_; + assert(prefix<=5); + + blockwriter1=(new ReadStreamStringWriter(null, true, 4, false)); + blockwriter2=(!paired ? null : new ReadStreamStringWriter(null, false, 4, false)); + } + + public void processMT(){ + + final String fname1=outname.replaceFirst("#", "1"); + final String fname2=(!paired ? null : outname.replaceFirst("#", "2")); + if(fname1!=null && new File(fname1).exists()){ + if(OVERWRITE){new File(fname1).delete();} + else{throw new RuntimeException("Destination file "+fname1+" already exists.");} + } + if(fname2!=null && new File(fname2).exists()){ + if(OVERWRITE){new File(fname1).delete();} + else{throw new RuntimeException("Destination file "+fname2+" already exists.");} + } + + Timer t=new Timer(); + Timer total=new Timer(); + t.start(); + total.start(); + +// assert(false) : fname1+", "+fname2+", "+outname+", "+prefix; + + new Thread(cris).start(); + System.err.println("Started cris"); + + Thread bwt1=null, bwt2=null; + if(fname1!=null){ + bwt1=new Thread(blockwriter1); + bwt1.start(); + } + if(fname2!=null){ + bwt2=new Thread(blockwriter2); + bwt2.start(); + } + System.err.println("Started blockwriters"); + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + if(r.colorspace()){ + assert(prefix<=6); + }else{ + assert(prefix<=5); + } + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){addRead(r);} + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + } + + + synchronized(this){this.notifyAll();} + System.err.println("Notified all"); + + finishWritingBlocks(); + System.err.println("Wrote blocks"); + + if(bwt1!=null){blockwriter1.poison();} + if(bwt2!=null){blockwriter2.poison();} +// if(bwt1!=null){blockwriter1.addList(null);} +// if(bwt2!=null){blockwriter2.addList(null);} + + if(bwt1!=null){ + while(bwt1.isAlive()){ + try { + bwt1.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + if(bwt2!=null){ + while(bwt2.isAlive()){ + try { + bwt2.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + t.stop(); + Data.sysout.println("Temp Write Time: "+t); + t.start(); + + if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} + ReadStreamWriter wt1=(fname1==null ? null : new ReadStreamStringWriter(fname1, true, 4, false)); + ReadStreamWriter wt2=(fname2==null ? null : new ReadStreamStringWriter(fname2, false, 4, false)); + + Thread wtt1=(wt1==null ? null : new Thread(wt1)); + Thread wtt2=(wt2==null ? null : new Thread(wt2)); + + if(wtt1!=null){wtt1.start();} + if(wtt2!=null){wtt2.start();} + + ArrayList keys=new ArrayList(table.size()); + keys.addAll(table.keySet()); + Collections.sort(keys); + + ReadComparatorTopological tcomp=new ReadComparatorTopological(); + + for(String key : keys){ + Block b=table.get(key); + table.remove(key); + processed+=b.added; + + if(b.added>MAX_BLOCKSIZE_TO_SORT){ + System.err.println("Skipping sorting for key "+key+" of size "+b.added); + RTextInputStream temp=new RTextInputStream(b.fname1, b.fname2, -1); + ArrayList reads=temp.nextList(); + while(reads!=null && reads.size()>0){ + if(reads!=null && reads.size()>0){ + if(wt1!=null){wt1.addList(reads);} + if(wt2!=null){wt2.addList(reads);} + } + b.numRead+=reads.size(); + reads=temp.nextList(); + } + temp.close(); + temp=null; + +// Data.sysout.println(key+"\t"+b.added); + b.delete(); + }else{ + ArrayList list=b.readBlock(); + if(PRINT_BLOCKS){Data.sysout.println(key+"\t"+list.size());} + b.delete(); + + Collections.sort(list, tcomp); + if(MERGE_DUPLICATES){ + int count; + count=mergeDuplicates(list, 0, 0, (byte)-99); + if(count>0){ + Tools.condense(list); + Collections.sort(list, tcomp); + } + count=mergeDuplicates(list, 1, 0, (byte)-99); +// if(count>0){ +// Tools.condense(list); +// Collections.sort(list, tcomp); +// } +// count=mergeDuplicates(list, 0, 1, (byte)2); + + Tools.condense(list); + Collections.sort(list, tcomp); + } + if(list!=null && list.size()>0){ + if(wt1!=null){wt1.addList(list);} + if(wt2!=null){wt2.addList(list);} + } + } + } + + //Add poison +// if(wt1!=null){wt1.addList(null);} +// if(wt2!=null){wt2.addList(null);} + if(wt1!=null){wt1.poison();} + if(wt2!=null){wt2.poison();} + + if(wtt1!=null){ + while(wtt1.isAlive()){ + try { + wtt1.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + if(wtt2!=null){ + while(wtt2.isAlive()){ + try { + wtt2.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + t.stop(); + total.stop(); + Data.sysout.println("Final Sort + Write Time: "+t); + Data.sysout.println("Total Time: "+total); + + } + + + private int mergeDuplicates(ArrayList list, int nmax, int mmax, byte qmax){ + if(list==null || list.size()<2){return 0;} + Read current=list.get(0); + + int correct=0; + int incorrect=0; + + int count=0; + for(int i=1; i0){ + if(r.chrom==current.chrom && r.start==current.start && r.stop==current.stop && r.strand()==current.strand()){ + correct++; + }else{ + incorrect++; + } + } + if(r2!=null && c2!=null && r2.originalSite!=null && c2.originalSite!=null){ + if(r2.originalSite.equals(c2.originalSite)){ + correct++; + }else{ + incorrect++; + } + }else if(r2!=null && c2!=null && r2.chrom>0){ + if(r2.chrom==c2.chrom && r2.start==c2.start && r2.stop==c2.stop && r2.strand()==c2.strand()){ + correct++; + }else{ + incorrect++; + } + } + } + current.merge(r, true, true); + list.set(i, null); + count++; + keep=true; + } + } + if(!keep){current=r;} + } + merged+=count; + correctMerged+=correct; + incorrectMerged+=incorrect; + return count; + } + + + private void addRead(Read r){ + StringBuilder sb=new StringBuilder(prefix); + boolean bad=false; + for(int i=0; i=0 && b<=3){ + sb.append((int)b); + }else{ + + if(AminoAcid.isFullyDefined(b)){ + sb.append((char)b); + }else{ + bad=true; + sb.append('N'); + } + } + + } + + String key=bad ? "ZN" : sb.toString(); +// String key=sb.toString(); + Block b=table.get(key); + if(b==null){ + //System.err.println("Created block "+key); + b=new Block(key, outname); + table.put(key, b); + } + b.add(r); + } + + + public void finishWritingBlocks(){ + System.err.println("Called finishWritingBlocks()"); + for(String key : table.keySet()){ + Block b=table.get(key); + b.finishWritingBuffer(); + } + } + + + + private static final void finishWriting(PrintWriter writer, OutputStream outStream){ + writer.flush(); + if(outStream.getClass()==ZipOutputStream.class){ + ZipOutputStream zos=(ZipOutputStream)outStream; + try { + zos.closeEntry(); + zos.finish(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + writer.close(); + try { + outStream.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + private class Block{ + + public Block(String name_, String fname_){ + + if(DONT_COMPRESS_TEMP_FILES){ + while(fname_.endsWith(".gz") || fname_.endsWith(".zip") || fname_.endsWith(".bz2")){ + fname_=fname_.substring(0, fname_.lastIndexOf('.')); + } + } + + name=name_; + fname1=fname_.replaceFirst("#", "_tsort_tempBlock_"+name+"_1"); + fname2=(!paired ? null : fname_.replaceFirst("#", "_tsort_tempBlock_"+name+"_2")); + + if(fname1==null){ + assert(false); + outStream1=null; + writer1=null; + }else{ + outStream1=ReadWrite.getOutputStream(fname1, false, true, false); + writer1=new PrintWriter(outStream1); + } + + if(fname2==null){ + outStream2=null; + writer2=null; + }else{ + outStream2=ReadWrite.getOutputStream(fname2, false, true, false); + writer2=new PrintWriter(outStream2); + } + } + + public void add(Read r){ + buffer.add(r); + added++; + if(buffer.size()>=WRITE_BUFFER){ + writeBuffer(false); + } + } + + public void writeBuffer(boolean close){ + + written+=buffer.size(); + ArrayList temp=buffer; + buffer=(close ? null : new ArrayList(WRITE_BUFFER)); + + if(close){ +// System.err.println("Closing "+name+": "+ fname1+", "+fname2); + if(blockwriter1!=null){blockwriter1.addList(temp, writer1, outStream1, close);} + if(blockwriter2!=null){blockwriter2.addList(temp, writer2, outStream2, close);} + }else{ + if(blockwriter1!=null && temp!=null && !temp.isEmpty()){blockwriter1.addList(temp, writer1, outStream1, close);} + if(blockwriter2!=null && temp!=null && !temp.isEmpty()){blockwriter2.addList(temp, writer2, outStream2, close);} + } + + assert(added==written); +// buffer.clear(); + } + + public void finishWritingBuffer(){ + //System.err.println("Writing block "+name); + writeBuffer(true); + +// finishWriting(writer1, outStream1); +// if(fname2!=null){ +// finishWriting(writer2, outStream2); +// } + + } + + public synchronized ArrayList readBlock(){ + RTextInputStream temp=new RTextInputStream(fname1, fname2, -1); + ArrayList out=new ArrayList((int)written); + ArrayList reads=temp.nextList(); + while(reads!=null && reads.size()>0){ + out.addAll(reads); + numRead+=reads.size(); + reads=temp.nextList(); + } + temp.close(); + temp=null; + assert(numRead==written); + + return out; + } + + public synchronized void delete() { + if(fname1!=null){new File(fname1).delete();} + if(fname2!=null){new File(fname2).delete();} + } + + public final String name; + public final String fname1, fname2; + + public final OutputStream outStream1, outStream2; + public final PrintWriter writer1, writer2; + private ArrayList buffer=new ArrayList(WRITE_BUFFER); + + public long added=0, written=0, numRead=0; + } + + public static class ReadComparatorTopological implements Comparator{ + + @Override + public int compare(Read r1, Read r2) { + return compare(r1, r2, true); + } + + public int compare(Read r1, Read r2, boolean compareMates) { + + int x=compareVectors(r1.bases, r2.bases); + if(x!=0){return x;} + + if(r1.mate!=null && r2.mate!=null){ + x=compareVectors(r1.mate.bases, r2.mate.bases); + } + if(x!=0){return x;} + + if(r1.bases!=null && r2.bases!=null && r1.bases.length!=r2.bases.length){return r1.bases.length-r2.bases.length;} + if(r1.mate!=null && r2.mate!=null && r1.mate.bases!=null && r2.mate.bases!=null + && r1.mate.bases.length!=r2.mate.bases.length){return r1.mate.bases.length-r2.mate.bases.length;} + + x=compareVectors(r1.quality, r2.quality); + if(x!=0){return 0-x;} + + if(r1.mate!=null && r2.mate!=null){ + x=compareVectors(r1.mate.quality, r2.mate.quality); + } + if(x!=0){return 0-x;} + + if(r1.numericID!=r2.numericID){return r1.numericID>r2.numericID ? 1 : -1;} + + return r1.id.compareTo(r2.id); + } + + public int compareVectors(final byte[] a, final byte[] b){ + if(a==null || b==null){ + if(a==null && b!=null){return 1;} + if(a!=null && b==null){return -1;} + return 0; + } + final int lim=Tools.min(a.length, b.length); + for(int i=0; ib[i]){return 1;} + } + return 0; + } + + public int compareVectorsN(final byte[] a, final byte[] b){ + if(a==null || b==null){ + if(a==null && b!=null){return 1;} + if(a!=null && b==null){return -1;} + return 0; + } + final int lim=Tools.min(a.length, b.length); + for(int i=0; ib[i]){return 1;} + } + return 0; + } + } + + public final String outname; + private final ConcurrentReadStreamInterface cris; + public long merged=0; + public long processed=0; + + public long correctMerged=0; + public long incorrectMerged=0; + + private final HashMap table=new HashMap(4096); + + public final boolean paired; + public final int prefix; + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + + public static final int WRITE_BUFFER=1000; //Bigger number uses more memory, for less frequent writes. + public static int MAX_BLOCKSIZE_TO_SORT=16000000; + + public static final boolean DONT_COMPRESS_TEMP_FILES=false; + public static boolean MERGE_DUPLICATES=false; + public static boolean OVERWRITE=false; + public static boolean PRINT_BLOCKS=false; + + + private final ReadStreamWriter blockwriter1; + private final ReadStreamWriter blockwriter2; + + +} diff --git a/current/align2/SplitMappedReads.java b/current/align2/SplitMappedReads.java new file mode 100755 index 0000000..864185a --- /dev/null +++ b/current/align2/SplitMappedReads.java @@ -0,0 +1,321 @@ +package align2; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.zip.ZipOutputStream; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; + +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; + +public class SplitMappedReads { + + + public static void main(String[] args){ + + String reads1=args[0]; + String reads2=args[1].equalsIgnoreCase("null") ? null : args[1]; + String outname=args[2].equalsIgnoreCase("null") ? "" : args[2]; + + int minChrom=1; + int maxChrom=25; + if(args.length>3){ + minChrom=maxChrom=Byte.parseByte(args[3]); + if(args.length>4){ + maxChrom=Byte.parseByte(args[4]); + } + } + assert(minChrom<=maxChrom && minChrom>=0); + + SplitMappedReads smr=new SplitMappedReads(reads1, reads2, outname, minChrom, maxChrom); + smr.process(); + + } + + public SplitMappedReads(String fname1, String fname2, String outname_, int minChrom, int maxChrom){ + this(new RTextInputStream(fname1, fname2, -1), outname_, minChrom, maxChrom); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + } + + public SplitMappedReads(RTextInputStream stream_, String outname_, int minChrom, int maxChrom){ + stream=stream_; + outname=outname_; + paired=stream.paired(); +// assert(outname.contains("#")) : "Output file name must contain the character '#' to be used for chromosome number."; + + MIN_CHROM=minChrom; + MAX_CHROM=maxChrom; + assert(MIN_CHROM>=0); + assert(MAX_CHROM>=MIN_CHROM); + + outArraySingle1=new OutputStream[maxChrom+1]; + printArraySingle1=new PrintWriter[maxChrom+1]; + bufferArraySingle1=new ArrayList[maxChrom+1]; + for(int i=minChrom; i(WRITE_BUFFER); + outArraySingle1[i]=ReadWrite.getOutputStream(outname.replace("#", "single_1_chr"+i), false, true, false); + printArraySingle1[i]=new PrintWriter(outArraySingle1[i]); + printArraySingle1[i].println("#Chromosome "+i+" Read 1 Singletons"); + printArraySingle1[i].println("#"+Read.header()); + } + + if(!paired){ + outArraySingle2=null; + printArraySingle2=null; + bufferArraySingle2=null; + outArrayPaired1=null; + printArrayPaired1=null; + bufferArrayPaired1=null; + outArrayPaired2=null; + printArrayPaired2=null; + bufferArrayPaired2=null; + }else{ + + outArraySingle2=new OutputStream[maxChrom+1]; + printArraySingle2=new PrintWriter[maxChrom+1]; + bufferArraySingle2=new ArrayList[maxChrom+1]; + for(int i=minChrom; i(WRITE_BUFFER); + outArraySingle2[i]=ReadWrite.getOutputStream(outname.replace("#", "single_2_chr"+i), false, true, false); + printArraySingle2[i]=new PrintWriter(outArraySingle2[i]); + printArraySingle2[i].println("#Chromosome "+i+" Read 2 Singletons"); + printArraySingle2[i].println("#"+Read.header()); + } + + outArrayPaired1=new OutputStream[maxChrom+1]; + printArrayPaired1=new PrintWriter[maxChrom+1]; + bufferArrayPaired1=new ArrayList[maxChrom+1]; + for(int i=minChrom; i(WRITE_BUFFER); + outArrayPaired1[i]=ReadWrite.getOutputStream(outname.replace("#", "paired_1_chr"+i), false, true, false); + printArrayPaired1[i]=new PrintWriter(outArrayPaired1[i]); + printArrayPaired1[i].println("#Chromosome "+i+" Read 1 Paired"); + printArrayPaired1[i].println("#"+Read.header()); + } + + outArrayPaired2=new OutputStream[maxChrom+1]; + printArrayPaired2=new PrintWriter[maxChrom+1]; + bufferArrayPaired2=new ArrayList[maxChrom+1]; + for(int i=minChrom; i(WRITE_BUFFER); + outArrayPaired2[i]=ReadWrite.getOutputStream(outname.replace("#", "paired_2_chr"+i), false, true, false); + printArrayPaired2[i]=new PrintWriter(outArrayPaired2[i]); + printArrayPaired2[i].println("#Chromosome "+i+" Read 2 Paired"); + printArrayPaired2[i].println("#"+Read.header()); + } + + } + + cris=(USE_CRIS ? new ConcurrentReadInputStream(stream, -1) : null); + } + + public void process(){ + + Timer t=new Timer(); + t.start(); + + if(cris!=null){ + new Thread(cris).start(); + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + processReads(reads); + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + }else{ + ArrayList reads=stream.nextList(); + while(reads!=null && reads.size()>0){ + processReads(reads); + reads=stream.nextList(); + } + } + + synchronized(this){this.notifyAll();} + + finish(); + + t.stop(); + Data.sysout.println("Time:\t"+t); + } + + + + private void processReads(ArrayList reads){ + for(Read r : reads){ + addRead(r, 1); + if(r.mate!=null){ + addRead(r.mate, 2); + } + } + } + + + private void addRead(Read r, int side){ + + if(r.chrom<1 && r.numSites()>0){ + SiteScore ss=r.topSite(); //Should not be necessary + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + } + + //Ensure no superfluous data is written + r.sites=null; + r.originalSite=null; + r.obj=null; + +// System.err.println("Adding to chrom "+r.chrom+", side "+side+", paired="+r.paired+", "+(r.list==null ? "null" : r.list.size())); + if(r.chromMAX_CHROM){return;} + + final PrintWriter writer; + final ArrayList list; + + if(side==1){ + if(r.paired()){ + writer=printArrayPaired1[r.chrom]; + list=bufferArrayPaired1[r.chrom]; + }else{ + writer=printArraySingle1[r.chrom]; + list=bufferArraySingle1[r.chrom]; + } + }else{ + assert(side==2); + if(r.paired()){ + writer=printArrayPaired2[r.chrom]; + list=bufferArrayPaired2[r.chrom]; + }else{ + writer=printArraySingle2[r.chrom]; + list=bufferArraySingle2[r.chrom]; + } + } + + assert(list.size()=WRITE_BUFFER){ + writeList((ArrayList)list.clone(), writer); + list.clear(); + } + } + + + private void writeList(ArrayList list, PrintWriter writer){ + + synchronized(writer){ + for(Read r : list){ + writer.println(r.toText(true)); + } + } + } + + + public void finish(){ + + final PrintWriter[][] writers=new PrintWriter[][] {printArraySingle1, printArraySingle2, printArrayPaired1, printArrayPaired2}; + final OutputStream[][] streams=new OutputStream[][] {outArraySingle1, outArraySingle2, outArrayPaired1, outArrayPaired2}; + final ArrayList[][] buffers=new ArrayList[][] {bufferArraySingle1, bufferArraySingle2, bufferArrayPaired1, bufferArrayPaired2}; + + + for(int x=0; x[] bufferArray=buffers[x]; + + for(int i=0; printArray!=null && i list=bufferArray[i]; + + if(list!=null && !list.isEmpty()){ + writeList(list, writer); + list=null; + } + } + } + + //TODO: Wait for writing to finish, if it is done in threads. + + + for(int x=0; x[] bufferArraySingle1; + + private final OutputStream[] outArraySingle2; + private final PrintWriter[] printArraySingle2; + private final ArrayList[] bufferArraySingle2; + + private final OutputStream[] outArrayPaired1; + private final PrintWriter[] printArrayPaired1; + private final ArrayList[] bufferArrayPaired1; + + private final OutputStream[] outArrayPaired2; + private final PrintWriter[] printArrayPaired2; + private final ArrayList[] bufferArrayPaired2; + + private final int MIN_CHROM; + private final int MAX_CHROM; + + public final boolean paired; + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + + public static final int WRITE_BUFFER=400; //Bigger number uses more memory, for less frequent writes. + + +} diff --git a/current/align2/SplitSamFile.java b/current/align2/SplitSamFile.java new file mode 100755 index 0000000..9b80a0e --- /dev/null +++ b/current/align2/SplitSamFile.java @@ -0,0 +1,65 @@ +package align2; + +import stream.SamLine; +import dna.Timer; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +public class SplitSamFile { + + + public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + TextFile tf=new TextFile(args[0], true, false); + String out1=args[1]; + String out2=args[2]; + assert(!out1.equalsIgnoreCase(out2)) : "Output files are the same."; + TextStreamWriter tsw1=new TextStreamWriter(out1, true, false, true); + TextStreamWriter tsw2=new TextStreamWriter(out2, true, false, true); + + tsw1.start(); + tsw2.start(); + + long plus=0; + long minus=0; + long other=0; + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); + if(c!='@'/* && c!=' ' && c!='\t'*/){ + SamLine sl=new SamLine(s); + if(sl.mapped()){ + if(sl.strand()==0){ + tsw1.println(s); + plus++; + }else{ + tsw2.println(s); + minus++; + } + }else{ + other++; + } + } + } + tf.close(); + tsw1.poison(); + tsw2.poison(); + + System.err.println("Total reads: \t"+(plus+minus+other)); + System.err.println("Plus reads: \t"+(plus)); + System.err.println("Minus reads: \t"+(minus)); + System.err.println("Unmapped reads:\t"+(other)); + + tsw1.waitForFinish(); + tsw2.waitForFinish(); + t.stop(); + + System.err.println("Time: \t"+t); + + } + + +} diff --git a/current/align2/Tools.java b/current/align2/Tools.java new file mode 100755 index 0000000..d499217 --- /dev/null +++ b/current/align2/Tools.java @@ -0,0 +1,1548 @@ +package align2; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.concurrent.atomic.AtomicIntegerArray; +import java.util.concurrent.atomic.AtomicLongArray; + +import stream.SiteScore; + +import dna.Data; + +public final class Tools { + + /** Return true if the user seems confused */ + public static boolean parseHelp(String[] args){ + if(args==null || args.length==0 || (args.length==1 && args[0]==null)){return true;} + if(args.length>1){return false;} + final String s=args[0].toLowerCase(); + return s.equals("-h") || s.equals("-help") || s.equals("--help") + || s.equals("-version") || s.equals("--version") || s.equals("?") || s.equals("-?") || (s.equals("help") && !new File(s).exists()); + } + + /** Checks for permission to overwrite files, and output name collisions. */ + public static boolean testOutputFiles(boolean overwrite, boolean allowDuplicates, String...args){ + if(args==null || args.length==0){return true;} + HashSet set=new HashSet(args.length*2); + int terms=0; + for(String s : args){ + if(s!=null){ + if(isOutputFileName(s)){ + terms++; + + if(!overwrite && new File(s).exists()){ + assert(overwrite) : "File "+s+" exists and overwrite=false"; + return false; + } + + if(!allowDuplicates && set.contains(s)){ + assert(false) : "Duplicate file "+s+" was specified for multiple output streams."; + return false; + } + + set.add(s); + } + } + } + return true; + } + + public static final boolean canWrite(String s, boolean overwrite){ + if(isNullFileName(s) || isSpecialOutputName(s)){return true;} + File f=new File(s); + if(f.exists()){return overwrite && f.canWrite();} + return true; + } + +// public static final boolean outputDestinationExists(String s){ +// if(isNullFileName(s)){return false;} +// if(isSpecialOutputName(s)){return false;} +// File f=new File(s); +// return f.exists(); +// } + + public static final boolean isOutputFileName(String s){ + return !(isNullFileName(s) || isSpecialOutputName(s)); + } + + public static final boolean isNullFileName(String s){ + if(s==null || s.equalsIgnoreCase("null") || s.equalsIgnoreCase("none")){return true;} + for(int i=0; i int condense(ArrayList list){ + if(list==null || list.size()==0){return 0;} + int removed=0; + + for(int i=list.size()-1; i>0; i--){ + if(list.get(i)==null){ + removed++; + X last=list.get(list.size()-1); + list.set(i, last); + list.remove(list.size()-1); + } + } + return removed; + } + + /** Removes null elements by shrinking the list. Will not change list order. */ + public static final int condenseStrict(ArrayList list){ + if(list==null || list.size()==0){return 0;} + int removed=0; + + int insertPos=0; + for(int i=0; i ArrayList condenseNew(ArrayList list){ + ArrayList temp=new ArrayList(list.size()); + for(X x : list){ + if(x!=null){temp.add(x);} + } + return temp; + } + + //This should also be correct. I'm not sure which is faster. +// /** Removes null elements by shrinking the list. Will not change list order. */ +// public static final int condenseStrict(ArrayList list){ +// if(list==null || list.size()==0){return 0;} +// int removed=0; +// int last=0; +// +// for(int i=0; i ssl, float fractionOfMax, boolean retainPaired){ +//// assert(false); +// if(ssl==null || ssl.size()==0){return -999999;} +// if(ssl.size()==1){return ssl.get(0).score;} +// int maxScore=-999999; +// for(SiteScore ss : ssl){ +// maxScore=Tools.max(maxScore, ss.score); +// } +// +// int cutoff=(int) (maxScore*fractionOfMax); +// trimSitesBelowCutoff(ssl, cutoff, retainPaired); +//// trimSitesBelowCutoffInplace(ssl, cutoff); +// return maxScore; +// } + + /** minSitesToRetain should be set to 1 if the list is not sorted by score (for efficiency of removal). Otherwise, it can be higher. */ + public static final int trimSiteList(ArrayList ssl, float fractionOfMax, boolean retainPaired, boolean retainSemiperfect, + int minSitesToRetain, int maxSitesToRetain){ +// assert(false); + if(ssl==null || ssl.size()==0){return -999999;} + if(ssl.size()==1){return ssl.get(0).score;} + int maxScore=-999999; + + if(minSitesToRetain>1 && minSitesToRetain ssl, int cutoff, boolean retainPaired, boolean retainSemiperfect, + int minSitesToRetain, int maxSitesToRetain){ +// assert(false); + if(ssl==null || ssl.size()==0){return;} + if(ssl.size()==1){return;} + + trimSitesBelowCutoff(ssl, cutoff, retainPaired, retainSemiperfect, minSitesToRetain, maxSitesToRetain); + } + + public static final > boolean inOrder(ArrayList list){ + if(list==null || list.size()<2){return true;} + for(int i=1; i0){return false;} + } + return true; + } + + + + public static final int mergeDuplicateSites(ArrayList list, boolean doAssertions, boolean mergeDifferentGaps){ + if(list==null || list.size()<2){return 0;} + Collections.sort(list, SiteScore.PCOMP); + + int removed=0; + + SiteScore a=list.get(0); + for(int i=1; ib.score || a.slowScore>b.slowScore)))){ + throw new RuntimeException("\n"+SiteScore.header()+"\n"+a.toText()+"\n"+b.toText()+"\n"); + } + + assert(a.perfect==b.perfect || + (a.perfect && (a.score>b.score || a.slowScore>b.slowScore))) : + "\n"+SiteScore.header()+"\n"+a.toText()+"\n"+b.toText()+"\n"; + } + + a.score=max(a.score, b.score); + a.slowScore=max(a.slowScore, b.slowScore); + a.pairedScore=max(a.pairedScore, b.pairedScore); + a.perfect=(a.perfect || b.perfect); + if(a.pairedScore>0 && a.pairedScore<=a.score){a.pairedScore=a.score+1;} + + removed++; + list.set(i, null); + }else if(mergeDifferentGaps && a.positionalMatch(b, false)){ //Same outermost boundaries, different gaps + + SiteScore better=null; + if(a.score!=b.score){ + better=(a.score>b.score ? a : b); + }else if(a.slowScore!=b.slowScore){ + better=(a.slowScore>b.slowScore ? a : b); + }else if(a.pairedScore!=b.pairedScore){ + better=(a.pairedScore>b.pairedScore ? a : b); + }else{ + better=a; + } + + a.score=max(a.score, b.score); + a.slowScore=max(a.slowScore, b.slowScore); + a.pairedScore=max(a.pairedScore, b.pairedScore); + a.perfect=(a.perfect || b.perfect); + if(a.pairedScore>0 && a.pairedScore<=a.score){a.pairedScore=a.score+1;} + a.gaps=better.gaps; + + removed++; + list.set(i, null); + } + else{ + a=b; + } + } + +// if(removed>0){condense(list);} + if(removed>0){condenseStrict(list);} + return removed; + } + + + + public static final int subsumeOverlappingSites(ArrayList list, boolean subsumeIfOnlyStartMatches, boolean subsumeInexact){ + if(list==null || list.size()<2){return 0;} + Collections.sort(list, SiteScore.PCOMP); + + int removed=0; + + + for(int i=0; ia.start); + if(overlappingA && a.strand==b.strand){ + + SiteScore better=null; + if(a.perfect!=b.perfect){ + better=a.perfect ? a : b; + }if(a.semiperfect!=b.semiperfect){ + better=a.semiperfect ? a : b; + }else if(a.score!=b.score){ + better=(a.score>b.score ? a : b); + }else if(a.slowScore!=b.slowScore){ + better=(a.slowScore>b.slowScore ? a : b); + }else if(a.pairedScore!=b.pairedScore){ + better=(a.pairedScore>b.pairedScore ? a : b); + }else if(a.pairedScore!=b.pairedScore){ + better=(a.quickScore>b.quickScore ? a : b); + }else{ + better=a; + } + +// if((a.perfect && b.perfect) || (a.semiperfect && b.semiperfect)){ + if(a.semiperfect && b.semiperfect){ + if(a.start==b.start || a.stop==b.stop){ + list.set(i, better); + list.set(j, null); + removed++; + a=better; + }else{ + //retain both of them + } + }else if(a.perfect || b.perfect){ + list.set(i, better); + list.set(j, null); + removed++; + a=better; + }else if(a.semiperfect || b.semiperfect){ + if(a.start==b.start && a.stop==b.stop){ + list.set(i, better); + list.set(j, null); + removed++; + a=better; + }else{ + //retain both of them + } + }else if(subsumeInexact || (a.start==b.start && (subsumeIfOnlyStartMatches || a.stop==b.stop))){ + assert(!a.semiperfect && !a.perfect && !b.semiperfect && !b.perfect); + a.start=min(a.start, b.start); + a.stop=max(a.stop, b.stop); + a.score=max(a.score, b.score); + a.slowScore=max(a.slowScore, b.slowScore); + a.pairedScore=max(a.pairedScore, b.pairedScore); + a.quickScore=max(a.quickScore, b.quickScore); + if(a.pairedScore>0 && a.pairedScore<=a.score){a.pairedScore=a.score+1;} + a.gaps=better.gaps;//Warning! Merging gaps would be better; this could cause out-of-bounds. + //TODO: Test for a subsumption length limit. + list.set(j, null); + removed++; + } + } + } + } + } + } + +// if(removed>0){condense(list);} + if(removed>0){condenseStrict(list);} + return removed; + } + + + + public static final int removeOverlappingSites(ArrayList list, boolean requireAMatchingEnd){ + if(list==null || list.size()<2){return 0;} + Collections.sort(list, SiteScore.PCOMP); + + int removed=0; + + + for(int i=0; ia.start); + if(overlappingA && a.strand==b.strand){ + + SiteScore better=null; + if(a.perfect!=b.perfect){ + better=a.perfect ? a : b; + }else if(a.score!=b.score){ + better=(a.score>b.score ? a : b); + }else if(a.slowScore!=b.slowScore){ + better=(a.slowScore>b.slowScore ? a : b); + }else if(a.pairedScore!=b.pairedScore){ + better=(a.pairedScore>b.pairedScore ? a : b); + }else if(a.pairedScore!=b.pairedScore){ + better=(a.quickScore>b.quickScore ? a : b); + }else{ + better=a; + } + + if(a.start==b.start && a.stop==b.stop){ + list.set(i, better); + list.set(j, null); + a=better; + removed++; + }else if(a.start==b.start || a.stop==b.stop){ //In this case they cannot both be perfect + list.set(i, better); + list.set(j, null); + a=better; + removed++; + }else if(!requireAMatchingEnd && a.score!=b.score){ + list.set(i, better); + list.set(j, null); + a=better; + removed++; + } + } + } + } + } + } + +// if(removed>0){condense(list);} + if(removed>0){condenseStrict(list);} + return removed; + } + + + + /** Returns the number of sitescores in the list within "thresh" of the top score. Assumes list is sorted descending. + * This is used to determine whether a mapping is ambiguous. */ + public static final int countTopScores(ArrayList list, int thresh){ + assert(thresh>=0) : thresh; + if(list==null || list.isEmpty()){return 0;} + int count=1; + final SiteScore ss=list.get(0); + final int limit=ss.score-thresh; + + for(int i=1; i list, int maxSwScore, float multSingle, float multPaired){ + if(list==null || list.size()==0){return 0;} + + assert(multSingle>=multPaired); + + int initialSize=list.size(); + final int swScoreThresh=(int)(maxSwScore*multSingle); //Change low-quality alignments to no-hits. + final int swScoreThreshPaired=(int)(maxSwScore*multPaired); + if(list.get(0).score=0; i--){ + SiteScore ss=list.get(i); + assert(ss.score==ss.slowScore); + assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!"; + if(ss.pairedScore>0){ + assert(ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : ss; + if(ss.slowScore list, int maxSwScore, float multSingle){ +// if(list==null || list.size()==0){return 0;} +// +// int initialSize=list.size(); +// final int swScoreThresh=(int)(maxSwScore*multSingle); //Change low-quality alignments to no-hits. +// if(list.get(0).score=0; i--){ +// for(int i=list.size()-1; i>1; i--){ +// SiteScore ss=list.get(i); +// assert(ss.score==ss.slowScore); +// assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!"; +// assert(ss.pairedScore==0) : ss.toText(); +// if(ss.slowScore list, int thresh){ + if(list==null || list.size()==0){return 0;} + + int initialSize=list.size(); + if(list.get(0).score=0; i--){ + for(int i=list.size()-1; i>1; i--){ + SiteScore ss=list.get(i); + assert(ss.score==ss.slowScore); + assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!"; + assert(ss.pairedScore==0) : ss.toText(); + if(ss.slowScore list, int maxSwScore, float multSingle, float multPaired, int expectedSites){ + if(list==null || list.size()==0){return 0;} + + assert(multSingle>=multPaired); + + int initialSize=list.size(); + final int swScoreThresh=(int)(maxSwScore*multSingle); //Change low-quality alignments to no-hits. + final int swScoreThreshPaired=(int)(maxSwScore*multPaired); + final int swScoreThresh2=(int)(maxSwScore*multSingle*1.2f); + final int swScoreThreshPaired2=(int)(maxSwScore*multPaired*1.1f); + if(list.get(0).scoremin; i--){ + if(list.get(i).slowScore>=nthBest){break;} + list.remove(i); + } + + for(int i=list.size()-1; i>=0; i--){ + SiteScore ss=list.get(i); + assert(ss.score==ss.slowScore); + assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!"; + if(ss.pairedScore>0){ + int thresh=(i>=expectedSites ? swScoreThreshPaired2 : swScoreThreshPaired); + assert(ss.pairedScore>ss.quickScore || ss.pairedScore>ss.slowScore) : ss; + if(ss.slowScore=expectedSites ? swScoreThresh2 : swScoreThresh); + assert(ss.pairedScore==0) : ss.toText(); + if(ss.slowScore list, int maxSwScore, float multSingle, int expectedSites){ + if(list==null || list.size()==0){return 0;} + + for(int i=expectedSites/2; imin; i--){ + if(list.get(i).slowScore>=nthBest){break;} + list.remove(i); + } + +// for(int i=list.size()-1; i>=0; i--){ + for(int i=list.size()-1; i>=1; i--){ + SiteScore ss=list.get(i); + assert(ss.score==ss.slowScore); + assert(i==0 || ss.slowScore<=list.get(i-1).slowScore) : "List is not sorted by singleton score!"; + assert(ss.pairedScore==0) : ss.toText(); + int thresh=(i>=expectedSites ? swScoreThresh2 : swScoreThresh); + if(ss.slowScore ssl, int cutoff, boolean retainPaired){ +// trimSitesBelowCutoff(ssl, cutoff, retainPaired, 1); +// } + + +// public static final void trimSitesBelowCutoff(ArrayList ssl, int cutoff, boolean retainPaired, int minSitesToRetain){ +//// assert(false); +// assert(minSitesToRetain>=1); +// if(ssl==null || ssl.size() ssl2=new ArrayList(ssl.size()); +// for(SiteScore ss : ssl){ +// if(ss.score>=cutoff || (retainPaired && ss.pairedScore>0)){ +// ssl2.add(ss); +// } +// } +// +//// Collections.sort(ssl2); +//// System.err.println("Cutoff: "+cutoff); +//// for(SiteScore ss : ssl2){ +//// System.err.print("("+ss.chrom+", "+ss.score+"), "); +//// } +//// System.err.println(); +// +// if(ssl2.size()==ssl.size()){return;} +//// System.err.println("cutoff: "+cutoff+",\tsize: "+ssl.size()+" -> "+ssl2.size()); +// ssl.clear(); +// ssl.addAll(ssl2); +// } + + + public static final void trimSitesBelowCutoff(ArrayList ssl, int cutoff, boolean retainPaired, boolean retainSemiperfect, + int minSitesToRetain, int maxSitesToRetain){ +// assert(false); + assert(minSitesToRetain>=1); + assert(maxSitesToRetain>minSitesToRetain); + if(ssl==null || ssl.size()<=minSitesToRetain){return;} + while(ssl.size()>maxSitesToRetain){ssl.remove(ssl.size()-1);} + + int removed=0; + final int maxToRemove=ssl.size()-minSitesToRetain; + + assert(minSitesToRetain==1 || inOrder(ssl)); + + if(retainPaired){ + for(int i=ssl.size()-1; i>=0; i--){ + SiteScore ss=ssl.get(i); + if(!retainSemiperfect || !ss.semiperfect){ + if(ss.score=maxToRemove){ + assert(removed==maxToRemove); + break; + } + } + } + } + }else{ + for(int i=ssl.size()-1; i>=0; i--){ + SiteScore ss=ssl.get(i); + if(!retainSemiperfect || !ss.semiperfect){ + if(ss.score=maxToRemove){ + assert(removed==maxToRemove); + break; + } + } + } + } + } + + if(removed>0){ + condenseStrict(ssl); + } + assert(ssl.size()>=minSitesToRetain); + } + + //Messes up order +// public static final void trimSitesBelowCutoffInplace(ArrayList ssl, int cutoff, boolean retainPaired){ +//// assert(false); +// if(ssl==null || ssl.size()<2){return;} +// +// for(int i=0; i=count[i-1]) : "\n\ncount["+i+"]="+count[i]+"\ncount["+(i-1)+"]="+count[i-1]+"\n"; + } + + int pos=count.length-1; + for(int sum=0; pos>1 && summaxLengthToKeep2){data[i]=null;} + } + } + + public static int findLimitForHighFreqEntries(int[][] data, float fractionToExclude){ + if(fractionToExclude<=0){return Integer.MAX_VALUE;} + int[] count=new int[data.length]; + + long numBases=0; + + for(int i=0; i=count[i-1]) : "\n\ncount["+i+"]="+count[i]+"\ncount["+(i-1)+"]="+count[i-1]+"\n"; + } + + int pos=count.length-1; + for(int sum=0; pos>1 && sum=minLength){ + if(isClumpy(array, maxDist, fraction)){ + removedSites+=array.length; + removedKeys++; + data[i]=null; + } + } + } + +// System.err.println("Removed\t"+removedSites+"\t/ "+total+"\tsites," + +// " or "+String.format("%.4f", (removedSites*100f/total))+"%"); +// System.err.println("Removed\t"+removedKeys+"\t/ "+data.length+"\tkeys," + +// " or "+String.format("%.4f", (removedKeys*100f/data.length))+"%"); + + } + + public static HashSet banClumpyEntries(final int[][] data, final int maxDist, final int minLength, final float fraction){ + + HashSet set=new HashSet(128); + + long total=0; + long removedSites=0; + long removedKeys=0; + + if(maxDist<=0){return set;} + + for(int i=0; i=minLength){ + if(isClumpy(array, maxDist, fraction)){ + removedSites+=array.length; + removedKeys++; + set.add(i); + } + } + } + +// System.err.println("Banned\t"+removedSites+"\t/ "+total+"\tsites," + +// " or "+String.format("%.4f", (removedSites*100f/total))+"%"); +// System.err.println("Banned\t"+removedKeys+"\t/ "+data.length+"\tkeys," + +// " or "+String.format("%.4f", (removedKeys*100f/data.length))+"%"); + + return set; + + } + + public static final boolean isClumpy(final int[] array, final int maxDist, final float fraction){ + if(array==null){return false;} + int count=0; + for(int i=1; i=(array.length*fraction); + } + + public static int[] makeLengthHistogram(int[][] x, int buckets) { + int[] lengths=new int[x.length]; + long total=0; + for(int i=0; i10000000000000L){ + div=1000000000000L; + ext="T"; + }else if(x>10000000000L){ + div=1000000000L; + ext="B"; + }else if(x>10000000){ + div=1000000; + ext="M"; + }else if(x>100000){ + div=1000; + ext="K"; + } + return String.format("%.2f", x/div)+ext; + } + + public static long parseKMG(String b){ + + char c=Character.toLowerCase(b.charAt(b.length()-1)); + if(!Character.isLetter(c) && !b.contains(".")){ + return Long.parseLong(b); + } + + long mult=1; + if(Character.isLetter(c)){ + if(c=='k'){mult=1000;} + else if(c=='m'){mult=1000000;} + else if(c=='g' || c=='b'){mult=1000000000;} + else if(c=='t'){mult=1000000000000L;} + else{throw new RuntimeException(b);} + b=b.substring(0, b.length()-1); + } + + return ((long)Double.parseDouble(b))*mult; + + } + + public static boolean parseBoolean(String s){ + if(s==null || s.length()<1){return true;} + if(s.length()==1){ + char c=Character.toLowerCase(s.charAt(0)); + return c=='t' || c=='1'; + } + if(s.equalsIgnoreCase("null") || s.equalsIgnoreCase("none")){return false;} + return Boolean.parseBoolean(s); + } + + public static int parseInt(byte[] array, int a, int b){ + assert(b>a); + int r=0; + final byte z='0'; + boolean negative=false; + if(array[a]=='-'){negative=true; a++;} + for(; a=0) : x+" = "+(char)array[a]+"\narray="+new String(array)+", start="+a+", stop="+b; + r=(r*10)+x; + } + if(negative){r*=-1;} + return r; + } + + /** TODO: This (temporarily) uses a lot of memory. Could be reduced by making an array of length max(x) and counting occurrences. */ + public static int[] makeLengthHistogram2(int[] x, int buckets, boolean verbose) { + int[] lengths=Arrays.copyOf(x, x.length); + long total=sum(x); + Arrays.sort(lengths); + + if(verbose){ + System.out.println("Length array size:\t"+x.length); + System.out.println("Min value: \t"+lengths[0]); + System.out.println("Med value: \t"+lengths[lengths.length/2]); + System.out.println("Max value: \t"+lengths[lengths.length-1]); + System.out.println("Total: \t"+total); + } + + int[] hist=new int[buckets+1]; + + long sum=0; + int ptr=0; + for(int i=0; ix.length){ + Data.sysout.println("Reverted to old histogram mode."); + return makeLengthHistogram2(x, buckets, verbose); + } + + int[] counts=new int[max+1]; + long total=0; + for(int i=0; i=0){ + counts[a]++; + total+=a; + } + } + + return makeLengthHistogram4(counts, buckets, total, verbose); + } + + /** Uses counts of occurrences of lengths rather than raw lengths */ + public static int[] makeLengthHistogram4(int[] counts, int buckets, long total, boolean verbose) { + if(total<=0){ + total=0; + for(int i=1; i=target){ + return i; + } + } + return array.length-1; + } + + public static int absdif(int a, int b) { + return a>b ? a-b : b-a; + } + + public static float absdif(float a, float b) { + return a>b ? a-b : b-a; + } + + public static double absdif(double a, double b) { + return a>b ? a-b : b-a; + } + + /** Uses unsigned math */ + public static final int absdifUnsigned(int a, int b){ + return (a<0 == b<0) ? a>b ? a-b : b-a : Integer.MAX_VALUE; + } + + public static final boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + + public static final int overlapLength(int a1, int b1, int a2, int b2){ + if(!overlap(a1,b1,a2,b2)){return 0;} + if(a1<=a2){ + return b1>=b2 ? b2-a2+1 : b1-a2+1; + }else{ + return b2>=b1 ? b1-a1+1 : b2-a1+1; + } + } + + /** Is (a1, b1) within (a2, b2) ? */ + public static final boolean isWithin(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a1>=a2 && b1<=b2; + } + + public static final int constrict(int point, int a, int b){ + assert(a<=b); + return(pointb ? b : point); + } + + public static final int indexOf(byte[] array, byte b){ + int i=0; + while(i=0 && array[i]!=b){i--;} + return i; + } + + public static final int stringLength(long x){ + if(x<0){ + if(x==Integer.MIN_VALUE){return 11;} + return lengthOf(-x)+1; + } + return lengthOf(x); + } + + public static final int stringLength(int x){ + if(x<0){ + if(x==Long.MIN_VALUE){return 20;} + return lengthOf(-x)+1; + } + return lengthOf(x); + } + + public static final int lengthOf(int x){ + assert(x>=0); + int i=1; + while(x>ilens[i]){i++;} + return i; + } + + public static final int lengthOf(long x){ + assert(x>=0); + int i=1; + while(x>llens[i]){i++;} + return i; + } + + public static final int max(int[] array){return array[maxIndex(array)];} + + public static final int maxIndex(int[] array){ + int max=array[0], maxIndex=0; + for(int i=1; imax){max=array[i];maxIndex=i;} + } + return maxIndex; + } + + public static final double standardDeviation(long[] numbers){ + if(numbers==null || numbers.length<1){return 0;} + long sum=sum(numbers); + double avg=sum/(double)numbers.length; + double sumdev2=0; + for(int i=0; i=0); + long[] r=new long[bins]; + if(bins==0){return r;} + double mult=bins/(double)array.length; + for(int i=0; iy ? x : y;} + public static final int min(int x, int y, int z){return xy ? (x>z ? x : z) : (y>z ? y : z);} + public static final int min(int x, int y, int z, int z2){return min(min(x,y), min(z,z2));} + public static final int max(int x, int y, int z, int z2){return max(max(x,y), max(z,z2));} + + //Median of 3 + public static final int mid(int x, int y, int z){return xy ? x : y;} + + public static final char min(char x, char y){return xy ? x : y;} + + public static final byte min(byte x, byte y, byte z){return xy ? max(x, z) : max(y, z);} + + public static final byte min(byte x, byte y, byte z, byte a){return min(min(x, y), min(z, a));} + public static final byte max(byte x, byte y, byte z, byte a){return max(max(x, y), max(z, a));} + + public static final long min(long x, long y){return xy ? x : y;} + + public static final long min(long x, long y, long z){return xy ? (x>z ? x : z) : (y>z ? y : z);} + + public static final double min(double x, double y){return xy ? x : y;} + + public static final float min(float x, float y){return xy ? x : y;} + + public static final int min(int[] array, int fromIndex, int toIndex){ + int min=array[fromIndex]; + for(int i=fromIndex+1; i<=toIndex; i++){ + min=min(min, array[i]); + } + return min; + } + + public static final int max(int[] array, int fromIndex, int toIndex){ + int max=array[fromIndex]; + for(int i=fromIndex+1; i<=toIndex; i++){ + max=max(max, array[i]); + } + return max; + } + + public static int minIndex(int[] array) { + if(array==null || array.length<1){return -1;} + float min=array[0]; + int index=0; + for(int i=1; i list=tcr.toVars(r2, true, true, false); + if(verbose && list!=null){ + for(Varlet v : list){ + System.err.println(v); + } + } + } + + } + t.stop(); + float kbps=(length*(long)rounds)*1000000f/t.elapsed; + System.err.println("Time: "+t+"\t("+String.format("%.2f", kbps)+" kbps)"); + return correct; + } + + private static CharSequence toString(byte[][] crbmq) { + StringBuilder sb=new StringBuilder(); + for(int i=0; i<2; i++){ + if(crbmq[i]==null){sb.append("null");} + else{ + for(byte b : crbmq[i]){ + if(b=='N'){sb.append('N');} + else{sb.append((char)(b+'0'));} + } + sb.append('\n'); + } + } + sb.append(new String(crbmq[2])); + sb.append('\n'); + sb.append(crbmq[3]==null ? "null" : new String(crbmq[3])); + sb.append('\n'); + return sb; + } + + private static String toStringCS(byte[] colors){ + StringBuilder sb=new StringBuilder(colors.length); + for(byte b : colors){ + if(b>3){sb.append((char)b);} + else{sb.append((char)(b+'0'));} + } + sb.append('\n'); + return sb.toString(); + } + + public void realign_new(final Read r, final int padding, final boolean recur, final int minValidScore, boolean forbidIndels){ + realign_new(r, r.colorspace() ? msaCS : msaBS, padding, recur, minValidScore, forbidIndels); + } + + public void realignByReversingRef(final Read r, final int padding, final boolean recur){ + realignByReversingRef(r, r.colorspace() ? msaCS : msaBS, padding, recur); + } + + /** This aligns a read with the reference, and generates the match string. */ + public static void realignByReversingRef(final Read r, final MSA msa, int padding, final boolean recur){ + if(r.shortmatch()){ + r.match=null; + r.setShortMatch(false); + } +// assert(r.colorspace()); +// assert(msa.colorspace); + padding=Tools.min(padding, (msa.maxColumns-r.bases.length)/2-20); + padding=Tools.max(padding, 0); + assert(r.colorspace()==msa.colorspace); + final ChromosomeArray chacs=Data.getChromosome(r.chrom); + if(verbose){ + System.err.println("Realigning."); + System.err.println("Original: "+r.start+", "+r.stop+", "+Gene.strandCodes[r.strand()]); + } + + { + assert(r.stop>=r.start); //Otherwise this is pointless... + int a=r.bases.length; + int b=r.stop-r.start+1; + if(bmaxI); + + byte[][] matchR=new byte[1][]; + if(r.match!=null && r.match.length==r.bases.length){ + matchR[0]=r.match; + }else{ + // System.err.println(new String(r.match)); + matchR[0]=r.match=new byte[r.bases.length]; + } + int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(r.bases, chacs.array, r.start, matchR); + r.match=matchR[0]; + + if(scoreNoIndel>=maxI){ + if(verbose){System.err.println("Quick match.");} +// assert(r.match[0]!='X') : r.toText(false); +// assert(r.match[r.match.length-1]!='X') : r.toText(false); + // assert(r.stop==r.start+r.bases.length-1); + r.stop=r.start+r.bases.length-1; + r.mapScore=scoreNoIndel; + }else{ + if(verbose){System.err.println("Slow match.");} + +// int minLoc=Tools.max(r.start-padding, chacs.minIndex); + int minLoc=Tools.max(r.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N' + int maxLoc=Tools.min(r.stop+padding, chacs.maxIndex); + + //These assertions are not too important... they indicate the read mapped off the end of the chromosome. + assert(minLoc<=r.start) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false); + assert(maxLoc>=r.stop) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false); + + // System.err.println("Aligning:\n"+new String(r.bases)+"\n"+chacs.getString(minLoc, maxLoc)); + int[] max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, scoreNoIndel, r.gaps); + // System.err.println(Arrays.toString(max)); + r.match=msa.traceback(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null); +// System.err.println(new String(r.match)); + int[] score=msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null); +// System.err.println(Arrays.toString(score)); + r.start=score[1]; + r.stop=score[2]; + r.mapScore=score[0]; + // System.err.println(Arrays.toString(score)); + // assert(false); + } + }else{ + assert(maxQ>maxI); + + byte[][] matchR=new byte[1][]; + if(r.match!=null && r.match.length==r.bases.length){ + matchR[0]=r.match; + }else{ + // System.err.println(new String(r.match)); + matchR[0]=r.match=new byte[r.bases.length]; + } + + int scoreNoIndel=-9999; + if(r.bases.length==(r.stop-r.start+1)){ + + byte[] ref=chacs.getBytes(r.start, r.stop); + if(r.colorspace()){ + Tools.reverseInPlace(ref); + }else{ + AminoAcid.reverseComplementBasesInPlace(ref); + } + scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(r.bases, ref, 0, matchR); + r.match=matchR[0]; + } + + if(scoreNoIndel>=maxI){ + if(verbose){System.err.println("Quick match.");} + assert(r.match[0]!='X') : r.toText(false); + assert(r.match[r.match.length-1]!='X') : r.toText(false); + r.stop=r.start+r.bases.length-1; + r.mapScore=scoreNoIndel; + }else{ + if(verbose){System.err.println("Slow match.");} + +// int minLoc=Tools.max(r.start-padding, chacs.minIndex); + int minLoc=Tools.max(r.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N' + int maxLoc=Tools.min(r.stop+padding, chacs.maxIndex); + + //These assertions are not too important... they indicate the read mapped off the end of the chromosome. + assert(minLoc<=r.start) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false); + assert(maxLoc>=r.stop) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false); + + byte[] ref=chacs.getBytes(minLoc, maxLoc); + // System.err.println("Aligning:\n"+new String(r.bases)+"\n"+new String(ref)); + if(r.colorspace()){ + Tools.reverseInPlace(ref); + }else{ + AminoAcid.reverseComplementBasesInPlace(ref); + } + + // System.err.println("Aligning:\n"+new String(r.bases)+"\n"+new String(ref)); + int[] max=msa.fillLimited(r.bases, ref, 0, ref.length-1, scoreNoIndel, r.gaps); + // System.err.println(Arrays.toString(max)); + r.match=msa.traceback(r.bases, ref, 0, ref.length-1, max[0], max[1], max[2], r.gaps!=null); +// System.err.println(new String(r.match)); + int[] score=msa.score(r.bases, ref, 0, ref.length-1, max[0], max[1], max[2], r.gaps!=null); +// System.err.println(Arrays.toString(score)); + // System.err.println(Arrays.toString(score)); + // assert(false); + + int start2=minLoc+(ref.length-score[2]-1); + int stop2=maxLoc-(score[1]); + + r.start=start2; + r.stop=stop2; + r.mapScore=score[0]; + } + } + if(verbose){System.err.println("Final: "+r.start+", "+r.stop+", "+Gene.strandCodes[r.strand()]);} + + if(recur && r.stop0 && (r.match[0]=='X' || r.match[0]=='I' || + r.match[r.match.length-1]=='Y' || r.match[r.match.length-1]=='X' || r.match[r.match.length-1]=='I')){ + int xy=0; + for(int i=0; imsa.maxColumns-20){ + //TODO: Alternately, I could kill the site. + r.stop=r.start+Tools.min(r.bases.length+40, msa.maxColumns-20); + if(r.gaps!=null){r.gaps=GapTools.fixGaps(r.start, r.stop, r.gaps, Shared.MINGAP);} + } + } + + if(r.start<0){r.start=0;} //Prevents assertion errors. This change should be reset by the realignment so it shouldn't matter. + if(r.stop>chacs.maxIndex){r.stop=chacs.maxIndex;} //Also to prevent a potential assertion error in unpadded references + assert(0<=r.start) : "\nchr"+r.chrom+": r.start="+r.start+", r.stop="+r.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+r.toText(false); + assert(chacs.maxIndex>=r.stop) : "\nchr"+r.chrom+": r.start="+r.start+", r.stop="+r.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+r.toText(false); + + { + assert(r.stop>=r.start); //Otherwise this is pointless... + int a=r.bases.length; + int b=r.stop-r.start+1; + if(bmaxI); + + byte[][] matchR=new byte[1][]; + if(r.match!=null && r.match.length==r.bases.length){ + matchR[0]=r.match; + }else{ + // System.err.println(new String(r.match)); + matchR[0]=r.match=new byte[r.bases.length]; + } + int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(r.bases, chacs.array, r.start, matchR); + r.match=matchR[0]; + + assert(0<=r.start) : "\nchr"+r.chrom+": r.start="+r.start+", r.stop="+r.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+r.toText(false); + assert(chacs.maxIndex>=r.stop) : "\nchr"+r.chrom+": r.start="+r.start+", r.stop="+r.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+r.toText(false); + + if(verbose){System.err.println("Estimated greflen: "+GapTools.calcGrefLen(r.start, r.stop, r.gaps));} + + if(scoreNoIndel>=maxI || forbidIndels){ + if(verbose){System.err.println("Quick match.");} +// assert(r.match[0]!='X') : r.toText(false); +// assert(r.match[r.match.length-1]!='X') : r.toText(false); + // assert(r.stop==r.start+r.bases.length-1); + r.stop=r.start+r.bases.length-1; + r.mapScore=scoreNoIndel; + }else{ + if(verbose){System.err.println("Slow match.");} + +// int minLoc=Tools.max(r.start-padding, chacs.minIndex); + int minLoc=Tools.max(r.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N' + int maxLoc=Tools.min(r.stop+padding, chacs.maxIndex); + + if(verbose){System.err.println("minLoc = "+minLoc+", maxLoc = "+maxLoc);} + if(verbose){System.err.println("A. Estimated greflen: "+GapTools.calcGrefLen(r.start, r.stop, r.gaps));} + if(verbose){System.err.println("A. Estimated greflen2: "+GapTools.calcGrefLen(minLoc, maxLoc, r.gaps));} + + //These assertions are not too important... they indicate the read mapped off the end of the chromosome. + assert(minLoc<=r.start) : "\nchr"+r.chrom+": minloc="+minLoc+", maxLoc="+maxLoc+", r.start="+r.start+", r.stop="+r.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+r.toText(false); + assert(maxLoc>=r.stop) : "\nchr"+r.chrom+": minloc="+minLoc+", maxLoc="+maxLoc+", r.start="+r.start+", r.stop="+r.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+r.toText(false); + + // System.err.println("Aligning:\n"+new String(r.bases)+"\n"+chacs.getString(minLoc, maxLoc)); + + int[] max=null; + int[] score=null; + try { + if(verbose){ + System.err.println("Calling fillLimited(bases, chacs, "+minLoc+", "+maxLoc+", "+ + Tools.max(scoreNoIndel, minValidScore)+", "+(r.gaps==null ? "null" : Arrays.toString(r.gaps))+")"); + } + max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), r.gaps); + score=(max==null ? null : msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null)); + if(verbose){System.err.println("Estimated greflen: "+GapTools.calcGrefLen(r.start, r.stop, r.gaps));} + + if(score!=null && score.length>6){ + int[] oldArray=score.clone(); + assert(score.length==8); + int extraPadLeft=score[6]; + int extraPadRight=score[7]; + + if(r.gaps==null){ + assert(maxLoc-minLoc+1<=msa.maxColumns); + int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(r.bases.length, GapTools.calcGrefLen(minLoc, maxLoc, r.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + assert(extraPadLeft>=0 && extraPadRight>=0) : extraPadLeft+", "+extraPadRight+"\n"+r; + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + + if(verbose){System.err.println("B. Estimated greflen: "+GapTools.calcGrefLen(r.start, r.stop, r.gaps));} + if(verbose){System.err.println("B. Estimated greflen2: "+GapTools.calcGrefLen(minLoc, maxLoc, r.gaps));} + max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), r.gaps); + score=(max==null ? null : msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null)); + + if(score==null || score[0]=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(r.bases.length, GapTools.calcGrefLen(minLoc, maxLoc, r.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + assert(extraPadLeft>=0 && extraPadRight>=0) : extraPadLeft+", "+extraPadRight+"\n"+r; + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + + max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), r.gaps); + score=(max==null ? null : msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null)); + + if(minLoc>0 && maxLocmaxI); + + byte[][] matchR=new byte[1][]; + if(r.match!=null && r.match.length==r.bases.length){ + matchR[0]=r.match; + }else{ + matchR[0]=r.match=new byte[r.bases.length]; + } + + if(verbose){ + System.err.println("Before reversed:"); + System.err.println(toStringCS(r.bases)); + } + + if(r.colorspace()){ + Tools.reverseInPlace(r.bases); + }else{ + AminoAcid.reverseComplementBasesInPlace(r.bases); + } + + if(verbose){ + System.err.println("Reversed."); + System.err.println(toStringCS(r.bases)); + } + + int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(r.bases, chacs.array, r.start, matchR); + r.match=matchR[0]; + + if(scoreNoIndel>=maxI || forbidIndels){ + if(verbose){System.err.println("Quick match.");} + assert(r.match[0]!='X') : r.toText(false); + assert(r.match[r.match.length-1]!='X') : r.toText(false); + r.stop=r.start+r.bases.length-1; + r.mapScore=scoreNoIndel; +// Tools.reverseInPlace(r.match); + }else{ + if(verbose){System.err.println("Slow match.");} + + int minLoc=Tools.max(r.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N' + int maxLoc=Tools.min(r.stop+padding, chacs.maxIndex); + if(verbose){System.err.println("Slow match "+minLoc+" ~ "+maxLoc);} + + //These assertions are not too important... they indicate the read mapped off the end of the chromosome. + assert(minLoc<=r.start) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false); + assert(maxLoc>=r.stop) : "\nchr"+r.chrom+": "+minLoc+", "+maxLoc+", "+r.start+", "+r.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+r.toText(false); + + if(verbose){System.err.println("Aligning:\n"+new String(r.bases)+"\n"+chacs.getString(minLoc, maxLoc));} + int[] max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), r.gaps); + if(verbose){System.err.println("Aligned: {rows, maxC, maxS, max} = "+Arrays.toString(max));} + int[] score=null; + score=(max==null ? null : msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null)); + + if(score!=null && score.length>6){ + if(verbose){System.err.println("Entering condition because score="+Arrays.toString(score));} + int[] oldArray=score.clone(); + assert(score.length==8); + int extraPadLeft=score[6]; + int extraPadRight=score[7]; + + if(r.gaps==null){ + assert(maxLoc-minLoc+1<=msa.maxColumns); + int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(r.bases.length, GapTools.calcGrefLen(minLoc, maxLoc, r.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + if(verbose){System.err.println("Set extraPadLeft="+extraPadLeft+", extraPadRight="+extraPadRight);} + if(verbose){System.err.println("Set minLoc="+minLoc+", maxLoc="+maxLoc);} + + max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), r.gaps); + score=(max==null ? null : msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null)); + + if(score==null || score[0]=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(r.bases.length, GapTools.calcGrefLen(minLoc, maxLoc, r.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + max=msa.fillLimited(r.bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), r.gaps); + score=(max==null ? null : msa.score(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null)); + } + } + + + if(verbose){System.err.println(Arrays.toString(max));} + + if(max!=null){ + r.match=msa.traceback(r.bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], r.gaps!=null); + r.start=score[1]; + r.stop=score[2]; + r.mapScore=score[0]; + if(verbose){System.err.println("Aligned:\n"+new String(r.bases)+"\n"+chacs.getString(r.start, r.stop)+"\n"+new String(r.match));} + }else{ + assert(r.match[0]!='X') : r.toText(false); + assert(r.match[r.match.length-1]!='X') : r.toText(false); + r.stop=r.start+r.bases.length-1; + r.mapScore=scoreNoIndel; + } + } + + if(r.colorspace()){ + Tools.reverseInPlace(r.bases); + }else{ + AminoAcid.reverseComplementBasesInPlace(r.bases); + } + } + if(verbose){System.err.println("Final: "+r.start+", "+r.stop+", "+Gene.strandCodes[r.strand()]);} + + if(recur && r.stop0 && (r.match[0]=='X' || r.match[0]=='I' || + r.match[r.match.length-1]=='Y' || r.match[r.match.length-1]=='X' || r.match[r.match.length-1]=='I')){ + int xy=0; + for(int i=0; imsa.maxColumns-20){ + //TODO: Alternately, I could kill the site. + ss.stop=ss.start+Tools.min(bases.length+40, msa.maxColumns-20); + if(ss.gaps!=null){ss.gaps=GapTools.fixGaps(ss.start, ss.stop, ss.gaps, Shared.MINGAP);} + } + } + + if(ss.start<0){ss.start=0;} //Prevents assertion errors. This change should be reset by the realignment so it shouldn't mattess. + if(ss.stop>chacs.maxIndex){ss.stop=chacs.maxIndex;} //Also to prevent a potential assertion error in unpadded references + assert(0<=ss.start) : "\nchr"+ss.chrom+": ss.start="+ss.start+", ss.stop="+ss.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText(); + assert(chacs.maxIndex>=ss.stop) : "\nchr"+ss.chrom+": ss.start="+ss.start+", ss.stop="+ss.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText(); + + { + assert(ss.stop>=ss.start); //Otherwise this is pointless... + int a=bases.length; + int b=ss.stop-ss.start+1; + if(bmaxI); + + byte[][] matchR=new byte[1][]; + if(ss.match!=null && ss.match.length==bases.length){ + matchR[0]=ss.match; + }else{ + // System.err.println(new String(ss.match)); + matchR[0]=ss.match=new byte[bases.length]; + } + int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(bases, chacs.array, ss.start, matchR); + ss.match=matchR[0]; + + assert(0<=ss.start) : "\nchr"+ss.chrom+": ss.start="+ss.start+", ss.stop="+ss.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText(); + assert(chacs.maxIndex>=ss.stop) : "\nchr"+ss.chrom+": ss.start="+ss.start+", ss.stop="+ss.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText(); + + if(verbose){System.err.println("Estimated greflen: "+GapTools.calcGrefLen(ss.start, ss.stop, ss.gaps));} + + if(scoreNoIndel>=maxI || forbidIndels){ + if(verbose){System.err.println("Quick match.");} +// assert(ss.match[0]!='X') : ss.toText(); +// assert(ss.match[ss.match.length-1]!='X') : ss.toText(); + // assert(ss.stop==ss.start+bases.length-1); + ss.stop=ss.start+bases.length-1; + ss.slowScore=scoreNoIndel; + }else{ + if(verbose){System.err.println("Slow match.");} + +// int minLoc=Tools.max(ss.start-padding, chacs.minIndex); + int minLoc=Tools.max(ss.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N' + int maxLoc=Tools.min(ss.stop+padding, chacs.maxIndex); + + if(verbose){System.err.println("minLoc = "+minLoc+", maxLoc = "+maxLoc);} + if(verbose){System.err.println("A. Estimated greflen: "+GapTools.calcGrefLen(ss.start, ss.stop, ss.gaps));} + if(verbose){System.err.println("A. Estimated greflen2: "+GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));} + + //These assertions are not too important... they indicate the read mapped off the end of the chromosome. + assert(minLoc<=ss.start) : "\nchr"+ss.chrom+": minloc="+minLoc+", maxLoc="+maxLoc+", ss.start="+ss.start+", ss.stop="+ss.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText(); + assert(maxLoc>=ss.stop) : "\nchr"+ss.chrom+": minloc="+minLoc+", maxLoc="+maxLoc+", ss.start="+ss.start+", ss.stop="+ss.stop+", padding="+padding+ + ", chacs.minIndex="+chacs.minIndex+", chacs.maxIndex="+chacs.maxIndex+"\nread:\n"+ss.toText(); + + // System.err.println("Aligning:\n"+new String(bases)+"\n"+chacs.getString(minLoc, maxLoc)); + + int[] max=null; + int[] score=null; + try { + if(verbose){ + System.err.println("Calling fillLimited(bases, chacs, "+minLoc+", "+maxLoc+", "+ + Tools.max(scoreNoIndel, minValidScore)+", "+(ss.gaps==null ? "null" : Arrays.toString(ss.gaps))+")"); + } + max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps); + score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null)); + if(verbose){System.err.println("Estimated greflen: "+GapTools.calcGrefLen(ss.start, ss.stop, ss.gaps));} + + if(score!=null && score.length>6){ + int[] oldArray=score.clone(); + assert(score.length==8); + int extraPadLeft=score[6]; + int extraPadRight=score[7]; + + if(ss.gaps==null){ + assert(maxLoc-minLoc+1<=msa.maxColumns); + int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + assert(extraPadLeft>=0 && extraPadRight>=0) : extraPadLeft+", "+extraPadRight+"\n"+id+", "+ss; + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + + if(verbose){System.err.println("B. Estimated greflen: "+GapTools.calcGrefLen(ss.start, ss.stop, ss.gaps));} + if(verbose){System.err.println("B. Estimated greflen2: "+GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps));} + max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps); + score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null)); + + if(score==null || score[0]=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + assert(extraPadLeft>=0 && extraPadRight>=0) : extraPadLeft+", "+extraPadRight+"\n"+id+", "+ss; + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + + max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps); + score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null)); + + if(minLoc>0 && maxLocmaxI); + + byte[][] matchR=new byte[1][]; + if(ss.match!=null && ss.match.length==bases.length){ + matchR[0]=ss.match; + }else{ + matchR[0]=ss.match=new byte[bases.length]; + } + + int scoreNoIndel=msa.scoreNoIndelsAndMakeMatchString(bases, chacs.array, ss.start, matchR); + ss.match=matchR[0]; + + if(scoreNoIndel>=maxI || forbidIndels){ + if(verbose){System.err.println("Quick match.");} + assert(ss.match[0]!='X') : ss.toText(); + assert(ss.match[ss.match.length-1]!='X') : ss.toText(); + ss.stop=ss.start+bases.length-1; + ss.slowScore=scoreNoIndel; +// Tools.reverseInPlace(ss.match); + }else{ + if(verbose){System.err.println("Slow match.");} + + int minLoc=Tools.max(ss.start-padding, 0); //It's OK to be off the beginning as long as bases prior to the true start are 'N' + int maxLoc=Tools.min(ss.stop+padding, chacs.maxIndex); + if(verbose){System.err.println("Slow match "+minLoc+" ~ "+maxLoc);} + + //These assertions are not too important... they indicate the read mapped off the end of the chromosome. + assert(minLoc<=ss.start) : "\nchr"+ss.chrom+": "+minLoc+", "+maxLoc+", "+ss.start+", "+ss.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+ss.toText(); + assert(maxLoc>=ss.stop) : "\nchr"+ss.chrom+": "+minLoc+", "+maxLoc+", "+ss.start+", "+ss.stop+ + ", "+chacs.minIndex+", "+chacs.maxIndex+"\n"+ss.toText(); + + if(verbose){System.err.println("Aligning:\n"+new String(bases)+"\n"+chacs.getString(minLoc, maxLoc));} + int[] max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps); + if(verbose){System.err.println("Aligned: {rows, maxC, maxS, max} = "+Arrays.toString(max));} + int[] score=null; + score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null)); + + if(score!=null && score.length>6){ + if(verbose){System.err.println("Entering condition because score="+Arrays.toString(score));} + int[] oldArray=score.clone(); + assert(score.length==8); + int extraPadLeft=score[6]; + int extraPadRight=score[7]; + + if(ss.gaps==null){ + assert(maxLoc-minLoc+1<=msa.maxColumns); + int newlen=(maxLoc-minLoc+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + if(verbose){System.err.println("Set extraPadLeft="+extraPadLeft+", extraPadRight="+extraPadRight);} + if(verbose){System.err.println("Set minLoc="+minLoc+", maxLoc="+maxLoc);} + + max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps); + score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null)); + + if(score==null || score[0]=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + }else{ + //TODO: In this case the alignment will probably be wrong. + int greflen=Tools.max(bases.length, GapTools.calcGrefLen(minLoc, maxLoc, ss.gaps)); + int newlen=(greflen+1+extraPadLeft+extraPadRight); + if(newlen>=msa.maxColumns){ + while(newlen>=msa.maxColumns && extraPadLeft>extraPadRight){newlen--;extraPadLeft--;} + while(newlen>=msa.maxColumns && extraPadLeft=msa.maxColumns){newlen-=2;extraPadLeft--;extraPadRight--;} + }else{ + int x=Tools.min(20, ((msa.maxColumns-newlen)/2)-1); + extraPadLeft=Tools.max(x, extraPadLeft); + extraPadRight=Tools.max(x, extraPadRight); + } + } + + minLoc=Tools.max(0, minLoc-extraPadLeft); + maxLoc=Tools.min(chacs.maxIndex, maxLoc+extraPadRight); + max=msa.fillLimited(bases, chacs.array, minLoc, maxLoc, Tools.max(scoreNoIndel, minValidScore), ss.gaps); + score=(max==null ? null : msa.score(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null)); + } + } + + + if(verbose){System.err.println(Arrays.toString(max));} + + if(max!=null){ + ss.match=msa.traceback(bases, chacs.array, minLoc, maxLoc, max[0], max[1], max[2], ss.gaps!=null); + ss.start=score[1]; + ss.stop=score[2]; + ss.slowScore=score[0]; + if(verbose){System.err.println("Aligned:\n"+new String(bases)+"\n"+chacs.getString(ss.start, ss.stop)+"\n"+new String(ss.match));} + }else{ + assert(ss.match[0]!='X') : ss.toText(); + assert(ss.match[ss.match.length-1]!='X') : ss.toText(); + ss.stop=ss.start+bases.length-1; + ss.slowScore=scoreNoIndel; + } + } + } + if(verbose){System.err.println("Final: "+ss.start+", "+ss.stop+", "+Gene.strandCodes[ss.strand()]);} + + if(recur && ss.stop0 && (ss.match[0]=='X' || ss.match[0]=='I' || + ss.match[ss.match.length-1]=='Y' || ss.match[ss.match.length-1]=='X' || ss.match[ss.match.length-1]=='I')){ + int xy=0; + for(int i=0; i=0); + //assert(checkArray(crbmq[2])); + if(verbose && x!=0){ + System.err.println("Trimmed end:"); + System.err.println(toString(crbmq)); + } + if(crbmq[3].length<2){return null;} + +// byte[] original=Arrays.copyOf(crbmq[2], crbmq[2].length); + + if(crbmq[3][crbmq[3].length-1]!='m'){ + System.err.println("Failed to trim read "+r.numericID+", x="+x); + System.err.println(r.toText(false)); + System.err.println(toString(crbmq)); + return null; + } + + final boolean containsIndels=containsIndels(crbmq[3]); + final boolean containsNocalls=containsNocalls(crbmq[3]) || + containsNocalls(crbmq[0]) || containsNocalls(crbmq[1]) || containsNocalls(crbmq[2]); + + if(containsNocalls){ + +// if(containsIndels){ +// System.err.println("*** Before fixing nocalls:"); +// System.err.println(toString(crbmq)); +// } + + x=fixNocallsInline(crbmq, r); + //assert(checkArray(crbmq[2])); + if(x<0){ + if(verbose){System.err.println("------------------------ broke decoder ------------------------");} + return null; + } + if(verbose){ + if(x>0){ + System.err.println("Fixed Nocalls:"); + System.err.println(toString(crbmq)); + } + } + +// if(containsIndels){ +// System.err.println("*** After fixing nocalls:"); +// System.err.println(toString(crbmq)+"\n"); +// } +// +// x=fixNocalls(crbmq); +// assert(checkArray(crbmq[2])); +// if(x<0){ +// if(verbose){System.err.println("------------------------ broke decoder ------------------------");} +// return null; +// } +// if(verbose){ +// if(x>0){ +// System.err.println("Fixed Nocalls:"); +// System.err.println(toString(crbmq)); +// } +// } +// +// +// if(containsIndels){ +// x=fixNocallsBackward(crbmq); +// assert(checkArray(crbmq[2])); +// if(x<0){ +// if(verbose){System.err.println("------------------------ broke decoder ------------------------");} +// return null; +// } +// if(verbose){ +// if(x>0){ +// System.err.println("Fixed Nocalls backwards:"); +// System.err.println(toString(crbmq)); +// } +// } +// } + } + + + + if(verbose){System.err.println(toString(crbmq));} + x=fixIndels(crbmq, r); + if(crbmq[3].length<2){return null;} + //assert(checkArray(crbmq[2])) : new String(crbmq[2])+"\n"+new String(original)+"\n"+new String(r.match); + final int indels=x; + if(x<0){ + if(verbose){System.err.println("------------------------ broke decoder ------------------------");} + return null; + } + if(verbose){ + if(x>0){ + System.err.println("Fixed indels:"); + System.err.println(toString(crbmq)); + } + } + assert(crbmq[0].length==crbmq[1].length) : "\n"+toString(crbmq)+"\n"+r.toText(false)+"\n"+chab.maxIndex+"\n"; + assert(crbmq[0].length==crbmq[3].length) : "\n"+toString(crbmq)+"\n"+r.toText(false)+"\n"+chab.maxIndex+"\n"; + assert(crbmq[0].length==(crbmq[2].length-1)) : "\n"+toString(crbmq)+"\n"+r.toText(false)+"\n"+chab.maxIndex+"\n"; + +// if(containsNocalls && containsIndels){//Indels are gone now, but some nocalls may remain +// assert(!containsIndels(crbmq[3])); +// x=fixNocallsInline(crbmq, r); +// assert(checkArray(crbmq[2])); +// if(x<0){ +// if(verbose){System.err.println("------------------------ broke decoder ------------------------");} +// return null; +// } +// if(verbose){ +// if(x>0){ +// System.err.println("Fixed Nocalls 2:"); +// System.err.println(toString(crbmq)); +// } +// } +//// x=fixNocalls(crbmq); +//// assert(checkArray(crbmq[2])); +//// if(x<0){ +//// if(verbose){System.err.println("------------------------ broke decoder ------------------------");} +//// return null; +//// } +//// if(verbose){ +//// if(x>0){ +//// System.err.println("Fixed Nocalls 2:"); +//// System.err.println(toString(crbmq)); +//// } +//// } +// }else{assert(!containsNocalls(crbmq[3])) : "\n"+toString(crbmq)+"\n";} + + x=fixColorspaceErrors(crbmq, 3, 10); + r.errors=(x>=0 ? x : 0); + //assert(checkArray(crbmq[2])); + if(x<0){ + if(verbose){System.err.println("------------------------ broke decoder ------------------------");} + return null; + } + if(verbose){ + if(x>0){ + System.err.println("Fixed Errors:"); + System.err.println(toString(crbmq)); + } + } + x=fixSubs(crbmq); + //assert(checkArray(crbmq[2])); + if(x<0){ + if(verbose){System.err.println("------------------------ broke decoder ------------------------");} + return null; + } + + if(verbose){ + if(x>0){ + System.err.println("Fixed Substitutions:"); + System.err.println(toString(crbmq)); + }else{ + System.err.println("Final:"); + System.err.println(toString(crbmq)); + } + } + + byte[] qualityBS=translateQuality(crbmq[4]); + + int flags=(r.flags&(~Read.COLORMASK)); + + assert(false) : "TODO: Make sure the next line (read instantiation) is correct."; + Read r2=new Read(crbmq[2], r.chrom, r.start, r.stop+1, r.id, qualityBS, r.numericID, flags); + + r2.originalSite=r.originalSite; + r2.errors=r.errors; + r2.mapScore=r.mapScore; + r2.obj=r.obj; + r2.sites=r.sites; + r2.copies=r.copies; + r2.mapLength=r.mapLength; + + if(verbose){ + System.err.println("r2:"); + System.err.println(chab.getString(r2.start, r2.stop)+" (ref) "); + System.err.println(new String(r2.bases)+" (call) "); + } + + if(indels>0 || (r2.stop-r2.start+1!=r2.bases.length)){ + if(verbose){ + System.err.println("Making slow BS match:"); + } + realign_new(r2, 4, true, 0, false); + +// int padding=4; +// { +// int a=r2.bases.length; +// int b=r2.stop-r2.start+1; +// if(b=0); +// final int minLoc=Tools.max(0, r2.start-padding); +// final int maxLoc=Tools.min(chab.maxIndex, r2.stop+padding); +// if(r2.strand()==Gene.PLUS){ +// int[] max=msaBS.fill(r2.bases, chab.array, minLoc, maxLoc); +// r2.match=msaBS.traceback(r2.bases, chab.array, minLoc, maxLoc, max[0], max[1], max[2]); +// int[] score=msaBS.score(r2.bases, chab.array, minLoc, maxLoc, max[0], max[1], max[2]); +// r2.start=score[1]; +// r2.stop=score[2]; +// }else{ +// byte[] bases=AminoAcid.reverseComplementBases(r2.bases); +// if(verbose){ +// System.err.println("reversed:"); +// System.err.println(chab.getString(minLoc, maxLoc)+" (ref extended) "); +// System.err.println(chab.getString(r2.start, r2.stop)+" (ref) "); +// System.err.println(new String(bases)+" (call) "); +// } +// int[] max=msaBS.fill(bases, chab.array, minLoc, maxLoc); +// r2.match=msaBS.traceback(bases, chab.array, minLoc, maxLoc, max[0], max[1], max[2]); +// int[] score=msaBS.score(bases, chab.array, minLoc, maxLoc, max[0], max[1], max[2]); +// r2.start=score[1]; +// r2.stop=score[2]; +//// Tools.reverseInPlace(r2.match); +// } + + }else{ + if(verbose){ + System.err.println("Making quick BS match:"); + } + final int maxI=msaBS.maxImperfectScore(r2.bases.length); + if(r2.strand()==Gene.PLUS){ + byte[][] matchR=new byte[1][]; + if(r2.match!=null && r2.match.length==r2.bases.length){ + matchR[0]=r2.match; + }else{ +// System.err.println(new String(r2.match)); + matchR[0]=r2.match=new byte[r2.bases.length]; + } + int scoreNoIndel=msaBS.scoreNoIndelsAndMakeMatchString(r2.bases, chab.array, r2.start, matchR); + + //TODO: If scoreNoIndelcolors.length){ + colorRef=Arrays.copyOf(colorRef, colors.length); + baseRef=Arrays.copyOf(baseRef, colorRef.length+1); + crbmq[1]=colorRef; + crbmq[2]=baseRef; + } + + return fixed; + } + + private static boolean fixDeletion(final byte[][] crbmq, int loc, Read r){ + + byte[] colors=crbmq[0]; + byte[] colorRef=crbmq[1]; + byte[] baseRef=crbmq[2]; + byte[] match=crbmq[3]; + byte[] quality=crbmq[4]; + + assert(match[loc]=='D') : loc; + + int len=1; + for(int i=loc+1; imatch.length-2){return false;} //Indels on very ends need to be processed differently + + //Deletion is from a to b, inclusive. Note that basespace coords are +1 from colorspace coords. + + byte[] colorRef2=new byte[colorRef.length-len]; + byte[] baseRef2=new byte[baseRef.length-len]; + byte[] match2=new byte[match.length-len]; + + assert(locmatch.length-2){return false;} //Indels on very ends need to be processed differently + + + //Deletion is from a to b, inclusive. Note that basespace coords are +1 from colorspace coords + + assert(loc0){ +// for(int i=1; i=0; i--){ +// if(baseRef[i]=='N'){ +// baseRef[i]=AminoAcid.colorToBase(baseRef[i+1], colorRef[i+1]); +// } +// } +// } +// return fixedRef+fixedCall; +// } + + + private static int fixNocallsInline(final byte[][] crbmq, Read read){ + byte[] colors=crbmq[0]; + byte[] colorRef=crbmq[1]; + byte[] baseRef=crbmq[2]; + byte[] match=crbmq[3]; + + int fixedRef=0; + int fixedCall=0; + +// boolean indels=false; +// +// int indexOfIndel=colors.length; +// for(int i=0; i=colorRef.length){ + System.err.println("Failed fixNocallsInline for read "+read.numericID); + System.err.println(read.toText(false)); + System.err.println(toString(crbmq)); + return -1; + } + + assert(ri0){ + {//forward + + for(int mi=0, ri=0; mi=0; mi--){ + + assert(ri>=0) : "\n"+read.toText(false)+"\n"+toString(crbmq); + + byte m=match[mi]; + byte r=colorRef[ri]; + + if(m=='m' || m=='S' || m=='N'){ + + if(baseRef[ri]=='N'){ + baseRef[ri]=AminoAcid.colorToBase(baseRef[ri+1], r); + } + if(baseRef[ri+1]=='N'){ + baseRef[ri+1]=AminoAcid.colorToBase(baseRef[ri], r); + } + ri--; + }else if(m=='D'){ + ri--; + }else if(m=='I'){ + }else{ + assert(false) : "m="+(char)m+"\n"+read.toText(false)+"\n"+toString(crbmq); + } + } + } + } + return fixedRef+fixedCall; + } + + + private static int fixNocalls(final byte[][] crbmq){ + byte[] colors=crbmq[0]; + byte[] colorRef=crbmq[1]; + byte[] baseRef=crbmq[2]; + byte[] match=crbmq[3]; + + int fixedRef=0; + int fixedCall=0; + + boolean indels=false; + + int indexOfIndel=colors.length; + for(int i=0; i0){ + + for(int i=1; i=0; i--){ + if(baseRef[i]=='N'){ + baseRef[i]=AminoAcid.colorToBase(baseRef[i+1], colorRef[i+1]); + } + } + } + } + return fixedRef+fixedCall; + } + + + private static int fixNocallsBackward(final byte[][] crbmq){ + byte[] colors=crbmq[0]; + byte[] colorRef=crbmq[1]; + byte[] baseRef=crbmq[2]; + byte[] match=crbmq[3]; + + int fixedRef=0; + int fixedCall=0; + + boolean indels=false; + + int indexOfIndelCall=0; + int indexOfIndelRef=0; + int indexOfIndelMatch=0; + for(int i=match.length-1; i>=0; i--){ + if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){ + indels=true; + int safe=(match.length-1)-i; + indexOfIndelMatch=i; + indexOfIndelCall=colors.length-safe; + indexOfIndelRef=colorRef.length-safe; +// System.err.println("indexOfIndelMatch="+indexOfIndelMatch+ +// "\nindexOfIndelCall="+indexOfIndelCall+ +// "\nindexOfIndelRef="+indexOfIndelRef+ +// "\nsafe="+safe); + break; + } + } + +// assert(colors.length==colorRef.length) : "\n"+Arrays.toString(colors)+"\n"+Arrays.toString(colorRef)+ +// "\n"+new String(baseRef)+"\n"+new String(crbmq[3])+"\n"; + for(int i=colors.length-1, j=colorRef.length-1, k=match.length-1; i>=indexOfIndelCall; i--, j--, k--){ +// if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){ +// indels=true; +// break; +// } + if(colors[i]=='N' || colors[i]=='.'){ + if(colorRef[j]!='N' && colorRef[j]!='.'){ + colors[i]=colorRef[j]; + fixedCall++; + assert(match[k]!='I' && match[k]!='D') : "i="+i+", j="+j+", k="+k+"\n"+toString(crbmq); + match[k]='m'; + } + } + if(colorRef[j]=='N' || colorRef[j]=='.'){ + if(colors[i]!='N' && colors[i]!='.'){ + colorRef[j]=colors[i]; + fixedRef++; + assert(match[k]!='I' && match[k]!='D') : "i="+i+", j="+j+", k="+k+"\n"+toString(crbmq); + match[k]='m'; + } + } + } + + assert(indels || colors.length==colorRef.length) : "\n"+toString(crbmq)+"\n"; + + if(fixedRef>0){ + if(!indels){ + for(int i=1; i=indexOfIndelRef; i--){ + if(match[i]=='I' || match[i]=='X' || match[i]=='Y' || match[i]=='D'){ + assert(false); + break; + } + if(baseRef[i]=='N'){ + baseRef[i]=AminoAcid.colorToBase(baseRef[i+1], colorRef[i+1]); + } + } + } + return fixedRef+fixedCall; + } + + public static boolean perfectMatch(final byte[] match){ + if(match==null){return false;} + for(int i=0; i0){ + System.err.println("Fixed qual-4 Errors:"); + System.err.println(toString(crbmq)); + } + + while(corrected>0){ + corrected=0; + for(int i=0; ithresh || quality[i]=22 && quality[quality.length-2]>=22){return 0;} +// System.err.println("Z"); + } + + int last=match.length-1; + int minBadIndex=last; + int mcount=0; + + int insertions=0; + + while(last>1 && mcount1 && mcount=0 && ri400){ + System.err.println(toStringCS(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)"); + }else{ + System.err.println(toStringCS(ref)+" (ref)"); + } + System.err.println(toStringCS(call)+" (call)"); + System.err.println(new String(match)); + }else{ + if(ref.length>400){ + System.err.println(new String(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)"); + }else{ + System.err.println(new String(ref)+" (ref)"); + } + System.err.println(new String(call)+" (call)"); + System.err.println(new String(match)); + } + } + + if(!ok){ + + ok=true; + + if(loud){System.err.println("Attempting to fix and skip error.");} + for(int ci=0, mi=0, ri=rstart; mi=0 && ri400){ + System.err.println(toStringCS(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)"); + }else{ + System.err.println(toStringCS(ref)+" (ref)"); + } + System.err.println(toStringCS(call)+" (call)"); + System.err.println(new String(match)); + }else{ + if(ref.length>400){ + System.err.println(new String(Arrays.copyOfRange(ref, rstart, rstart+call.length))+" (ref)"); + }else{ + System.err.println(new String(ref)+" (ref)"); + } + System.err.println(new String(call)+" (call)"); + System.err.println(new String(match)); + } + + + + if(THROW_EXCEPTION_ON_VERIFY_FAILURE){ + System.err.println("Fixed successfully?\t"+ok); + throw new RuntimeException("Failed VerifyMatchString()"); + } + + } + + return ok; + } + + //TODO: No-calls and no-ref are currently considered the same. + /** When this is called, the match string should be plus-oriented */ + public ArrayList toVars(final Read read, final boolean CONDENSE, final boolean CONDENSE_SNPS, final boolean SPLIT_SUBS){ + assert(!read.colorspace()); + assert(read.match!=null); + byte[] match=read.match; + byte[] quality=read.quality; + byte[] call=read.bases; + + if(quality==null){quality=Read.getFakeQuality(call.length);} + + assert(!read.colorspace()); //or else reverse-complement will mess things up + + assert(checkArray(call)); + + int maxVars=0; + + byte last='m'; + for(int i=0; i vars=new ArrayList(maxVars); + ChromosomeArray cha=Data.getChromosome(read.chrom); + + +// assert(verifyMatchString(call, cha.array, match, read.start, true, read.colorspace())) : read.toText(false); + boolean vms=false; + try { + vms=verifyMatchString(call, cha.array, match, read.start, true, read.colorspace()); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + vms=false; + System.err.println("in TranslateColorspace.toVars(), a read failed verification:\n"+read.toText(false)+"\n"); + } + + if(verbose){ + System.err.println("Making vars:"); + System.err.println(new String(call)); + System.err.println(cha.getString(read.start, read.stop)); + System.err.println(new String(match)); + + } + + int readQuality; + { + int totalQual=0; + int minQual=quality[0]; + for(int i=0; i0){varType=Variation.NOREF;} + else if(cs.charAt(0)=='N'){ + varType=Variation.NOCALL; + if(verbose){System.err.println("Setting type NOCALL: "+Variation.varTypeMap[varType]);} + }else if(mlen==1){varType=Variation.SNP;} + else{varType=Variation.DELINS;} + + + final int headDist, tailDist, endDist; + { + int cstart2=cstart, cstop2=cstop; + if(varType==Variation.DEL){ + cstart2--; + cstop2++; + } + + assert(cstop2>=cstart2) : Variation.varTypeMap[varType]+", "+cstop2+", "+cstart2+", "+clen+ + "\n'"+cs+"', '"+rs+"'\n"+new String(match); + assert(cstop2=0); + assert(tailDist>=0); + } + + + int varQuality; + if(varType==Variation.DEL){ + varQuality=((qualSum/mlen)+(qualMin))/2; + }else{ + if(callPos1){ + qualMin=Tools.min(quality[callPos-2], quality[callPos-2], quality[callPos-2], quality[callPos-2]); + varQuality=(quality[callPos-2]+quality[callPos-1]+quality[callPos]+quality[callPos+1]+(qualMin))/5; + }else if(callPos0){ + qualMin=Tools.min(quality[callPos-1], quality[callPos]); + varQuality=qualMin; + }else{ + varQuality=((qualSum/mlen)+(qualMin))/2; + } + } + + if(verbose){ + System.err.println("mlen="+mlen+", rlen="+rlen+", clen="+clen+", varType="+Variation.varTypeMap[varType]+"\n"+ + ", cs="+cs+", nCount="+nCount+", rCount="+rCount+", iCount="+iCount+", dCount="+dCount+", sCount="+sCount); + } + +// assert(read.mapScore>0) : read.toText(false); + v=new Varlet(read.chrom, read.strand(), rstart, rstop, mstart, mstop, varType, rs.toString(), cs.toString(), + varQuality, readQuality, read.mapScore, read.errors, expectedErrors, (read.paired() ? 1 : 0), read.numericID, + read.bases.length, read.mapLength, read.start, read.stop, read.copies, headDist, tailDist, endDist, + read.pairnum()); + +// if(v.varType==Variation.NOREF){System.err.print("R");} + + if(v.varType==Variation.SNP){ + if(v.call.equals(v.ref)){ + System.err.println("\n"+read.toText(false)); + System.err.println("\n"+v.toText()); + System.err.println("\n"+read.strand()); + System.err.println("\n"); + System.err.println(cha.getString(read.start, read.stop)); + System.err.println(new String(call)); + System.err.println(new String(match)); + System.err.println("\n"); + assert(false); + } + + } + + vars.add(v); + } + } + //assert(checkArray(call)); + +// assert(read.numericID!=3448228) : CONDENSE+"\n"+vars; + +// boolean fail=false; +// { +// int nr=0; +// for(Variation v : vars){ +// if(v.varType==Variation.NOREF){ +// nr++; +// fail=nr>0; +// } +// } +// System.err.print(" "+nr); +// } +// if(fail){verbose=true;} + +// if(read.numericID==3448228){verbose=true;} + + //Optionally, merge nearby variations + if(CONDENSE && vars.size()>1){ + boolean condense=false; + + int mergeDistance=1; // 1 for adjacent, 2 for non-adjacent. + + for(int i=1; i=v2.beginLoc){condense=true;} //To prevent overlapping variations + else if(CONDENSE_SNPS || (v1.varType!=Variation.SNP && v2.varType!=Variation.SNP)){ + condense|=(v1.matchStop>=v2.matchStart-mergeDistance); + } + } + + if(verbose){ + System.err.println("Compared\n"+v1+"\nand\n"+v2+"\ncondense="+condense+"\n"+v1.matchStart+", "+v2.matchStart+", "+mergeDistance); + } + } + +// condense=false; + if(condense){ + if(verbose){ + System.err.println("Condensing:"); + for(Varlet v : vars){ + System.err.println(v); + } + } + ArrayList list2=new ArrayList(vars.size()-1); + for(int i=vars.size()-2; i>=0; i--){ + Varlet prev=vars.get(i); +// Varlet v=vars.get(i+1); + Varlet v=vars.remove(i+1); + + + boolean merge=(!v.isNR_or_NC() && !prev.isNR_or_NC() && (prev.matchStop>=v.matchStart-mergeDistance || prev.endLoc>=v.beginLoc)); + if(merge && !CONDENSE_SNPS && prev.endLocmidstop ? "" : cha.getString(midstart, midstop)); + + String cs=(prev.call==null ? "" : prev.call)+middle+(v.call==null ? "" : v.call); + String rs=(prev.ref==null ? "" : prev.ref)+middle+(v.ref==null ? "" : v.ref); + + final int headDist=Tools.min(v.headDist, prev.headDist); + final int tailDist=Tools.min(v.tailDist, prev.tailDist); + final int endDist=Tools.min(v.endDist, prev.endDist); + + + Varlet v2=new Varlet(read.chrom, read.strand(), prev.beginLoc, v.endLoc, prev.matchStart, v.matchStop, varType, + rs, cs, (prev.avgVarQuality()+v.avgVarQuality())/2, readQuality, read.mapScore, read.errors, expectedErrors, + (read.paired() ? 1 : 0), read.numericID, read.bases.length, read.mapLength, + read.start, read.stop, read.copies, headDist, tailDist, endDist, read.pairnum()); + + vars.remove(i); //prev + vars.add(v2); + }else{ + list2.add(v); + } + } + assert(vars.size()==1); + list2.add(vars.get(0)); + Collections.reverse(list2); + vars=list2; + + if(verbose){ + System.err.println("Condensed:"); + for(Varlet v : vars){ + System.err.println(v); + } + System.err.println(); + } + } + } + +// { +// int nr=0; +// for(Variation v : vars){ +// if(v.varType==Variation.NOREF){ +// nr++; +// } +// } +// System.err.println(" "+nr); +// } +// +// assert(!fail); + +// assert(read.numericID!=3448228) : CONDENSE+"\n"+vars; + + //assert(checkArray(call)); + //Don't exit early and forget to undo this! + if(read.strand()==Gene.MINUS){ + AminoAcid.reverseComplementBasesInPlace(call); + Tools.reverseInPlace(quality); + } + //assert(checkArray(call)); + return vars; + } + + + protected MSA msaBS; + protected MSA msaCS; + + public static boolean verbose=false; + + public static boolean DISCARD_NOCALLED_INSERTIONS=false; + public static boolean THROW_EXCEPTION_ON_VERIFY_FAILURE=true; //Throws an exception when "verify match string" fails + +} diff --git a/current/align2/TrimRead.java b/current/align2/TrimRead.java new file mode 100755 index 0000000..8432d01 --- /dev/null +++ b/current/align2/TrimRead.java @@ -0,0 +1,478 @@ +package align2; + +import java.util.Arrays; + +import jgi.CalcTrueQuality; + +import stream.Read; +import stream.SiteScore; + +import dna.Gene; + +/** + * Helper class for processes that do inline quality trimming. + * @author Brian Bushnell + * @date Mar 15, 2013 + * + */ +public final class TrimRead { + + public static void main(String[] args){ + byte[] bases=args[0].getBytes(); + byte[] quals=(args.length<2 ? null : args[1].getBytes()); + if(quals!=null){ + for(int i=0; i>32)&0xFFFFFFFFL) : 0; + b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; + }else{ + a=(trimLeft ? testLeft(r.bases, r.quality, (byte)trimq) : 0); + b=(trimRight ? testRight(r.bases, r.quality, (byte)trimq) : 0); + } + return (a+b==0 ? null : new TrimRead(r, a, b, trimq, minlen)); + } + + /** + * Trim until at least 'minlen' consecutive bases exceed 'minq' + * @param r Read to trim + * @param trimLeft Trim left side + * @param trimRight Trim right side + * @param trimq Maximum quality to trim + * @param minlen Minimum consecutive bases over minq before trimming stops + * @return Number of bases trimmed + */ + public static int trimFast(Read r, boolean trimLeft, boolean trimRight, int trimq, int minlen){ + final byte[] bases=r.bases, qual=r.quality; + if(bases==null || bases.length<1){return 0;} + final int a, b; + if(optimalMode){ + long packed=testOptimal(bases, qual, QualityTools.PROB_ERROR[trimq]); + a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; + b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; + }else{ + a=(trimLeft ? testLeft(bases, qual, (byte)trimq) : 0); + b=(trimRight ? testRight(bases, qual, (byte)trimq) : 0); + } + return trimByAmount(r, a, b, minlen); + } + + public static boolean untrim(Read r){ + if(r==null || r.obj==null){return false;} + if(r.obj.getClass()!=TrimRead.class){return false;} + TrimRead tr=(TrimRead)r.obj; + return tr.untrim(); + } + +// public TrimRead(Read r_, boolean trimLeft, boolean trimRight, int trimq_, int minlen_){ +// this(r_, (trimLeft ? testLeft(r_.bases, r_.quality, (byte)trimq_) : 0), (trimRight ? testRight(r_.bases, r_.quality, (byte)trimq_) : 0), trimq_, minlen_); +// } + + public TrimRead(Read r_, int trimLeft, int trimRight, int trimq_, int minlen_){ + minlen_=Tools.max(minlen_, 0); + r=r_; + bases1=r.bases; + qual1=r.quality; + trimq=(byte)trimq_; + assert(bases1!=null || qual1==null) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n"; + assert(bases1==null || qual1==null || bases1.length==qual1.length) : "\n"+new String(bases1)+"\n"+new String(qual1)+"\n"; + int trimmed=trim(trimLeft, trimRight, minlen_); + if(trimmed>0){ + r.bases=bases2; + r.quality=qual2; + r.mapLength=(bases2==null ? 0 : bases2.length); + r.obj=this; + trimMatch(r); + } + } + + /** Trim the left end of the read, from left to right */ + private int trim(final boolean trimLeft, final boolean trimRight, final int minlen){ + final int a, b; + if(optimalMode){ + long packed=testOptimal(bases1, qual1, QualityTools.PROB_ERROR[trimq]); + a=trimLeft ? (int)((packed>>32)&0xFFFFFFFFL) : 0; + b=trimRight ? (int)((packed)&0xFFFFFFFFL) : 0; + }else{ + a=(trimLeft ? testLeft(bases1, qual1, (byte)trimq) : 0); + b=(trimRight ? testRight(bases1, qual1, (byte)trimq) : 0); + } + return trim(a, b, minlen); + } + + /** Trim the left end of the read, from left to right */ + private int trim(int trimLeft, int trimRight, final int minlen){ + if(trimLeft+trimRight>bases1.length){ + trimRight=Tools.max(0, bases1.length-minlen); + trimLeft=0; + } + + leftTrimmed=trimLeft; + rightTrimmed=trimRight; + final int sum=leftTrimmed+rightTrimmed; + + if(verbose){ + System.err.println("leftTrimmed="+leftTrimmed+", rightTrimmed="+rightTrimmed+", sum="+sum); + } + + if(sum==0){ + bases2=bases1; + qual2=qual1; + }else{ + bases2=Arrays.copyOfRange(bases1, trimLeft, bases1.length-trimRight); + qual2=((qual1==null || (trimLeft+trimRight>=qual1.length)) ? null : Arrays.copyOfRange(qual1, trimLeft, qual1.length-trimRight)); + } + return sum; + } + + /** Trim bases outside of leftLoc and rightLoc, excluding leftLoc and rightLoc */ + public static int trimToPosition(Read r, int leftLoc, int rightLoc, int minResultingLength){ + final int len=r.bases==null ? 0 : r.bases.length; + return trimByAmount(r, leftLoc, len-rightLoc-1, minResultingLength); + } + + /** Trim this many bases from each end */ + public static int trimByAmount(Read r, int leftTrimAmount, int rightTrimAmount, int minResultingLength){ + + leftTrimAmount=Tools.max(leftTrimAmount, 0); + rightTrimAmount=Tools.max(rightTrimAmount, 0); + + //These assertions are unnecessary if the mapping information will never be used or output. + assert(r.match==null) : "TODO: Handle trimming of reads with match strings."; + assert(r.sites==null) : "TODO: Handle trimming of reads with SiteScores."; + + final byte[] bases=r.bases, qual=r.quality; + final int len=(bases==null ? 0 : bases.length), qlen=(qual==null ? 0 : qual.length); + if(len<1){return 0;} + minResultingLength=Tools.min(len, Tools.max(minResultingLength, 0)); + if(leftTrimAmount+rightTrimAmount+minResultingLength>len){ + rightTrimAmount=Tools.max(1, len-minResultingLength); + leftTrimAmount=0; + } + + final int total=leftTrimAmount+rightTrimAmount; + if(total>0){ + r.bases=Arrays.copyOfRange(bases, leftTrimAmount, len-rightTrimAmount); + r.quality=(leftTrimAmount+rightTrimAmount>=qlen ? null : Arrays.copyOfRange(qual, leftTrimAmount, qlen-rightTrimAmount)); + trimMatch(r); + if(r.stop>r.start){ //TODO: Fixing mapped coordinates needs more work. + r.start+=leftTrimAmount; + r.stop-=rightTrimAmount; + } + } + + if(verbose){ + System.err.println("leftTrimmed="+leftTrimAmount+", rightTrimmed="+rightTrimAmount+ + ", sum="+total+", final length="+(r.bases==null ? 0 : r.bases.length)); + } + + return total; + } + + /** Count number of bases that need trimming on each side, and pack into a long */ + private static long testOptimal(byte[] bases, byte[] qual, float avgErrorRate){ + if(optimalBias>=0){avgErrorRate=optimalBias;}//Override + assert(avgErrorRate>0 && avgErrorRate<=1) : "Average error rate ("+avgErrorRate+") must be between 0 (exclusive) and 1 (inclusive)"; + if(bases==null || bases.length==0){return 0;} + if(qual==null){return avgErrorRate<1 ? 0 : ((((long)testLeftN(bases))<<32) | (((long)testRightN(bases))&0xFFFFFFFFL));} + + assert(!ADJUST_QUALITY) : "Needs to be recompiled to support adjusting quality."; + + float maxScore=0; + float score=0; + int maxLoc=-1; + int maxCount=-1; + int count=0; + + final float nprob=Tools.max(Tools.min(avgErrorRate*1.1f, 1), NPROB); + + for(int i=0; i0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; + + float delta=avgErrorRate-probError; + score=score+delta; + if(score>0){ + count++; + if(score>maxScore || (score==maxScore && count>maxCount)){ + maxScore=score; + maxCount=count; + maxLoc=i; + } + }else{ + score=0; + count=0; + } + } + + final int left, right; + if(maxScore>0){ + assert(maxLoc>=0); + assert(maxCount>0); + left=maxLoc-maxCount+1; + assert(left>=0 && left<=bases.length); + right=bases.length-maxLoc-1; + }else{ + left=0; + right=bases.length; + } + final long packed=((((long)left)<<32) | (((long)right)&0xFFFFFFFFL)); + + if(verbose){ + System.err.println(Arrays.toString(qual)); + System.err.println("After testLocal: maxScore="+maxScore+", maxLoc="+maxLoc+", maxCount="+maxCount+ + ", left="+left+", right="+right+", returning "+Long.toHexString(packed)); + } + return packed; + } + + /** Count number of bases that need trimming on left side */ + private static int testLeft(byte[] bases, byte[] qual, final byte trimq){ + if(bases==null || bases.length==0){return 0;} + if(qual==null){return trimq>0 ? 0 : testLeftN(bases);} + int good=0; + int lastBad=-1; + int i=0; + for(; i0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; + if(q>trimq){good++;} + else{good=0; lastBad=i;} + } + if(verbose){ +// System.err.println(Arrays.toString(qual)); + System.err.println("After testLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(lastBad+1)); +// assert(false); + } + return lastBad+1; + } + + /** Count number of bases that need trimming on right side */ + private static int testRight(byte[] bases, byte[] qual, final byte trimq){ + if(bases==null || bases.length==0){return 0;} + if(qual==null){return trimq>0 ? 0 : testRightN(bases);} + int good=0; + int lastBad=bases.length; + int i=bases.length-1; + for(; i>=0 && good0 || b=='N') : "index "+i+": q="+q+", b="+(char)b+"\n"+new String(bases)+"\n"+Arrays.toString(qual)+"\n"; + if(q>trimq){good++;} + else{good=0; lastBad=i;} + } + if(verbose){ + System.err.println("After trimLeft: good="+good+", lastBad="+lastBad+", i="+i+", returning "+(bases.length-lastBad)); + } + return bases.length-lastBad; + } + + /** Count number of bases that need trimming on left side, considering only N as bad */ + public static int testLeftN(byte[] bases){ + if(bases==null || bases.length==0){return 0;} + int good=0; + int lastBad=-1; + for(int i=0; i=0 && good ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + for(Read r : reads){ + if(r.paired()){ + Read r2=r.mate; + + if(r.chrom<1 && r.numSites()>0){ + SiteScore ss=r.topSite(); //Should not be necessary + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + } + if(r2.chrom<1 && r2.numSites()>0){ + SiteScore ss=r2.topSite(); //Should not be necessary + r2.start=ss.start; + r2.stop=ss.stop; + r2.chrom=ss.chrom; + r2.setStrand(ss.strand); + } + + assert(r.paired()); + assert(r2.paired()); + assert(r.numericID==r2.numericID); + assert(r.chrom==r2.chrom) : "\n\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n\n"; + assert(Tools.absdif(r.start, r2.start)<100000) : "\n\n"+r.toText(false)+"\n\n"+r2.toText(false)+"\n\n"; + } + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + ReadWrite.closeStream(cris); + }else{ + ArrayList reads=rtis.nextList(); + while(reads!=null && reads.size()>0){ + for(Read r : reads){ + if(r.paired()){ + Read r2=r.mate; + + if(r.chrom<1 && r.numSites()>0){ + SiteScore ss=r.topSite(); //Should not be necessary + r.start=ss.start; + r.stop=ss.stop; + r.chrom=ss.chrom; + r.setStrand(ss.strand); + } + if(r2.chrom<1 && r2.numSites()>0){ + SiteScore ss=r2.topSite(); //Should not be necessary + r2.start=ss.start; + r2.stop=ss.stop; + r2.chrom=ss.chrom; + r2.setStrand(ss.strand); + } + + assert(r.paired()); + assert(r2.paired()); + assert(r.numericID==r2.numericID); + assert(r.chrom==r2.chrom); + assert(Tools.absdif(r.start, r2.start)<100000); + } + } + reads=rtis.nextList(); + } + rtis.close(); + } + } + + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + + +} diff --git a/current/dna/AminoAcid.java b/current/dna/AminoAcid.java new file mode 100755 index 0000000..4a29f06 --- /dev/null +++ b/current/dna/AminoAcid.java @@ -0,0 +1,606 @@ +package dna; +import java.util.Arrays; +import java.util.HashMap; + + +/** + * @author Brian Bushnell + * @date July 1, 2010 + * + */ +public final class AminoAcid { + + + public static void main(String[] args){ +// for(String s : stringToAA.keySet()){ +// System.out.println(s+"\t->\t"+stringToAA.get(s)); +// } + + String bases="atctgatTGGcgcgatatatcg"; + String acids=stringToAAs(bases); + + System.out.println(bases+" -> "+acids); + + } + + + private AminoAcid(){ + this(null); + assert(false); + System.exit(0); + } + + private AminoAcid(String line){ + String[] s2=line.split(", "); + String[] s3=new String[s2.length-3]; + for(int i=3; i>=2; + } + return sb.reverse().toString(); + } + + + public final String name; + public final String symbol; + public final char letter; + public final String[] codeStrings; + + + //a=1 + //c=2 + //g=4 + //t=8 + +// R G A (puRine) +// Y T C (pYrimidine) +// K G T (Ketone) +// M A C (aMino group) +// S G C (Strong interaction) +// W A T (Weak interaction) +// B G T C (not A) (B comes after A) +// D G A T (not C) (D comes after C) +// H A C T (not G) (H comes after G) +// V G C A (not T, not U) (V comes after U) +// N A G C T (aNy) +// X masked +// - gap of indeterminate length + + public static final byte[] numberToBase={ + 'A','C','G','T','N' + }; + + public static final byte[] numberToComplementaryBase={ + 'T','G','C','A','N' + }; + + public static final byte[] numberToComplement={ + 3,2,1,0,4 + }; + + public static final byte[] numberToBaseExtended={ + ' ','A','C','M','G','R','S','V', //0-7 + 'T','W','Y','H','K','D','B','N', //8-15 + 'X',' ',' ',' ',' ',' ',' ',' ', //16-23 + }; + + /** Has 'N' in position 0. Mainly for translating compressed arrays containing zeroes to bases. */ + public static final byte[] numberToBaseExtended2={ + 'N','A','C','M','G','R','S','V', //0-7 + 'T','W','Y','H','K','D','B','N', //8-15 + 'X',' ',' ',' ',' ',' ',' ',' ', //16-23 + }; + + public static final byte[] numberToComplementaryBaseExtended={ + ' ','T','G','K','C','Y','W','B', //0-7 + 'A','S','R','D','M','H','V','N', //8-15 + 'X',' ',' ',' ',' ',' ',' ',' ', //16-23 + }; + + /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', -1 otherwise */ + public static final byte[] baseToNumber=new byte[128]; + + /** Element i is: 3 for 'A', 2 for 'C', 1 for 'G', 0 for 'T', -1 otherwise */ + public static final byte[] baseToComplementNumber=new byte[128]; + + /** Element i is: 0 for 'A', 1 for 'C', 2 for 'G', 3 for 'T', 4 for 'N', -1 otherwise */ + public static final byte[] baseToNumberACGTN=new byte[128]; + + public static final byte[] baseToComplementExtended=new byte[128]; + + /** Element i is the bitwise OR of constituent IUPAC base numbers in baseToNumber.
+ * For example, baseToNumberExtended['M'] = (baseToNumber['A'] | baseToNumber['C']) = (1 | 2) = 3 */ + public static final byte[] baseToNumberExtended=new byte[128]; + public static final AminoAcid[] AlphabeticalAAs=new AminoAcid[21]; + public static final AminoAcid[] codeToAA=new AminoAcid[64]; + public static final char[] codeToChar=new char[64]; + public static final HashMap stringToAA=new HashMap(512); + + public static final AminoAcid Alanine=new AminoAcid("Alanine, Ala, A, GCU, GCC, GCA, GCG"); + public static final AminoAcid Arginine=new AminoAcid("Arginine, Arg, R, CGU, CGC, CGA, CGG, AGA, AGG"); + public static final AminoAcid Asparagine=new AminoAcid("Asparagine, Asn, N, AAU, AAC"); + public static final AminoAcid AsparticAcid=new AminoAcid("AsparticAcid, Asp, D, GAU, GAC"); + public static final AminoAcid Cysteine=new AminoAcid("Cysteine, Cys, C, UGU, UGC"); + public static final AminoAcid GlutamicAcid=new AminoAcid("GlutamicAcid, Glu, E, GAA, GAG"); + public static final AminoAcid Glutamine=new AminoAcid("Glutamine, Gln, Q, CAA, CAG"); + public static final AminoAcid Glycine=new AminoAcid("Glycine, Gly, G, GGU, GGC, GGA, GGG"); + public static final AminoAcid Histidine=new AminoAcid("Histidine, His, H, CAU, CAC"); + public static final AminoAcid Isoleucine=new AminoAcid("Isoleucine, Ile, I, AUU, AUC, AUA"); + public static final AminoAcid Leucine=new AminoAcid("Leucine, Leu, L, UUA, UUG, CUU, CUC, CUA, CUG"); + public static final AminoAcid Lysine=new AminoAcid("Lysine, Lys, K, AAA, AAG"); + public static final AminoAcid Methionine=new AminoAcid("Methionine, Met, M, AUG"); + public static final AminoAcid Phenylalanine=new AminoAcid("Phenylalanine, Phe, F, UUU, UUC"); + public static final AminoAcid Proline=new AminoAcid("Proline, Pro, P, CCU, CCC, CCA, CCG"); + public static final AminoAcid Serine=new AminoAcid("Serine, Ser, S, UCU, UCC, UCA, UCG, AGU, AGC"); + public static final AminoAcid Threonine=new AminoAcid("Threonine, Thr, T, ACU, ACC, ACA, ACG"); + public static final AminoAcid Tryptophan=new AminoAcid("Tryptophan, Trp, W, UGG"); + public static final AminoAcid Tyrosine=new AminoAcid("Tyrosine, Tyr, Y, UAU, UAC"); + public static final AminoAcid Valine=new AminoAcid("Valine, Val, V, GUU, GUC, GUA, GUG"); + public static final AminoAcid END=new AminoAcid("End, End, *, UAA, UGA, UAG"); +// public static final AminoAcid ANY=new AminoAcid("Any, Any, X, XXX"); + + + + + public static final byte[][] COLORS=new byte[][] { + {0, 1, 2, 3}, + {1, 0, 3, 2}, + {2, 3, 0, 1}, + {3, 2, 1, 0} + }; + + /** Returns a new reverse-complemented array in ASCII coding*/ + public static final byte[] reverseComplementBases(final byte[] in){ + byte[] out=new byte[in.length]; + final int last=in.length-1; + for(int i=0; i0 && b>=2; + } + return out; + } + + public static final long reverseComplementBinary(long kmer, int k){ + long out=0; + kmer=~kmer; + for(int i=0; i>=2; + } + return out; + } + + public static final int reverseComplementBinaryFast(int kmer, int k){ + int out=0; + int extra=k&3; + for(int i=0; i>=2; + } + k-=extra; + for(int i=0; i>=8; + } + return out; + } + + public static final long reverseComplementBinaryFast(long kmer, int k){ + long out=0; + int extra=k&3; + for(int i=0; i>=2; + } + k-=extra; + for(int i=0; i>=8; + } + return out; + } + + public static final byte[] toColorspaceSimulated(byte[] array){ + byte[] out=new byte[array.length+1]; + out[0]='T'; + out[1]=baseToColor((byte)'T', array[0]); + for(int i=2; i3){ +// System.err.println("colorToBase("+(char)base1+","+color+") = N"); + return (byte)'N'; + } + byte a=baseToNumber[base1]; + +// System.err.println("colorToBase("+(char)base1+","+color+") = "+(char)numberToBase[COLORS[a][color]]); + + return numberToBase[COLORS[a][color]]; + } + + public static final byte toNumber(String code){ + return toNumber(code.charAt(0), code.charAt(1), code.charAt(2)); + } + + public static final AminoAcid toAA(String code){ + return toAA(code.charAt(0), code.charAt(1), code.charAt(2)); + } + + public static final char toChar(String code){ + return toChar(code.charAt(0), code.charAt(1), code.charAt(2)); + } + + public static final char[] splitBase(char c){ + byte b=baseToNumberExtended[c]; + int len=Integer.bitCount(b); + char[] out=new char[len]; + + int index=0; + for(int i=0; i<4; i++){ + if(((1<=0; i--){ + int temp=code&3; + code>>=2; + bytes[i]=numberToBase[temp]; + } + + return bytes; + } + + public static final int baseTupleToNumber(byte[] tuple){ + + int r=0; + for(int i=0; i3){return -1;} + r=((r<<2)|temp); + } + + return r; + } + + public static boolean isFullyDefined(char base){ + return baseToNumber[base]>=0; + } + + public static boolean isFullyDefined(byte base){ + return base>=0 && baseToNumber[base]>=0; + } + + public static boolean isACGTN(char base){ + return baseToNumberACGTN[base]>=0; + } + + public static boolean isACGTN(byte base){ + return base>=0 && baseToNumberACGTN[base]>=0; + } + + public static boolean containsOnlyACGTN(String s){ + if(s==null || s.length()==0){return true;} + for(int i=0; i=0; + } + + public static boolean isFullyDefined(byte c, boolean colorspace){ + return colorspace ? (c>=0 && c<=3) : (c>=0 && baseToNumber[c]>=0); + } + + public static boolean isFullyDefined(String s){ + for(int i=0; i=0 && baseToNumberACGTN[c2]>=0 && baseToNumberACGTN[c3]>=0); + int x=(baseToNumberACGTN[c1]<<4)|(baseToNumberACGTN[c2]<<2)|(baseToNumberACGTN[c3]); + return (byte)x; + } + + public static final AminoAcid toAA(char c1, char c2, char c3){ + assert(baseToNumberACGTN[c1]>=0 && baseToNumberACGTN[c2]>=0 && baseToNumberACGTN[c3]>=0); + int x=(baseToNumberACGTN[c1]<<4)|(baseToNumberACGTN[c2]<<2)|(baseToNumberACGTN[c3]); + return codeToAA[x]; + } + + public static final char toChar(char c1, char c2, char c3){ + assert(baseToNumberACGTN[c1]>=0 && baseToNumberACGTN[c2]>=0 && baseToNumberACGTN[c3]>=0); + int x=(baseToNumberACGTN[c1]<<4)|(baseToNumberACGTN[c2]<<2)|(baseToNumberACGTN[c3]); + return codeToChar[x]; + } + + public static final char toChar(byte c1, byte c2, byte c3){ + assert(baseToNumberACGTN[c1]>=0 && baseToNumberACGTN[c2]>=0 && baseToNumberACGTN[c3]>=0); + byte n1=baseToNumberACGTN[c1], n2=baseToNumberACGTN[c2], n3=baseToNumberACGTN[c3]; + if(n1>3 || n2>3 || n3>3){return '?';} + int x=(n1<<4)|(n2<<2)|(n3); +// return (x list=new ArrayList(4); + for(File f2 : f.listFiles()){ + if(!f2.isDirectory() && f2.isFile()){ + String s=f2.getAbsolutePath(); + if(s.endsWith(".chrom") || s.endsWith(".chromC") || s.contains(".chrom.") || s.contains(".chromC.")){ + list.add(s); + } + } + } + chromfiles=list.toArray(new String[list.size()]); + } + } + + String outfile=args[1]; + int blocklen=Integer.parseInt(args[2]); + int trigger=(args.length>3 ? Integer.parseInt(args[3]) : 0); + + TextStreamWriter tsw=new TextStreamWriter(outfile, true, false, false); + tsw.start(); + + if(trigger<=0){ //Write normally + for(int i=0; i0){ + sb.append('N'); + if(ns==trigger){ + sb.setLength(sb.length()-ns); + tsw.print(">"+contig+"\n"); + contig++; + writeContig(sb, tsw, fastaBlocklen); + sb.setLength(0); + } + } + }else{ + sb.append((char)b); + ns=0; + } + } + + + if(sb.length()>0){ + sb.setLength(sb.length()-ns); + tsw.print(">"+contig+"\n"); + contig++; + writeContig(sb, tsw, fastaBlocklen); + sb.setLength(0); + } + + return contig; + } + + public static void writeContig(StringBuilder sb, TextStreamWriter tsw, int blocklen){ + for(int i=0; i"+cha.chromosome+"\n"); + writeChrom(cha, tsw, blocklen); + tsw.poison(); + } + + public static void writeChrom(ChromosomeArray cha, TextStreamWriter tsw, int blocklen){ + tsw.println(">"+cha.chromosome); + for(int i=0; i<=cha.maxIndex; i+=blocklen){ + int max=Tools.min(i+blocklen-1, cha.maxIndex); + tsw.println(cha.getString(i, max)); + } + } + +} diff --git a/current/dna/ChromosomeArray.java b/current/dna/ChromosomeArray.java new file mode 100755 index 0000000..26a8b59 --- /dev/null +++ b/current/dna/ChromosomeArray.java @@ -0,0 +1,401 @@ +package dna; +import java.io.Serializable; +import java.util.ArrayList; +import java.util.Arrays; + +import align2.Tools; + +import fileIO.ReadWrite; + + +public class ChromosomeArray implements Serializable { + + + /** + * + */ + private static final long serialVersionUID = 3199182397853127842L; + + public static void main(String[] args){ + translateFile(args[1], Byte.parseByte(args[0])); + } + + + private static void translateFile(String fname, int chrom){ + + long time1=System.nanoTime(); + + ChromosomeArray cha=read(fname, chrom); + cha.chromosome=chrom; + long time2=System.nanoTime(); + + int dot=fname.lastIndexOf(".fa"); + String outfile=fname.substring(0,dot).replace("hs_ref_", "")+".chrom"; + + System.out.println("Writing to "+outfile); + + System.out.println("minIndex="+cha.minIndex+", maxIndex="+cha.maxIndex+", length="+cha.array.length+ + "; time="+String.format("%.3f seconds", (time2-time1)/1000000000d)); + + long time3=System.nanoTime(); + ReadWrite.write(cha, outfile, false); + cha=null; + System.gc(); + cha=read(outfile); + long time4=System.nanoTime(); + + System.out.println("minIndex="+cha.minIndex+", maxIndex="+cha.maxIndex+", length="+cha.array.length+ + "; time="+String.format("%.3f seconds", (time4-time3)/1000000000d)); + } + + public static ChromosomeArray read(String fname, int chrom){ + ChromosomeArray cha=read(fname); + assert(cha.chromosome<1); + cha.chromosome=chrom; + return cha; + } + + public static ChromosomeArray read(String fname){ + + if(fname.endsWith(".chrom") || fname.endsWith(".chrom.gz")){ + ChromosomeArray ca=ReadWrite.read(ChromosomeArray.class, fname); + return ca; + }else{ + assert(fname.endsWith(".chromC") || fname.endsWith(".chromC.gz")); + + ChromosomeArrayCompressed cac=ReadWrite.read(ChromosomeArrayCompressed.class, fname); + return cac.toChromosomeArray(); + } + } + + public ChromosomeArray(){ + this((byte)-1, Gene.PLUS); + } + + public ChromosomeArray toColorspace(){ + assert(!colorspace); + ChromosomeArray ca=new ChromosomeArray(chromosome, strand, 0, maxIndex, true); + + for(int i=0; i=array.length){//Increase size + int newlen=(int)(1+(3L*max(array.length, loc))/2); + assert(newlen>loc) : newlen+", "+loc+", "+array.length; + resize(newlen); + assert(array.length==newlen); +// System.err.println("Resized array to "+newlen); + } + char c=Character.toUpperCase((char)val); + if(AminoAcid.baseToNumberExtended[c]<0){c='N';} + array[loc]=(val>Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)val); + minIndex=min(loc, minIndex); + maxIndex=max(loc, maxIndex); + } + + + public void set(int loc, CharSequence s){ + int loc2=loc+s.length(); + if(loc2>array.length){//Increase size + int newlen=(int)(1+(3L*max(array.length, loc2))/2); + assert(newlen>loc2) : newlen+", "+loc2+", "+array.length; + resize(newlen); + assert(array.length==newlen); +// System.err.println("Resized array to "+newlen); + } + + for(int i=0; imaxIndex){ + for(int i=0; i3 ? -1 : b; + }else{ + return AminoAcid.baseToNumber[b]; + } + } + + public boolean isFullyDefined(int a, int b){ + for(int i=a; i<=b; i++){ + int x=AminoAcid.baseToNumber[array[i]]; + if(x<0){return false;} + } + return true; + } + + public boolean isFullyUndefined(int a, int b){ + for(int i=a; i<=b; i++){ + int x=AminoAcid.baseToNumber[array[i]]; + if(x>=0){return false;} + } + return true; + } + + public int countDefinedBases(){ + return countDefinedBases(minIndex, maxIndex); + } + + public int countDefinedBases(int a, int b){ + int sum=0; + for(int i=a; i<=b; i++){ + int x=AminoAcid.baseToNumber[array[i]]; + if(x>=0){sum++;} + } + return sum; + } + + public int getNumber(int a, int b){ + return toNumber(a, b, array); + } + + public int getNumber(int a, int b, boolean colorspace){ + return colorspace ? toNumberColorspace(a, b, array) : toNumber(a, b, array); + } + + public static int toNumberColorspace(int a, int b, byte[] bases){ + assert(b>=a); + assert(b-a<17); //<17 for unsigned, <16 for signed + int out=0; + for(int i=a; i<=b; i++){ + int x=bases[i]; + if(x<0 || x>3){return -1;} + out=((out<<2)|x); + } + return out; + } + + public static int toNumber(int a, int b, byte[] bases){ + assert(b>=a); + assert(b-a<17); //<17 for unsigned, <16 for signed + int out=0; + for(int i=a; i<=b; i++){ + int x=AminoAcid.baseToNumber[bases[i]]; + if(x<0){return -1;} + out=((out<<2)|x); + } + return out; + } + + public static int toNumber(int a, int b, String bases){ + int out=0; + for(int i=a; i<=b; i++){ + int x=AminoAcid.baseToNumber[bases.charAt(i)]; + if(x<0){return -1;} + out=((out<<2)|x); + } + return out; + } + + public void resize(int newlen){ + byte[] temp=new byte[newlen]; + int lim=min(array.length, newlen); + assert(lim>=maxIndex) : lim+","+maxIndex; + for(int i=0; i=0; i--){ + byte b=array[i]; + if(b=='A' || b=='C' || b=='G' || b=='T'){ + dist=0; + }else{ + dist=(dist==max ? max : (char)(dist+1)); + } + r[i]=Tools.min(dist, r[i]); + } + return r; + } + + public ArrayList toContigRanges(final int nBlockSize){ + assert(nBlockSize>0); + ArrayList list=new ArrayList(); + + int start=-1; + int stop=-1; + int ns=nBlockSize+1; + + boolean contig=false; + + for(int i=minIndex; i<=maxIndex; i++){ + byte b=array[i]; + if(b=='N' || b=='X'){ + ns++; + if(contig && (b=='X' || ns>=nBlockSize)){ + Range r=new Range(start, stop); + list.add(r); + contig=false; + } + }else{ + ns=0; + if(!contig){start=i;} + contig=true; + stop=i; + } + } + if(contig){ + Range r=new Range(start, stop); + list.add(r); + } + return list; + } + + + public boolean equalsIgnoreCase(ChromosomeArray other){ + if(minIndex!=other.minIndex){System.err.println("a");return false;} + if(maxIndex!=other.maxIndex){System.err.println("b");return false;} + if(chromosome!=other.chromosome){System.err.println("c");return false;} + if(array.length!=other.array.length){System.err.println("d");return false;} + for(int i=minIndex; i<=maxIndex; i++){ + if(Character.toLowerCase(array[i])!=Character.toLowerCase(other.array[i])){ + System.err.println("e"); + return false; + } + } + return true; + } + + private static final long min(long x, long y){return xy ? x : y;} + private static final int min(int x, int y){return xy ? x : y;} + + public final byte strand; + public int chromosome; + public byte[] array; + public int maxIndex=-1; + public int minIndex=Integer.MAX_VALUE; + + public final boolean colorspace; + + +} diff --git a/current/dna/ChromosomeArrayCompressed.java b/current/dna/ChromosomeArrayCompressed.java new file mode 100755 index 0000000..01f1271 --- /dev/null +++ b/current/dna/ChromosomeArrayCompressed.java @@ -0,0 +1,324 @@ +package dna; +import java.io.Serializable; + +import align2.Tools; + +import fileIO.ReadWrite; + + +public class ChromosomeArrayCompressed implements Serializable { + + + public static void main(String[] args){ + +// ChromosomeArray ca; +// +// +// ca=Data.getChromosome(21); +// ca=new ChromosomeArray((byte)1, "acagtgca"); +// +// +// ChromosomeArrayCompressed cac=new ChromosomeArrayCompressed(ca); +// ChromosomeArray ca2=cac.toChromosomeArray(); +// +// assert(ca.minIndex==ca2.minIndex); +// assert(ca.maxIndex==ca2.maxIndex); +// assert(ca.chromosome==ca2.chromosome); +// assert(ca.array.length==ca2.array.length); +// +// System.out.println("Old: "+ca.toBaseString()); +// System.out.println("New: "+ca2.toBaseString()); +// +// for(int i=0; i<=ca.maxIndex; i++){ +// if(Character.toLowerCase(ca.array[i])!=Character.toLowerCase(ca2.array[i])){ +// System.out.println("Error at "+i); +// System.exit(1); +// } +// } + + if(args.length>2){ + Data.setGenome(Integer.parseInt(args[2])); + } + + byte minChrom=1; + byte maxChrom=26; + + String root=args[0].replace('\\', '/'); + if(!root.endsWith("/")){root+="/";} + + String fname=args[1]; + + if(fname.contains("#")){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + + try { + translateFile(root, chrom, fname.replace("#", Gene.chromCodes[chrom])); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + }else{ +// translateFile(root, fname); //chrom is unknown + assert(false); + } + + if(args.length>2){ + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + System.out.println("Loading chr"+Gene.chromCodes[chrom]+" colorspace."); + Data.getChromosome(chrom); + Data.unload(chrom, true); + } + } + + } + + + private static void translateFile(String root, int chrom, String fname){ + System.out.print(chrom+":\t"); + + ChromosomeArray ca=ChromosomeArray.read(root+fname, (byte) chrom); + System.out.print("Loaded\t"); +// System.out.println("\n"+ca.minIndex+", "+ca.maxIndex+"\n"+ca.getString(19999992,20000081)+"\n"); +// System.out.print(root+fname); + + assert(chrom==ca.chromosome); + + ChromosomeArrayCompressed cac=new ChromosomeArrayCompressed(ca); + System.out.print("Translated\t"); + assert(chrom==cac.chromosome); + ReadWrite.write(cac, root+"chr"+Gene.chromCodes[chrom]+".chromC", false); + System.out.print("Wrote\t"); + + cac=null; + cac=ReadWrite.read(ChromosomeArrayCompressed.class, root+"chr"+Gene.chromCodes[chrom]+".chromC"); + System.out.print("Reloaded\t"); + assert(chrom==cac.chromosome); + + ChromosomeArray ca2=cac.toChromosomeArray(); + +// System.out.println("\n"+ca2.minIndex+", "+ca2.maxIndex+"\n"+ca2.getString(19999992,20000081)+"\n"); + + boolean success=ca2.equalsIgnoreCase(ca); + + System.out.println(success ? "Success" : "Fail"); + + assert(success) : "\n"+ca2.getString(0, 100)+"\n"+ca.getString(0, 100); + +// Data.unload(chrom, true); //Why is this here? + } + + + public ChromosomeArrayCompressed(ChromosomeArray ca){ + this(ca.chromosome, ca.minIndex, ca.maxIndex, ca.array); + } + + + public ChromosomeArrayCompressed(int chrom, int min, int max, byte[] letters){ + + + array=new byte[Tools.max(max, letters.length-1)/2+1]; + minIndex=min; + maxIndex=max; + chromosome=chrom; + + for(int i=min; i<=max; i++){ + byte letter=letters[i]; + write(i, letter); + } + + } + + + public void translate(byte[] dest){ + +// int min=minIndex/2; +// int max= + + byte[] map=AminoAcid.numberToBaseExtended2; + + int max=dest.length/2; + + +// int min=minIndex/2; +// int min2=min*2; + + +// for(int i=0, j=0, k=1; i>4); +// dest[j]=map[n0]; +// +// byte n1=(byte)(b&0xF); +// dest[k]=map[n1]; +// } + + + for(int i=0, j=0; i>4); + dest[j]=map[n0]; + + byte n1=(byte)(b&0xF); + dest[j+1]=map[n1]; + } + + if((dest.length&1) == 1){ + dest[dest.length-1]=readLetter(dest.length-1); + } + + } + + + public ChromosomeArray toChromosomeArray(){ + + //TODO Store strand data + ChromosomeArray ca=new ChromosomeArray(chromosome, Gene.PLUS, minIndex, maxIndex); + +// byte[] caarray=ca.array; +// +// for(int i=ca.minIndex; i<=ca.maxIndex; i++){ +// caarray[i]=readLetter(i); +// } + + translate(ca.array); + + return ca; + } + + public byte readNumber(int pos){ + int remap=pos/2; + byte bit1=(byte) (pos&1); + int old=array[remap]&0xFF; + + if(bit1==0){ + old=(old>>>4); + }else{ + old=(old&0x0F); + } + return (byte)old; + } + + public byte readLetter(int pos){ + int remap=pos/2; + byte bit1=(byte) (pos&1); + int old=array[remap]&0xFF; + + if(bit1==0){ + old=(old>>>4); + }else{ + old=(old&0x0F); + } + +// System.out.println("Reading "+Integer.toHexString(array[remap])+" at position "+pos+" -> "+(char)AminoAcid.numberToBaseExtended[old]); + + return AminoAcid.numberToBaseExtended[old]; + } + + private void write(int pos, byte letter){ + int remap=pos/2; + byte bit1=(byte) (pos&1); + int old=array[remap]; + int number=AminoAcid.baseToNumberExtended[letter]; + assert(number<16); + + if(bit1==0){ + +// System.out.println("01: "+padZeroes(Integer.toBinaryString(old), 8)); + old=(old&0x0F); +// System.out.println("02: "+padZeroes(Integer.toBinaryString(old), 8)); + old=(old|(number<<4)); +// System.out.println("03: "+padZeroes(Integer.toBinaryString(old), 8)); + }else{ +// System.out.println("11: "+padZeroes(Integer.toBinaryString(old), 8)); + old=(old&0xF0); +// System.out.println("12: "+padZeroes(Integer.toBinaryString(old), 8)); + old=(old|number); +// System.out.println("13: "+padZeroes(Integer.toBinaryString(old), 8)); + } + +// System.out.println("Writing "+(char)letter+" at position "+pos +// +": "+padZeroes(Integer.toBinaryString(array[remap]), 8)+" -> "+padZeroes(Integer.toBinaryString(old), 8)); + + array[remap]=(byte)old; + } + + + private static String padZeroes(String s, int len){ + while(s.length()len){ + int chop=s.length()-len; + s=s.substring(chop); + } + return s; + } + + public ChromosomeArrayCompressed(){ + this((byte)-1); + } + + public ChromosomeArrayCompressed(int chrom){ + chromosome=chrom; + } + + + public void set(int loc, int val){ + + if(loc>=array.length){//Increase size + int newlen=1+(3*max(array.length, loc))/2; + assert(newlen>loc); + resize(newlen); + assert(array.length==newlen); + } + + array[loc]=(val>Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)val); + minIndex=min(loc, minIndex); + maxIndex=max(loc, maxIndex); + } + + public byte get(int loc){ + return array[loc]; + } + + public String get(int a, int b){ + StringBuilder sb=new StringBuilder(b-a+1); + for(int i=a; i<=b; i++){ + sb.append((char)get(i)); + } + return sb.toString(); + } + + public void resize(int newlen){ + byte[] temp=new byte[newlen]; + int lim=min(array.length, newlen); + assert(lim>maxIndex) : lim+","+maxIndex; + for(int i=0; iy ? x : y;} + private static final int min(int x, int y){return xy ? x : y;} + + public boolean extended=false; + + public final int chromosome; + public byte[] array; + public int maxIndex=-1; + public int minIndex=Integer.MAX_VALUE; + + + /** + * + */ + private static final long serialVersionUID = -8836873912811373713L; + +} diff --git a/current/dna/Coverage.java b/current/dna/Coverage.java new file mode 100755 index 0000000..3c0d837 --- /dev/null +++ b/current/dna/Coverage.java @@ -0,0 +1,26 @@ +package dna; +import java.util.HashSet; + +import var.VarLine; + + +public class Coverage{ + + public Coverage(Gene gg){ + g=gg; + } + + public final Gene g; + public HashSet varSet; //TODO: Could change these to arrays and sort them. + public int min=Integer.MAX_VALUE; + public int max=0; + public int covered=0; + public int uncovered=0; + public long sum=0; + public float avg; + public float covRatio; + + public int[] missingChromRelative; + public int[] missingGeneRelative; + +} \ No newline at end of file diff --git a/current/dna/CoverageArray.java b/current/dna/CoverageArray.java new file mode 100755 index 0000000..2358d48 --- /dev/null +++ b/current/dna/CoverageArray.java @@ -0,0 +1,128 @@ +package dna; +import java.io.Serializable; +import java.util.ArrayList; + +import fileIO.ReadWrite; + + +public abstract class CoverageArray implements Serializable { + + /** + * + */ + private static final long serialVersionUID = -7175422489330746676L; + + + public static final CoverageArray read(String fname){ + CoverageArray ca=null; + + if(!fname.contains(".ca")){ + throw new RuntimeException(); +// ca=new CoverageArray2(); +// ca.load(new TsvCoverageFile(fname)); +// return ca; + } + + fname=ReadWrite.findFileExtension(fname); +// System.err.println("Found "+fname); + + return ReadWrite.read(CoverageArray.class, fname); + +// if(fname.endsWith(".ca2") || fname.contains(".ca2.")){return ReadWrite.read(CoverageArray2.class, fname);} +// else if(fname.endsWith(".ca") || fname.contains(".ca.")){return ReadWrite.read(CoverageArray1.class, fname);} +// else{return ReadWrite.read(CoverageArray.class, fname);} + } + + public CoverageArray(int chrom){chromosome=chrom;} + + /** + * @param loc + * @param amt + */ + public abstract void increment(int loc, int amt); + + /** + * @param loc + */ + public abstract void increment(int loc); + + public abstract void incrementRange(int min, int max, int amt); + + + public abstract void set(int loc, int val); + + public abstract int get(int loc); + + public abstract void resize(int newlen); + + + public final double[][] toGraph(int blocksize, int min, int max){ + + min=max(min, minIndex); + max=min(max, maxIndex); + int length=max-min; + + ArrayList list=new ArrayList(); + + int block; + + if(blocksize<=0){ +// block=((array.length+62999)/63000);//For Excel +// block=((length+62999)/63000);//For Excel + block=((length+31499)/31500);//For Excel + }else{ + block=blocksize; + } + block=max(block, 1); + + int current=0; + double[] sum=new double[2]; + for(int loc=min; loc<=max; loc++){ + if(current==block){ + for(int i=0; iy ? x : y;} + private static final int min(int x, int y){return xy ? x : y;} + + public int chromosome; + + public int maxIndex=-1; + public int minIndex=Integer.MAX_VALUE; + public int length(){return maxIndex-minIndex+1;} + public abstract int arrayLength(); + + private static boolean OVERFLOWED=false; + +} diff --git a/current/dna/CoverageArray1.java b/current/dna/CoverageArray1.java new file mode 100755 index 0000000..13f517e --- /dev/null +++ b/current/dna/CoverageArray1.java @@ -0,0 +1,216 @@ +package dna; +import java.io.Serializable; + +import driver.Translator2; + +import fileIO.ReadWrite; + + +public class CoverageArray1 extends CoverageArray implements Serializable { + + + /** + * + */ + private static final long serialVersionUID = 6711045833114428632L; + + + public static void main(String[] args){ + runSpeedTest(args); + +// translateGenomeBuild(args); + } + + public static void runSpeedTest(String[] args){ + + long time1=System.nanoTime(); + + CoverageArray1 ca=(CoverageArray1)read(args[1]); + ca.chromosome=Byte.parseByte(args[0]); + long time2=System.nanoTime(); + +// int dot=args[1].lastIndexOf("."); +// String outfile=args[1].substring(0,dot)+".ca"; + + args[1]=args[1].replace('\\', '/'); + int slash=args[1].lastIndexOf('/'); + String outfile; + if(slash<1){ + outfile="coverage-chr"+Gene.chromCodes[ca.chromosome]+"-build"+Data.GENOME_BUILD+".ca"; + }else{ + outfile=args[1].substring(0,slash+1)+"coverage-chr"+Gene.chromCodes[ca.chromosome]+"-build"+Data.GENOME_BUILD+".ca"; + } + + System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+ + "; time="+String.format("%.3f seconds", (time2-time1)/1000000000d)); + + long time3=System.nanoTime(); + ReadWrite.write(ca, outfile, false); + ca=null; + System.gc(); + ca=(CoverageArray1)read(outfile); + long time4=System.nanoTime(); + + System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+ + "; time="+String.format("%.3f seconds", (time4-time3)/1000000000d)); + + + } + + public static void translateGenomeBuild(String[] args){ + + Timer t=new Timer(); + t.start(); + + int inBuild=Integer.parseInt(args[0]); + int outBuild=Integer.parseInt(args[1]); + String root=args[2]; + + translateGenomeBuild(inBuild, outBuild, root); + + t.stop(); + System.out.println("Time:\t"+t); + + } + + public static void translateGenomeBuild(int inBuild, int outBuild, String root){ + root=root.replace('\\', '/'); + if(!root.endsWith("/")){root+="/";} + + CoverageArray1[] out=new CoverageArray1[27]; + + for(int chrom=1; chrom=array.length){//Increase size + int newlen=1+(7*max(array.length, max))/4; + assert(newlen>max); + resize(newlen); + assert(array.length==newlen); + }else if(max<0){max=-1;} + for(int i=min; i<=max; i++){ + int val=array[i]+amt; + if(val>Short.MAX_VALUE){ + val=Short.MAX_VALUE; + if(!OVERFLOWED){ + System.err.println("Note: Coverage capped at "+Short.MAX_VALUE); + OVERFLOWED=true; + } + } + array[i]=(short)val; + } + } + + public void set(int loc, int val){ + + if(loc>=array.length){//Increase size + int newlen=1+(7*max(array.length, loc))/4; + assert(newlen>loc); + resize(newlen); + assert(array.length==newlen); + }else if(loc<0){ +// minIndex=min(0, minIndex); +// maxIndex=max(0, maxIndex); + return; + } + + if(val>Short.MAX_VALUE && !OVERFLOWED){ + System.err.println("Note: Coverage capped at "+Short.MAX_VALUE); + OVERFLOWED=true; + } + array[loc]=(val>Short.MAX_VALUE ? Short.MAX_VALUE : (short)val); + minIndex=min(loc, minIndex); + maxIndex=max(loc, maxIndex); + } + + public int get(int loc){ + return loc>=array.length || loc<0 ? 0 : array[loc]; + } + + public void resize(int newlen){ + System.out.println("Resized chrom "+chromosome+" to "+newlen); + short[] temp=new short[newlen]; + int lim=min(array.length, newlen); + assert(lim>maxIndex) : lim+","+maxIndex; + for(int i=0; iy ? x : y;} + private static final int min(int x, int y){return xy ? x : y;} + + public short[] array; + public int length(){return maxIndex-minIndex+1;} + public int arrayLength(){return array.length;} + + private static boolean OVERFLOWED=false; + /** + * + */ +// private static final long serialVersionUID = -7493066925636540386L; + +} diff --git a/current/dna/CoverageArray2.java b/current/dna/CoverageArray2.java new file mode 100755 index 0000000..4409234 --- /dev/null +++ b/current/dna/CoverageArray2.java @@ -0,0 +1,218 @@ +package dna; +import java.io.Serializable; + +import driver.Translator2; + +import fileIO.ReadWrite; + + +public class CoverageArray2 extends CoverageArray implements Serializable { + + + /** + * + */ + private static final long serialVersionUID = 23847758821160827L; + + + public static void main(String[] args){ + runSpeedTest(args); + +// translateGenomeBuild(args); + } + + public static void runSpeedTest(String[] args){ + + long time1=System.nanoTime(); + + CoverageArray2 ca=(CoverageArray2)read(args[1]); + ca.chromosome=Byte.parseByte(args[0]); + long time2=System.nanoTime(); + +// int dot=args[1].lastIndexOf("."); +// String outfile=args[1].substring(0,dot)+".ca"; + + args[1]=args[1].replace('\\', '/'); + int slash=args[1].lastIndexOf('/'); + String outfile; + if(slash<1){ + outfile="coverage-chr"+Gene.chromCodes[ca.chromosome]+"-build"+Data.GENOME_BUILD+".ca"; + }else{ + outfile=args[1].substring(0,slash+1)+"coverage-chr"+Gene.chromCodes[ca.chromosome]+"-build"+Data.GENOME_BUILD+".ca"; + } + + System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+ + "; time="+String.format("%.3f seconds", (time2-time1)/1000000000d)); + + long time3=System.nanoTime(); + ReadWrite.write(ca, outfile, false); + ca=null; + System.gc(); + ca=(CoverageArray2)read(outfile); + long time4=System.nanoTime(); + + System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+ + "; time="+String.format("%.3f seconds", (time4-time3)/1000000000d)); + + + } + + public static void translateGenomeBuild(String[] args){ + + Timer t=new Timer(); + t.start(); + + int inBuild=Integer.parseInt(args[0]); + int outBuild=Integer.parseInt(args[1]); + String root=args[2]; + + translateGenomeBuild(inBuild, outBuild, root); + + t.stop(); + System.out.println("Time:\t"+t); + + } + + public static void translateGenomeBuild(int inBuild, int outBuild, String root){ + root=root.replace('\\', '/'); + if(!root.endsWith("/")){root+="/";} + + CoverageArray2[] out=new CoverageArray2[27]; + + for(int chrom=1; chrom=array.length){//Increase size + int newlen=1+(7*max(array.length, max))/4; + assert(newlen>max); + resize(newlen); + assert(array.length==newlen); + }else if(max<0){max=-1;} + for(int i=min; i<=max; i++){ + int val=array[i]+amt; + if(val>Character.MAX_VALUE){ + val=Character.MAX_VALUE; + if(!OVERFLOWED){ + System.err.println("Note: Coverage capped at "+Character.MAX_VALUE); + OVERFLOWED=true; + } + } + array[i]=(char)val; + } + } + + + public void set(int loc, int val){ + + if(loc>=array.length){//Increase size + int newlen=1+(7*max(array.length, loc))/4; + assert(newlen>loc); + resize(newlen); + assert(array.length==newlen); + }else if(loc<0){ +// minIndex=min(0, minIndex); +// maxIndex=max(0, maxIndex); + return; + } + + if(val>Character.MAX_VALUE && !OVERFLOWED){ + System.err.println("Note: Coverage capped at "+Character.MAX_VALUE); + OVERFLOWED=true; + } + array[loc]=(val>Character.MAX_VALUE ? Character.MAX_VALUE : (char)val); + minIndex=min(loc, minIndex); + maxIndex=max(loc, maxIndex); + } + + public int get(int loc){ + return loc>=array.length || loc<0 ? 0 : array[loc]; + } + + public void resize(int newlen){ + System.err.println("Resized CoverageArray "+chromosome+" to "+newlen); + char[] temp=new char[newlen]; + int lim=min(array.length, newlen); + assert(lim>maxIndex) : lim+","+maxIndex; + for(int i=0; iy ? x : y;} + private static final int min(int x, int y){return xy ? x : y;} + + + public char[] array; + public int length(){return maxIndex-minIndex+1;} + public int arrayLength(){return array.length;} + + private static boolean OVERFLOWED=false; + /** + * + */ +// private static final long serialVersionUID = -7493066925636540386L; + +} diff --git a/current/dna/CoverageArray3.java b/current/dna/CoverageArray3.java new file mode 100755 index 0000000..218b647 --- /dev/null +++ b/current/dna/CoverageArray3.java @@ -0,0 +1,225 @@ +package dna; +import java.io.Serializable; + +import driver.Translator2; + +import fileIO.ReadWrite; + + +public class CoverageArray3 extends CoverageArray implements Serializable { + + /** + * + */ + private static final long serialVersionUID = -8138626768937480215L; + + public static void main(String[] args){ + runSpeedTest(args); + +// translateGenomeBuild(args); + } + + public static void runSpeedTest(String[] args){ + + long time1=System.nanoTime(); + + CoverageArray3 ca=(CoverageArray3)read(args[1]); + ca.chromosome=Byte.parseByte(args[0]); + long time2=System.nanoTime(); + +// int dot=args[1].lastIndexOf("."); +// String outfile=args[1].substring(0,dot)+".ca"; + + args[1]=args[1].replace('\\', '/'); + int slash=args[1].lastIndexOf('/'); + String outfile; + if(slash<1){ + outfile="coverage-chr"+Gene.chromCodes[ca.chromosome]+"-build"+Data.GENOME_BUILD+".ca"; + }else{ + outfile=args[1].substring(0,slash+1)+"coverage-chr"+Gene.chromCodes[ca.chromosome]+"-build"+Data.GENOME_BUILD+".ca"; + } + + System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+ + "; time="+String.format("%.3f seconds", (time2-time1)/1000000000d)); + + long time3=System.nanoTime(); + ReadWrite.write(ca, outfile, false); + ca=null; + System.gc(); + ca=(CoverageArray3)read(outfile); + long time4=System.nanoTime(); + + System.out.println("minIndex="+ca.minIndex+", maxIndex="+ca.maxIndex+", length="+ca.array.length+ + "; time="+String.format("%.3f seconds", (time4-time3)/1000000000d)); + + + } + + public static void translateGenomeBuild(String[] args){ + + Timer t=new Timer(); + t.start(); + + int inBuild=Integer.parseInt(args[0]); + int outBuild=Integer.parseInt(args[1]); + String root=args[2]; + + translateGenomeBuild(inBuild, outBuild, root); + + t.stop(); + System.out.println("Time:\t"+t); + + } + + public static void translateGenomeBuild(int inBuild, int outBuild, String root){ + root=root.replace('\\', '/'); + if(!root.endsWith("/")){root+="/";} + + CoverageArray3[] out=new CoverageArray3[27]; + + for(int chrom=1; chrom=array.length){//Increase size + int newlen=1+(7*max(array.length, max))/4; + assert(newlen>max); + resize(newlen); + assert(array.length==newlen); + }else if(max<0){max=-1;} + for(int i=min; i<=max; i++){ + long val=array[i]+amt; + if(val>Integer.MAX_VALUE){ + val=Integer.MAX_VALUE; + if(!OVERFLOWED){ + System.err.println("Note: Coverage capped at "+Integer.MAX_VALUE); + OVERFLOWED=true; + } + } + array[i]=(int)val; + } + } + + public void set(int loc, int val){ + set(loc, (long)val); + } + + public void set(int loc, long val){ + + if(loc>=array.length){//Increase size + int newlen=1+(7*max(array.length, loc))/4; + assert(newlen>loc); + resize(newlen); + assert(array.length==newlen); + }else if(loc<0){ +// minIndex=min(0, minIndex); +// maxIndex=max(0, maxIndex); + return; + } + + if(val>Integer.MAX_VALUE && !OVERFLOWED){ + System.err.println("Note: Coverage capped at "+Integer.MAX_VALUE); + OVERFLOWED=true; + } + array[loc]=(val>Integer.MAX_VALUE ? Integer.MAX_VALUE : (int)val); + minIndex=min(loc, minIndex); + maxIndex=max(loc, maxIndex); + } + + public int get(int loc){ + return loc>=array.length || loc<0 ? 0 : array[loc]; + } + + public void resize(int newlen){ + System.err.println("Resized CoverageArray "+chromosome+" to "+newlen); + int[] temp=new int[newlen]; + int lim=min(array.length, newlen); + assert(lim>maxIndex) : lim+","+maxIndex; + for(int i=0; iy ? x : y;} + private static final int min(int x, int y){return xy ? x : y;} + + + public int[] array; + public int length(){return maxIndex-minIndex+1;} + public int arrayLength(){return array.length;} + + private static boolean OVERFLOWED=false; + +} diff --git a/current/dna/Data.java b/current/dna/Data.java new file mode 100755 index 0000000..ad10dca --- /dev/null +++ b/current/dna/Data.java @@ -0,0 +1,1619 @@ +package dna; +import java.io.File; +import java.io.IOException; +import java.io.PrintStream; +import java.net.URL; +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.HashMap; + +import kmer.Primes; + +import var.Variation; + +import align2.AbstractIndex; +import align2.ChromLoadThread; +import align2.Tools; + + +import driver.Search; +import fileIO.ChainBlock; +import fileIO.ChainLine; +import fileIO.ReadWrite; +import fileIO.TextFile; + +public class Data { + + + public static void main(String[] args){} + + + //TODO IMPORTANT! Ensure that this unloads everything big, AND that reloading subsequently works OK. + public static void unloadAll(){ + chromosomePlusMatrix=null; + chromosomeMinusMatrix=null; + AbstractIndex.clear(); + } + + + //TODO IMPORTANT! Ensure that this unloads everything big, AND that reloading subsequently works OK. + public static void unload(int chrom, boolean unloadSoft){ + +// unloadGenes(chrom); + + chromosomePlusMatrix[chrom]=null; + } + + public static void unloadGenes(int chrom){ + geneMatrix[chrom]=null; + geneSetMatrix[chrom]=null; + geneTxRangeMatrix[chrom]=null; + geneSetRangeMatrix[chrom]=null; + geneCodeRangeMatrix[chrom]=null; + geneCodeAndExonRangeMatrix[chrom]=null; + geneNearbyRangeMatrix[chrom]=null; + exonRangeMatrix[chrom]=null; + } + + public static byte find(int x, byte[] array){ + for(byte i=0; i genes=new ArrayList(); + for(Gene g : getGenes(chrom)){ + if(g.strand==strand){ + genes.add(g); + } + } + return genes.toArray(new Gene[genes.size()]); + } + + + public static GeneSet[] getGeneSets(int chrom){ + if(geneSetMatrix[chrom]==null){ + loadGenes(chrom); + } + return geneSetMatrix[chrom]; + } + + + public static HashMap> getGeneIDTable(){ + if(geneIDTable==null){ + +// System.err.println("WAITING FOR CS"); + synchronized(GENEIDLOCK){ +// System.err.println("ENTER CS"); + if(geneIDTable==null){ +// System.err.println("ENTER CS2"); + HashMap> temp=new HashMap>(2048); + for(byte chrom=1; chrom<=25; chrom++){ + GeneSet[] set=getGeneSets(chrom); + for(GeneSet gs : set){ + int id=-1; + for(Gene g : gs.genes){ + if(id==-1){id=g.id;} + else{assert(id==g.id) : gs+"\n"+gs.genes+"\n";} + } + assert(id>-1); + Integer key=new Integer(id); + ArrayList value=temp.get(key); + // assert(old==null || chrom>22) : "\nCollision!\n\n"+gs+"\n\nis overwriting\n\n"+old; + if(value==null){ + value=new ArrayList(2); + temp.put(key, value); + } + value.add(gs); + } + } +// System.err.println("EXIT CS2"); + geneIDTable=temp; + } +// System.err.println("EXIT CS"); + } + + } +// System.err.println("GeneIDTable contains "+geneIDTable.size()+" entries."); + return geneIDTable; + } + +// public static ChromosomeArray getChromosome(int chrom, boolean colorspace){ +// assert(!colorspace) : "Colorspace no longer supported"; +// if(colorspace){throw new RuntimeException("Colorspace no longer supported");} +// if(true){throw new RuntimeException("Colorspace no longer supported");} +// return getChromosome(chrom); +// } + + public static ChromosomeArray getChromosome(int chrom, byte strand){ + if(strand==Gene.PLUS){return getChromosome(chrom);} + + if(chromosomeMinusMatrix[chrom]==null){ + synchronized(CHROMLOCKS[chrom%CHROMLOCKS.length]){ + if(chromosomeMinusMatrix[chrom]==null){ + ChromosomeArray p=getChromosome(chrom); + chromosomeMinusMatrix[chrom]=p.complement(); + } + } + } + return chromosomeMinusMatrix[chrom]; + } + + public static ChromosomeArray getChromosome(int chrom){ + assert(chromosomePlusMatrix!=null); + assert(chromosomePlusMatrix.length>chrom) : chromosomePlusMatrix.length+", "+chrom; + if(chromosomePlusMatrix[chrom]==null){ + synchronized(CHROMLOCKS[chrom%CHROMLOCKS.length]){ + if(chromosomePlusMatrix[chrom]==null){loadChromosome(chrom);} + } + } + assert(chromosomePlusMatrix[chrom].array[0]=='N') : (char)chromosomePlusMatrix[chrom].array[0]+ + "\nIf you see this message, please regenerate your index.";//startpad was too low or for some reason invalid. + return chromosomePlusMatrix[chrom]; + } + + private static void loadGenes(int chrom){ + + if(geneMatrix[chrom]!=null){return;} //In case another thread already loaded the chromosome + synchronized(CHROMLOCKS[chrom%CHROMLOCKS.length]){ + if(geneMatrix[chrom]==null){ + + // Gene[] genes=FindExons.readGenes(ROOT_GENE+"ref/chr"+Gene.chromCodes[chrom]+".Ref.Table", Gene.FORMAT_NM); + // Gene[] genes=FindExons.readGenes(ROOT_GENE+"ref2/ccds-chr"+Gene.chromCodes[chrom]+"-genes.txt", Gene.FORMAT_CCDS); + // Gene[] genes=FindExons.readGenes(ROOT_GENE+"ref3/ccds-chr"+Gene.chromCodes[chrom]+"-genes.txt", Gene.FORMAT_CCDS); + + // Gene[] genes=ReadWrite.readArray(Gene.class, ROOT_GENE+"seqGene/chr"+Gene.chromCodes[chrom]+".ga"); + + String fname=ROOT_GENE+"Build"+GENOME_BUILD+"/"+GENE_MAP+"/chr"+Gene.chromCodes[chrom]+".ga"; + + Gene[] genes=ReadWrite.readArray(Gene.class, fname); + + Arrays.sort(genes); +// geneMatrix[chrom]=genes; + + geneTxRangeMatrix[chrom]=findGeneRanges(genes, TX_RANGE); + geneCodeRangeMatrix[chrom]=findGeneRanges(genes, CODE_RANGE); + geneCodeAndExonRangeMatrix[chrom]=findCodeAndExonRanges(genes, false, true); + exonRangeMatrix[chrom]=findCodeAndExonRanges(genes, false, false); + geneNearbyRangeMatrix[chrom]=findCodeAndExonRanges(genes, true, true); + + HashMap> temp=new HashMap>(); + HashMap gntable=new HashMap(); + HashMap tntable=new HashMap(); + + for(Gene g : genes){ + + String trkey=g.mrnaAcc; + if(trkey==null){trkey=g.chromosome+"_"+g.id;} + if(trkey!=null){ + Gene old=tntable.get(trkey); + if(old!=null){ + // stdout.println("For transcript '"+g.nameTranscript+"': Overwrote \n"+old+"\nwith\n"+g+"\n"); + } + tntable.put(trkey, g); + } + + String key=g.symbol; + if(key==null){key=g.mrnaAcc;} + ArrayList list=temp.get(key); + if(list==null){ + list=new ArrayList(8); + temp.put(key, list); + } + list.add(g); + } + + GeneSet[] gsm=new GeneSet[temp.size()]; + String[] keys=temp.keySet().toArray(new String[temp.size()]); + for(int i=0; i list=temp.get(key); + GeneSet gs=new GeneSet(key, list); + gsm[i]=gs; + gntable.put(key, gs); + } + + geneNameTable[chrom]=gntable; + transcriptNameTable[chrom]=tntable; + geneSetMatrix[chrom]=gsm; + Arrays.sort(geneSetMatrix[chrom]); + + geneSetRangeMatrix[chrom]=findGeneSetRanges(geneSetMatrix[chrom]); + + { + assert(geneMatrix[chrom]==null) : "Need to sync."; + geneMatrix[chrom]=genes; + } + } + } + } + + public static void loadChromosomes(int min, int max){ +// assert(false); + synchronized(CHROMLOCKS){ + String pattern=chromFname(GENOME_BUILD); + ChromLoadThread.loadAll(pattern, min, max, chromosomePlusMatrix); + } + } + + private static void loadChromosome(int chrom){ +// assert(false); + assert(chromosomePlusMatrix[chrom]==null); + assert(chrom>0) : chrom; + + String fname=chromFname(chrom, GENOME_BUILD); + sysout.println("Loading "+fname); + if(CHROMC){ + chromosomePlusMatrix[chrom]=ReadWrite.read(ChromosomeArrayCompressed.class, fname).toChromosomeArray(); + }else{ + chromosomePlusMatrix[chrom]=ReadWrite.read(ChromosomeArray.class, fname); + } + assert(chromosomePlusMatrix[chrom].chromosome==chrom); + } + + public static final String chromExtension(){ + return ".chrom"+(CHROMC ? "C" : "") + (CHROMGZ ? ".gz" : ""); + } + + public static final String chromFname(int chrom, int genome){ + return ROOT_GENOME+genome+"/chr"+chrom+chromExtension(); + } + + public static final String chromFname(int genome){ + return ROOT_GENOME+genome+"/chr#"+chromExtension(); + } + + public static Range[] findGeneRanges(Gene[] genes, final int mode){ + + ArrayList list=new ArrayList(8192); + ArrayList glist=new ArrayList(64); + + Range current=null; + for(int i=0; i=a){ + r=new Range(a, b); + + if(current==null){ + current=r; + glist.add(g); + }else if(current.touches(r)){ + current=current.merge(r); + glist.add(g); + }else{ + current.obj1=glist.toArray(new Gene[glist.size()]); + glist.clear(); + glist.add(g); + list.add(current); + current=r; + } + } + } + if(current!=null){ //i.e., if there were any genes + current.obj1=glist.toArray(new Gene[glist.size()]); + list.add(current); + } + + return list.toArray(new Range[list.size()]); + } + + public static Range[] findGeneSetRanges(GeneSet[] genes){ + + ArrayList list=new ArrayList(8192); + ArrayList glist=new ArrayList(64); + + Range current=null; + for(int i=0; i=a){ + r=new Range(a, b); + + if(current==null){ + current=r; + glist.add(g); + }else if(current.touches(r)){ + current=current.merge(r); + glist.add(g); + }else{ + current.obj1=glist.toArray(new GeneSet[glist.size()]); + glist.clear(); + glist.add(g); + list.add(current); + current=r; + } + } + } + if(current!=null){ //i.e., if there were any genes + current.obj1=glist.toArray(new GeneSet[glist.size()]); + list.add(current); + } + + return list.toArray(new Range[list.size()]); + } + + + public static Range[] findCodeAndExonRanges(Gene[] genes, boolean nearby, boolean codingOnly){ + + + ArrayList list=new ArrayList(32768); + + for(int i=0; i list2=new ArrayList(list.size()); + Collections.sort(list); + + + HashSet gset=new HashSet(64); + Range current=null; + for(Range r : list){ + if(current==null){ + gset.add((Gene)r.obj1); + current=r; + }else if(current.touches(r)){ + gset.add((Gene)r.obj1); + current=current.merge(r); + }else{ + current.obj1=gset.toArray(new Gene[gset.size()]); + list2.add(current); + gset.clear(); + gset.add((Gene)r.obj1); + current=r; + } + } + + if(current!=null){ + current.obj1=gset.toArray(new Gene[gset.size()]); + list2.add(current); + Collections.sort(list2); + } + + return list2.toArray(new Range[list2.size()]); + } + + public static Range[] geneSetRangeMatrix(int chrom){ + if(geneSetRangeMatrix[chrom]==null){ + loadGenes(chrom); + } + assert(geneSetRangeMatrix[chrom]!=null); + return geneSetRangeMatrix[chrom]; + } + + public static Range[] exonRangeMatrix(int chrom){ + if(exonRangeMatrix[chrom]==null){ + loadGenes(chrom); + } + assert(exonRangeMatrix[chrom]!=null); + return exonRangeMatrix[chrom]; + } + + public static Range[] geneCodeAndExonRangeMatrix(int chrom){ + if(geneCodeAndExonRangeMatrix[chrom]==null){ + loadGenes(chrom); + } + assert(geneCodeAndExonRangeMatrix[chrom]!=null); + return geneCodeAndExonRangeMatrix[chrom]; + } + + public static Range[] geneNearbyRangeMatrix(int chrom){ + if(geneNearbyRangeMatrix[chrom]==null){ + loadGenes(chrom); + } + assert(geneNearbyRangeMatrix[chrom]!=null); + return geneNearbyRangeMatrix[chrom]; + } + + public static HashMap geneNameTable(int chrom){ + if(geneNameTable[chrom]==null){ + loadGenes(chrom); + } + assert(geneNameTable[chrom]!=null); + return geneNameTable[chrom]; + } + + public static HashMap transcriptNameTable(int chrom){ + if(transcriptNameTable[chrom]==null){ + loadGenes(chrom); + } + assert(transcriptNameTable[chrom]!=null); + return transcriptNameTable[chrom]; + } + + + public static GeneSet[] getNearestGeneSets(int chrom, int loc){ + Range[] r=geneSetRangeMatrix(chrom); + int index=driver.Search.findPointBinary(loc, r); + GeneSet[] sets=(GeneSet[]) r[index].obj1; + if(sets==null || sets.length==0){ + assert(false); + return null; + } + return sets; + } + + /** Returns genesets overlapping the range */ + public static GeneSet[] getNearestGeneSets(int chrom, int loc1, int loc2){ + assert(loc2>=loc1); + +// boolean flag=(chrom==21 && loc1<38540895 && loc2>38540895);//TODO UNDO +// +// if(flag){ +// stdout.println(loc1+", "+loc2+", "+((loc1+loc2)/2)); +// for(GeneSet gs : Data.geneNameTable[chrom].values()){ +// if(gs.intersects(loc1, loc2)){ +// stdout.println("%%% "+gs); +// } +// } +// } + + Range[] ranges=geneSetRangeMatrix(chrom); + if(ranges==null || ranges.length==0){return null;} + int index=driver.Search.findPointBinary(loc1, ranges); + + +// if(flag){ +// Range r0=ranges[index-1]; +// Range r1=ranges[index]; +// Range r2=ranges[index+1]; +// +// stdout.println("r0: "+r0+"\n"+Arrays.toString((GeneSet[])r0.obj1)+"\n"); +// stdout.println("r1: "+r1+"\n"+Arrays.toString((GeneSet[])r1.obj1)+"\n"); +// stdout.println("r2: "+r2+"\n"+Arrays.toString((GeneSet[])r2.obj1)+"\n"); +// +// } +// if(flag){stdout.println("c");} + + Range r1=ranges[index]; + Range r2=(index>=ranges.length-1 ? null : ranges[index+1]); + + if(ranges[index].b>=loc2 || r2==null || r2.a>loc2){ + return (GeneSet[])r1.obj1; + } + +//// if(flag){stdout.println("e");} +// if(ranges[index].b>=loc2 || (index==ranges.length-1) || ranges[index+1].a>loc2){ +//// if(flag){ +//// stdout.println("f"); +//// stdout.println(ranges[index].b<=loc2); +//// stdout.println((index==ranges.length-1)); +//// stdout.println(ranges[index+1].a>loc2); +//// stdout.println("......."); +//// } +// return sets1; +// } + + if(loc1>r1.b && loc2=dist2){ + return (GeneSet[])r1.obj1; + }else{ + return (GeneSet[])r2.obj1; + } + } + +// assert(false) : "Test: This should be very rare, since it is slow."; + + //Otherwise, return all overlapping ranges. + ArrayList list=new ArrayList(4); + + while(index=ranges[index].a; index++){ +// if(flag){stdout.println("ADDED RANGE "+ranges[index]);} + GeneSet[] gsa=(GeneSet[]) ranges[index].obj1; + for(GeneSet gs : gsa){list.add(gs);} + } + return list.toArray(new GeneSet[list.size()]); + } + + + public static boolean isExonic(byte chrom, int point, int thresh, boolean isCoding){ + Range[] ranges=(isCoding ? Data.geneCodeAndExonRangeMatrix(chrom) : Data.exonRangeMatrix(chrom)); + return Search.containsPointBinary(point, ranges, thresh); + } + + + public static final String padFront(String num, int width, String symbol){ + String r=num; + while(r.length() String toStringRecursive(Iterable a){ + if(a==null){return "null";} + StringBuilder sb=new StringBuilder(256); + String prefix=""; + sb.append("["); + for(X x : a){ + sb.append(toStringRecursive(a)); + if(x!=null && x instanceof Iterable){ + sb.append("\n"); + }else{ + sb.append(", "); + } + } + sb.append("]"); + return sb.toString(); + } + + public static final HashMap geneNameToIdTable(){ + if(geneNameToIdTable==null){ + geneIdToNameTable(); + assert(geneIdToNameTable!=null); + assert(geneNameToIdTable!=null); + } + return geneNameToIdTable; + } + + public static final HashMap geneIdToNameTable(){ + if(geneIdToNameTable==null){ + + synchronized(GENEIDLOCK){ + if(geneIdToNameTable==null){ + + // TextFile tf=new TextFile(ROOT_GENE+"gene_names_36.3.txt"); + TextFile tf=new TextFile(ROOT_GENE+"gene_names_37.1.txt", false, false); + String[] lines=tf.toStringLines(); + tf.close(); + HashMap table=new HashMap((lines.length*3)/2); + for(String s : lines){ + if(!s.startsWith("#")){ + String[] line=s.split("\t", -1); + // assert(line.length==3) : "'"+s+"'"; + if(line.length>=3){ + + int key=-1; + try { + key=Integer.parseInt(line[1]); + } catch (NumberFormatException e) { + System.err.println("Bad line: "+s); + throw new RuntimeException(e); + } + + table.put(key, (line[2]==null || line[2].length()==0) ? line[1] : line[2]); + } + } + } + + geneIdToNameTable=table; + + HashMap table2=new HashMap((lines.length*3)/2); + for(Integer id : geneIdToNameTable.keySet()){ + table2.put(geneIdToNameTable.get(id), id); + } + geneNameToIdTable=table2; + } + } + } + return geneIdToNameTable; + } + + + public static ChainLine[][] getChainLines(int from, int to){ + if(from==36 && to==37){ + if(chains36to37==null){ + chains36to37=ChainBlock.loadChainLines(ROOT_CHAIN+"hg18ToHg19.over.chain"); + } + return chains36to37; + }else if(from==37 && to==36){ + if(chains37to36==null){ + chains37to36=ChainBlock.loadChainLines(ROOT_CHAIN+"hg19ToHg18.over.chain"); + } + return chains37to36; + } + throw new RuntimeException("Unknown chain file: "+from+" -> "+to); + } + + + public static final String toStringRecursive(Object a){ + return a==null ? "null" : a.toString(); + } + + public static boolean isBaited(Variation v){ + return isBaited(v, 0); + } + + public static boolean isBaited(Variation v, int thresh){ + int mid=(v.beginLoc+v.endLoc)/2; + int len=v.endLoc-v.beginLoc+1; + return isBaited(v.chromosome, mid, len/2+thresh); + } + + public static boolean isBaited(int chrom, int point, int thresh){ + if(BAITS==null){ + BAITS=(int[][][]) ReadWrite.readObject("UNDEFINED_ROOT"+"baits_"+BAIT_FILE+"_build"+GENOME_BUILD+".int3d"); + } + return isBaited(point, BAITS[chrom], thresh); + } + + /** Is this point within "thresh" of a bait */ + private static boolean isBaited(int point, int[][] baits, int thresh){ + + if(baits==null || baits[0].length==0){return false;} + + int[] starts=baits[0]; + int[] stops=baits[1]; + int index=Arrays.binarySearch(stops, point); + + if(index>=0){return true;} //Hit inside a bait + index=(-index)-1; + + if(index>=stops.length){return point<=(stops[stops.length-1]+thresh);} + +// if(index<0 || index>=stops.length){ +// System.err.println(point+" in "+starts[0]+", "+stops[stops.length-1]+" -> "+index+"/"+(stops.length-1)); +// } + + final int a=point-thresh; + final int b=point+thresh; + + if(overlap(a, b, starts[index], stops[index])){return true;} + for(int i=index+1; i=starts[i]; i++){ + if(overlap(a, b, starts[i], stops[i])){return true;} + } + for(int i=index-1; i>=0 && a<=stops[i]; i++){ + if(overlap(a, b, starts[i], stops[i])){return true;} + } + return false; +// +// return point>=(starts[index]-thresh) && point<=(stops[index]+thresh); + } + + private static boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + + + + /** Note - this does not handle cases where the same instance uses different bait files. */ + public static int[][][] getBaits(){ + if(BAITS==null){ + synchronized(BAITLOCK){ + if(BAITS==null){ + BAITS=(int[][][]) ReadWrite.readObject("UNDEFINED_ROOT"+"baits_"+BAIT_FILE+"_build"+GENOME_BUILD+".int3d"); + } + } + } + assert(BAITS!=null); + return BAITS; + } + + + public static final synchronized void setGenome(int g){ + assert(g>0); + if(genome_set_to==g){return;} + if(genome_set_to<0){ + setGenome2(g); + }else{ + throw new RuntimeException("Changing genomes is not currently supported."); + } + } + + private static final synchronized void setGenome2(int g){ + assert(genome_set_to!=g); + GENOME_BUILD=g; + genome_set_to=g; + numChroms=-1; + numBases=-1; + numDefinedBases=-1; + numContigs=-1; + numScaffolds=-1; + name=null; + genomeSource=null; + scaffoldPrefixes=false; + long fastabytes=-1; + long fastatime=-1; + final int currentVersion=FastaToChromArrays.currentVersion(); + int version=0; + + if(GENOME_BUILD==FastaToChromArrays.LISTBUILD && FastaToChromArrays.SUMMARY_LIST!=null){ + for(int i=0; i1 ? Integer.parseInt(split[1]) : 0); + } + }else{ + String[] split=s.split("\t"); + String a=split[0]; + String b=split[1]; + if(a.equalsIgnoreCase("chroms")){numChroms=(int)Long.parseLong(b);} + else if(a.equalsIgnoreCase("bases")){numBases=Long.parseLong(b);} + else if(a.equalsIgnoreCase("defined")){numDefinedBases=Long.parseLong(b);} + else if(a.equalsIgnoreCase("contigs")){numContigs=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("scaffolds")){numScaffolds=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("interpad")){interScaffoldPadding=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("undefined")){} + else if(a.equalsIgnoreCase("name")){name=b;} + else if(a.equalsIgnoreCase("source")){genomeSource=b;} + else if(a.equalsIgnoreCase("bytes")){fastabytes=Long.parseLong(b);} + else if(a.equalsIgnoreCase("last modified")){fastatime=Long.parseLong(b);} + else if(a.equalsIgnoreCase("scafprefixes")){scaffoldPrefixes=Tools.parseBoolean(b);} + else{assert(version1 ? Integer.parseInt(split[1]) : 0); + } + }else{ + String[] split=s.split("\t"); + String a=split[0]; + String b=split[1]; + if(a.equalsIgnoreCase("chroms")){numChroms=(int)Long.parseLong(b);} + else if(a.equalsIgnoreCase("bases")){numBases=Long.parseLong(b);} + else if(a.equalsIgnoreCase("defined")){numDefinedBases=Long.parseLong(b);} + else if(a.equalsIgnoreCase("contigs")){numContigs=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("scaffolds")){numScaffolds=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("interpad")){interScaffoldPadding=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("undefined")){} + else if(a.equalsIgnoreCase("name")){name=b;} + else if(a.equalsIgnoreCase("source")){genomeSource=b;} + else if(a.equalsIgnoreCase("bytes")){fastabytes=Long.parseLong(b);} + else if(a.equalsIgnoreCase("last modified")){fastatime=Long.parseLong(b);} + else if(a.equalsIgnoreCase("scafprefixes")){scaffoldPrefixes=Tools.parseBoolean(b);} + else{assert(version0) : "Genome "+g+": numChroms="+numChroms; + assert(numBases>0) : "Genome "+g+": numBases="+numBases; + assert(numDefinedBases>0) : "Genome "+g+": numDefinedBases="+numDefinedBases; + assert(numBases>=numDefinedBases) : "Genome "+g+": numBases>numDefinedBases : "+numBases+">"+numDefinedBases; + + chromosomePlusMatrix=new ChromosomeArray[numChroms+1]; + chromLengths=new int[numChroms+1]; + chromDefinedBases=new int[numChroms+1]; + chromUndefinedBases=new int[numChroms+1]; + chromContigs=new int[numChroms+1]; + chromStartPad=new int[numChroms+1]; + chromScaffolds=new int[numChroms+1]; + + scaffoldNames=new byte[numChroms+1][][]; + scaffoldLocs=new int[numChroms+1][]; + scaffoldLengths=new int[numChroms+1][]; + + if(GENOME_BUILD==FastaToChromArrays.LISTBUILD && FastaToChromArrays.INFO_LIST!=null){ + for(int i=0; i1 ? Integer.parseInt(split[1]) : 0); + } + }else{ + assert(version==currentVersion); + String[] split=s.split("\t"); + int chrom=Integer.parseInt(split[0]); + chromScaffolds[chrom]=Integer.parseInt(split[1]); + chromContigs[chrom]=(split.length>2 ? Integer.parseInt(split[2]) : -1); + chromLengths[chrom]=Integer.parseInt(split[3]); + chromDefinedBases[chrom]=Integer.parseInt(split[4]); + chromUndefinedBases[chrom]=(split.length>5 ? Integer.parseInt(split[5]) : -1); + chromStartPad[chrom]=(split.length>6 ? Integer.parseInt(split[6]) : -1); + // chromStopPad[chrom]=(split.length>7 ? Integer.parseInt(split[7]) : -1); + + } + } + FastaToChromArrays.INFO_LIST=null; + }else{ + String s; + TextFile tf=new TextFile(ROOT_GENOME+GENOME_BUILD+"/info.txt", false, false); + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)=='#'){ + if(s.startsWith("#Version")){ + String[] split=s.split("\t"); + version=(split.length>1 ? Integer.parseInt(split[1]) : 0); + } + }else{ + + if(version>=currentVersion){ + String[] split=s.split("\t"); + int chrom=Integer.parseInt(split[0]); + chromScaffolds[chrom]=Integer.parseInt(split[1]); + chromContigs[chrom]=(split.length>2 ? Integer.parseInt(split[2]) : -1); + chromLengths[chrom]=Integer.parseInt(split[3]); + chromDefinedBases[chrom]=Integer.parseInt(split[4]); + chromUndefinedBases[chrom]=(split.length>5 ? Integer.parseInt(split[5]) : -1); + chromStartPad[chrom]=(split.length>6 ? Integer.parseInt(split[6]) : -1); +// chromStopPad[chrom]=(split.length>7 ? Integer.parseInt(split[7]) : -1); + }else{ + tf.close(); + if(new File(ROOT_GENOME+GENOME_BUILD+"/info.txt").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/info.txt").delete();} + if(new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt").delete();} + if(new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz").exists()){new File(ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz").delete();} + sysout.println("Regenerating genome info in new format."); + dna.FastaToChromArrays.writeInfo(GENOME_BUILD, numChroms, name, genomeSource, true, scaffoldPrefixes); + tf=new TextFile(ROOT_GENOME+GENOME_BUILD+"/info.txt", false, false); + } + } + + } + + tf.close(); + } + + String fname=ROOT_GENOME+GENOME_BUILD+"/scaffolds.txt.gz"; + boolean hasList=(GENOME_BUILD==FastaToChromArrays.LISTBUILD && FastaToChromArrays.SCAF_LIST!=null); + if(!LOAD_SCAFFOLDS || (!hasList && !new File(fname).exists())){ + for(int i=0; i and overwrite=true flags.\n"+i+", "+chromScaffolds[i]; + } + }else{ + for(int chrom=0; chrom=1) : chrom+", "+num+", "+Arrays.toString(chromScaffolds); + if(num>0){ + scaffoldNames[chrom]=new byte[num][]; + scaffoldLocs[chrom]=new int[num]; + scaffoldLengths[chrom]=new int[num]; + } + } + int[] count=new int[numChroms+1]; + + if(hasList){ + + if(verbose){System.err.println("Fetching scaffold names from list:\n\n"+FastaToChromArrays.SCAF_LIST+"\n\n");} + + for(int i=0; i1 ? Integer.parseInt(split[1]) : 0); +// assert(version==currentVersion) : "Wrong version: "+version+", "+currentVersion; + } + }else{ + String[] split=s.split("\t"); + if(verbose){System.err.println("Split into "+Arrays.toString(split));} + int chrom=Integer.parseInt(split[0]); + int x=count[chrom]; + count[chrom]++; + + int scaffoldID=Integer.parseInt(split[1]); + scaffoldLocs[chrom][x]=Integer.parseInt(split[2]); + scaffoldLengths[chrom][x]=Integer.parseInt(split[3]); + scaffoldNames[chrom][x]=split[4].getBytes(); + if(verbose){System.err.println("Set scaffoldNames["+chrom+"]["+x+" to "+(scaffoldNames[chrom][x]==null ? "null" : new String(scaffoldNames[chrom][x])));} + } + } + FastaToChromArrays.SCAF_LIST=null; + }else{ + + String s; + TextFile tf=new TextFile(fname, false, false); + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)=='#'){ + if(s.startsWith("#Version")){ + assert(version==currentVersion) : "Wrong index version; please delete /ref/genome/\n"+version+", "+currentVersion; +// String[] split=s.split("\t"); +// version=(split.length>1 ? Integer.parseInt(split[1]) : 0); +// assert(version==currentVersion) : "Wrong version: "+version+", "+currentVersion; + } + }else{ + String[] split=s.split("\t"); + int chrom=Integer.parseInt(split[0]); + int x=count[chrom]; + count[chrom]++; + + int scaffoldID=Integer.parseInt(split[1]); + scaffoldLocs[chrom][x]=Integer.parseInt(split[2]); + scaffoldLengths[chrom][x]=Integer.parseInt(split[3]); + scaffoldNames[chrom][x]=split[4].getBytes(); + } + + } + + tf.close(); + } + } + +// for(int i=1; i scafNameTable(){ + + if(GENOME_BUILD<0){ + assert(scaffoldNameTable==null); + return null; + } + if(scaffoldNameTable!=null){return scaffoldNameTable;} + synchronized(SCAFMAPLOCK){ + if(scaffoldNameTable!=null){return scaffoldNameTable;} + scaffoldNameTable=new HashMap((int)Tools.min(2L*numScaffolds+10, 1000000000)); + for(int chrom=0; chrom map=scafNameTable(); + if(map==null){return null;} + return map.get(new String(name)); + } + public static ScafLoc getScafLoc(String name){ + HashMap map=scafNameTable(); + if(map==null){return null;} + return map.get(name); + } + + public static byte[] scaffoldName(int chrom, int loc, int idx){return scaffoldNames[chrom][idx];} + public static int scaffoldRelativeLoc(int chrom, int loc, int idx){return loc-scaffoldLocs[chrom][idx];} + + public static int scaffoldIndex(int chrom, int loc){ + int[] array=scaffoldLocs[chrom]; + if(array==null || array.length<2){return 0;} + + assert(interScaffoldPadding>0); + loc=loc+interScaffoldPadding/2; //Puts it on closest scaffold if it is between scaffolds + + int idx=Arrays.binarySearch(array, loc); + if(idx>=0){return idx;} //Perfect hit + + //Otherwise, return closest scaffold. + int insertPoint=-1-idx; + assert(insertPoint>=0 && insertPoint<=array.length); + int r=max(0, insertPoint-1); + assert(r>=0 && rarray[r]); + assert(r==array.length-1 || loc=loc1); + if(scaffoldLocs==null){return true;} + assert(chrom>=0 && chrom0); + + int idx=Arrays.binarySearch(array, loc1+interScaffoldPadding); + final int scaf; + if(idx>=0){scaf=idx;} //Perfect hit + else{ + int insertPoint=-1-idx; + assert(insertPoint>=0 && insertPoint<=array.length); + scaf=max(0, insertPoint-1); + assert(scaf>=0 && scafarray[scaf]); + assert(scaf==array.length-1 || loc1+interScaffoldPaddingupperBound){return false;} //This could happen if a random read was generated in the start or stop padding. + assert(scaf==0 || scaf==array.length-1 || (loc1>=lowerBound && loc10 && !res.endsWith("/")){res=res+"/";} + res=res+"resources/"+fname; + f=new File(res); + if(f.exists()){path=res;} + else{if(vb){System.err.println("Did not find "+fname+" at "+res);}} + } + if(!f.exists()){ + if(vb){System.err.println("Considering fixing "+path+"\n"+path.contains("/file:"));} + if(path.contains("/file:")){ + String fixed=path.substring(path.lastIndexOf("/file:")+1); + f=new File(fixed); + if(f.exists()){path=fixed;} + else{if(vb){System.err.println("Did not find "+fname+" at "+fixed);}} + } + } + if(!f.exists()){ + if(vb){System.err.println("Considering getResource");} + URL url=Primes.class.getResource("/"+fname); + if(url!=null){ + String temp=url.toString(); + if(vb){System.err.println("Found URL "+temp);} + f=new File(temp); + // if(f.exists()){fname=temp;} + // else{System.err.println("Did not find "+fname+" at "+temp);} + path=temp; + } + } + if(!f.exists() && !path.startsWith("jar:")){ + String hardlink="/global/projectb/sandbox/gaag/bbtools/resources/"+fname; + f=new File(hardlink); + if(f.exists()){path=hardlink;} + else{if(vb){System.err.println("Did not find "+fname+" at "+hardlink);}} + } + if(!f.exists() && !path.startsWith("jar:")){ + System.err.println("Warning! Cannot find "+fname+" "+path); + return null; + } + } + if(vb){System.err.println("Found "+fname+" at "+path);} + return path; + } + + public static final int min(int x, int y){return xy ? x : y;} + + public static final byte min(byte x, byte y){return xy ? x : y;} + + public static final long min(long x, long y){return xy ? x : y;} + + public static final double min(double x, double y){return xy ? x : y;} + + public static final float min(float x, float y){return xy ? x : y;} + + public static int numChroms; + public static long numBases; + public static long numDefinedBases; + public static int numContigs; + public static int numScaffolds; + public static int interScaffoldPadding; + public static int[] chromLengths; + public static int[] chromDefinedBases; + public static int[] chromUndefinedBases; + public static int[] chromContigs; + public static int[] chromScaffolds; + public static int[] chromStartPad; + + public static byte[][][] scaffoldNames; + public static int[][] scaffoldLocs; + /** Does NOT include interScaffoldPadding */ + public static int[][] scaffoldLengths; + /** Should be true if scaffold names have extra prefixes (for BBSplitter mode), false otherwise */ + public static boolean scaffoldPrefixes; + + /** Allows translation of sam coordinates back to native coordinates */ + public static HashMap scaffoldNameTable; + + public static String genomeSource; + public static String name; + + private static final GeneSet[][] geneSetMatrix=new GeneSet[63][]; + private static final Gene[][] geneMatrix=new Gene[63][]; + public static final Range[][] geneSetRangeMatrix=new Range[63][]; + public static final Range[][] geneTxRangeMatrix=new Range[63][]; + public static final Range[][] geneCodeRangeMatrix=new Range[63][]; + private static final Range[][] geneCodeAndExonRangeMatrix=new Range[63][]; + public static final Range[][] exonRangeMatrix=new Range[63][]; + public static HashMap> geneIDTable; + + /** Ranges within genes and exons or within NEAR their ends */ + public static final Range[][] geneNearbyRangeMatrix=new Range[63][]; + + public static ChromosomeArray[] chromosomePlusMatrix; + public static ChromosomeArray[] chromosomeMinusMatrix; + + private static HashMap geneIdToNameTable; + private static HashMap geneNameToIdTable; + + private static final HashMap[] geneNameTable=new HashMap[63]; + private static final HashMap[] transcriptNameTable=new HashMap[63]; + + public static ChainLine[][] chains36to37; + public static ChainLine[][] chains37to36; + + public static int[][][] BAITS; + + private static final int TX_RANGE=0; + private static final int CODE_RANGE=1; + private static final int CODEEXON_RANGE=2; + private static final int EXON_RANGE=3; + private static final int NEAR_RANGE=4; + + + public static final int NEAR=200; + + public static boolean ENV=(System.getenv()!=null); + public static boolean WINDOWS=(System.getenv().containsKey("OS") && System.getenv().get("OS").equalsIgnoreCase("Windows_NT")); + public static int LOGICAL_PROCESSORS=Runtime.getRuntime().availableProcessors(); + private static String HOSTNAME; + public static String HOSTNAME(){ + if(HOSTNAME==null){ + try { + java.net.InetAddress localMachine = java.net.InetAddress.getLocalHost(); + HOSTNAME=localMachine.getHostName(); + } catch (UnknownHostException e) { + // TODO Auto-generated catch block +// e.printStackTrace(); + HOSTNAME="unknown"; + } catch (NullPointerException e) { + // TODO Auto-generated catch block +// e.printStackTrace(); + HOSTNAME="unknown"; + } catch (Throwable e) { + HOSTNAME="unknown"; + } + } + return HOSTNAME; + } + + + /** Should be the same as ROOT_BASE but is found dynamically */ + public static final String ROOT; + + public static String ROOT_BASE; + public static String ROOT_OUTPUT; + public static String ROOT_REF; + public static String ROOT_GENOME; + public static String ROOT_INDEX; + public static String ROOT_GENE; + public static String ROOT_CHAIN; + public static String ROOT_AFFY; + public static String ROOT_TEMPDIR; + + static{ + ROOT=(new File(Data.class.getClassLoader().getResource(Data.class.getName().replace('.', '/') + ".class") + .getFile()).getAbsolutePath().replace('\\', '/').replace("dna/Data.class", "")); + setPath(WINDOWS ? "windows" : "unix"); + if(!WINDOWS || true){setPath("local");} + } + + public static void setPath(String path){ +// System.err.println("***"+path); + if(path.indexOf('\\')>=0){path=path.replace('\\', '/');} + String mode=(path==null ? "null" : path.toLowerCase()); + boolean local=mode.equals("local") || mode.equals(".") || mode.equals("/.") || mode.equals("./"); + boolean win=mode.contains("windows"); + boolean unix=mode.contains("unix"); + + if(local){ + ROOT_BASE=""; + ROOT_OUTPUT=""; + ROOT_REF="ref/"; + ROOT_GENOME=ROOT_REF+"genome/"; + ROOT_INDEX=ROOT_REF+"index/"; +// ROOT_GENE="D:/Data/ref/genes/"; +// ROOT_CHAIN="D:/Data/ref/chain/"; +// ROOT_AFFY="D:/Data/ref/affy/"; +// ROOT_TEMPDIR=""; + }else if(win){ + ROOT_BASE="C:/workspace/prune/"; + ROOT_OUTPUT="C:/workspace/"; + ROOT_REF="D:/Data/ref/"; + ROOT_GENOME=ROOT_REF+"genome/"; + ROOT_INDEX=ROOT_REF+"index/"; + ROOT_GENE="D:/Data/ref/genes/"; + ROOT_CHAIN="D:/Data/ref/chain/"; + ROOT_AFFY="D:/Data/ref/affy/"; + ROOT_TEMPDIR="C:/workspace/tempdir/"; + }else if(unix){ + ROOT_BASE="/house/homedirs/b/bushnell/prune/"; + ROOT_OUTPUT="/house/homedirs/b/bushnell/prune/"; + ROOT_REF="/house/homedirs/b/bushnell/Data/ref/"; + ROOT_GENOME=ROOT_REF+"genome/"; + ROOT_INDEX=ROOT_REF+"index/"; + ROOT_GENE="/house/homedirs/b/bushnell/Data/ref/genes/"; + ROOT_CHAIN="/house/homedirs/b/bushnell/Data/ref/chain/"; + ROOT_AFFY="/house/homedirs/b/bushnell/Data/ref/affy/"; + String s=System.getenv().get("TEMPDIR"); + ROOT_TEMPDIR=(s==null ? "" : s+"/"); + }else if(!"null".equals(mode)){ + if(!path.endsWith("/")){path=path+"/";} + ROOT_BASE=path; + ROOT_OUTPUT=path; + ROOT_REF=path+"ref/"; + ROOT_GENOME=ROOT_REF+"genome/"; + ROOT_INDEX=ROOT_REF+"index/"; +// ROOT_GENE="D:/Data/ref/genes/"; +// ROOT_CHAIN="D:/Data/ref/chain/"; +// ROOT_AFFY="D:/Data/ref/affy/"; +// ROOT_TEMPDIR=""; + }else{ + ROOT_BASE=null; + ROOT_OUTPUT=null; + ROOT_REF=null; + ROOT_GENOME=null; + ROOT_GENE=null; + ROOT_CHAIN=null; + ROOT_AFFY=null; + ROOT_TEMPDIR=null; + } + } + + public static final String VAR_FOLDER="VAR/"; + public static final String GENE_FOLDER="GENE/"; + + public static int GENOME_BUILD=-1; + private static int genome_set_to=-1; + public static int DBSNP_BUILD=132; + + public static final byte CGI=0; + public static final byte CORE_SOLID=1; + public static final byte CORE_ILLUMINA=2; + public static final byte OTOGEN_ILLUMINA=3; + + public static final boolean verbose=false; + + /** seqGene, knownGene, refGene, unionGene, seqRefGene, ccs */ + public static String GENE_MAP="seqRefGene"; + + /** 30M, 50M, nimblegen, truseq_60M */ + public static String BAIT_FILE="truseq_60M"; + + private static final String BAITLOCK=new String("BAITLOCK"); + private static final String GENEIDLOCK=new String("GENEIDLOCK"); + + private static final String[] CHROMLOCKS=new String[256]; + + static{ + for(int i=0; i INTERNMAP=new HashMap(INTERN_MAP_SIZE); +// public static final void unloadInternMap(){ +// INTERNMAP=new HashMap(INTERN_MAP_SIZE); +// } + + private static String condense(String s){ + //TODO - finish this + StringBuilder sb=new StringBuilder(s.length()); + for(int i=0; i25){return new String(s);} +// calls++; +// +// if(s.length()>0 && s.charAt(0)!='?'){ +// s=condense(s); +// } + + if(s.length()<2){ +// return s.intern(); + return forceIntern(s); + } + boolean acgtn=AminoAcid.containsOnlyACGTNQ(s); + + if(acgtn){ + if(s.length()<4){ +// return s.intern(); + return forceIntern(s); + } + if(s.length()>6){ + return new String(s); + } + } + + //Otherwise it is non-base string of length 2 to 20, or a base string of length 4 to 6. + return forceIntern(s); + } + + public static String forceIntern(String s){ + calls++; + +// if(s.length()<2){return s.intern();} +// boolean acgtn=AminoAcid.containsOnlyACGTNQ(s); +// +// if(acgtn){ +// if(s.length()<4){return s.intern();} +// } + + String old=INTERNMAP.get(s); + if(old!=null){return old;} + + synchronized(INTERNMAP){ +// System.err.print(INTERNMAP.size()+"~"+calls+": "+s+", "); + if(INTERNMAP.size()>INTERN_MAP_LIMIT){ + System.err.println("INTERNMAP overflow caused by "+s); + INTERNMAP.clear(); + } + if(INTERNMAP.containsKey(s)){return INTERNMAP.get(s);} + s=new String(s); + INTERNMAP.put(s, s); + } + return s; + } + static int calls=0; + + public static PrintStream sysout=System.err;//System.out; + + public static boolean CHROMC=false; + public static boolean CHROMGZ=true; + public static boolean LOAD_SCAFFOLDS=true; + +// private static final boolean GUNZIP=testExecute("gunzip --help"); +// private static final boolean GZIP=testExecute("gzip --help"); +// private static final boolean SAMTOOLS=testExecute("samtools --help"); + + public static boolean GUNZIP(){return GUNZIP==0 ? GZIP() : GUNZIP>0;} + public static boolean UNPIGZ(){return UNPIGZ==0 ? PIGZ() : UNPIGZ>0;} + public static boolean GZIP(){ + if(GZIP==0 && !WINDOWS){ + synchronized(SUBPROCSYNC){ + if(GZIP==0){GZIP=testExecute("gzip --version");} + } + } + return GZIP>0; + } + public static boolean PIGZ(){ + if(PIGZ==0 && !WINDOWS){ + synchronized(SUBPROCSYNC){ + if(PIGZ==0){PIGZ=testExecute("pigz --version");} + } + } + return PIGZ>0; + } + public static boolean SAMTOOLS(){ + if(SAMTOOLS==0 && !WINDOWS){ + synchronized(SUBPROCSYNC){ + if(SAMTOOLS==0){SAMTOOLS=testExecute("samtools");} + } + System.err.println(SAMTOOLS>0 ? "Found samtools." : "Could not find samtools."); + } + return SAMTOOLS>0; + } + public static boolean SH(){ + if(SH==0 && !WINDOWS){ + synchronized(SUBPROCSYNC){ + if(SH==0){SH=testExecute("sh --version");} + } +// System.err.println(SH>0 ? "Found sh." : "Could not find sh."); + if(SH<0){System.err.println("Could not find sh; won't launch I/O subprocesses.");} + } + return SH>0; + } + private static final String SUBPROCSYNC=new String("SUBPROCSYNC"); + private static final String SCAFMAPLOCK=new String("SCAFMAPLOCK"); + + /* Set these to zero to enable or -1 to disable */ + private static int GUNZIP=-1; + private static int UNPIGZ=0; + private static int GZIP=0; + private static int PIGZ=0; + private static int SAMTOOLS=0; + private static int SH=0; + + private static int testExecute(String s){ +// System.err.println("Testing "+s); + try { + Process p; + p = Runtime.getRuntime().exec(s); +// System.err.println("Got process."); + while(p.getErrorStream().read()>-1){} +// return p.exitValue()==0; +// System.err.println("This system does has "+s+" installed."); + } catch (IOException e) { +// System.err.println("This system does not have "+s+" installed."); + // TODO Auto-generated catch block +// e.printStackTrace(); + return -1; + } + return 1; + } + +} diff --git a/current/dna/Exon.java b/current/dna/Exon.java new file mode 100755 index 0000000..7cf4a30 --- /dev/null +++ b/current/dna/Exon.java @@ -0,0 +1,175 @@ +package dna; +import java.io.Serializable; +import java.util.HashMap; + + +public class Exon implements Comparable, Serializable{ + + /** + * + */ + private static final long serialVersionUID = 1890833345682913235L; + + + public Exon(){ + a=-1; + b=-1; + utr=false; + cds=false; + chromosome=-1; + strand=-1; + } + +// public Exon(String startPoint, String endPoint, String chrom){ +// this(startPoint, endPoint, chrom, "?"); +// } +// +// public Exon(int startPoint, int endPoint, String chrom){ +// this(startPoint, endPoint, chrom, "?"); +// } +// +// public Exon(int startPoint, int endPoint, byte chrom){ +// this(startPoint, endPoint, chrom, (byte)2); +// } + + public Exon(String startPoint, String endPoint, String chrom, String strnd, boolean utr_, boolean cds_){ + this(Integer.parseInt(startPoint), Integer.parseInt(endPoint), toChromosome(chrom), toStrand(strnd), utr_, cds_); + } + + public Exon(int startPoint, int endPoint, String chrom, String strnd, boolean utr_, boolean cds_){ + this(startPoint, endPoint, toChromosome(chrom), toStrand(strnd), utr_, cds_); + } + + public Exon(int startPoint, int endPoint, byte chrom, byte strnd, boolean utr_, boolean cds_){ + a=startPoint; + b=endPoint; + chromosome=chrom; + strand=strnd; + utr=utr_; + cds=cds_; + } + + + + public static Exon merge(Exon exon1, Exon exon2){ + assert(canMerge(exon1, exon2)); + return new Exon(min(exon1.a, exon2.a), max(exon1.b, exon2.b), exon1.chromosome, exon1.strand, exon1.cds||exon2.cds, exon1.utr||exon2.utr); + } + + public static boolean canMerge(Exon exon1, Exon exon2){ + if(exon1.chromosome!=exon2.chromosome){return false;} + return overlap(exon1.a, exon1.b, exon2.a, exon2.b); + } + + + public boolean intersects(int point){return point>=a && point<=b;} + //Slow + public boolean intersects(int a2, int b2){ + assert(a2<=b2); + return overlap(a, b, a2, b2); + } + + public boolean crosses(int a2, int b2){return (a2
=a) || (a2<=b && b2>b);} + public boolean contains(int a2, int b2){return (a2>=a && b2<=b);} + + public boolean intersectsNearby(int a, int b){ + return intersects(a-Data.NEAR, b+Data.NEAR); + } + + private static boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + + public int distToSpliceSite(int x, int y){ + int distA=distToPoint(x, y, a); + int distB=distToPoint(x, y, b); + return min(distA, distB); + } + + public static int distToPoint(int x, int y, int point){ + assert(x<=y); + if(y<=point){return point-y;} + if(x>=point){return x-point;} + return 0; + } + + public static byte toStrand(String s){ + byte r=2; + if("-".equals(s)){ + r=1; + }else if("+".equals(s)){ + r=0; + }else{ + assert("?".equals(s)); + } + return r; + } + + public static byte toChromosome(String s){ + int i=0; +// System.out.println(s); + while(!Character.isDigit(s.charAt(i))){i++;} + return Byte.parseByte(s.substring(i)); + } + + public int length(){ + int r=(int)(b-a+1); + assert(r>0); + return r; + } + + public String toString(){ +// return "(chr"+chromosome+","+(strand==0 ? "+" : "-")+","+a+"~"+b+")"; + return "(chr"+chromosome+", "+a+" - "+b+", len "+length()+")"; + } + + public int compareTo(Exon other){ + if(chromosomeother.chromosome){return 1;} + + if(aother.a){return 1;} + + if(bother.a){return 1;} + + if(strandother.strand){return 1;} + + if(utr && !other.utr){return -1;} + if(!utr && other.utr){return 1;} + + if(cds && !other.cds){return -1;} + if(!cds && other.cds){return 1;} + + return 0; + } + + public boolean equals(Object other){ + return equals((Exon)other); + } + + public boolean equals(Exon other){ + return a==other.a && b==other.b && chromosome==other.chromosome && strand==other.strand && utr==other.utr && cds==other.cds; + } + + public int hashCode(){ + int xor=a^(Integer.rotateLeft(b, 16)); + xor^=Integer.rotateRight(chromosome, 6); + return xor; + } + + + private static final int min(int x, int y){return xy ? x : y;} + + public final int a; + public final int b; + public final boolean utr; + public final boolean cds; + public final byte chromosome; + public final byte strand; + + public static final HashMap table=new HashMap(65536); +} diff --git a/current/dna/FastaToChromArrays.java b/current/dna/FastaToChromArrays.java new file mode 100755 index 0000000..79a3a5f --- /dev/null +++ b/current/dna/FastaToChromArrays.java @@ -0,0 +1,579 @@ +package dna; + +import java.io.File; +import java.io.IOException; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Date; + +import align2.Tools; + +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public class FastaToChromArrays { + +// Example: +// jgi.FastaToChromArrays ecoli_K12.fa 1 writeinthread=false genscaffoldinfo=true retain waitforwriting=false +// gzip=true chromc=false maxlen=536670912 writechroms=true minscaf=1 midpad=300 startpad=8000 stoppad=8000 nodisk=false + + public static void main(String[] args){ + main2(args); + } + + public static ArrayList main2(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + boolean oldWIT=WRITE_IN_THREAD; + WRITE_IN_THREAD=true; + +// assert(false) : ReadWrite.ZIPLEVEL; + + String name=null; + + int genome=-1; + int chroms=-1; + String infile=null; + boolean writeinfo=false; + boolean genScaffoldInfo=true; + boolean writeChroms=true; + boolean scafprefixes=Data.scaffoldPrefixes; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + //do nothing + }if(a.equals("path") || a.equals("root") || a.equals("tempdir")){ + Data.setPath(b); + }else if(a.equals("name") || a.equals("organism")){ + name=b; + }else if(a.equals("in") || a.equals("input") || a.equals("ref") || a.equals("fasta")){ + if(split.length<1 || "null".equalsIgnoreCase(b)){b=null;} + infile=b; + }else if(a.equals("build") || a.equals("genome")){ + genome=Integer.parseInt(b); + }else if(a.equals("chroms")){ + chroms=Integer.parseInt(b); + }else if(a.equals("writeinthread")){ + WRITE_IN_THREAD=Tools.parseBoolean(b); + }else if(a.equals("nodisk")){ + NODISK=Tools.parseBoolean(b); + }else if(a.equals("writeinfo")){ + writeinfo=Tools.parseBoolean(b); + }else if(a.equals("padstart") || a.startsWith("startpad") || a.equals("padfront") || a.startsWith("frontpad")){ + START_PADDING=Integer.parseInt(b); + }else if(a.equals("padstop") || a.startsWith("stoppad") || a.equals("padend") || a.startsWith("endpad")){ + END_PADDING=Integer.parseInt(b); + }else if(a.equals("pad") || a.equals("padding")){ + START_PADDING=END_PADDING=Integer.parseInt(b); + }else if(a.equals("midpad") || a.startsWith("padmid")){ + MID_PADDING=Integer.parseInt(b); + }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){ + MIN_SCAFFOLD=Integer.parseInt(b); + }else if(a.equals("genscaffoldinfo")){ + genScaffoldInfo=Tools.parseBoolean(b); + System.err.println("Set genScaffoldInfo="+genScaffoldInfo); + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + }else if(a.equals("mergescaffolds") || a.equals("mergecontigs") || (a.equals("merge"))){ + MERGE_SCAFFOLDS=Tools.parseBoolean(b); + System.err.println("Set MERGE_SCAFFOLDS="+MERGE_SCAFFOLDS); + }else if(a.startsWith("maxlen") || a.startsWith("chromlen")){ + long len=Long.parseLong(b); + assert(len>0 && len<=Integer.MAX_VALUE); + MAX_LENGTH=(int)len; + }else if(a.equals("writechroms")){ + writeChroms=Tools.parseBoolean(b); + }else if(a.equals("gzip")){ + Data.CHROMGZ=Tools.parseBoolean(b); + }else if(a.equals("chromc")){ + Data.CHROMC=Tools.parseBoolean(b); + }else if(a.equals("retain")){ + RETAIN=Tools.parseBoolean(b); + }else if(a.equals("scafprefixes")){ + scafprefixes=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("waitforwriting")){ + WAIT_FOR_WRITING=Tools.parseBoolean(b); + }else{ + if(i>2){ + System.err.println("Unknown parameter "+args[i]); +// throw new RuntimeException("Unknown parameter "+args[i]); + } + } + } + } + + WAIT_FOR_WRITING=(WAIT_FOR_WRITING || ReadWrite.USE_GZIP || ReadWrite.USE_PIGZ); + + ArrayList r=RETAIN ? new ArrayList() : null; + +// assert(false) : Arrays.toString(args); +// assert(RETAIN); + + if(genome<0){genome=Integer.parseInt(args[1]);} //Legacy + if(genome<0){throw new RuntimeException("Please specify a genome build number.");} + + if(writeinfo){ + if(chroms<0){chroms=Integer.parseInt(args[2]);} //Legacy + if(chroms<0){throw new RuntimeException("Please the number of chroms.");} + writeInfo(genome, chroms, name, null, false, scafprefixes); + }else{ + if(infile==null){infile=args[0].replace('\\', '/');} //Legacy + if(infile==null){throw new RuntimeException("Please specify an input file.");} + { + File f=new File(infile); + if(!f.exists() || f.isDirectory()){ + if(!infile.startsWith("stdin")){ + throw new RuntimeException("Not a valid file: "+f); + } + } + } + String outRoot=Data.ROOT_GENOME+genome+"/"; + + FastaToChromArrays ftca=new FastaToChromArrays(); + ftca.makeChroms(infile, outRoot, name, genScaffoldInfo, writeChroms, r, scafprefixes); + } + + WRITE_IN_THREAD=oldWIT; + return r; + } + + private FastaToChromArrays(){} + + + private static int[] countInfo(ChromosomeArray ca){ + int contigs=0; + int startPad=0; + int stopPad=0; + int undefined=0; + int defined=0;//=ca.countDefinedBases(); + + int lastN=-1; + int lastDef=-1; + + for(int i=0; i<=ca.maxIndex; i++){ + byte b=ca.get(i); + if(AminoAcid.isFullyDefined(b)){ + if(defined==0){startPad=i; contigs++;} + else if(i-lastDef>contigTrigger){contigs++;} + lastDef=i; + defined++; + }else{ + lastN=i; + undefined++; + } + } + + if(contigs>0 && lastN==ca.maxIndex){ + stopPad=lastN-lastDef; + }else{ +// System.err.println(lastN+", "+lastDef+", "+ca.maxIndex); + } + + return new int[] {ca.chromosome, 1, contigs, (ca.maxIndex+1), defined, undefined, startPad, stopPad}; + } + + @Deprecated + public static void writeInfo(int genome, int chroms, String name, String source, boolean unload, boolean scafNamePrefix){ + Data.GENOME_BUILD=genome; + Data.chromosomePlusMatrix=new ChromosomeArray[chroms+1]; + + String outRoot=Data.ROOT_GENOME+genome+"/"; + TextStreamWriter info=new TextStreamWriter(outRoot+"info.txt", true, false, false); + info.start(); + info.print("#Chromosome sizes\n"); + try { + info.print("#Generated on\t"+new Date()+"\n"); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + info.print("#Version\t"+VERSION+"\n"); + info.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n"); + + + long bases=0; + long definedBases=0; + + long contigSum=0; + + for(int chrom=1; chrom<=chroms; chrom++){ + ChromosomeArray ca=Data.getChromosome(chrom); + int[] v=countInfo(ca); + info.print(v[0]+"\t"+v[1]+"\t"+v[2]+"\t"+v[3]+"\t"+v[4]+"\t"+v[5]+"\t"+v[6]+"\t"+v[7]+"\n"); + + bases+=v[3]; + definedBases+=v[4]; + contigSum+=v[2]; + if(unload){Data.unload(chrom, false);} + } + info.poison(); + StringBuilder sb=new StringBuilder(); + sb.append("#Summary\n"); + try { + sb.append("#Generated on\t"+new Date()+"\n"); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + sb.append("#Version\t"+VERSION+"\n"); + sb.append("chroms\t"+(chroms)+"\n"); + sb.append("bases\t"+bases+"\n"); + sb.append("defined\t"+definedBases+"\n"); + sb.append("undefined\t"+(bases-definedBases)+"\n"); + sb.append("contigs\t"+contigSum+"\n"); + sb.append("scaffolds\t"+chroms+"\n"); + sb.append("interpad\t"+MID_PADDING+"\n"); + if(name!=null){sb.append("name\t"+name+"\n");} + if(source!=null){sb.append("source\t"+source+"\n");} + if(scafNamePrefix){sb.append("scafprefixes\t"+scafNamePrefix+"\n");}//else{assert(false);} + ReadWrite.writeString(sb, outRoot+"summary.txt", false); + info.waitForFinish(); + } + + private int makeChroms(String fname, String outRoot, String genomeName, boolean genScaffolds, boolean writeChroms, ArrayList r, + boolean scafNamePrefix){ + + if(!NODISK){ + File f=new File(outRoot); + if(!f.exists()){ + if(!NODISK){f.mkdirs();} + }else if(OVERWRITE){ + for(File g : f.listFiles()){ + String s=g.getName(); + if(g.isFile() && s.contains(".chrom")){ + System.err.println("Deleting "+s); + g.delete(); + } + } + } + + f=new File(outRoot.replace("ref/genome/", "ref/index/")); + if(!f.exists()){ + if(!NODISK){f.mkdirs();} + }else if(OVERWRITE){ + for(File g : f.listFiles()){ + String s=g.getName(); + if(g.isFile() && (s.endsWith(".int2d") || s.endsWith(".block") || s.endsWith(".block2.gz") || s.endsWith(".blockB") || s.endsWith(".blockB2.gz"))){ + System.err.println("Deleting "+s); + g.delete(); + } + } + } + } + + TextFile tf=new TextFile(fname, false, false); + int chrom=1; + + TextStreamWriter infoWriter=null, scafWriter=null; + ArrayList infolist=null, scaflist=null; + + if(NODISK){ + infolist=new ArrayList(); + infolist.add("#Chromosome sizes"); + try { + infolist.add("#Generated on\t"+new Date()); + } catch (Exception e1) { + e1.printStackTrace(); + } + infolist.add("#Version\t"+VERSION); + infolist.add("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad"); + }else{ + infoWriter=new TextStreamWriter(outRoot+"info.txt", true, false, false); + infoWriter.start(); + infoWriter.print("#Chromosome sizes\n"); + try { + // System.err.println(new Date()); + infoWriter.print("#Generated on\t"+new Date()+"\n"); + } catch (Exception e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + infoWriter.print("#Version\t"+VERSION+"\n"); + infoWriter.print("#chrom\tscaffolds\tcontigs\tlength\tdefined\tundefined\tstartPad\tstopPad\n"); + } + + if(genScaffolds){ + if(NODISK){ + scaflist=new ArrayList(); + scaflist.add("#Scaffold names"); + try { + scaflist.add("#Generated on\t"+new Date()); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + scaflist.add("#Version\t"+VERSION); + scaflist.add("#chrom\tid\tstart\tlength\tname"); + }else{ + //System.err.println("*123 Making ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ); + scafWriter=new TextStreamWriter(outRoot+"scaffolds.txt.gz", true, false, false); + scafWriter.start(); + scafWriter.print("#Scaffold names\n"); + try { + scafWriter.print("#Generated on\t"+new Date()+"\n"); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + scafWriter.print("#Version\t"+VERSION+"\n"); + scafWriter.print("#chrom\tid\tstart\tlength\tname\n"); + } + } + + + for(ChromosomeArray ca=makeNextChrom(tf, chrom, infoWriter, scafWriter, infolist, scaflist); ca!=null; + ca=makeNextChrom(tf, chrom, infoWriter, scafWriter, infolist, scaflist)){ + if(ca.array.length>ca.maxIndex+1){ca.resize(ca.maxIndex+1);} + if(RETAIN){r.add(ca);} + + if(writeChroms){ + String x=outRoot+"chr"+chrom+Data.chromExtension(); + if(new File(x).exists() && !OVERWRITE){throw new RuntimeException("Tried to overwrite existing file "+x+", but OVERWRITE=false.");} + if(Data.CHROMC){ + ChromosomeArrayCompressed cac=new ChromosomeArrayCompressed(ca); + ReadWrite.writeObjectInThread(cac, x, false); + }else{ + ReadWrite.writeObjectInThread(ca, x, false); + } + System.err.println("Writing "+x); + } + chrom++; + } + lastHeader=nextHeader=null; + + tf.close(); + if(infoWriter!=null){infoWriter.poison();} + if(scafWriter!=null){ + //System.err.println("*123 Killing ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ); + scafWriter.poison(); + } + + StringBuilder sb=new StringBuilder(); + sb.append("#Summary\n"); + try { + sb.append("#Generated on\t"+new Date()+"\n"); + } catch (Exception e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + sb.append("#Version\t"+VERSION+"\n"); + sb.append("chroms\t"+(chrom-1)+"\n"); + sb.append("bases\t"+lengthSum+"\n"); + assert((definedSum+undefinedSum)==lengthSum) : definedSum+", "+undefinedSum+", "+lengthSum; + sb.append("defined\t"+definedSum+"\n"); + sb.append("undefined\t"+undefinedSum+"\n"); + sb.append("contigs\t"+contigSum+"\n"); + sb.append("scaffolds\t"+scaffoldSum+"\n"); + sb.append("interpad\t"+MID_PADDING+"\n"); + if(genomeName!=null){sb.append("name\t"+genomeName+"\n");} + if(fname!=null){ + File f=new File(fname); + String cpath=null; + try { + cpath=f.getCanonicalPath(); + } catch (IOException e) { + cpath=f.getAbsolutePath(); + } + sb.append("source\t"+cpath+"\n"); + sb.append("bytes\t"+f.length()+"\n"); + sb.append("last modified\t"+f.lastModified()+"\n"); + } + if(scafNamePrefix){sb.append("scafprefixes\t"+scafNamePrefix+"\n");}//else{assert(false);} + if(NODISK){ + SUMMARY_LIST=new ArrayList(); + String[] split=sb.toString().split("\n"); + for(String s : split){SUMMARY_LIST.add(s);} + }else{ + ReadWrite.writeString(sb, outRoot+"summary.txt", false); + } + + if(infoWriter!=null){infoWriter.waitForFinish();} + if(scafWriter!=null){ + //System.err.println("*123 Waiting For ScafWriter; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ); + scafWriter.waitForFinish(); + //System.err.println("*123 ScafWriter Finished; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ); + } + + if(WAIT_FOR_WRITING && ReadWrite.countActiveThreads()>0){ + System.err.println("Waiting for writing to finish."); + ReadWrite.waitForWritingToFinish(); + System.err.println("Finished."); + //System.err.println("*123 countActiveThreads Finished; "+ReadWrite.countActiveThreads()+", "+ReadWrite.USE_GZIP+", "+ReadWrite.USE_PIGZ); + } + + if(infolist!=null){ + INFO_LIST=infolist; + LISTBUILD=Data.GENOME_BUILD; + }else{INFO_LIST=null;} + if(scaflist!=null){ + SCAF_LIST=scaflist; + LISTBUILD=Data.GENOME_BUILD; + }else{SCAF_LIST=null;} + + return chrom-1; + } + + private ChromosomeArray makeNextChrom(TextFile tf, int chrom, TextStreamWriter infoWriter, TextStreamWriter scafWriter, ArrayList infolist, ArrayList scaflist){ + ChromosomeArray ca=new ChromosomeArray(chrom, (byte)Gene.PLUS, 0, 120000+START_PADDING, false); + ca.maxIndex=-1; + for(int i=0; iMAX_LENGTH){break;} + if(scaffolds>0 && !MERGE_SCAFFOLDS){break;} + + if(scaffolds>0){ + for(int i=0; i=MIN_SCAFFOLD){ +// System.err.println("B: Writing a scaffold because currentScaffold = "+currentScaffold); + scaffoldSum++; + if(scafWriter!=null){scafWriter.print(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+lastHeader+"\n");} + if(scaflist!=null){ + scaflist.add(chrom+"\t"+scaffoldSum+"\t"+(ca.maxIndex+1)+"\t"+currentScaffold.length()+"\t"+lastHeader); + if(verbose){System.err.println("B: Added to scaflist: "+scaflist.get(scaflist.size()-1));} + } + ca.set(ca.maxIndex+1, currentScaffold); + scaffolds++; + } + + currentScaffold.setLength(0); + lastHeader=nextHeader; + } + + if(verbose){System.err.println("lastHeader="+lastHeader);} + + if(scaffolds==0){return null;} + + if(END_PADDING>0){ + int terminalN=0; + for(int i=ca.maxIndex; i>=0 && terminalN INFO_LIST, SCAF_LIST, SUMMARY_LIST; + +// public static boolean GENERATE_SCAFFOLD_INFO=true; + +} diff --git a/current/dna/Gene.java b/current/dna/Gene.java new file mode 100755 index 0000000..019b9d7 --- /dev/null +++ b/current/dna/Gene.java @@ -0,0 +1,1050 @@ +package dna; +import java.io.Serializable; +import java.util.HashSet; + + +public class Gene implements Comparable, Serializable{ + +// /** +// * +// */ + private static final long serialVersionUID = -1342555621377050981L; + + + public Gene(){ + chromosome=-1; +// nc_accession=null; + symbol=null; + proteinAcc=null; + id=-1; + mrnaAcc=null; + status=-1; + completeness=-1; + strand=-1; + codeStart=txStart=-1; + codeStop=txStop=-1; + exons=null; + cdsStartStat=-1; + cdsEndStat=-1; + exonFrames=null; + txLength=-1; + codeLength=-1; + exonLength=-1; + exonCodeLength=-1; + aaLength=-1; + utrLength5prime=-1; + utrLength3prime=-1; + readCorrectly=false; + untranslated=false; + pseudo=false; + description=null; + fullDescription=null; + valid=true; + primarySource=-1; + } + + public Gene(byte chrom, byte strand_, int txStart_, int txStop_, int cdStart_, int cdStop_, int gid, + String name_, String trans_, String protTrans_, String status_, String completeness_, + Exon[] exons_, boolean untran, boolean pseudo_, boolean valid_, + String primarySource_, String descript_, String fullDescript_){ + + chromosome=chrom; +// nc_accession=null; + symbol=name_; + id=gid; + mrnaAcc=((trans_==null || trans_.length()<1 || trans_.equals("-")) ? null : trans_); + proteinAcc=((protTrans_==null || protTrans_.length()<1 || protTrans_.equals("-")) ? null : protTrans_); + + primarySource=primarySource_==null ? -1 : (byte)find3(primarySource_, sourceCodes); + description=descript_; + fullDescription=fullDescript_; + + + status=status_==null ? -1 : (byte)find3(status_, statusCodes); + completeness=completeness_==null ? -1 : (byte)find3(completeness_, completenessCodes); + strand=strand_; + + exons=exons_; + + txStart=txStart_; + txStop=txStop_; //Assuming pure 0-based numbering. + codeStart=cdStart_; + codeStop=cdStop_; //Assuming pure 0-based numbering. + + assert(codeStart>=txStart) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc; + assert(codeStop<=txStop) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc; + + +// cdsStartStat=(byte)find("?", endStatCodes); +// cdsEndStat=(byte)find("?", endStatCodes); + cdsStartStat=-1; + cdsEndStat=-1; + + exonFrames=null; + + txLength=txStop-txStart+1; + codeLength=(codeStop==codeStart ? 0 : codeStop-codeStart+1); + + untranslated=untran; + pseudo=pseudo_; + + int eLen=0, ecLen=0, utr0=0, utr2=0; + + if(exons!=null){ + + for(Exon e : exons){ + + utr0+=max(0, min(e.b, codeStart)-e.a); + utr2+=max(0, e.b-max(e.a, codeStop)); + + int len=e.b-e.a+1; + eLen+=len; + len=(min(e.b, codeStop)-max(e.a, codeStart)); + len=max(0, len+1); + ecLen+=len; + } + } + + + exonLength=(eLen<2 ? 0 : eLen); + exonCodeLength=(codeLength<1 || exonLength<1 ? 0 : ecLen); + aaLength=exonCodeLength/3-1; + + assert(exonLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n"; + assert(codeLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n"; + + //assert(exonCodeLength%3 == 0); //This should be true with a correct database + + if(strand==PLUS){ + utrLength5prime=untranslated ? 0 : utr0; + utrLength3prime=untranslated ? 0 : utr2; + }else{ + utrLength5prime=untranslated ? 0 : utr2; + utrLength3prime=untranslated ? 0 : utr0; + } + + //System.err.println(name+", "+exonLength+", "+exonCodeLength+(exons==null ? "" : ", "+exons.length)); + + readCorrectly=true; + valid=(readCorrectly && valid_); + } + + + public Gene merge(Gene g){ + + assert((exons==null && g.exons==null) || + (exons!=null && g.exons!=null && exons.length==g.exons.length)); +// assert(exonLength==g.exonLength); + assert(Math.abs(exonLength-g.exonLength)<=8) : "\n\n"+this+"\n\n"+g+"\n\n"; + assert(strand==g.strand); +// assert(codeStart==g.codeStart); +// assert(codeStop==g.codeStop); + + String Xsymbol=symbol; + String XproteinAcc=proteinAcc; + int Xid=id; + String XmrnaAcc=mrnaAcc; + int Xstatus=status; + int Xcompleteness=completeness; + int XcodeStart=codeStart; + int XcodeStop=codeStop; + int XtxStart=txStart; + int XtxStop=txStop; + int XcdsStartStat=cdsStartStat; + int XcdsEndStat=cdsEndStat; + byte[] XexonFrames=exonFrames; + int XtxLength=txLength; + int XcodeLength=codeLength; + int XexonLength=exonLength; + int XexonCodeLength=exonCodeLength; + int XaaLength=aaLength; + int XutrLength5prime=utrLength5prime; + int XutrLength3prime=utrLength3prime; +// boolean XreadCorrectly=readCorrectly; + boolean Xuntranslated=untranslated; + boolean Xpseudo=pseudo; + String Xdescription=description; + String XfullDescription=fullDescription; + boolean Xvalid=valid; + + assert(untranslated || g.untranslated || g.codeStart>=txStart) : "\n"+this+"\n\n"+g; + assert(untranslated || g.untranslated || g.codeStop<=txStop) : "\n"+this+"\n\n"+g; + + if(Xsymbol==null){Xsymbol=g.symbol;} + if(XproteinAcc==null){XproteinAcc=g.proteinAcc;} + if(Xid<0){Xid=g.id;} + if(XmrnaAcc==null){XmrnaAcc=g.mrnaAcc;} + if(Xstatus<0){Xstatus=g.status;} + if(Xcompleteness<0){Xcompleteness=g.completeness;} + + + if(XcodeStart==XcodeStop && g.codeStart=txStart); + assert(g.codeStop<=txStop); + XcodeStart=g.codeStart; + XcodeStop=g.codeStop; + } + + //These two should never happen... + if(XtxStart<0){XtxStart=g.txStart;} + if(XtxStop<0){XtxStop=g.txStop;} + + if(XcdsStartStat<0){XcdsStartStat=g.cdsStartStat;} + if(XcdsEndStat<0){XcdsEndStat=g.cdsEndStat;} + if(XexonFrames==null){XexonFrames=g.exonFrames;} + if(XtxLength<0){XtxLength=g.txLength;} + if(XcodeLength<0){XcodeLength=g.codeLength;} + if(XexonLength<0){XexonLength=g.exonLength;} + if(XexonCodeLength<0){XexonCodeLength=g.exonCodeLength;} + if(XaaLength<0){XaaLength=g.aaLength;} + if(XutrLength5prime<0){XutrLength5prime=g.utrLength5prime;} + if(XutrLength3prime<0){XutrLength3prime=g.utrLength3prime;} + if(Xdescription==null){Xdescription=g.description;} + if(XfullDescription==null){XfullDescription=g.fullDescription;} + +// if(XreadCorrectly){} +// if(Xuntranslated){} +// if(Xpseudo){} +// if(Xvalid){} + + //TODO Note that the readCorrectly field gets lost here + Gene out=new Gene(chromosome, strand, XtxStart, XtxStop, XcodeStart, XcodeStop, Xid, + symbol, XmrnaAcc, XproteinAcc, + Xstatus< 0 ? null : statusCodes[Xstatus], Xcompleteness<0 ? null : completenessCodes[Xcompleteness], + exons, Xuntranslated, Xpseudo, Xvalid, sourceCodes[primarySource], Xdescription, XfullDescription); + + return out; + } + + + public static byte toStrand(String s){ + byte r=2; + if("-".equals(s)){ + r=1; + }else if("+".equals(s)){ + r=0; + }else if("?".equals(s) || ".".equals(s)){ + r=2; + }else{ + throw new RuntimeException("Unknown strand: "+s); + } + return r; + } + + public static byte toChromosome(final String s){ +// assert(false) : s; + String s2=s; + if(s2.endsWith("random")){s2="U";} + if(s2.startsWith("chr")){s2=s2.substring(3);} + if(s2.equals("MT")){s2="M";} +// int loc=find2(s2.toUpperCase(), chromCodes); + int loc=find3(s2.toUpperCase(), chromCodes); + + if(loc<0){ + if(!Character.isDigit(s2.charAt(0))){ + loc=find3("U", chromCodes); + }else{ + try { + loc=Integer.parseInt(s2); + } catch (NumberFormatException e) { + throw new RuntimeException(e); + } + assert(loc>=23 && loc<=26) : loc+", "+s; + } + } + + assert(loc>=0) : s; + return (byte)loc; + } + + public static int toBuild(final String s){ + String s2=s.toLowerCase(); + if(s2.startsWith("build")){s2=s2.substring(5);} + else if(s2.startsWith("b")){s2=s2.substring(1);} + else if(s2.startsWith("hg")){s2=s2.substring(1);} + + if(s2.startsWith("=")){s2=s2.substring(1);} + + assert(Character.isDigit(s2.charAt(0))) : s; + + return Integer.parseInt(s2); + } + + private void fillExons(String eStarts, String eEnds, byte chr, byte str){ + String[] s1=eStarts.split(","); + String[] s2=eEnds.split(","); + + int last=-1; + + for(int i=0; ilast) : eStarts; + last=a; + + boolean cds=overlap(a, b, codeStart, codeStop); + boolean utr=(acodeStop); + + Exon key=new Exon(a, b, chr, str, utr, cds); + Exon value=Exon.table.get(key); + if(value==null){ + value=key; + Exon.table.put(key, key); + } + exons[i]=value; + } + } + + private Exon[] fillExonsCCDS(String estring, byte chr, byte str){ + String[] intervals=estring.replace("[","").replace("]","").replace(" ","").split(","); + + int last=-1; + + Exon[] array=new Exon[intervals.length]; + + for(int i=0; ilast) : estring; + last=a; + + boolean cds=overlap(a, b, codeStart, codeStop); + boolean utr=(acodeStop); + + Exon key=new Exon(a, b, chr, str, utr, cds); + Exon value=Exon.table.get(key); + if(value==null){ + value=key; + Exon.table.put(key, key); + } + array[i]=value; + } + return array; + } + + public int toGeneRelativeOffset(int index){ + + int off=0; + + if(strand==PLUS){ + + // System.out.println(); + for(Exon e : exons){ + // System.out.print(e+" * "); + + int temp=0; + if(e.intersects(index)){ + temp=(int)(index-e.a); + }else if(e.a>index){ + break; + }else{ + temp=e.length(); + } + assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length(); + assert(temp>=0) : index+", "+e; + off+=temp; + } + + }else if(strand==MINUS){ + for(int i=exons.length-1; i>=0; i--){ + Exon e=exons[i]; + + int temp=0; + if(e.intersects(index)){ + temp=(int)(e.b-index); + }else if(e.b=0) : index+", "+e; + off+=temp; + } + + }else{assert false : strand;} + + return off; + } + + public int[] toExonRelativeOffset(int index){ + + int ex=0; + int off=0; + + if(strand==0){ + + // System.out.println(); + for(Exon e : exons){ + // System.out.print(e+" * "); + + int temp=0; + if(e.intersects(index)){ + temp=(int)(index-e.a); + }else if(e.a>index){ + break; + }else{ + ex++; + } + assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length(); + assert(temp>=0) : index+", "+e; + off=temp; + } + + }else if(strand==1){ + for(int i=exons.length-1; i>=0; i--){ + Exon e=exons[i]; + + int temp=0; + if(e.intersects(index)){ + temp=(int)(e.b-index); + }else if(e.b=0) : index+", "+e; + off=temp; + } + + }else{assert false : strand;} + +// if((index-143053138)>-3 && (index-143053138)<3){ +// assert(false) : ("\n\nLooking for "+index+" in\n"+this+ +// "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n"); +// } +// +// if((index-143053111)>-10 && (index-143053111)<10){ +// assert(false) : ("\n\nLooking for "+index+" in\n"+this+ +// "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n"); +// } + +// if(off==1 && exons[exons.length-1].b==143053111){ +// assert(false) : ("\n\nLooking for "+index+" in\n"+this+ +// "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n"); +// } + + // System.out.println(); + return new int[] {ex, off}; + } + + + public boolean isHypothetical(){ + return isHypothetical(symbol); + } + + + public static boolean isHypothetical(String s){ + if(s==null){return false;} + if(s.startsWith("C") && s.contains("orf")){return true;} + if(s.length()>=4 && s.startsWith("LOC") && Character.isDigit(s.charAt(3))){return true;} + return false; + } + + + public boolean isNormalGene(){ + return valid && !untranslated && !pseudo && !isHypothetical(); + } + + + public boolean intersectsTx(int point){ + return point>=txStart && point<=txStop; + } + public boolean intersectsTr(int point){ + assert(!untranslated); + return (untranslated ? false : point>=translationStart() && point<=translationStop()); + } + public boolean intersectsCode(int point){ +// assert(!untranslated) : "point = "+point+"\ngene = "+this; +// return (untranslated ? false : point>=codeStart && point<=codeEnd); + return (untranslated ? intersectsTx(point) : point>=codeStart && point<=codeStop); + } + public boolean intersectsExon(int point){ + for(Exon e : exons){ + if(e.intersects(point)){return true;} + } + return false; + } + + /** Note that this skips code intersection checking for untranslated genes. */ + public boolean intersectsCodeAndExon(int point){ + if(!untranslated && !intersectsCode(point)){return false;} + for(Exon e : exons){ + if(e.intersects(point)){return true;} + } + return false; + } + + + /** Note that this skips code intersection checking for untranslated genes. */ + public boolean intersectsCodeAndExon(int a, int b){ + if(!untranslated && !intersectsCode(a, b)){return false;} + for(Exon e : exons){ + if(e.intersects(a, b)){return true;} + } + return false; + } + + /** Note that this skips code intersection checking for untranslated genes. */ + public boolean intersectsIntron(int a, int b){ + if(exons==null || exons.length<2){return false;} + if(!overlap(a, b, exons[0].a, exons[exons.length-1].b)){return false;} + for(int i=1; i=e1.b+distFromEnds && b<=e2.a-distFromEnds){return true;} + } + return false; + } + + public boolean intersectsSplice(int a, int b){ + assert(b>=a); + if(exons==null || exons.length<2){return false;} + if(btxStop){return false;} + for(Exon e : exons){ + if(e.a>=a && e.a<=b){return true;} + if(e.b>=a && e.b<=b){return true;} + } + return false; + } + + public boolean intersectsNearby(int a, int b){ + return intersectsCodeAndExon(a-NEAR, b+NEAR); + } + + private static int closestToPoint(int a, int b, int point){ + int a2=(a>point ? a-point : point-a); + int b2=(b>point ? b-point : point-b); + return a2 + * nearest exon number (-1 means coding start or stop),
+ * side (0 means start, 1 means stop),
+ * position (1 means inside, 2 means outside, 3 means both),
+ * site coordinate + * } + */ + public int[] nearestSpliceSite(int a, int b){ + + int bestDist=999999999; + int nearestExon=-1; + int side=-1; + int position=0; + int bestSite=-1; + + boolean strictlyIntronic=this.isDeepIntronic(a, b, 1); + + if(!strictlyIntronic){ + { + int point=codeStart; + int x=Exon.distToPoint(a, b, point); + if(x=point){position|=1;} + side=(strand==PLUS ? 0 : 1); + if(strand==PLUS){ + side=0; + }else if(strand==MINUS){ + side=1; + } + } + + point=codeStop; + x=Exon.distToPoint(a, b, point); + if(xpoint){position|=2;} + if(a<=point){position|=1;} + side=(strand==PLUS ? 1 : 0); + } + } + } + + for(int i=0; i=point){position|=1;} + } + + point=e.b; + x=Exon.distToPoint(a, b, point); + if(xpoint){position|=2;} + if(a<=point){position|=1;} + } + } + + if(nearestExon>=0 && strand==MINUS){ + nearestExon=exons.length-nearestExon-1; + } + + return new int[] {bestDist, nearestExon, side, position, bestSite}; + } + + + + public boolean intersectsTx(int a, int b){ + assert(a<=b); + return overlap(a, b, txStart, txStop); + } + public boolean intersectsTr(int a, int b){ + assert(a<=b); + assert(!untranslated); + return (untranslated ? false : overlap(a, b, translationStart(), translationStop())); + } + public boolean intersectsCode(int a, int b){ + assert(a<=b); +// assert(!untranslated) : "a="+a+", b="+b+"\ngene = "+this; +// return (untranslated ? false : overlap(a, b, codeStart, codeEnd)); + return (untranslated ? intersectsTx(a, b) : overlap(a, b, codeStart, codeStop)); + } + public boolean intersectsExon(int a, int b){ +// if(!intersectsCode(a, b)){return false;} + assert(a<=b); + for(Exon e : exons){ + if(e.intersects(a, b)){return true;} + } + return false; + } + public boolean intersectsUTR(int a, int b){ + if(!intersectsTx(a,b)){return false;} + if(untranslated){return true;} + if(overlap(a, b, txStart, codeStart)){return true;} + if(overlap(a, b, codeStop, txStop)){return true;} + return false; + } + /** Downstream */ + public boolean intersectsUTR3(int a, int b){ + if(!intersectsTx(a,b)){return false;} + if(untranslated){return false;} + if(strand==MINUS){ + if(overlap(a, b, txStart, codeStart)){return true;} + }else{ + if(overlap(a, b, codeStop, txStop)){return true;} + } + return false; + } + /** Upstream */ + public boolean intersectsUTR5(int a, int b){ + if(!intersectsTx(a,b)){return false;} + if(untranslated){return false;} + if(strand==PLUS){ + if(overlap(a, b, txStart, codeStart)){return true;} + }else{ + if(overlap(a, b, codeStop, txStop)){return true;} + } + return false; + } + + private static boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + + public static final String header(){ + return "#chrom\tsymbol\tgeneId\tmrnaAcc\tproteinAcc" + + "\tstrand\tcodeStart\tcodeStop\ttxStart\ttxStop" + + "\t(UNTRANSLATED?)\t(PSEUDOGENE?)\tstatus\tcompleteness\tsource" + + "\t[exon0start-exon0stop, ...exonNstart-exonNstop]" + + "\tfullName\tdescription"; + } + +// public CharSequence toRefSeqFormat(){ +// return driver.ToRefGeneFormat.format(this); +// } + + public String toString(){ + + StringBuilder sb=new StringBuilder(256); + + sb.append(chromCodes[chromosome]+"\t"); + sb.append(symbol+"\t"); + sb.append(id+"\t"); + sb.append(mrnaAcc+"\t"); + assert(proteinAcc==null || !proteinAcc.equals("null")); + sb.append((proteinAcc==null ? "" : proteinAcc)+"\t"); + sb.append(strandCodes[strand]+"\t"); + sb.append(codeStart+"\t"); + sb.append(codeStop+"\t"); + sb.append(txStart+"\t"); + sb.append(txStop); + + sb.append("\t"+(untranslated ? "UNTRANS" : "")); + sb.append("\t"+(pseudo ? "PSEUDO" : "")); + + sb.append("\t"+(status>=0 ? statusCodes[status] : "")); + sb.append("\t"+(completeness>=0 ? completenessCodes[completeness] : "")); + sb.append("\t"+(primarySource>=0 ? sourceCodes[primarySource] : "")); + + sb.append("\t["); + String comma=""; + for(int i=0; exons!=null && i0)); + sb.append("\t"+(description==null ? "" : description)); + + assert(fullDescription==null || (!fullDescription.equals("null") && fullDescription.length()>0)); + sb.append("\t"+(fullDescription==null ? "" : fullDescription)); + + String s=sb.toString(); + return Character.isWhitespace(s.charAt(0)) ? s : s.trim(); + } + + public String toShortString(){ + + StringBuilder sb=new StringBuilder(256); + + sb.append("chr"+chromCodes[chromosome]+"\t"); + sb.append(symbol+"\t"); + sb.append(mrnaAcc+"\t"); + sb.append(strandCodes[strand]+"\t"); + sb.append("("+codeStart+" - "+codeStop+")"); + return sb.toString(); + } + + public int compareTo(Gene other){ + if(chromosomeother.chromosome){return 1;} + + if(txStartother.txStart){return 1;} + + if(txStopother.txStop){return 1;} + + if(codeStartother.codeStart){return 1;} + + if(codeStopother.codeStop){return 1;} + + if(exonLengthother.exonLength){return 1;} + + if(strandother.strand){return 1;} + + if(idother.id){return 1;} + + if(!symbol.equals(other.symbol)){return symbol.compareTo(other.symbol);} + return mrnaAcc.compareTo(other.mrnaAcc); + } + + public boolean isIdenticalTo(Gene other){ + if(chromosome!=other.chromosome){return false;} + if(strand!=other.strand){return false;} + if(txStart!=other.txStart){return false;} + if(txStop!=other.txStop){return false;} + if(codeStart!=other.codeStart){return false;} + if(codeStop!=other.codeStop){return false;} + if(exonLength!=other.exonLength){return false;} +// if(pseudo!=other.pseudo || untranslated!=other.untranslated){return false;} + if(exons==null){ + if(other.exons!=null){return false;} + }else{ + if(other.exons==null || (other.exons.length!=exons.length)){return false;} + for(int i=0; i=0); + return exnum; + } + + + public static final int find(String a, String[] array){ + for(int i=0; i asdf=new HashSet(); + + public static final int find2(String a, String[] array){ + for(int i=0; i=a2 && b2>=a1){r=0;} + else if(a1>b2){r=a1-b2;} + else{r=a2-b1;} + assert(r>=0) : r; + return r; + } + + + private static final int min(int x, int y){return xy ? x : y;} + + /** Transcription start position */ + public final int txStart; + /** Transcription end position */ + public final int txStop; + /** Coding region start */ + public final int codeStart; + /** Coding region end */ + public final int codeStop; + + /** Length of transcribed area */ + public final int txLength; + + /** Length of coding area */ + public final int codeLength; + + /** Length of exons (summed) */ + public final int exonLength; + + /** Length of exonic coding region */ + public final int exonCodeLength; + + /** Number of amino acids (excluding stop codon) */ + public final int aaLength; + + public final int utrLength5prime; + public final int utrLength3prime; + + /** Reference sequence chromosome or scaffold */ + public final byte chromosome; + /** + or - for strand */ + public final byte strand; + /** ? */ + public final byte cdsStartStat; + /** ? */ + public final byte cdsEndStat; + + public final boolean readCorrectly; + + /** Array of exons used by this gene */ + public final Exon[] exons; + + /** Exon frame {0,1,2}, or -1 if no frame for exon */ + public final byte[] exonFrames; + + /** Name of gene (usually transcript_id from GTF) */ + public final String mrnaAcc; + + /** Protein accession */ + public final String proteinAcc; + + /** Alternate name (e.g. gene_id from GTF) */ + public final String symbol; + + public final String description; + + public final String fullDescription; + + public final byte primarySource; + + /* CCDS file format: + * chromosome nc_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type */ + + /* CCDS format stuff */ + +// public final String nc_accession; + public final byte status; + public final byte completeness; + public final int id; + + + public final boolean untranslated; + public final boolean pseudo; + public final boolean valid; + + public static final String[] sourceCodes={ + "seqGene", "knownGene", "refGene", "unionGene", + "reserved1", "reserved2", "reserved3", "reserved4" + }; + + /** Index with cdsStartStat and cdsEndStat */ + public static final String[] endStatCodes={"none", "unk", "incmpl", "cmpl"}; + + public static final String[] statusCodes={ + "Unknown","Reviewed","Validated","Provisional","Predicted","Inferred","Public" + +// "Public", "Reviewed, update pending", "Reviewed, withdrawal pending", +// "Withdrawn", "Withdrawn, inconsistent annotation", +// "Under review, withdrawal", "Under review, update", + + }; + + public static final String[] completenessCodes={ + "Unknown","Complete5End","Complete3End","FullLength","IncompleteBothEnds","Incomplete5End","Incomplete3End","Partial" + }; + + + /** Index with chromosome number */ + public static final String[] chromCodes={"A", "1", "2", "3", "4", "5", "6", + "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", + "19", "20", "21", "22", "X", "Y", "M", "U"}; + + /** Index with strand number */ + public static final String[] strandCodes={"+", "-", "?"}; + public static final char[] strandCodes2={'+', '-', '?'}; + + public static final byte PLUS=0; + public static final byte MINUS=1; + private static final int NEAR=Data.NEAR; + + public static final byte STAT_UNKNOWN=0; + public static final byte STAT_REVIEWED=1; + public static final byte STAT_VALIDATED=2; + public static final byte STAT_PROVISIONAL=3; + public static final byte STAT_PREDICTED=4; + public static final byte STAT_INFERRED=5; + public static final byte STAT_PUBLIC=6; + +} diff --git a/current/dna/GeneSet.java b/current/dna/GeneSet.java new file mode 100755 index 0000000..d08873e --- /dev/null +++ b/current/dna/GeneSet.java @@ -0,0 +1,132 @@ +package dna; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; + + +public class GeneSet implements Comparable{ + + public static void main(String[] args){ + Data.getGeneIDTable(); + } + + public GeneSet(String n, ArrayList g){ + name=n; + id=g.get(0).id; + genes=g; + chrom=g.get(0).chromosome; + transcripts=genes.size(); + assert(transcripts>0); + + byte st=-1; + + boolean pse=true, unt=true; + + for(int i=0; i genes; + public final int transcripts; + + /** True if all transcripts are untranslated */ + public final boolean untranslated; + /** True if all transcripts are psuedogenes */ + public final boolean pseudo; + + public int minStart=Integer.MAX_VALUE; + public int maxEnd=0; + + + public boolean intersects(int point){ + return point>=minStart && point<=maxEnd; + } + public boolean intersects(int point1, int point2){ + return point2>=minStart && point1<=maxEnd; + } + + + @Override + public int compareTo(GeneSet other) { + if(chrom!=other.chrom){ + return chrom>other.chrom ? 1 : -1; + } + int x=minStartother.minStart ? 1 : 0; + if(x!=0){return x;} + return x=name.compareTo(other.name); + } + + public boolean equals(Object other){ + return equals((GeneSet)other); + } + + public boolean equals(GeneSet other){ + return compareTo(other)==0; + } + + @Override + public int hashCode(){ + return Integer.rotateLeft(name.hashCode(), 5)^chrom; + } + + private static final int min(int x, int y){return xy ? x : y;} + private static final byte min(byte x, byte y){return xy ? x : y;} + private static final long min(long x, long y){return xy ? x : y;} + private static final float min(float x, float y){return xy ? x : y;} + +} \ No newline at end of file diff --git a/current/dna/IntMap.java b/current/dna/IntMap.java new file mode 100755 index 0000000..12f83ff --- /dev/null +++ b/current/dna/IntMap.java @@ -0,0 +1,106 @@ +package dna; +import java.util.Arrays; + + +public class IntMap { + + public static void main(String[] args){ + + } + + + public IntMap(int from, int to){ + reset(from, to); + } + + + public int get(int key){ + assert(key>=min && key<=max); + return array[key-min]; + } + + + public boolean containsKey(int key){ + assert(key>=min && key<=max); + return array[key-min]!=INVALID; + } + + + public int put(int key, int value){ + assert(key>=min && key<=max); + assert(value!=INVALID); + int index=key-min; + int old=array[index]; + array[index]=value; + return old; + } + + + public int remove(int key){ + assert(key>=min && key<=max); + int index=key-min; + int old=array[index]; + array[index]=INVALID; + return old; + } + + + public int size(){ + int sum=0; + for(int i=0; i=min); + assert(((long)max)-((long)min)=min && key<=max); + return array[key-min]; + } + + + public boolean containsKey(int key){ +// assert(key>=min && key<=max); + return array[key-min]!=INVALID; + } + + + public int put(int key, int value){ + assert(key>=min && key<=max); + assert(value!=INVALID); + int index=key-min; + int old=array[index]; + array[index]=value; + return old; + } + + + public int remove(int key){ + assert(key>=min && key<=max); + int index=key-min; + int old=array[index]; + array[index]=INVALID; + return old; + } + + + public int size(){ + int sum=0; + for(int i=0; i=min); + assert(((long)max)-((long)min)=size2){ + + }else{ + int[] oldArray=array; + array=new int[size2]; + + } + assert(false) : "TODO"; +// +// min=min2; +// max=max2; +// assert(max>=min); +// assert(((long)max)-((long)min) table=null; + + public static Set keys(){return table.keySet();} + + public static Matrix get(String s){ + if(table==null){ + table=new HashMap(64); +// fillTable("matrices.txt"); +// fillTable("matrices2.txt"); + +// fillTable("matrixN1.txt"); +// fillTable("matrixN2.txt"); +// fillTable("matrixN3.txt"); +// fillTable("matrixN4.txt"); + + fillTable("matrix_build37_N1.txt"); + fillTable("matrix_build37_N2.txt"); + fillTable("matrix_build37_N3.txt"); +// fillTable("matrix_build37_N4.txt"); + + + +// fillTable("asmGstart_sept9.txt"); +// fillTable("asmEstart_sept9.txt"); +// fillTable("asmTRstart_sept9.txt"); +// fillTable("asmGstop_sept9.txt"); +// fillTable("asmEstop_sept9.txt"); +// fillTable("asmTRstop_sept9.txt"); +// fillTable("asmEstop_sept16.txt"); + +// fillTable("SplicePercentiles_b37_Sept16.txt"); + fillTable("SplicePercentiles_b37_Nov24.txt"); + + } + Matrix m=table.get(s); + +// assert(table.containsKey(s)) : "\nCan't find "+s+" in\n\n"+table.keySet()+"\n"; +// assert(m!=null) : "\nValue for "+s+" is null\n"; + + if(!table.containsKey(s) || m==null){ + if(!table.containsKey(s)){throw new RuntimeException("Can't find "+s+" in\n\n"+table.keySet()+"\n");} + if(m==null){throw new RuntimeException("\nValue for "+s+" is null");} + } + + + return m; + } + + private static void fillTable(String fname){ + MatrixFile mf=new MatrixFile(fname); + Matrix mat=mf.nextMatrix(); + while(mat!=null){ +// System.out.println("Adding "+mat.name); + table.put(mat.name, mat); + table.put(mat.name.toLowerCase(), mat); + mat=mf.nextMatrix(); + } + mf.close(); + } + +} diff --git a/current/dna/Motif.java b/current/dna/Motif.java new file mode 100755 index 0000000..cd9134d --- /dev/null +++ b/current/dna/Motif.java @@ -0,0 +1,231 @@ +package dna; + +public abstract class Motif { + + public Motif(String name_, int length_, int center_){ + center=center_; + length=length_; + suffix=length-center-1; + name=name_; + +// assert(center>=0 && centerarray[pos]){pos=i;} + } + return pos; + } + + public String toString(){ + return name+", "+length+", "+center; + } + + + public final String name; + public String commonLetters; + public final int center; + public final int length; + public final int suffix; + + + static final int min(int x, int y){return xy ? x : y;} + static final float min(float x, float y){return xy ? x : y;} + + static final byte[] numberToBase=AminoAcid.numberToBase; + static final byte[] numberToBaseExtended=AminoAcid.numberToBaseExtended; + static final byte[] baseToNumber=AminoAcid.baseToNumberACGTN; + static final byte[] baseToNumberExtended=AminoAcid.baseToNumberExtended; + + static final float[] baseProb1={0.256614f, 0.226617f, 0.238012f, 0.278756f}; + + //Within 200 of exon and gene ends only + static final float[] baseProb2={ + 0.076019f, 0.046405f, 0.071754f, 0.062437f, 0.067143f, 0.066057f, 0.020333f, 0.073085f, + 0.060553f, 0.054897f, 0.068741f, 0.053822f, 0.052896f, 0.059260f, 0.077188f, 0.089412f + }; + + //name: Overall Frequency MP3 + static final float[] baseProb3={ + 0.027343f, 0.011857f, 0.018295f, 0.018524f, 0.015942f, 0.012337f, 0.003792f, 0.014333f, + 0.019988f, 0.015837f, 0.020411f, 0.015518f, 0.014382f, 0.011355f, 0.016466f, 0.020234f, + 0.014364f, 0.014299f, 0.022875f, 0.015605f, 0.018893f, 0.019412f, 0.006677f, 0.021076f, + 0.003629f, 0.005854f, 0.006783f, 0.004067f, 0.010491f, 0.018413f, 0.024257f, 0.019924f, + 0.018029f, 0.010640f, 0.019427f, 0.012458f, 0.015158f, 0.017025f, 0.006167f, 0.016547f, + 0.018098f, 0.016891f, 0.020042f, 0.013710f, 0.010580f, 0.010773f, 0.018026f, 0.014443f, + 0.016281f, 0.009609f, 0.011157f, 0.015849f, 0.017150f, 0.017284f, 0.003696f, 0.021130f, + 0.018839f, 0.016316f, 0.021506f, 0.020527f, 0.017442f, 0.018720f, 0.018440f, 0.034811f + }; + +// protected static final Hashtable percentTable=makePercentTable(); +// +// private static final Hashtable makePercentTable(){ +// +// String[] keys={ +// "Exon Stops MP3", +// }; +// +// float[][] values={ +// { +// 0.00234f, 0.01071f, 0.02476f, 0.05155f, 0.08682f, 0.1453f, 0.22434f, 0.29615f, 0.36233f, 0.41034f, +// 0.46028f, 0.52224f, 0.58198f, 0.63879f, 0.68356f, 0.70622f, 0.7268f, 0.75131f, 0.77065f, 0.79546f, +// 0.82445f, 0.85279f, 0.86899f, 0.88287f, 0.89197f, 0.90166f, 0.91405f, 0.93129f, 0.94708f, 0.95521f, +// 0.96106f, 0.96293f, 0.9663f, 0.97242f, 0.97662f, 0.97866f, 0.98017f, 0.98242f, 0.98459f, 0.98703f, +// 0.98957f, 0.99064f, 0.99157f, 0.99286f, 0.9952f, 0.99721f, 0.99858f, 0.99914f, 0.99967f, 0.9999f, 0.99998f +// }, +// }; +// +// Hashtable r= new Hashtable(); +// for(int i=0; i "+array[index]+" -> "+array[index+1]+" *** "); + + if(index>=array.length-1){return 1;} + + float a, b; + if(index==0){ + a=0; + b=array[0]; + }else{ + a=array[index]; + b=array[index+1]; + } + + float ratio=strength-(index/((float)array.length)); + + return ratio*b+(1-ratio)*a; + + } + +} diff --git a/current/dna/MotifMulti.java b/current/dna/MotifMulti.java new file mode 100755 index 0000000..89b6c3d --- /dev/null +++ b/current/dna/MotifMulti.java @@ -0,0 +1,59 @@ +package dna; +import java.util.Arrays; + + +public class MotifMulti extends Motif { + + public MotifMulti(String name_, Motif...args){ + super(name_, args[0].length, args[0].center); + commonLetters=Arrays.toString(args); + sub=args; + } + + + public boolean matchesExactly(byte[] source, int a){ + for(int i=0; i0){s1=args[0];} +// if(args.length>1){s2=args[1];} + + MotifProbsN m=makeMotif("Exon Stops MP3", 10, 3, 3); + + System.out.println("Made motif "+m.name); + + String source=s2; + + + int x=m.countExact(source); + System.out.println(x+" matches."); + + byte[] sbytes=source.getBytes(); + + for(int i=0; i\t%.5f", p, m.normalize(p))); + } + + } + + public static MotifProbsN makeMotif(String name_, int length_, int center_, int n_){ + Matrix mat=Matrix.get(name_); + assert(mat!=null) : "\nCan't find '"+name_+"' in:\n"+Matrix.keys()+"\n\n"; + float[][] sub=mat.subGrid(center_, length_); + +// System.out.println("Found "+name+":\n"+Arrays.toString(sub[preLen])); + + assert(sub[0].length==(1<<(2*n_))); + + MotifProbsN r=new MotifProbsN(name_, sub, center_, n_); + + Matrix percentMatrix=null; + + + try { + percentMatrix=Matrix.get(name_+", "+r.length+", "+r.center); + } catch (Exception e) { + // TODO Auto-generated catch block +// System.out.println("\nIgnoring missing percentMatrix:\n"+e); + } + + if(percentMatrix!=null){ + r.percentile=percentMatrix.grid[0]; + } +// r.percentile=percentTable.get(name); + + return r; + } + + public MotifProbsN(String name_, float[][] p, int cen, int n){ + super(name_, p.length, cen); + + N=n; + chunk=new byte[N]; + baseProb=Motif.baseProbN[N]; + + probs=p; + importance=positionImportance(probs); + + adjustForBaseProb(probs, baseProb); + + double pmin=1, pmax=1; + + double sum=0; + for(int i=0; i>(2*(N-1))]); + +// pmax*=probs[i][x]*4; //TODO Note the .25; could be an empirical inverse probability, but that causes complications +// pmin*=probs[i][y]*4; + + pmax*=probs[i][x]; + pmin*=probs[i][y]; + +// pmax*=Math.pow(probs[i][x], 1+importance[i]); +// pmin*=Math.pow(probs[i][y], 1+importance[i]); + +// pmax*=(probs[i][x]+(matrixAvg*importance[i]*.1f)); +// pmin*=(probs[i][y]+(matrixAvg*importance[i]*.1f)); + } + + + maxProb=(float)pmax; + minProb=(float)pmin; + + invProbDif=1f/(maxProb-minProb); + invLength=1f/(length); + + commonLetters=sb.toString(); + + lettersUpper=commonLetters.toUpperCase().getBytes(); + lettersLower=commonLetters.toLowerCase().getBytes(); + + numbers=new byte[commonLetters.length()]; + numbersExtended=new byte[commonLetters.length()]; + + for(int i=0; isource.length){return false;} + + for(int i=0; isource.length){return minProb;} + + float r=1; + + for(int i=0; ibaseProb.length){return minProb;} + +// float p1=(probs[i][n]+(matrixAvg*importance[i]*.1f)); + +// float p1=(float)Math.pow(probs[i][n], 1+importance[i]); //Note: Assumes (A,C,G,T) only. + float p1=probs[i][n]; //Note: Assumes (A,C,G,T) only. + +// float p2=invBaseProb2[n]; +// float p2=4; //TODO +// +// r=r*p1*p2; + + r=r*p1; + } + return r; + } + + + public float[] positionImportance(float[][] rawProbs){ + float[] base=baseProb; + float[] out=new float[rawProbs.length]; + + double maxSum=0; + + for(int i=0; imaxSum){ + maxSum=sum; + } + } + + for(int i=0; i0){s1=args[0];} + if(args.length>1){s2=args[1];} + + MotifSimple m=new MotifSimple(s1, 0); + String source=s2; + + + int x=m.countExtended(source); + System.out.println(x+" matches."); + } + + public MotifSimple(String s, int cen){ + super(s, s.length(), cen); + + commonLetters=s; + lettersUpper=commonLetters.toUpperCase().getBytes(); + lettersLower=commonLetters.toLowerCase().getBytes(); + + boolean x=false; + for(int i=0; isource.length){return false;} + + for(int i=0; isource.length){return false;} + + for(int i=0; i{ + + /** A numeric range, assuming 0-based, base-centered numbering. */ + public Range(int aa, int bb){ + + assert(aa<=bb) : aa+">"+bb; + a=aa; + b=bb; + length=b-a+1; + } + + public static Range toRange(String s){ + String[] s2=s.replace("[","").replace("]","").replace("(","").replace(")","").replace(",","").split("-"); + + int a, b; + if(s2.length==1){ + a=b=Integer.parseInt(s2[0]); + }else{ + a=Integer.parseInt(s2[0]); + b=Integer.parseInt(s2[1]); + } + return new Range(a, b); + } + + @Override + public int compareTo(Range other) { + if(aother.a){return 1;} + + if(bother.b){return 1;} + + return 0; + } + + public boolean includes(int p){ + return p>=a && p<=b; + } + + public boolean intersects(int p1, int p2){ + return overlap(a, b, p1, p2); + } + + public boolean includes(int p1, int p2){ + assert(p1<=p2); + return p1>=a && p2<=b; + } + + public boolean intersects(Range other){ + return intersects(other.a, other.b); + } + + public boolean touches(Range other){ + if(intersects(other.a, other.b)){return true;} + return b==other.a-1 || a==other.b+1; + } + + public boolean includes(Range other){ + return includes(other.a, other.b); + } + + public boolean equals(Object other){ + return equals((Range)other); + } + + public Range merge(Range other){ + assert(touches(other)); + Range r=new Range(min(a, other.a), max(b, other.b)); + + assert(r.includes(this)); + assert(r.includes(other)); + assert(r.length<=length+other.length); + return r; + } + + public boolean equals(Range other){ + return a==other.a && b==other.b; + } + + public int hashCode(){ + return new Long(Long.rotateLeft(a, 16)^b).hashCode(); + } + + public String toString(){ + return "("+a+(a==b ? "" : (" - "+b))+")"; + } + + + public static boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + + + public static Range[] toRanges(int[] ...arrays){ + int len=0; + int[] combined=null; + + if(arrays.length==1){ + combined=arrays[0]; + len=combined.length; + }else{ + for(int i=0; i list=new ArrayList(16); + int start=combined[0], last=combined[0]; + +// System.out.println(Arrays.toString(combined)); + + for(int i=0; ilast+1){ + list.add(new Range(start, last)); + start=last=x; + }else{ + last=x; + } + } + list.add(new Range(start, last)); + return list.toArray(new Range[list.size()]); + } + + + private static final int min(int x, int y){return xy ? x : y;} + + public final int a; + public final int b; + public final int length; + + public Object obj1=null; + public Object obj2=null; +} diff --git a/current/dna/ScafLoc.java b/current/dna/ScafLoc.java new file mode 100755 index 0000000..18daeb0 --- /dev/null +++ b/current/dna/ScafLoc.java @@ -0,0 +1,20 @@ +package dna; + +/** + * @author Brian Bushnell + * @date Sep 24, 2013 + * + */ +public class ScafLoc { + + public ScafLoc(String name_, int chrom_, int loc_){ + name=name_; + chrom=chrom_; + loc=loc_; + } + + public String name; + public int chrom; + public int loc; + +} diff --git a/current/dna/Scaffold.java b/current/dna/Scaffold.java new file mode 100755 index 0000000..419a7c8 --- /dev/null +++ b/current/dna/Scaffold.java @@ -0,0 +1,71 @@ +package dna; + +/** + * @author Brian Bushnell + * @date Jan 4, 2013 + * + */ +public class Scaffold implements Comparable { + + public Scaffold(String name_, String assembly_, int length_){ + name=name_; + assembly=assembly_; + length=length_; + } + + /** Assumes SAM format. + * e.g.
@SQ SN:scaffold_0 LN:1785514 AS:build 9 */ + public Scaffold(String s){ + this(s.split("\t")); + } + + /** Assumes SAM format */ + public Scaffold(String[] split) { + assert(split.length>2 && split[0].equals("@SQ")); + for(String s : split){ + if(s.equals("@SQ")){ + //Do nothing + }else if(s.startsWith("SN:")){ + assert(name==null); + name=new String(s.substring(3)); //Data.forceIntern(s.substring(3)); + }else if(s.startsWith("LN:")){ + length=Integer.parseInt(s.substring(3)); + }else if(s.startsWith("AS:")){ + assembly=Data.forceIntern(s.substring(3)); + } + } + assert(length>-1); + assert(name!=null); + } + + public Scaffold(String name_, int length_) { + name=name_; + length=length_; + } + + @Override + public int hashCode(){ + return name.hashCode(); + } + + public int compareTo(Scaffold other){ + return name.compareTo(other.name); + } + + public String toString(){ + return "@SQ\tSN:"+name+"\tLN:"+length+(assembly==null ? "" : "\tAS:"+assembly); + } + + public String name; + public String assembly; + public int length=-1; + public long basehits=0; + + /** {A,C,G,T,N} */ + public long[] basecount; + public float gc; + + /** For attaching things */ + public Object obj; + +} diff --git a/current/dna/Timer.java b/current/dna/Timer.java new file mode 100755 index 0000000..15f72a3 --- /dev/null +++ b/current/dna/Timer.java @@ -0,0 +1,28 @@ +package dna; + +public class Timer { + + public Timer(){} + + public long start(){ + time1=time2=System.nanoTime(); + elapsed=0; + return time1; + } + + public long stop(){ + time2=System.nanoTime(); + elapsed=time2-time1; + return time2; + } + + public String toString(){ + return String.format("%.3f seconds.", elapsed/1000000000d); + } + + public long time1; + public long time2; + /** in nanos */ + public long elapsed; + +} diff --git a/current/driver/ClearRam.java b/current/driver/ClearRam.java new file mode 100755 index 0000000..e5ecaa9 --- /dev/null +++ b/current/driver/ClearRam.java @@ -0,0 +1,64 @@ +package driver; + +import java.util.ArrayList; + +import fileIO.ReadWrite; + +public class ClearRam { + + public static void main(String[] args){ + + for(int i=0; i<2; i++){ + + try { + System.gc(); + attempt(); + } catch(final java.lang.OutOfMemoryError e) { +// e.printStackTrace(); + System.err.println("Out of memory at "+((current*8)/(1<<20))+" MB"); + } + } + } + + public static void attempt(){ + ArrayList list=new ArrayList(8000); + current=0; + + while(true){ + long[] array=null; + + array=new long[1<<20]; + list.add(array); + +// for(int i=0; icha1.maxIndex ? cha1.maxIndex : cha2.maxIndex; + + for(int i=0; i1 ? args[1] : null); + if(new File(in).isDirectory()){ + try { + concatenateDirectory(in, out); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + }else{ + concatenatePattern(in, out); + } + t.stop(); + System.err.println(t); + + } + + public static void concatenatePattern(final String basename, final String out){ + assert(false) : "This is human-specific."; + String outname=(out==null ? basename.replace("#", "ALL") : out); + + TextStreamWriter tsw=new TextStreamWriter(outname, true, false, true); + tsw.start(); + + for(int chrom=1; chrom<26; chrom++){ + String fname=basename.replace("#", Gene.chromCodes[chrom]); + TextFile tf=new TextFile(fname, false, true); + + tsw.print(">chr"+Gene.chromCodes[chrom]+"\n"); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + char c=s.charAt(0); + if(c!='>' && c!=';'){ + tsw.println(s); + } + } + System.err.print("."); + } + tsw.poison(); + tsw.waitForFinish(); + } + + public static void concatenateDirectory(final String in, String out) throws IOException{ + if(out==null){out="stdout";} + + final byte[] buf=new byte[32768]; + + final File dir=new File(in); + final File[] files=dir.listFiles(); + Arrays.sort(files); + + final File outfile=new File(out); + final OutputStream os=ReadWrite.getOutputStream(out, false, true, true); + + for(File f : files){ + if(f!=null && f.isFile() && !f.equals(outfile)){ + String fname=f.getAbsolutePath(); + System.err.println("Processing "+fname); + + InputStream is=ReadWrite.getInputStream(fname, false, false); + + for(int lim=is.read(buf); lim>0; lim=is.read(buf)){ + os.write(buf, 0, lim); + } + + is.close(); + System.err.print("."); + } + } + ReadWrite.close(os); + } + + +} diff --git a/current/driver/ConcatenateTextFiles.java b/current/driver/ConcatenateTextFiles.java new file mode 100755 index 0000000..e6e63ba --- /dev/null +++ b/current/driver/ConcatenateTextFiles.java @@ -0,0 +1,190 @@ +package driver; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.concurrent.ArrayBlockingQueue; + +import align2.Tools; + +import dna.Timer; + +import fileIO.ReadWrite; +import fileIO.TextFile; + +public class ConcatenateTextFiles { + + /** Format: infile1,infile2,...infileN,outfile */ + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + Timer t=new Timer(); + t.start(); + + if(ReadWrite.ZIPLEVEL<6){ReadWrite.ZIPLEVEL=6;} + + for(int i=0; i1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + System.out.println("Set ziplevel to "+b); + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + }else{ + concatenate(args[i].split(",")); + } + + } + t.stop(); + System.out.println(); + System.out.println("Time: \t"+t); + } + + private static void concatenate(String[] split) { + String outname=split[split.length-1]; + assert(OVERWRITE || !new File(outname).exists()) : outname+" exists."; +// OutputStream os=ReadWrite.getOutputStream(outname, false); +// PrintWriter writer=new PrintWriter(os); + + WriteThread wt=new WriteThread(outname); + wt.start(); + + + ArrayList[] bufferptr=new ArrayList[] {new ArrayList(LIST_SIZE)}; + + for(int i=0; i buffer=bufferptr[0]; + if(buffer==null){ + wt.add(new ArrayList(1)); + }else if(buffer.isEmpty()){ + wt.add(buffer); + }else{ + wt.add(buffer); + wt.add(new ArrayList(1)); + } + + } + + private static void processTerm(String term, ArrayList[] bufferptr, WriteThread wt){ + + System.out.println("Processing term "+term); + + File f=new File(term); + if(!f.isDirectory()){ + + TextFile tf=new TextFile(term, false, false); + + ArrayList buffer=bufferptr[0]; + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + buffer.add(s); + + // System.out.println("Added to buffer"); + if(buffer.size()>=LIST_SIZE){ + // System.out.println("Sent buffer"); + +// System.out.println("****** "+term+" ******"); +// for(String b : buffer){ +// System.out.println(b); +// } + + wt.add(buffer); + bufferptr[0]=buffer=new ArrayList(LIST_SIZE); + } + } + tf.close(); + }else{ + assert(f.isDirectory()); + File[] contents=f.listFiles(); + for(File c : contents){ + String abs=c.getAbsolutePath(); + if(!abs.equals(wt.fname)){ +// System.out.println(c+" == "+new File(wt.fname)+" : "+c.equals(new File(wt.fname))); + processTerm(abs, bufferptr, wt); + } + } + } + } + + private static class WriteThread extends Thread{ + + public WriteThread(String fname_){ + String temp=fname_; + try { + temp=new File(fname_).getCanonicalPath(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + fname=temp; + os=ReadWrite.getOutputStream(fname, false, true, true); + writer=new PrintWriter(os); + } + + public void add(ArrayList list){ + assert(list!=null); + while(list!=null){ +// System.out.println("Adding list to queue "+queue.size()); + try { + queue.put(list); + list=null; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + @Override + public void run(){ + + ArrayList list=null; + while(list==null){ +// System.out.println("Waiting for list..."); + try { + list=queue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } +// System.out.println("Took list of size "+(list==null ? "null" : list.size()+"")); + if(list!=null){ + if(list.isEmpty()){ + ReadWrite.finishWriting(writer, os, fname, allowSubprocess); + return; + } + for(String s : list){ + if(s!=null){writer.println(s);} + } + } + list=null; + } + } + + private final OutputStream os; + private final PrintWriter writer; + private final ArrayBlockingQueue> queue=new ArrayBlockingQueue>(MAX_LISTS); + private final String fname; + + } + + public static final int MAX_LISTS=8; + public static final int LIST_SIZE=100; + public static boolean OVERWRITE=true; + public static boolean allowSubprocess=true; + +} diff --git a/current/driver/Concatenator.java b/current/driver/Concatenator.java new file mode 100755 index 0000000..216d163 --- /dev/null +++ b/current/driver/Concatenator.java @@ -0,0 +1,60 @@ +package driver; + +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +public class Concatenator { + + + public static void main(String args[]){ + + assert(args.length==2 && !args[1].contains(",")); + TextStreamWriter tsw=new TextStreamWriter(args[1], false, false, true); + tsw.start(); + for(String s : args[0].split(",")){ + writeFile(s, tsw); + } + tsw.poison(); + } + + public static void writeFile(String fname, TextStreamWriter tsw){ + TextFile tf=new TextFile(fname, false, false); + if(tsw==null){ + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + System.out.println(s); + } + }else{ + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + tsw.println(s); + } + } + tf.close(); + } + + + public static StringBuilder merge(String[] fnames){ + StringBuilder sb=new StringBuilder(); + + for(int i=0; i1) : s; + + boolean success=true; + boolean nomap=false; + boolean reverse=false; + + int flag=-1; + String chrom=null; + int loc=-1; + + try { + flag=Integer.parseInt(line[1]); + chrom=line[2]; + loc=Integer.parseInt(line[3]); + nomap=((flag&0x4)!=0); + reverse=((flag&0x10)!=0); + } catch (NumberFormatException e) { + success=false; + } + + if(success && !nomap){ + String aln=chrom+"\t"+loc+"\t"+(reverse ? "R" : "F")+"\n"; + out.print(aln); + } + + + } + } + + tf.close(); + out.flush(); + out.close(); + + } + +} diff --git a/current/driver/CountNumberOfBaitedGenes.java b/current/driver/CountNumberOfBaitedGenes.java new file mode 100755 index 0000000..9aa6937 --- /dev/null +++ b/current/driver/CountNumberOfBaitedGenes.java @@ -0,0 +1,91 @@ +package driver; + +import java.util.HashMap; + +import dna.Data; +import dna.Exon; +import dna.Gene; +import dna.GeneSet; + +public class CountNumberOfBaitedGenes { + + + public static void main(String[] args){ + + Data.GENOME_BUILD=37; + Data.GENE_MAP="refGene"; + + int genes=0; + int partly=0; + int fully=0; + + Data.getBaits(); + + for(byte chrom=1; chrom<=24; chrom++){ + HashMap table=Data.geneNameTable(chrom); + + + + for(GeneSet gs : table.values()){ + boolean fullybaited=false; + boolean partlybaited=false; + boolean exists=false; + for(Gene g : gs.genes){ + if(!g.untranslated && g.codeLength>1){ + exists=true; + fullybaited=(fullybaited||isFullyBaited(g)); + partlybaited=(partlybaited||isPartlyBaited(g)); + } + if(fullybaited){break;} + } + if(fullybaited){fully++;} + if(partlybaited){partly++;} + if(exists){genes++;} + } + System.err.print("."); + } + System.err.println(); + System.err.println("Genes: \t"+genes); + System.err.println("Fully Baited: \t"+fully); + System.err.println("Partly Baited: \t"+partly); + + } + + + public static boolean isFullyBaited(Gene g){ + + int count=0; + + for(Exon e : g.exons){ + for(int i=e.a; i<=e.b; i++){ + if(i>=g.codeStart && i<=g.codeStop){ + if(!Data.isBaited(g.chromosome, i, 0)){return false;} + count++; + } + } + } + + if(count<1){return false;} //No content + return true; + } + + + public static boolean isPartlyBaited(Gene g){ + + int count=0; + + for(Exon e : g.exons){ + for(int i=e.a; i<=e.b; i++){ + if(i>=g.codeStart && i<=g.codeStop){ + if(Data.isBaited(g.chromosome, i, 0)){return true;} + count++; + } + } + } + + if(count<1){return false;} //No content + return false; + } + + +} diff --git a/current/driver/CountRNAs.java b/current/driver/CountRNAs.java new file mode 100755 index 0000000..74a0225 --- /dev/null +++ b/current/driver/CountRNAs.java @@ -0,0 +1,32 @@ +package driver; + +import dna.Data; +import dna.Gene; + +public class CountRNAs { + + public static void main(String[] args){ + Data.GENOME_BUILD=Integer.parseInt(args[0]); + Data.GENE_MAP=args[1]; + long coding=0; + long noncoding=0; + long pseudo=0; + for(byte chrom=1; chrom<=24; chrom++){ + Gene[] genes=Data.getGenes(chrom); + for(Gene g : genes){ + if(g.pseudo){ + pseudo++; + }else if(g.untranslated){ + noncoding++; + }else{ + coding++; + } + } + } + System.out.println("Gene map: "+Data.GENE_MAP); + System.out.println("Pseudogenes: "+pseudo); + System.out.println("Translated Genes: "+coding); + System.out.println("Untranslated Genes: "+noncoding); + } + +} diff --git a/current/driver/FindMotifs.java b/current/driver/FindMotifs.java new file mode 100755 index 0000000..e4b1641 --- /dev/null +++ b/current/driver/FindMotifs.java @@ -0,0 +1,359 @@ +package driver; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.Collections; +import java.util.HashSet; + +import dna.ChromosomeArray; +import dna.Data; +import dna.Gene; +import dna.GeneSet; +import dna.Motif; +import dna.MotifProbsN; + + +public class FindMotifs { + + + public static void main(String[] args){ + + int chrom=1; + if(args.length>0){ + chrom=Integer.parseInt(args[0]); + } + + int maxChrom=22; + + + float[][] grid={ + +// {0.19540f, 0.26751f, 0.34873f, 0.18835f}, +// {0.18987f, 0.33930f, 0.28953f, 0.18131f}, +// {0.19421f, 0.32921f, 0.28259f, 0.19399f}, + + {0.19519f, 0.23856f, 0.38961f, 0.17664f}, + {0.17382f, 0.33995f, 0.30720f, 0.17903f}, + {0.24452f, 0.38376f, 0.25710f, 0.11462f}, + + {0.46075f, 0.09954f, 0.38018f, 0.05953f}, + {0.29028f, 0.38874f, 0.19941f, 0.12156f}, + {0.17610f, 0.46129f, 0.28953f, 0.07309f}, + + {0.99859f, 0.00108f, 0.00011f, 0.00022f}, + {0.00001f, 0.00001f, 0.00076f, 0.99924f}, + {0.00001f, 0.00001f, 0.99924f, 0.00076f}, +// + {0.20993f, 0.14877f, 0.51269f, 0.12861f}, + {0.26903f, 0.39861f, 0.18337f, 0.14899f}, +// {0.14812f, 0.26448f, 0.39286f, 0.19453f}, +// {0.23476f, 0.24073f, 0.35697f, 0.16753f}, +// {0.24886f, 0.32043f, 0.22511f, 0.20560f}, +// {0.16504f, 0.31956f, 0.34288f, 0.17252f}, +// {0.23444f, 0.26838f, 0.32531f, 0.17187f}, + }; + + + float[][] gridATG={ + {1, 0, 0, 0}, + {0, 0, 0, 1}, + {0, 0, 1, 0}, + }; + + + float[][] grid2={ +// {0.03793f, 0.04397f, 0.06643f, 0.02087f, 0.06272f, 0.11378f, 0.06177f, 0.07713f, 0.06526f, 0.10277f, 0.09535f, 0.04248f, 0.02564f, 0.06590f, 0.05943f, 0.05859f}, +// {0.04534f, 0.04238f, 0.07427f, 0.02956f, 0.06897f, 0.11463f, 0.06579f, 0.07702f, 0.05594f, 0.09821f, 0.08677f, 0.04206f, 0.02363f, 0.05943f, 0.05721f, 0.05880f}, +// {0.04397f, 0.04524f, 0.07766f, 0.02702f, 0.06537f, 0.10372f, 0.07130f, 0.07427f, 0.05234f, 0.09609f, 0.09397f, 0.04164f, 0.02808f, 0.05954f, 0.06399f, 0.05583f}, +// +// {0.04990f, 0.04164f, 0.07342f, 0.02479f, 0.06039f, 0.11294f, 0.06113f, 0.07013f, 0.06929f, 0.10234f, 0.09556f, 0.03973f, 0.02998f, 0.06102f, 0.05329f, 0.05445f}, +// {0.05075f, 0.04725f, 0.08391f, 0.02765f, 0.06590f, 0.11664f, 0.06388f, 0.07151f, 0.06770f, 0.10012f, 0.07840f, 0.03719f, 0.02458f, 0.06113f, 0.05424f, 0.04916f}, +// {0.04831f, 0.04005f, 0.08931f, 0.03125f, 0.06685f, 0.09249f, 0.09546f, 0.07035f, 0.05583f, 0.08359f, 0.10234f, 0.03867f, 0.02278f, 0.05287f, 0.06293f, 0.04693f}, +// +// {0.04587f, 0.05117f, 0.07045f, 0.02627f, 0.05488f, 0.09450f, 0.05541f, 0.06420f, 0.06664f, 0.13328f, 0.10478f, 0.04534f, 0.02087f, 0.06208f, 0.05912f, 0.04513f}, + {0.04598f, 0.04015f, 0.07321f, 0.02892f, 0.06261f, 0.12575f, 0.07331f, 0.07935f, 0.06282f, 0.10637f, 0.08370f, 0.03687f, 0.02246f, 0.05721f, 0.05318f, 0.04810f}, + {0.03952f, 0.03189f, 0.09704f, 0.02543f, 0.07639f, 0.08666f, 0.10266f, 0.06378f, 0.05424f, 0.07850f, 0.11166f, 0.03899f, 0.02426f, 0.04270f, 0.07819f, 0.04810f}, + + {0.04312f, 0.04291f, 0.08878f, 0.01960f, 0.03666f, 0.09800f, 0.04577f, 0.05933f, 0.07098f, 0.14207f, 0.12395f, 0.05255f, 0.02129f, 0.05795f, 0.04990f, 0.04714f}, + {0.04662f, 0.04672f, 0.06335f, 0.01536f, 0.09069f, 0.14980f, 0.05488f, 0.04556f, 0.07670f, 0.11654f, 0.09026f, 0.02490f, 0.03009f, 0.07162f, 0.04895f, 0.02797f}, + {0.09503f, 0.01695f, 0.11802f, 0.01409f, 0.20945f, 0.03856f, 0.11622f, 0.02045f, 0.11929f, 0.02998f, 0.09440f, 0.01377f, 0.03464f, 0.01504f, 0.05255f, 0.01155f}, + + {0.16580f, 0.13751f, 0.11325f, 0.04185f, 0.02532f, 0.03687f, 0.01409f, 0.02426f, 0.08645f, 0.19578f, 0.05700f, 0.04195f, 0.01165f, 0.01992f, 0.01451f, 0.01377f}, + {0.07257f, 0.06929f, 0.13042f, 0.01695f, 0.04693f, 0.25532f, 0.05943f, 0.02839f, 0.04079f, 0.06918f, 0.07649f, 0.01240f, 0.01420f, 0.06876f, 0.02373f, 0.01515f}, + {0.17417f, 0.00021f, 0.00011f, 0.00001f, 0.46213f, 0.00021f, 0.00001f, 0.00021f, 0.28944f, 0.00064f, 0.00001f, 0.00001f, 0.07289f, 0.00001f, 0.00001f, 0.00001f}, + +/* */ {0.00001f, 0.00001f, 0.00074f, 0.99788f, 0.00001f, 0.00001f, 0.00001f, 0.00106f, 0.00001f, 0.00001f, 0.00001f, 0.00011f, 0.00001f, 0.00001f, 0.00001f, 0.00021f}, + {0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00074f, 0.00001f, 0.00001f, 0.99926f, 0.00001f}, + {0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.00001f, 0.20934f, 0.14769f, 0.51330f, 0.12893f, 0.00032f, 0.00001f, 0.00042f, 0.00001f}, + + {0.07416f, 0.04132f, 0.06346f, 0.03072f, 0.03252f, 0.05265f, 0.01843f, 0.04407f, 0.15447f, 0.23382f, 0.07946f, 0.04598f, 0.00795f, 0.07194f, 0.02182f, 0.02723f}, +// {0.04428f, 0.05859f, 0.11855f, 0.04767f, 0.06155f, 0.11622f, 0.12681f, 0.09514f, 0.02829f, 0.05922f, 0.06621f, 0.02945f, 0.01335f, 0.03157f, 0.08168f, 0.02140f}, +// {0.04015f, 0.02691f, 0.06166f, 0.01875f, 0.07850f, 0.06876f, 0.06314f, 0.05520f, 0.08910f, 0.09916f, 0.15319f, 0.05181f, 0.02712f, 0.04534f, 0.08020f, 0.04100f}, +// {0.07522f, 0.05785f, 0.06282f, 0.03899f, 0.05244f, 0.07151f, 0.04428f, 0.07194f, 0.10965f, 0.11601f, 0.08306f, 0.04948f, 0.01165f, 0.07416f, 0.03644f, 0.04450f}, +// {0.05149f, 0.06187f, 0.09959f, 0.03602f, 0.05996f, 0.11749f, 0.07787f, 0.06420f, 0.03094f, 0.08083f, 0.07342f, 0.04142f, 0.02214f, 0.06018f, 0.09185f, 0.03072f}, +// {0.04195f, 0.03380f, 0.06717f, 0.02161f, 0.08656f, 0.09164f, 0.07342f, 0.06876f, 0.07660f, 0.09810f, 0.12003f, 0.04799f, 0.02871f, 0.04460f, 0.06568f, 0.03337f}, +// {0.07819f, 0.05318f, 0.05710f, 0.04534f, 0.05668f, 0.08232f, 0.04471f, 0.08444f, 0.09948f, 0.10520f, 0.06590f, 0.05573f, 0.01685f, 0.06378f, 0.03697f, 0.05414f}, + }; + + + Motif gstartMotif=new MotifProbsN("Gene Starts MP1", grid, 6, 1); + Motif gstartATG=new MotifProbsN("ATG Gene Starts MP1", gridATG, 0, 1); + + Motif gstartMotif2=new MotifProbsN("Gene Starts MP2", grid2, 8, 2); +// +// Motif estartMotif_ag=new MotifProbs(grid_ag, 9); +// Motif estartMotif_ac=new MotifProbs(grid_ac, 9); +// Motif estartMotif_atg=new MotifProbs(grid_atg, 9); +// Motif estartMotif_nonagac=new MotifProbs(grid_nonagac, 9); +// +// Motif estartMotif2_ag=new MotifProbsN(grid2_ag, 10); +// Motif estartMotif2_ac=new MotifProbsN(grid2_ac, 9); +// Motif estartMotif2_nonagac=new MotifProbsN(grid2_nonagac, 10); + +// Motif estartMotif_multi=new MotifMulti(estartMotif_ag, estartMotif_ac, estartMotif_nonagac); +// Motif estartMotif2_multi=new MotifMulti(estartMotif2_ag, estartMotif2_ac, estartMotif2_nonagac); + + Motif m=gstartMotif2; + + + ArrayList firstBeaten=new ArrayList(); + + long count=0; + for(chrom=1; chrom<=maxChrom; chrom++){ +// count+=analyzeChromosomeGStarts(chrom, m, locations); +// count+=analyzeChromosomeGStartsStronger(chrom, m, locations, firstBeaten); +// count+=analyzeChromosomeGStartsStrongerInFrame(chrom, m, locations, firstBeaten, true, Gene.PLUS); + count+=analyzeChromosomeGStartsStrongerInFrame(chrom, m, locations, firstBeaten, true, Gene.MINUS); + Data.unload(chrom, true); + } + + Collections.sort(locations); + + int[] histogram=new int[CLEN+1]; + int[] histogramBeaten=new int[CLEN+1]; + for(Integer i : locations){ + histogram[i]++; + } + for(Integer i : firstBeaten){ + histogramBeaten[i]++; + } + + System.out.println(count+" sites analyzed. ATG occurances:"); + for(int i=0; i list, byte strand){ + GeneSet[] genes=Data.getGeneSets(chrom); + ChromosomeArray ca=Data.getChromosome(chrom, strand); + + HashSet eset=new HashSet(); + for(GeneSet g : genes){ + if(g.strand==strand){ + if(strand==Gene.PLUS){ + eset.add(g.minStart); + }else{ + eset.add(ca.maxIndex-g.maxEnd); + } + } + } + + ArrayList list2=new ArrayList(eset.size()); + list2.addAll(eset); + Collections.sort(list2); + + for(Integer x : list2){ + + for(int i=CLEN; i>=0; i--){ + int pos=x-i; + float f=analyze(pos, m, ca); + if(f>=THRESH){ + list.add(i); + } + } + } + return list2.size(); + } + + + public static long analyzeChromosomeGStartsStronger(int chrom, Motif m, ArrayList list, ArrayList listBeat, byte strand){ + GeneSet[] genes=Data.getGeneSets(chrom); + ChromosomeArray ca=Data.getChromosome(chrom, strand); + + HashSet eset=new HashSet(); + for(GeneSet g : genes){ + if(g.strand==strand){ + if(strand==Gene.PLUS){ + eset.add(g.minStart); + }else{ + eset.add(ca.maxIndex-g.maxEnd); + } + } + } + + ArrayList list2=new ArrayList(eset.size()); + list2.addAll(eset); + Collections.sort(list2); + + for(Integer x : list2){ + +// for(int i=CLEN; i>=0; i--){ +// int pos=x-i; +// float f=analyze(pos, list, m, ca); +// if(f>=THRESH){ +// list.add(i); +// } +// } + + int firstBeaten=CLEN+1; + float basis=analyze(x, m, ca); + for(int i=0; i<=CLEN; i++){ + int pos=x-i; + float f=analyze(pos, m, ca); + if(f>=basis){ + if(i>0 && i list, ArrayList listBeat, boolean in, byte strand){ + GeneSet[] genes=Data.getGeneSets(chrom); + ChromosomeArray ca=Data.getChromosome(chrom, strand); + + HashSet eset=new HashSet(); + for(GeneSet g : genes){ + if(g.strand==strand){ + if(strand==Gene.PLUS){ + eset.add(g.minStart); + }else{ + eset.add(ca.maxIndex-g.maxEnd); + } + } + } + + ArrayList list2=new ArrayList(eset.size()); + list2.addAll(eset); + Collections.sort(list2); + + for(Integer x : list2){ + +// for(int i=CLEN; i>=0; i--){ +// int pos=x-i; +// float f=analyze(pos, list, m, ca); +// if(f>=THRESH){ +// list.add(i); +// } +// } + + int firstBeaten=CLEN+1; + float basis=analyze(x, m, ca); + for(int i=0; i<=CLEN; i++){ + int pos=x-i; + + + if((in && i%3==0) || (!in && i%3==1)){ + float f=analyze(pos, m, ca); + if(f>=basis){ + if(i>0 && i locations=new ArrayList(); + + + + private static final int min(int x, int y){return xy ? x : y;} + +} diff --git a/current/driver/GenerateNoCallsFromCoverage.java b/current/driver/GenerateNoCallsFromCoverage.java new file mode 100755 index 0000000..2c89f16 --- /dev/null +++ b/current/driver/GenerateNoCallsFromCoverage.java @@ -0,0 +1,426 @@ +package driver; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import var.VarLine; +import var.Variation; + +import dna.ChromosomeArray; +import dna.CoverageArray1; +import dna.Data; +import dna.Gene; + +public class GenerateNoCallsFromCoverage { + +// @Deprecated +// public static ArrayList generateOld(byte chrom, CoverageArray ca, int build, char gender){ +// +// ArrayList lines=new ArrayList(256); +// +// assert(Data.GENOME_BUILD==build); +// ChromosomeArray chra=Data.getChromosome(chrom); +// +// int start=-1; +// int stop=-1; +// +// for(int i=chra.minIndex; i-1){ +// stop=i-1; +// +// VarLine v1=new VarLine(); +// v1.ploidy=(chrom<=22 ? 2 : chrom>=24 ? 1 : (Byte)Variation.ploidyMap.get("?")); +// +// v1.haplotype=1; +// v1.chromosome=chrom; +// v1.beginLoc=start; +// v1.endLoc=stop; +// +// v1.ref="="; +// v1.call=null; +// +// v1.totalScore=-1; +// v1.xRef=-2; +// v1.xRefArray=null; +// v1.hapLink=-1; +// v1.varType=Variation.NOCALL; +// +// VarLine v2; +// if((chrom==23 && gender=='M') || chrom==24 || chrom==25){ +// v2=null; +// }else{ +// v2=new VarLine(); +// v2.ploidy=(chrom<=22 ? 2 : chrom>=24 ? 1 : (Byte)Variation.ploidyMap.get("?")); +// +// v2.haplotype=2; +// v2.chromosome=chrom; +// v2.beginLoc=start; +// v2.endLoc=stop; +// +// v2.ref="="; +// v2.call=null; +// +// v2.totalScore=-1; +// v2.xRef=-2; +// v2.xRefArray=null; +// v2.hapLink=-1; +// v2.varType=Variation.NOCALL; +// } +// +// +// start=-1; +// stop=-1; +// lines.add(v1); +// if(v2!=null){lines.add(v2);} +// } +// +// +// } +// +// return lines; +// } + + + + public static ArrayList generate(byte chrom, CoverageArray1 ca, int build, char gender){ + + assert(minCovered>=1); + assert(minHalfCovered>=1); + assert(minCovered>=minHalfCovered); + + + ArrayList lines=new ArrayList(256); + + assert(Data.GENOME_BUILD==build); + ChromosomeArray chra=Data.getChromosome(chrom); + + int start=-1; + int stop=-1; + + + byte level=-1; + + boolean haploid=(chrom==23 && gender=='M') || chrom==24 || chrom==25; + + for(int i=chra.minIndex; i=24 ? 1 : gender=='M' ? 1 : gender=='F' ? 2 : (Byte)Variation.ploidyMap.get("?")); + v1.haplotype=1; + v1.chromosome=chrom; + v1.beginLoc=start; + v1.endLoc=stop; + + v1.ref="="; + v1.call=null; + + v1.totalScore=-1; + v1.hapLink=-1; + v1.varType=Variation.NOCALL; + + lines.add(v1); + } + + if(level==0 || (level==1 && !haploid)){ + + VarLine v2=new VarLine(); + v2.ploidy=(chrom<=22 ? 2 : chrom>=24 ? 1 : gender=='M' ? 1 : gender=='F' ? 2 : (Byte)Variation.ploidyMap.get("?")); + + v2.haplotype=2; + v2.chromosome=chrom; + v2.beginLoc=start; + v2.endLoc=stop; + + v2.ref="="; + v2.call=null; + + v2.totalScore=-1; + v2.hapLink=-1; + v2.varType=Variation.NOCALL; + + lines.add(v2); + } + + +// start=-1; + stop=-1; + level=newLevel; + start=i; + } + } + + return lines; + } + + + public static ArrayList removeDuplicateNocalls(List input, int copies){ + ArrayList[] haplo=splitHaplotypes(input, copies); + + ArrayList output=new ArrayList(256); +// System.err.println("A: copies="+copies+"; input.size="+input.size()+"; haplo="+haplo[0].size()+", "+haplo[1].size()); + for(ArrayList alv : haplo){ + VarLine temp=alv.size()==0 ? null : alv.get(0); + for(VarLine vl : alv){assert(vl.haplotype==temp.haplotype);} + ArrayList alv2=removeDuplicateNocallsHaplotyped(alv); +// assert(checkCopyCountHaplotyped(alv2)); //Very slow + +// output.addAll(removeDuplicateNocallsHaplotyped(alv2)); //This MUST be incorrect. + + output.addAll(alv2); + } + + Collections.sort(output); + + return output; + } + + public static boolean checkCopyCountHaplotyped(List list){ + + int max=0; + for(VarLine vl : list){ + if(vl.endLoc>max){max=vl.endLoc;} + } + + byte[] sum=new byte[max+1]; +// byte[] vars=new byte[max+1]; + byte[] nocalls=new byte[max+1]; + + for(VarLine vl : list){ + for(int i=vl.beginLoc; i<=vl.endLoc; i++){ + sum[i]++; + if(vl.isNoCall()){nocalls[i]++;} +// else{vars[i]++;} + } + } + + for(int i=0; i1){ + assert(false) : "chr"+list.get(0).chromosome+", "+i; + return false; + } + if(sum[i]>1){ + assert(false) : "chr"+list.get(0).chromosome+", "+i; + return false; + } + } + + return true; + } + + + /** All elements of input should share haplotype */ + public static ArrayList removeDuplicateNocallsHaplotyped(ArrayList input){ + + +// System.err.println("B: input.size="+input.size()); + + Collections.sort(input); + + ArrayList output=new ArrayList(256); + + boolean needToReprocess=false; + + VarLine prev=null; + + final boolean verbose=false; + + for(int i=0; i=current.beginLoc) : current; + +// final VarLine current2=current; + final VarLine prev2=prev; + +// if(current.chromosome==2 && (current.touches(8890433) || (prev!=null && prev.touches(8890433)))){ +// verbose=true; +// System.err.println("current="+current); +// System.err.println("touches? "+current.touches(8890433)); +// System.err.println("intersects? "+current.intersects(8890433)); +// }else if(prev==null && verbose){ +// System.err.println("current="+current); +// System.err.println("touches? "+current.touches(8890433)); +// System.err.println("intersects? "+current.intersects(8890433)); +// }else{ +// verbose=false; +// } + + boolean problem=prev!=null && prev.intersects(current); + if(problem){ + if(prev.isPoint() && (current.endLoc==prev.beginLoc || current.beginLoc==prev.beginLoc)){ + problem=false; + } + if(current.isPoint() && (prev.endLoc==current.beginLoc || prev.beginLoc==current.beginLoc)){ + problem=false; + } + } + + if(problem){ + boolean ncc=current.isNoCall(); + boolean ncp=prev.isNoCall(); + boolean refc=current.isRef(); + boolean refp=prev.isRef(); + boolean varc=current.isTrueVariation(); + boolean varp=prev.isTrueVariation(); + if(!needToReprocess){ +// System.err.println("\nNeed to reprocess because:"); +// System.err.println("\n"+prev); +// System.err.println("\n"+current); + } + needToReprocess=true; + + if((ncc && ncp) || (refc && refp) || (refc && ncp)/* || (refc && varp) || (ncp && varp)*/){ //Un-intersect them + current=current.clone(); + { + current.ref="="; + if(refc){current.call="=";} + else if(ncc){current.call=null;} + } + current.beginLoc=prev.endLoc+1; + if(current.beginLoc>current.endLoc){current=null;} + else{ + assert(!prev.intersects(current) + || (prev.isPoint() && (current.endLoc==prev.beginLoc || current.beginLoc==prev.beginLoc)) + || (current.isPoint() && (prev.endLoc==current.beginLoc || prev.beginLoc==current.beginLoc))) : + refp+", "+ncp+", "+refc+", "+ncc+"\n"+prev+"\n"+current; + } + }else if(ncc || refc){ + current=current.clone(); + { + current.ref="="; + if(refc){current.call="=";} + else if(ncc){current.call=null;} + } + current.beginLoc=prev.endLoc+(prev.isPoint() ? 0 : 1); + if(current.beginLoc>current.endLoc){current=null;} + else{ + assert(!prev.intersects(current) + || (prev.isPoint() && (current.endLoc==prev.beginLoc || current.beginLoc==prev.beginLoc)) + || (current.isPoint() && (prev.endLoc==current.beginLoc || prev.beginLoc==current.beginLoc))) : + refp+", "+ncp+", "+refc+", "+ncc+"\n"+prev+"\n"+current; + } + }else if(ncp || refp){ + prev=prev.clone(); + { + prev.ref="="; + if(refp){prev.call="=";} + else if(ncp){prev.call=null;} + } + prev.endLoc=current.beginLoc-1; + if(prev.beginLoc>prev.endLoc){prev=null;} + else{ + assert(!prev.intersects(current) || + (prev.isNoCall() && prev.lengthRef()==1 && current.isPoint()) //Corner case for intersection + ) : "\n"+prev+"\n\n"+current+"\n"; + } + + if(prev2.endLoc>current.endLoc || (prev2.endLoc==current.endLoc && current.isPoint())){ + VarLine temp=prev2.clone(); + { + temp.ref="="; + if(temp.isRef()){temp.call="=";} + else if(temp.isNoCall()){temp.call=null;} + } + temp.beginLoc=current.endLoc+(current.isPoint() ? 0 : 1); + if(temp.beginLoc<=temp.endLoc){ + + assert(prev==null || !temp.intersects(prev)); + assert(!temp.intersects(current) + || (temp.isPoint() && (current.endLoc==temp.beginLoc || current.beginLoc==temp.beginLoc)) + || (current.isPoint() && (temp.endLoc==current.beginLoc || temp.beginLoc==current.beginLoc))) : + refp+", "+ncp+", "+refc+", "+ncc+"\n"+temp+"\n"+current; + + if(verbose){System.err.println("Current="+current+"\nprev="+prev+"\nAdding "+temp+"\n");} + + output.add(temp); +// needToReprocess=true; + } + } + }else{ + System.out.println("Warning: Deleted variation due to conflict! \n"+prev+"\n"+current+"\n"); +// assert(false) : "\n"+prev+"\n"+current+"\n"; + current=null; + } + } + + if(prev!=null){ + if(verbose){System.err.println("Current="+current+"\nAdding "+prev+"\n");} + output.add(prev); + } + prev=current; + } + if(prev!=null){output.add(prev);} + + if(needToReprocess){return removeDuplicateNocallsHaplotyped(output);} + + Collections.sort(output); + return output; + } + + + public static ArrayList[] splitHaplotypes(List input, int copies){ + ArrayList[] haplo=new ArrayList[2]; + for(int i=0; i(); + } + for(VarLine vl : input){ + if(vl.haplotype==1){ + haplo[0].add(vl); + }else if(vl.haplotype==2){ + haplo[1].add(vl); + }else{ + assert(vl.haplotype==3); + if(copies>1){ + VarLine[] vl2=vl.splitLine(); + haplo[0].add(vl2[0]); + haplo[1].add(vl2[1]); + }else{ + haplo[0].add(vl); + } + } + } + + return haplo; + } + + + public static int minCovered=2; + public static int minHalfCovered=1; + +} diff --git a/current/driver/GetReads.java b/current/driver/GetReads.java new file mode 100755 index 0000000..a8f9ada --- /dev/null +++ b/current/driver/GetReads.java @@ -0,0 +1,404 @@ +package driver; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashSet; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; + +import align2.ListNum; +import align2.Tools; +import dna.Data; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ByteFile1; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextStreamWriter; + +/** + * Grab reads with specified numbers from a file. + * TODO Note that much of this is ripped directly from ReformatReads, but is incorrect, because this class does not support dual output files. + * @author Brian Bushnell + * @date Jul 10, 2013 + * + */ +public class GetReads { + + public static void main(String[] args){ + new GetReads(args); + } + + public GetReads(String[] args){ + if(args==null || args.length==0){ + throw new RuntimeException("No arguments."); + } + + for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}} + outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + Timer t=new Timer(); + t.start(); + + String in1=null; + String in2=null; + + String qfin1=null; + String qfin2=null; + + String out1=null; + String out2=null; + + String qfout1=null; + String qfout2=null; + + boolean parsecustom=false; + boolean errorState=false; + long maxReads=-1; + int passes=1; + boolean testsize=false; + boolean overwrite=false; + float samplerate=1f; + long sampleseed=1; + + boolean setInterleaved=false; //Whether it was explicitly set. + + byte qin=-1; + byte qout=-1; + + FastaReadInputStream.SPLIT_READS=false; + stream.FastaReadInputStream.MIN_READ_LEN=1; + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + + HashSet table=new HashSet(); + + for(int i=0; i1 ? split[1] : "true"); + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null") || a.equals(in2)){ + // do nothing + }else if(a.equals("id") || a.equals("number")){ + String[] b2=b.split(","); + for(String c : b2){ + final long x, y; + if(c.indexOf('-')>=0){ + String[] c2=c.split("-"); + assert(c2.length==2) : c; + x=Long.parseLong(c2[0]); + y=Long.parseLong(c2[1]); + }else{ + x=y=Long.parseLong(c); + } + for(long z=x; z<=y; z++){ + table.add(z); + } + } + }else if(a.equals("passes")){ + passes=Integer.parseInt(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; +// align2.FastaReadInputStream2.verbose=verbose; +// align2.FastqReadInputStream.verbose=verbose; + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + ReadWrite.USE_PIGZ=Tools.parseBoolean(b); + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("build") || a.equals("genome")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + if(b.indexOf('#')>-1 && !new File(b).exists()){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){ + out1=b; + if(b.indexOf('#')>-1){ + out1=b.replace("#", "1"); + out2=b.replace("#", "2"); + } + }else if(a.equals("out2") || a.equals("output2")){ + out2=b; + }else if(a.equals("qfin") || a.equals("qfin1")){ + qfin1=b; + }else if(a.equals("qfout") || a.equals("qfout1")){ + qfout1=b; + }else if(a.equals("qfin2")){ + qfin2=b; + }else if(a.equals("qfout2")){ + qfout2=b; + }else if(a.equals("parsecustom")){ + parsecustom=Tools.parseBoolean(b); + }else if(a.equals("testsize")){ + testsize=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + } + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("samplerate")){ + samplerate=Float.parseFloat(b); + assert(samplerate<=1f && samplerate>=0f) : "samplerate="+samplerate+"; should be between 0 and 1"; + }else if(a.equals("sampleseed")){ + sampleseed=Long.parseLong(b); + }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){ + int x=Integer.parseInt(b); + stream.FastaReadInputStream.MIN_READ_LEN=(x>0 ? x : Integer.MAX_VALUE); + }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in1=arg; + if(arg.indexOf('#')>-1 && !new File(arg).exists()){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(out1==null && i==1 && !arg.contains("=")){ + out1=arg; + if(arg.indexOf('#')>-1){ + out1=b.replace("#", "1"); + out2=b.replace("#", "2"); + } + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); +// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;} + + if(in1==null){ + throw new RuntimeException("Error - at least one input file is required."); + } + + if(out1==null){ + if(out2!=null){ + throw new RuntimeException("Error - cannot define out2 without defining out1."); + } + out1="stdout"; + } + + if(!setInterleaved){ + assert(in1!=null && out1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; + if(in2!=null){ //If there are 2 input streams. + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else{ //There is one input stream. + if(out2!=null){ + FASTQ.FORCE_INTERLEAVED=true; + FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + } + } + + if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} + if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;} + + if(!Tools.testOutputFiles(overwrite, false, out1, out2)){ + throw new RuntimeException("\n\nOVERWRITE="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); + } + + FASTQ.PARSE_CUSTOM=parsecustom; + + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + + FileFormat ffin=FileFormat.testInput(in1, 0, null, true, true); + FileFormat ffout=FileFormat.testOutput(out1, 0, null, true, overwrite, false); + + + final boolean useSharedHeader=(ffin!=null && ffout!=null && ffin.samOrBam() && ffout.samOrBam()); + + if(ffin!=null && ffout!=null && ffin.samOrBam() && (ffout.samOrBam() || ffout.bread())){ + throw new RuntimeException("\nDirect conversion of sam to sam or bread are not currently supported.\nAll other conversions are possible."); + } + + + ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, useSharedHeader, ff1, ff2); + } + + cris.setSampleRate(samplerate, sampleseed); + outstream.println("Input is "+(cris.paired() ? "paired" : "unpaired")); + Thread cristhread=new Thread(cris); + cristhread.start(); + + TextStreamWriter tsw=new TextStreamWriter(out1, overwrite, false, false); + tsw.start(); + + + long readsProcessed=0; + long basesProcessed=0; + + for(int pass=1; pass<=passes; pass++){ +// outstream.println("pass="+pass); + if(pass>1){ + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, useSharedHeader, ff1, ff2); + cris.setSampleRate(samplerate, sampleseed); + cristhread=new Thread(cris); + cristhread.start(); + } + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(ffin.samOrBam() || (r.mate!=null)==cris.paired()); + } + + while(reads!=null && reads.size()>0 && !table.isEmpty()){ + + for(Read r1 : reads){ + { + readsProcessed++; + basesProcessed+=r1.bases==null ? 0 : r1.bases.length; + } + Read r2=r1.mate; + if(r2!=null){ + readsProcessed++; + basesProcessed+=r2.bases==null ? 0 : r2.bases.length; + } + + if(table.remove(r1.numericID)){ + tsw.println(r1); + if(r2!=null){tsw.println(r2);} + if(table.isEmpty()){break;} + } + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + errorState|=ReadWrite.closeStream(cris); + } + + if(tsw!=null){ + tsw.poisonAndWait(); + } + + errorState|=(cris.errorState()); + + t.stop(); + + double rpnano=readsProcessed/(double)(t.elapsed); + double bpnano=basesProcessed/(double)(t.elapsed); + + String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m"); + String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m"); + + while(rpstring.length()<8){rpstring=" "+rpstring;} + while(bpstring.length()<8){bpstring=" "+bpstring;} + + outstream.println("Time: \t"+t); + outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + if(testsize){ + long bytesProcessed=(new File(in1).length()+(in2==null ? 0 : new File(in2).length()))*passes; + double xpnano=bytesProcessed/(double)(t.elapsed); + String xpstring=(bytesProcessed<100000 ? ""+bytesProcessed : bytesProcessed<100000000 ? (bytesProcessed/1000)+"k" : (bytesProcessed/1000000)+"m"); + while(xpstring.length()<8){xpstring=" "+xpstring;} + outstream.println("Bytes Processed: "+xpstring+" \t"+String.format("%.2fm bytes/sec", xpnano*1000)); + } + + if(errorState){ + throw new RuntimeException("GetReads terminated in an error state; the output may be corrupt."); + } + + } + + private PrintStream outstream=System.err; + public static boolean verbose=false; + +} diff --git a/current/driver/GetSequence.java b/current/driver/GetSequence.java new file mode 100755 index 0000000..e3c8676 --- /dev/null +++ b/current/driver/GetSequence.java @@ -0,0 +1,99 @@ +package driver; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Gene; +import dna.Range; + +public class GetSequence { + + public static void main(String[] args){ + + byte chrom=-1; + byte strand=Gene.PLUS; + + /** Change base to zero or one for the coordinates mode */ + int base=0; + +// char c=args[1].charAt(0); +// if(c=='+'){strand=Gene.PLUS;} +// else if(c=='-'){strand=Gene.MINUS;} +// else{assert(false) : "Invalid strand: "+args[1];} + + int firstLoc=-1; + for(int i=0; i=Integer.MAX_VALUE){ + System.out.println("Found overflow ID "+ssr.numericID+" at line "+line); + System.out.println("ssr="+ssr.toText()); + System.out.println("raw="+s2); + System.out.println("All:\n"+Arrays.toString(split)); + System.out.println(); + break; + } + } + line++; + } + tf.close(); + System.out.println("Max ID was "+max); + + } + +} diff --git a/current/driver/MakeTestScript.java b/current/driver/MakeTestScript.java new file mode 100755 index 0000000..140b54f --- /dev/null +++ b/current/driver/MakeTestScript.java @@ -0,0 +1,368 @@ +package driver; + +public class MakeTestScript { + + + public static void main(String[] args){ + + assert(args.length>=1) : "Please enter number of reads."; + numReads=Integer.parseInt(args[0]); + readlen=Integer.parseInt(args[1]); + + String mode=args[2]; + String extra=(args.length>3 ? args[3] : "EXTRA"); + + String printtime="java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime "; + String gradesam="java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.GradeSamFile "; + String time=mode+"Time.txt"; + + String[] strings=null; + +// strings=new String[] { +// "/house/homedirs/b/bushnell/ssaha2/ssaha2 -solexa -outfile #S.sam -best -1 -output sam_soft -save hg37 " + +// "reads_B1_#Rx#Lbp_#S.fastq", +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime defaultTime.txt", +// gradesam+"#S.sam #R ssaha2", +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime defaultTime.txt" +// }; + + if(mode.equalsIgnoreCase("bwa")){ + strings=new String[] { +// printtime+time+" false", +// "memtime /house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>temp_bwa.sai", +// "memtime /house/homedirs/b/bushnell/bwa/bwa samse "+extra+" temp_bwa.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam", +// printtime+time, +// gradesam+"bwa_#S_r#Rx#L.sam #R", + + printtime+time+" false", + "/house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sai", + "/house/homedirs/b/bushnell/bwa/bwa samse "+extra+" bwa_#S_r#Rx#L.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam", + printtime+time, + gradesam+"bwa_#S_r#Rx#L.sam #R", + }; + } + + if(mode.equalsIgnoreCase("bwamem")){ + strings=new String[] { +// printtime+time+" false", +// "memtime /house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>temp_bwa.sai", +// "memtime /house/homedirs/b/bushnell/bwa/bwa samse "+extra+" temp_bwa.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam", +// printtime+time, +// gradesam+"bwa_#S_r#Rx#L.sam #R", + + printtime+time+" false", + "/house/homedirs/b/bushnell/bwa74/bwa mem -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>bwamem_#S_r#Rx#L.sam", + printtime+time, + gradesam+"bwamem_#S_r#Rx#L.sam #R", + }; + } + + if(mode.equalsIgnoreCase("bwasw")){ + strings=new String[] { +// printtime+time+" false", +// "memtime /house/homedirs/b/bushnell/bwa/bwa aln -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fastq 1>temp_bwa.sai", +// "memtime /house/homedirs/b/bushnell/bwa/bwa samse "+extra+" temp_bwa.sai reads_B1_#Rx#Lbp_#S.fastq 1>bwa_#S_r#Rx#L.sam", +// printtime+time, +// gradesam+"bwa_#S_r#Rx#L.sam #R", + + printtime+time+" false", + "/house/homedirs/b/bushnell/bwa/bwa bwasw -b5 -q2 -r1 -z10 -t 32 "+extra+" reads_B1_#Rx#Lbp_#S.fasta 1>bwa_#S_r#Rx#L.sam", + printtime+time, + gradesam+"bwa_#S_r#Rx#L.sam #R", + }; + } + + if(mode.startsWith("bbmap")){ + int k=13; + String s2=mode.replaceFirst("bbmap", ""); + if(s2.length()>0){ + k=Integer.parseInt(s2); + } + strings=new String[] { + printtime+time+" false", + "memtime java -ea -Xmx106g -cp /house/homedirs/b/bushnell/beta18/ " + + "align2.BBMap in=reads_B1_#Rx#Lbp_#S.fastq out=bbmap"+k+"_#S_r#Rx#L.sam overwrite k="+k+" printtoerr", + printtime+time, + gradesam+"bbmap"+k+"_#S_r#Rx#L.sam #R", + }; + } + + if(mode.equalsIgnoreCase("bowtie2")){ + strings=new String[] { + printtime+time+" false", + "memtime bowtie2 -x bow2ref -U reads_B1_#Rx#Lbp_#S.fastq -S bowtie2_#S_r#Rx#L.sam --phred33 -p 32", + printtime+time, + gradesam+"bowtie2_#S_r#Rx#L.sam #R", + }; + } + + if(mode.equalsIgnoreCase("gsnap")){ + strings=new String[] { + printtime+time+" false", + "memtime /house/homedirs/b/bushnell/gsnap/bin/gsnap -t 32 -d "+extra+" -A sam reads_B1_#Rx#Lbp_#S.fastq > gsnap_#S_r#Rx#L.sam", + printtime+time, + gradesam+"gsnap_#S_r#Rx#L.sam #R", + }; + } + + +// strings=new String[] { +// "bowtie --best -y --chunkmbs 1024 --strata -m 1 -k 2 -v 3 -p 24 -t -q -S HG37" + +// " reads_B1_#Rx#Lbp_#S.fastq #S_bowtie.sam", +// +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bowtieTime.txt", +// gradesam+"#S_bowtie.sam #R", +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bowtieTime.txt" +// }; + + +// strings=new String[] { +// "bfast match -T $TMPDIR/ -n 16 -f hg19.fa -r reads_B1_#Rx#Lbp_#S.fastq > $TMPDIR/#S.bmf", +// "bfast localalign -n 16 -f hg19.fa -m $TMPDIR/#S.bmf > $TMPDIR/#S.baf", +//// "bfast postprocess -n 16 -a 3 -f hg19.fa -i $TMPDIR/#S.baf > #S.sam", +//// "bfast postprocess -n 16 -a 3 -m 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#Rx#L.sam", +// "bfast postprocess -n 16 -M 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#Rx#L.sam", +// +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bfastTime.txt", +// gradesam+"#S_r#Rx#L.sam #R", +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime bfastTime.txt" +// }; + + if(mode.equalsIgnoreCase("smalt")){ + strings=new String[] { + printtime+time+" false", + "memtime /house/homedirs/b/bushnell/smalt/smalt_x86_64 map -n 32 -f sam -o smalt_#S_r#Rx#L.sam smaltindex reads_B1_#Rx#Lbp_#S.fastq", + printtime+time, + gradesam+"smalt_#S_r#Rx#L.sam #R ssaha2", + }; + } + + if(mode.equalsIgnoreCase("snap")){ + strings=new String[] { + printtime+time+" false", + "memtime /house/homedirs/b/bushnell/snap/snap single snapref reads_B1_#Rx#Lbp_#S.fastq -o snap_#S_r#Rx#L.sam -t 32 -b", + printtime+time, + gradesam+"snap_#S_r#Rx#L.sam #R", + }; + } + + if(mode.equalsIgnoreCase("masai")){ + strings=new String[] { + printtime+time+" false", + "memtime /house/homedirs/b/bushnell/masai/masai_mapper --output-format sam "+extra+" reads_B1_#Rx#Lbp_#S.fastq", + printtime+time, + gradesam+"reads_B1_#Rx#Lbp_#S.sam #R", + }; + } + + if(mode.equalsIgnoreCase("blasr")){ + System.out.println("source /house/sdm/pacbio/smrtanalysis-installs/smrtanalysis-2.0.0/etc/setup.sh\n"); + strings=new String[] { + printtime+time+" false", + "memtime blasr reads_B1_#Rx#Lbp_#S.fastq "+extra+" -sam -out blasr_#S_r#Rx#L.sam -bestn 1 -nproc 32", + printtime+time, + gradesam+"blasr_#S_r#Rx#L.sam #R blasr", + }; + } + +// strings=new String[] { +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt", +// "./soap -p 24 -a reads_B1_#Rx#Lbp_#S.fastq -D hg37.fa.index -o #S_r#Rx#L.soap", +// "perl soap2sam.pl -p #S_r#Rx#L.soap > #S_r#Rx#L.sam", +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt", +// gradesam+"#S_r#Rx#L.sam #R", +// }; + +// strings=new String[] { +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt", +// "./bin/gmapper-ls reads_B1_#Rx#Lbp_#S.fastq --single-best-mapping --qv-offset 33 -L hg37 -N 24 -o 5 -h 80% > #S_r#Rx#L.sam", +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt", +// gradesam+"#S_r#Rx#L.sam #R", +// }; + +// strings=new String[] { +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt", +// "./bin/MosaikBuild -q reads_B1_#Rx#Lbp_#S.fastq -out $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25.dat -st illumina", +// "./bin/MosaikAligner -in $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25.dat -out $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25_aligned.dat -ia hg37_ref.dat -hs 15 -bw=29 -j hg37_jumpdb -act 20 -mm 32 -mhp 100 -p 32 -m unique", +// "./bin/MosaikText -in $TMPDIR/reads_B1_#Rx100bp_#S_chr1-25_aligned.dat -sam #S_r#Rx#L.sam", +// "java -ea -Xmx96m -cp /house/homedirs/b/bushnell/beta18/ align2.PrintTime mapTime.txt", +// gradesam+"#S_r#Rx#L.sam #R", +// }; + + int[] blank=new int[] {0, 0, 0, 0, 0}; + + int preload=100; + if(mode.equalsIgnoreCase("masai")){ + preload=1000; + } + print(strings, blank, preload); + print(strings, blank, preload); + print(strings, blank, preload); + print(strings, blank, preload); + for(int[] array : sets){ + print(strings, array, numReads); + } + + } + + private static void print(String[] array, int[] blank, int x) { + + int rl=readlen; + if(blank.length>5){rl=blank[5];} + + String counts=(blank[0]+"S_"+blank[1]+"I_"+blank[2]+"D_"+blank[3]+"U_"+blank[4]+"N"); + String reads=""+x; + String len=""+rl; + + for(String s : array){ + String s2=s.replaceAll("#S", counts).replaceAll("#R", reads).replaceAll("#L", len); + System.out.println(s2); + } + System.out.println(); + + } + + public static int numReads=400000; + public static int readlen=150; + + public static final int[][] sets=new int[][] { + {0, 0, 0, 0, 0}, + {1, 0, 0, 0, 0}, + {2, 0, 0, 0, 0}, + {3, 0, 0, 0, 0}, + {4, 0, 0, 0, 0}, + {5, 0, 0, 0, 0}, + {6, 0, 0, 0, 0}, + {7, 0, 0, 0, 0}, + {8, 0, 0, 0, 0}, + {10, 0, 0, 0, 0}, + {12, 0, 0, 0, 0}, + {14, 0, 0, 0, 0}, + {16, 0, 0, 0, 0}, + {18, 0, 0, 0, 0}, + {20, 0, 0, 0, 0}, + {24, 0, 0, 0, 0}, + {28, 0, 0, 0, 0}, + {32, 0, 0, 0, 0}, + {36, 0, 0, 0, 0}, + {40, 0, 0, 0, 0}, + + {0, 1, 0, 0, 0}, + {0, 2, 0, 0, 0}, + {0, 3, 0, 0, 0}, + {0, 4, 0, 0, 0}, + {0, 5, 0, 0, 0}, + {0, 6, 0, 0, 0}, + {0, 7, 0, 0, 0}, + {0, 8, 0, 0, 0}, + {0, 10, 0, 0, 0}, + {0, 12, 0, 0, 0}, + {0, 14, 0, 0, 0}, + {0, 16, 0, 0, 0}, + {0, 18, 0, 0, 0}, + {0, 20, 0, 0, 0}, + {0, 24, 0, 0, 0}, + {0, 28, 0, 0, 0}, + {0, 32, 0, 0, 0}, + {0, 36, 0, 0, 0}, + {0, 40, 0, 0, 0}, + + {0, 0, 1, 0, 0}, + {0, 0, 2, 0, 0}, + {0, 0, 3, 0, 0}, + {0, 0, 4, 0, 0}, + {0, 0, 5, 0, 0}, + {0, 0, 6, 0, 0}, + {0, 0, 7, 0, 0}, + {0, 0, 8, 0, 0}, + {0, 0, 10, 0, 0}, + {0, 0, 12, 0, 0}, + {0, 0, 14, 0, 0}, + {0, 0, 16, 0, 0}, + {0, 0, 18, 0, 0}, + {0, 0, 20, 0, 0}, + {0, 0, 24, 0, 0}, + {0, 0, 28, 0, 0}, + {0, 0, 32, 0, 0}, + {0, 0, 36, 0, 0}, + {0, 0, 40, 0, 0}, + {0, 0, 48, 0, 0}, + {0, 0, 56, 0, 0}, + {0, 0, 64, 0, 0}, + {0, 0, 96, 0, 0}, + {0, 0, 128, 0, 0}, + {0, 0, 192, 0, 0}, + {0, 0, 256, 0, 0}, + {0, 0, 384, 0, 0}, + {0, 0, 512, 0, 0}, + {0, 0, 768, 0, 0}, + {0, 0, 1000, 0, 0}, + {0, 0, 1500, 0, 0}, + {0, 0, 2000, 0, 0}, + {0, 0, 3000, 0, 0}, + {0, 0, 4000, 0, 0}, + {0, 0, 6000, 0, 0}, + {0, 0, 8000, 0, 0}, + {0, 0, 12000, 0, 0}, + {0, 0, 16000, 0, 0}, + {0, 0, 24000, 0, 0}, + {0, 0, 32000, 0, 0}, + {0, 0, 48000, 0, 0}, + {0, 0, 64000, 0, 0}, + {0, 0, 96000, 0, 0}, + {0, 0, 128000, 0, 0}, + + {0, 0, 0, 1, 0}, + {0, 0, 0, 2, 0}, + {0, 0, 0, 3, 0}, + {0, 0, 0, 4, 0}, + {0, 0, 0, 5, 0}, + {0, 0, 0, 6, 0}, + {0, 0, 0, 7, 0}, + {0, 0, 0, 8, 0}, + {0, 0, 0, 10, 0}, + {0, 0, 0, 12, 0}, + {0, 0, 0, 14, 0}, + {0, 0, 0, 16, 0}, + {0, 0, 0, 18, 0}, + {0, 0, 0, 20, 0}, + {0, 0, 0, 24, 0}, + {0, 0, 0, 28, 0}, + {0, 0, 0, 32, 0}, + {0, 0, 0, 36, 0}, + {0, 0, 0, 40, 0}, + + {0, 0, 0, 0, 1}, + {0, 0, 0, 0, 2}, + {0, 0, 0, 0, 3}, + {0, 0, 0, 0, 4}, + {0, 0, 0, 0, 5}, + {0, 0, 0, 0, 6}, + {0, 0, 0, 0, 7}, + {0, 0, 0, 0, 8}, + {0, 0, 0, 0, 10}, + {0, 0, 0, 0, 12}, + {0, 0, 0, 0, 14}, + {0, 0, 0, 0, 16}, + {0, 0, 0, 0, 18}, + {0, 0, 0, 0, 20}, + {0, 0, 0, 0, 24}, + {0, 0, 0, 0, 28}, + {0, 0, 0, 0, 32}, + {0, 0, 0, 0, 36}, + {0, 0, 0, 0, 40}, + + {0, 0, 0, 0, 0, 400}, + {2, 2, 2, 2, 0, 400}, + {4, 2, 2, 2, 0, 400}, + {6, 3, 3, 3, 0, 400}, + {8, 4, 4, 4, 0, 400}, + {10, 4, 4, 4, 0, 400}, + {12, 4, 4, 4, 0, 400}, + {14, 4, 4, 4, 0, 400}, + {16, 4, 4, 4, 0, 400}, + {18, 4, 4, 4, 0, 400}, + {20, 5, 5, 5, 0, 400}, + }; + +} diff --git a/current/driver/MakeTestScriptScoreOnly.java b/current/driver/MakeTestScriptScoreOnly.java new file mode 100755 index 0000000..b83201d --- /dev/null +++ b/current/driver/MakeTestScriptScoreOnly.java @@ -0,0 +1,210 @@ +package driver; + +public class MakeTestScriptScoreOnly { + + + public static void main(String[] args){ + + assert(args.length==1) : "Please enter number of reads."; + numReads=Integer.parseInt(args[0]); + +// String[] strings=new String[] { +// "/work/bbushnell/ssaha2/ssaha2 -solexa -outfile #S.sam -best -1 -output sam_soft -save hg37 " + +// "/work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S.sam #R ssaha2", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt" +// }; + + +// String[] strings=new String[] { +// "bwa aln -t 22 bs_ /work/bbushnell/synth/reads_B1_100000x100bp_#S_chr1-25.fq > temp_default.sai", +// "bwa samse bs_ temp_default.sai /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq > #S_default.sam", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_default.sam #R", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime defaultTime.txt" +// }; + + +// String[] strings=new String[] { +// "java -ea -Xms24g -Xmx31g -server -XX:+UseNUMA -XX:+AggressiveOpts -XX:+UseCompressedOops " + +// "align.TestIndex11f 1 25 100 0 /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq null " + +// "outfile=#S_bbmap11f.sam cs=false threads=22 paired=false pairlen=100 build=37 match=short " + +// "removeambiguous=false fastqparsecustom overwrite savepar=false", +// +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bbmap11fTime.txt", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_bbmap11f.sam #R", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bbmap11fTime.txt" +// }; + + +// String[] strings=new String[] { +// "bowtie --best -y --chunkmbs 1024 --strata -m 1 -k 2 -v 3 -p 24 -t -q -S HG37" + +// " /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq #S_bowtie.sam", +// +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bowtieTime.txt", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_bowtie.sam #R", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bowtieTime.txt" +// }; + + +// String[] strings=new String[] { +// "bfast match -T $TMPDIR/ -n 16 -f hg19.fa -r /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq > $TMPDIR/#S.bmf", +// "bfast localalign -n 16 -f hg19.fa -m $TMPDIR/#S.bmf > $TMPDIR/#S.baf", +//// "bfast postprocess -n 16 -a 3 -f hg19.fa -i $TMPDIR/#S.baf > #S.sam", +//// "bfast postprocess -n 16 -a 3 -m 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#R.sam", +// "bfast postprocess -n 16 -M 20 -f hg19.fa -i $TMPDIR/#S.baf > #S_r#R.sam", +// +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bfastTime.txt", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime bfastTime.txt" +// }; + +// String[] strings=new String[] { +// "smalt_x86_64 map -n 8 -a -f samsoft -o #S_r#R.sam hg37 /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R ssaha2", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt", +// }; + +// String[] strings=new String[] { +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt", +// "./soap -p 24 -a /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq -D hg37.fa.index -o #S_r#R.soap", +// "perl soap2sam.pl -p #S_r#R.soap > #S_r#R.sam", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt", +// "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R", +// }; + + String[] strings=new String[] { + "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt", + "./bin/gmapper-ls /work/bbushnell/synth/reads_B1_#Rx100bp_#S_chr1-25.fq --single-best-mapping --qv-offset 33 -L hg37 -N 24 -o 5 -h 80% > #S_r#R.sam", + "java -ea -Xmx96m -cp /work/bbushnell/java/ align.PrintTime mapTime.txt", + "java -ea -Xmx96m -cp /work/bbushnell/java/ align.GradeSamFile #S_r#R.sam #R", + }; + + int[] blank=new int[] {0, 0, 0, 0}; + + print(strings, blank, 100); + print(strings, blank, 100); + print(strings, blank, 100); + print(strings, blank, 100); + for(int[] array : sets){ + print(strings, array, numReads); + } + + } + + private static void print(String[] array, int[] blank, int x) { + + String counts=(blank[0]+"S_"+blank[1]+"I_"+blank[2]+"D_"+blank[3]+"U"); + String reads=""+x; + + for(String s : array){ + String s2=s.replaceAll("#S", counts).replaceAll("#R", reads); + System.out.println(s2); + } + System.out.println(); + + } + + public static int numReads=400000; + + public static final int[][] sets=new int[][] { + {0, 0, 0, 0}, + {1, 0, 0, 0}, + {2, 0, 0, 0}, + {3, 0, 0, 0}, + {4, 0, 0, 0}, + {5, 0, 0, 0}, + {6, 0, 0, 0}, + {7, 0, 0, 0}, + {8, 0, 0, 0}, + {10, 0, 0, 0}, + {12, 0, 0, 0}, + {14, 0, 0, 0}, + {16, 0, 0, 0}, + {18, 0, 0, 0}, + {20, 0, 0, 0}, + {24, 0, 0, 0}, + {28, 0, 0, 0}, + {32, 0, 0, 0}, + {36, 0, 0, 0}, + {40, 0, 0, 0}, + + {0, 1, 0, 0}, + {0, 2, 0, 0}, + {0, 3, 0, 0}, + {0, 4, 0, 0}, + {0, 5, 0, 0}, + {0, 6, 0, 0}, + {0, 7, 0, 0}, + {0, 8, 0, 0}, + {0, 10, 0, 0}, + {0, 12, 0, 0}, + {0, 14, 0, 0}, + {0, 16, 0, 0}, + {0, 20, 0, 0}, + {0, 24, 0, 0}, + {0, 28, 0, 0}, + {0, 32, 0, 0}, + {0, 36, 0, 0}, + {0, 40, 0, 0}, + + {0, 0, 1, 0}, + {0, 0, 2, 0}, + {0, 0, 3, 0}, + {0, 0, 4, 0}, + {0, 0, 5, 0}, + {0, 0, 6, 0}, + {0, 0, 7, 0}, + {0, 0, 8, 0}, + {0, 0, 10, 0}, + {0, 0, 12, 0}, + {0, 0, 14, 0}, + {0, 0, 16, 0}, + {0, 0, 20, 0}, + {0, 0, 24, 0}, + {0, 0, 28, 0}, + {0, 0, 32, 0}, + {0, 0, 48, 0}, + {0, 0, 64, 0}, + {0, 0, 128, 0}, + {0, 0, 192, 0}, + {0, 0, 256, 0}, + {0, 0, 512, 0}, + {0, 0, 1000, 0}, + {0, 0, 2000, 0}, + {0, 0, 3000, 0}, + {0, 0, 4000, 0}, + {0, 0, 6000, 0}, + {0, 0, 8000, 0}, + {0, 0, 10000, 0}, + {0, 0, 12000, 0}, + {0, 0, 14000, 0}, + {0, 0, 16000, 0}, + {0, 0, 20000, 0}, + {0, 0, 24000, 0}, + {0, 0, 28000, 0}, + {0, 0, 32000, 0}, + + {0, 0, 0, 1}, + {0, 0, 0, 2}, + {0, 0, 0, 3}, + {0, 0, 0, 4}, + {0, 0, 0, 5}, + {0, 0, 0, 6}, + {0, 0, 0, 7}, + {0, 0, 0, 8}, + {0, 0, 0, 10}, + {0, 0, 0, 12}, + {0, 0, 0, 14}, + {0, 0, 0, 16}, + {0, 0, 0, 20}, + {0, 0, 0, 24}, + {0, 0, 0, 28}, + {0, 0, 0, 32}, + {0, 0, 0, 36}, + {0, 0, 0, 40} + }; + +} diff --git a/current/driver/MeasureGene.java b/current/driver/MeasureGene.java new file mode 100755 index 0000000..5023be6 --- /dev/null +++ b/current/driver/MeasureGene.java @@ -0,0 +1,228 @@ +package driver; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Exon; +import dna.Gene; +import dna.MotifMulti; +import dna.MotifProbsN; + +public class MeasureGene { + + + public static void main(String[] args){ + + byte minChrom=19; + byte maxChrom=22; + + double sum=0; + long count=0; + + + for(byte chrom=minChrom; chrom<=maxChrom; chrom++){ + Data.getChromosome(chrom); + Gene[] genes=Data.getGenes(chrom, Gene.PLUS); + genes=toNormalGenes(genes); + + for(Gene g : genes){ +// ArrayList exons=getExons(g); + + analyzeGene(g); +// System.out.println("\nchr"+g.chromosome+"\t"+g.name+"\t"+g.nameTranscript); +// +// for(int i=0; i normal=new ArrayList(genes.length); + for(Gene g : genes){ + if(g.isNormalGene()){normal.add(g);} + } + return normal.toArray(new Gene[normal.size()]); + } + + + public static ArrayList getExons(Gene...genes){ + HashSet exonTable=new HashSet(); + for(Gene g : genes){ + for(int i=0; i exons=new ArrayList(exonTable.size()); + exons.addAll(exonTable); + exonTable=null; + Collections.sort(exons); + return exons; + } + + + public static float measureExonFrequency(int a, int b, byte chrom, byte strand){ +// assert e.strand==Gene.PLUS; + + int start=a; + int stop=b-1; + + double sum=0; + int count=0; + + ChromosomeArray ca=Data.getChromosome(chrom, strand); + + for(int i=start; i3); + number=((number<<2)|code); + } + if(!invalid){ + count++; + sum+=freqDif[number]; + }else{ + return 0; + } + } + + return count>0 ? (float)(sum/count) : 0; + } + + + + + + private static final MotifProbsN mAG=MotifProbsN.makeMotif("AG Exon Starts MP2", 13, 11, 2); + private static final MotifProbsN mAC=MotifProbsN.makeMotif("AC Exon Starts MP2", 13, 11, 2); + private static final MotifProbsN mATG=MotifProbsN.makeMotif("ATG Exon Starts MP2", 13, 11, 2); + + private static final MotifProbsN mGT=MotifProbsN.makeMotif("GT Exon Stops MP2", 10, 3, 2); + private static final MotifProbsN mGC=MotifProbsN.makeMotif("GC Exon Stops MP2", 10, 3, 2); + + private static final MotifProbsN mGStartATG=MotifProbsN.makeMotif("Gene Starts MP2", 13, 11, 2); + + private static final MotifProbsN mGStopTAA=MotifProbsN.makeMotif("TAA Gene Stops MP2", 13, 11, 2); + private static final MotifProbsN mGStopTAG=MotifProbsN.makeMotif("TAG Gene Stops MP2", 13, 11, 2); + private static final MotifProbsN mGStopTGA=MotifProbsN.makeMotif("TGA Gene Stops MP2", 13, 11, 2); + + private static final MotifMulti mGStart=new MotifMulti("Gene Starts MP2", mGStartATG); + private static final MotifMulti mEStart=new MotifMulti("Exon Starts MP2", mAG, mAC); + private static final MotifMulti mEStop=new MotifMulti("Exon Stops MP2", mGT, mGC); + private static final MotifMulti mGStop=new MotifMulti("Gene Stops MP2", mGStopTAA, mGStopTAG, mGStopTGA); + + + private static final int length=2; + + //Overall Frequency Exonic + + public static final float[] exonicFreq1={0.259195f, 0.260530f, 0.260441f, 0.219835f}; + + //Overall Frequency Non-Exonic + + public static final float[] nonExonicFreq1={0.277111f, 0.204189f, 0.213443f, 0.305257f}; + + //Overall Frequency Exonic + + public static final float[] exonicFreq2={0.071395f, 0.055355f, 0.077256f, 0.052618f, + 0.079593f, 0.077505f, 0.032685f, 0.071248f, 0.075189f, 0.070017f, 0.070666f, + 0.045554f, 0.032210f, 0.057977f, 0.079080f, 0.051651f}; + + //Overall Frequency Non-Exonic + + public static final float[] nonExonicFreq2={0.086472f, 0.047310f, 0.070451f, 0.072291f, + 0.069003f, 0.055260f, 0.011722f, 0.071913f, 0.058469f, 0.045772f, 0.056984f, + 0.054175f, 0.062555f, 0.059560f, 0.076273f, 0.101790f}; + + public static final float[] freqDif=( + length==2 ? makeDif(exonicFreq2, nonExonicFreq2) : + length==1 ? makeDif(exonicFreq1, nonExonicFreq1) : + null); + + public static final float[] makeDif(float[] a, float[] b){ + float[] dif=new float[a.length]; + for(int i=0; i table1=makeTable(lines1, col1, 1); + Hashtable table2=makeTable(lines2, col2, 1); + + HashSet keySet=new HashSet(); + keySet.addAll(table1.keySet()); + keySet.addAll(table2.keySet()); + String[] keys=keySet.toArray(new String[0]); + Arrays.sort(keys); + + StringBuilder sb=new StringBuilder(); + sb.append(toString(lines1[0], lines2[0], maxWidth1, maxWidth2)); + sb.append('\n'); + + for(String key : keys){ + String[] line1=table1.get(key); + String[] line2=table2.get(key); + + if(line1==null){ + line1=new String[col1+1]; + line1[col1]=line2[col2]; + } + + sb.append(toString(line1, line2, maxWidth1, maxWidth2)); + sb.append('\n'); + } + + return sb; + } + + private static StringBuilder toString(String[] a, String[] b, int alen, int blen){ + StringBuilder sb=new StringBuilder(); + for(int i=0; ii && a[i]!=null){ + sb.append(a[i]); + } + sb.append('\t'); + } + for(int i=0; ii && b[i]!=null){ + sb.append(b[i]); + } + sb.append('\t'); + } + return sb; + } + + private static Hashtable makeTable(String[][] lines, int col, int firstLine) { + Hashtable table=new Hashtable(); + for(int i=firstLine; i table1=makeTable(lines1, col1, 1); + Hashtable table2=makeTable(lines2, col2, 1); + + HashSet keySet=new HashSet(); + keySet.addAll(table1.keySet()); + keySet.addAll(table2.keySet()); + String[] keys=keySet.toArray(new String[0]); + Arrays.sort(keys); + + StringBuilder sb=new StringBuilder(); + sb.append(toString(lines1[0], lines2[0], maxWidth1, maxWidth2)); + sb.append('\n'); + + for(String key : keys){ + String[] line1=table1.get(key); + String[] line2=table2.get(key); + + if(line1==null){ + line1=new String[col1+1]; + line1[col1]=line2[col2]; + } + + sb.append(toString(line1, line2, maxWidth1, maxWidth2)); + sb.append('\n'); + } + + return sb; + } + + private static StringBuilder toString(String[] a, String[] b, int alen, int blen){ + StringBuilder sb=new StringBuilder(); + for(int i=0; ii && a[i]!=null){ + sb.append(a[i]); + } + sb.append('\t'); + } + for(int i=0; ii && b[i]!=null){ + sb.append(b[i]); + } + sb.append('\t'); + } + return sb; + } + + private static Hashtable makeTable(String[][] lines, int col, int firstLine) { + Hashtable table=new Hashtable(); + for(int i=firstLine; i1 && !Character.isDigit(name.charAt(name.length()-1))){ + name=name.substring(0, name.length()-1); + } + name=name.toLowerCase(); + + if(f.isFile() && name.endsWith("chr"+chrom)){ + copyFile(f.getAbsolutePath(), dest.getAbsolutePath()+"/"+name2); + } + } + } + + } + + + /** + * @param srFile + * @param dtFile + * {@link from http://www.roseindia.net/java/beginners/CopyFile.shtml} + */ + private static void copyFile(String src, String dst){ +// assert(false) : src+" -> "+dst; + try{ + File f1 = new File(src); + File f2 = new File(dst); + InputStream in = new FileInputStream(f1); + //For Append the file. + // OutputStream out = new FileOutputStream(f2,true); + + //For Overwrite the file. + OutputStream out = new FileOutputStream(f2); + + byte[] buf = new byte[16384]; + int len; + while ((len = in.read(buf)) > 0){ + out.write(buf, 0, len); + } + in.close(); + out.close(); + }catch(FileNotFoundException e){ + throw new RuntimeException(e); + }catch(IOException e){ + throw new RuntimeException(e); + } + } + + +} diff --git a/current/driver/PrintEnv.java b/current/driver/PrintEnv.java new file mode 100755 index 0000000..03099e4 --- /dev/null +++ b/current/driver/PrintEnv.java @@ -0,0 +1,36 @@ +package driver; + +import java.net.UnknownHostException; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Date; +import java.util.Map; + +/** + * @author Brian Bushnell + * @date Apr 4, 2013 + * + */ +public class PrintEnv { + + public static void main(String[] args){ + + Date d=new Date(); + System.out.println("Time: "+d.getTime()+" = "+d+"\n"); + + Map env=System.getenv(); + ArrayList keys=new ArrayList(env.keySet()); + Collections.sort(keys); + for(String s : keys){ + System.out.println(s+"\t"+env.get(s)); + } + try { + java.net.InetAddress localMachine = java.net.InetAddress.getLocalHost(); + System.out.println("Hostname of local machine: " + localMachine.getHostName()); + } catch (UnknownHostException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + +} diff --git a/current/driver/ReverseComplement.java b/current/driver/ReverseComplement.java new file mode 100755 index 0000000..4e07b69 --- /dev/null +++ b/current/driver/ReverseComplement.java @@ -0,0 +1,231 @@ +package driver; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jul 19, 2013 + * + */ +public class ReverseComplement { + +public static void main(String[] args){ + + if(args==null || args.length==0 || (args.length==1 && + (args[0].equalsIgnoreCase("-h") || args[0].equals("-help") || args[0].equals("--help") || args[0].equals("-?") || args[0].equals("?")))){ + printOptions(); + System.exit(0); + } + ReverseComplement rc=new ReverseComplement(args); + rc.process(); + } + + private static void printOptions(){ + outstream.println("Syntax:\n"); + outstream.println("\njava -ea -Xmx100m -cp jgi.ReverseComplement "); + outstream.println("\nOptional flags:"); + outstream.println("in= \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in."); + outstream.println("out= \tThe 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out."); + outstream.println("showspeed=t \tSet to 'f' to suppress display of processing speed."); + } + + public ReverseComplement(String[] args){ + for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}} + outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + ReadWrite.USE_UNPIGZ=true; + ReadWrite.USE_PIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + boolean setOut=false; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in")){ + in=b; + }else if(a.equals("out")){ + out=b; + setOut=true; + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("showspeed")){ + showspeed=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(i==0 && in==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + in=args[i]; + }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + out=args[i]; + setOut=true; + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + + if(in==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + + if(!setOut){out="stdout.fa";} + if("stdout".equalsIgnoreCase(out) || "standarddout".equalsIgnoreCase(out)){ + out="stdout.fa"; + outstream=System.err; + } + if(!overwrite){ + if(out!=null && new File(out).exists()){throw new RuntimeException("Output file "+out+" already exists, and overwrite="+overwrite);} + } + assert(!in.equalsIgnoreCase(out)); + } + + public void process(){ + + Timer t=new Timer(); + t.start(); + + boolean dq0=FASTQ.DETECT_QUALITY; + boolean ti0=FASTQ.TEST_INTERLEAVED; + int rbl0=Shared.READ_BUFFER_LENGTH; + FASTQ.DETECT_QUALITY=false; + FASTQ.TEST_INTERLEAVED=false; + Shared.READ_BUFFER_LENGTH=8; + + process2(); + + FASTQ.DETECT_QUALITY=dq0; + FASTQ.TEST_INTERLEAVED=ti0; + Shared.READ_BUFFER_LENGTH=rbl0; + + t.stop(); + + double rpnano=readsProcessed/(double)(t.elapsed); + double bpnano=basesProcessed/(double)(t.elapsed); + + String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m"); + String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m"); + + while(rpstring.length()<8){rpstring=" "+rpstring;} + while(bpstring.length()<8){bpstring=" "+bpstring;} + + outstream.println("Time: \t"+t); + outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + + if(errorState){ + throw new RuntimeException(this.getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + public void process2(){ + + final TextStreamWriter tsw=(out==null ? null : new TextStreamWriter(out, overwrite, false, true)); + if(tsw!=null){tsw.start();} + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + FileFormat ff1=FileFormat.testInput(in, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, null); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + boolean paired=cris.paired(); + assert(paired); + if(verbose){System.err.println("Paired: "+paired);} + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + + for(Read r : reads){ + assert(r.mate==null); + readsProcessed++; + basesProcessed+=r.bases==null ? 0 : r.bases.length; + r.reverseComplement(); + if(tsw!=null){tsw.println(r);} + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + + ReadWrite.closeStream(cris); + + if(tsw!=null){tsw.poisonAndWait();} + + errorState|=(cris.errorState() /*|| (tsw!=null && tsw.errorState())*/); + } + + private String in, out; + private long maxReads=-1; + private long readsProcessed=0; + private long basesProcessed=0; + public boolean errorState=false; + + private static PrintStream outstream=System.err; + public static boolean overwrite=false; + public static boolean showspeed=true; + public static boolean verbose=false; + +} diff --git a/current/driver/Search.java b/current/driver/Search.java new file mode 100755 index 0000000..6170a7c --- /dev/null +++ b/current/driver/Search.java @@ -0,0 +1,167 @@ +package driver; +import java.util.ArrayList; +import java.util.List; + +import dna.Data; +import dna.Gene; +import dna.Range; + + +public class Search { + + /** Find genes in the array that overlap point "p" */ + public static List findGenes(int p, Gene[] genes){ + ArrayList list=new ArrayList(16); + + for(int i=0; i findGenesBinary(int p, Range[] ranges, boolean nearby){ + ArrayList list=null; + int a=findPointBinary(p, ranges); + + Range r=ranges[a]; + +// System.out.println("Searching for "+p+" in "+r+"; previous range was "+ranges[a-1]); + if(!r.includes(p)){return list;} + + list=new ArrayList(16); + + Gene[] genes2=(Gene[])r.obj1; + assert(genes2.length>0); + +// System.out.println("Found "+genes2.length+" to consider."); + + + //TODO: Specify whether tx or code (etc) coverage is needed. + for(int i=0; ir.b+Data.NEAR){break;} + if(nearby){ + if(g.intersectsNearby(p, p)){list.add(g);} + }else{ + if(g.intersectsCode(p)){list.add(g);} + } + } + + return list; + } + + /** Find genes in the array that overlap point "p" */ + public static List findGenesLinear(int p, Gene[] genes, Range[] ranges){ + ArrayList list=null; + int a=findPointLinear(p, ranges); + + Range r=ranges[a]; + +// System.out.println("Searching for "+p+" in "+r+"; previous range was "+ranges[a-1]); + if(!r.includes(p)){return list;} + + list=new ArrayList(16); + + Gene[] genes2=(Gene[])r.obj1; + assert(genes2.length>0); + +// System.out.println("Found "+genes2.length+" to consider."); + + + //TODO: Specify whether tx or code (etc) coverage is needed. + for(int i=0; ir.b){break;} + if(g.intersectsCode(p)){ + list.add(g); +// System.out.println(" Yes."); + } + } + + return list; + } + + public static int findPointLinear(int p, Range[] array){ + for(int i=0; ip){return i;} //Fail. + if(r.includes(p)){return i;} //Success. + } + return array.length-1; + } + + public static int findPointBinary(int p, Range[] array){ + assert(array!=null); + if(array.length==0){return 0;} + int result=findPointBinary(p, 0, max(0, array.length-1), array); + + //TODO: Assertions + + return result; + } + + public static boolean containsPointBinary(int p, Range[] array, int thresh){ + assert(array!=null); + if(array.length==0){return false;} + int rnum=findPointBinary(p, 0, max(0, array.length-1), array); + + int p1=p-thresh, p2=p+thresh; + Range r=array[rnum]; + if(p2>=r.a && p1<=r.b){return true;} + + if(rnum==0 && pr.b) : "\n\n"+p+"\t"+rnum+"/"+array.length+"\t"+r+"\n\n"; //Otherwise, it violated the search contract. + if(array.length<=rnum+1){return false;} + + Range r2=array[rnum+1]; + assert(r2.a>p) : "\n\n"+p+"\t"+rnum+"/"+array.length+"\t"+r+"\n\n"; //Otherwise, it violated the search contract. + return (p2>=r.a && p1<=r.b); + } + + public static int findPointBinary(int p, int a, int b, Range[] array){ + if(a>=b){ + + //This line should ensure that p>array[a] when p is not within any range. + //Except, of course, when p<(all ranges). + //In other words, the return is strictly the LEFT (LOWER) index when p is between two ranges. + if(a>0 && p=0); + assert(aarray[a].b && (a==array.length-1 || p "; + assert(!args[0].equalsIgnoreCase(args[1])) : "File names must be different."; + + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + + int minlen=1; + long reads=Long.MAX_VALUE; + char symbol='D'; + if(args.length>2){symbol=(char)args[2].charAt(0);} + if(args.length>3){minlen=Integer.parseInt(args[3]);} + if(args.length>4){reads=Long.parseLong(args[4]);} + + symbol=Character.toUpperCase(symbol); + if(symbol=='='){symbol='M';} + if(symbol=='X'){symbol='S';} + if(symbol=='N'){symbol='D';} + if(symbol=='S' || symbol=='H' || symbol=='P'){symbol='C';} + + final int index=Tools.indexOf(new char[] {'M','S','D','I','C'}, symbol); + assert(index>=0) : "Symbol (3rd argument) must be M, S, D, I, C (for match string symbols) or M, =, X, D, N, I, S, H, P (for cigar symbols)."; + + TextFile tf=new TextFile(args[0], true, false); + TextStreamWriter tsw=new TextStreamWriter(args[1], false, false, true); + tsw.start(); + + for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ + if(line.charAt(0)=='@'){ + tsw.println(line); + }else{ + if((reads=reads-1)<0){break;} + SamLine sl=new SamLine(line); + if(testLine(sl, minlen, index)){ + tsw.println(line); + } + } + } + tf.close(); + tsw.poison(); + tsw.waitForFinish(); + + } + + + private static boolean testLine(SamLine sl, int minlen, int index){ + assert(sl!=null); + if(!sl.mapped() || sl.cigar==null){return false;} + int[] msdic=sl.cigarToMdsiMax(sl.cigar); + return (msdic!=null && msdic[index]>=minlen); + } + +} diff --git a/current/driver/SniffSplices.java b/current/driver/SniffSplices.java new file mode 100755 index 0000000..4dad5d2 --- /dev/null +++ b/current/driver/SniffSplices.java @@ -0,0 +1,198 @@ +package driver; + +import java.util.ArrayList; + +import dna.AminoAcid; +import dna.Motif; +import dna.MotifProbsN; + +public class SniffSplices { + + public static void main(String[] args){ + +// MotifProbsN mAG=MotifProbsN.makeMotif("AG Exon Starts MP2", 11, 13, 11, 2); +// MotifProbsN mGT=MotifProbsN.makeMotif("GT Exon Stops MP2", 3, 10, 3, 2); +// +// MotifProbsN eStarts2=MotifProbsN.makeMotif("Exon Starts MP2", 9, 11, 9, 2); +// MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP2", 3, 10, 3, 2); +// +// MotifProbsN gStarts2=MotifProbsN.makeMotif("Gene Starts MP2", 9, 11, 9, 2); +// MotifProbsN gStops2=MotifProbsN.makeMotif("Gene Stops MP2", 3, 10, 3, 2); + + + Motif m=eStops2; +// Motif m=eStarts2; +// Motif m=eStarts2_15; + + + ArrayList list=new ArrayList(); + + boolean rcomp=false; + if(args.length>0){ + for(String s1 : args){ + String s=s1.toLowerCase(); + if(s.equalsIgnoreCase("rcomp")){rcomp=true;} + + if(s.contains("estart_ac")){m=eStarts2_AC;} + else if(s.contains("estart_15")){m=eStarts2_15;} + else if(s.contains("estart")){m=eStarts2;} + else if(s.contains("estop_gc")){m=eStops2_GC;} + else if(s.contains("estop")){m=eStops2;} + else if(s.contains("gstart")){m=gStarts2;} + else if(s.contains("gstop")){m=gStops2;} + else{list.add(s.toUpperCase());} + } + } + + + System.out.println("Using motif "+m); + + int initialLoc=0; + int increment=1; //1 for plus strand, -1 for minus strand + +// String s="NNNNNNNNAGCGGGAATCGGGGGGTCCTTCTGCTCCCCTGAGCGTCCTTCCTGTGTTCCCAGGC"+ +// "ACTATCGCCTACCTGTTTTTCACCAACCGCCACGAGGTGAGGAAGATGACCCTGGACCGAAGCGAATACACCAGCCTCAT"+ +// "CCCAAACTTGAAGAACGTGGTCGCCCTGGACACCGAGGTGGCCAGCAACAGAATATACTGGTCCGACCTGTCCCAAAGGA"+ +// "AGATCTACAGGTGAGCCTTGGAGCCACACCCAGCGCTCAACCCCCGGTGGCGCGGGGGCCCCTCTCACTGACGCTCTCCT"+ +// "TCCCCTGCTCCTCCCCCTCAGCACCCAAATCGACAGAGCCCCCGGCTTCTCCTCCTATGACACCGTCGTCAGCGAGGACC"+ +// "TCCAGGCCCCTGATGGGCTGGCGGTGGACTGGATCCACAGCAACATATACTGGACAGACTCCATCCTGGGCACCGTCTCC"+ +// "GTGGCCGACACCAAGGGCGTGAAGAGAAAGACGCTCTTCAAGGAGAAAGGCTCTAAGCCACGTGCCATCGTGGTGGATCC"+ +// "CGTTCACGGGTGGGTGCTGCTAAAGCCGAGGGCCACGGAAGGAANNNNNNNN"; + + // "AAGTACAGGAATTATATGCCCCCAGGTAA * AGTACAGGAATTATATGCCCCCAGGTAAC" +// String[] array={ +// "GCCTACTTTGTATGATGACCCTGTCCT", +// "AGCCCTGGCCGCCTACTTTGTATGATGACCCTGTCCTCCCTCACCCA", +// }; +// String[] array={ +// "TGGCCGCCGCCGACCGTAAGTTTTGCGCGCAAACTCCC", +// "TGGCCGCCGCCGACCGTTAAGTTTTGCGCGCAAACTCCC", +// }; +// String[] array={ +// "CAACTGCCAAGGGAAGGGCACGGTTAGCGGCACCCTCATAGGTAAGTGATGGCCCCAGACGCTGGTCTCTCTCCATCTGGACCTGGCCTGGGAGGTGGCTTGG", +// "CAACTGCCAAGGGAAGGGCACGGTTAGCGGCACCCTCATAGGTGAGTGATGGCCCCAGACGCTGGTCTCTCTCCATCTGGACCTGGCCTGGGAGGTGGCTTGG", +// }; + +// String[] array={ +// "GTCTTTCTCATGTGGTCCTTGTGTTCGTCGAGCAGGCCAGCAAGTGTGACAGTCATGGCACCCACCTGGCAGGGG", +// "GTCTTTCTCATGTGGTCCTTGTGTTCGTTGAGCAGGCCAGCAAGTGTGACAGTCATGGCACCCACCTGGCAGGGG", +// }; + +// String[] array={ +// "GCAGGGTCATGGTCACCGACTTCGAGAATGTGCCCGAGGAGGACGGGACCCGCCTCCACAGACAGGTAAGCACAGCCGTCTGATGGGAGGGCTGCCTCTGCCCATATCCCCATCCTGGAG", +// "GCAGGGTCATGGTCACCGACTTCGAGAATGTGCCCGAGGAGGACGGGACCCGCTTCCACAGACAGGTAAGCACGGCCGTCTGATGGGAGGGCTGCCTCTGCCCATATCCCCATCCTGGAG", +// }; + + +// String[] array={ +// "RTGTTTTCACTCCAGCCACGGAGCTGGGTCTCTGGTCTCGGGGGCAGCTGTGTGACAGAGCGT" + +// "GCCTCTCCCTACAGTGCTCTTCGTCTTCCTTTGCCTGGGGGTCTTCCTTCTATGGAAGAACTG", +// "RTGTTTTCACTCCAGCCACGGAGCTGGGTCTCTGGTCTCGGGGGCAGCTGTGTGACAGAGCGT" + +// "GCCTCTCCTTACAGTGCTCTTCGTCTTCCTTTGCCTGGGGGTCTTCCTTCTATGGAAGAACTG", +// }; + +// String[] array={ +//// "CAGCGAAGATGCGAAGGTGATTCCCGGGTGGG", +//// "CAGCGAAGATGCGAAGGTGATTTCCGGGTGGG", +// "GCGGCCGAAGCGGGCCATGGACGCGCTCAAGT", +// "GCGGCCGGAGCGGGCCATGGACGCGCTCAAGT", +// }; + + +// String[] array={ +// "AAGTATGTTTTTGCTTTTAGGAGGATTCTCT", +// "AAGTATGTTTTTGTTTTTAGGAGGATTCTCT", +// }; + +// String[] array={ +// "TTAGGTTGCTGGTGTCTGTATAATGTGTGT"+ +// "A"+ +// "TCTTTGTTGCAGGTTTGTTTTTTATTCTGC", +// +// "TTAGGTTGCTGGTGTCTGTATAATGTGTGT"+ +// "G"+ +// "TCTTTGTTGCAGGTTTGTTTTTTATTCTGC" +// }; + +// ATGTATTCTACTTTT[TCTTTT]AAGTATGTTTTTGTTTTTAGGAGGATTCTCTATGG + +// String[] array={ +// "CAGGTCCTCGAGATCCTGGGATACAGGAAA", +// "CAGGTCCTCGAGATCCTGGGATATAGGAAA" +// }; + +// String[] array={ +// "TGTTTTTGCTTTTAGGAGGATTCTCTATG", +// "TGTTTTTGTTTTTAGGAGGATTCTCTATG" +// }; + + + + for(String s : list){ + if(rcomp){s=AminoAcid.reverseComplementBases(s);} + System.out.println("For string "+s+":"); + + if(!s.startsWith("N") || !s.endsWith("N")){ + s="NNNN"+s+"NNNN"; + } + byte[] code=s.getBytes(); + + for(int i=0; i %.4f ",percent));} + float norm2=norm; + while(norm2>0.1f){ + norm2-=.1f; + System.out.print("*"); + } + +// System.out.print("\t"+String.format("%.3f ",m.percentile(norm))); + + System.out.println(); + + } + + } + + } + + + private static final int N_MOTIF=2; + +// private static final MotifProbsN eStarts2=MotifProbsN.makeMotif("Exon Starts MP"+N_MOTIF, 12, 9, 2); +//// private static final MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP"+N_MOTIF, 3, 11, 3, 2); +// private static final MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP"+N_MOTIF, 12, 3, 2); +// +// private static final MotifProbsN gStarts2=MotifProbsN.makeMotif("Gene Starts MP"+N_MOTIF, 13, 9, 2); +// private static final MotifProbsN gStops2=MotifProbsN.makeMotif("Gene Stops MP"+N_MOTIF, 11, 3, 2); +// +// private static final MotifProbsN trStarts2=MotifProbsN.makeMotif("Tr Starts MP"+N_MOTIF, 12, 7, 2); +// private static final MotifProbsN trStops2=MotifProbsN.makeMotif("Tr Stops MP"+N_MOTIF, 11, 6, 2); + + private static final MotifProbsN eStarts2=MotifProbsN.makeMotif("Exon Starts MP"+N_MOTIF, 13, 9, 2); + private static final MotifProbsN eStarts2_AC=MotifProbsN.makeMotif("AC Exon Starts MP"+N_MOTIF, 13, 9, 2); + private static final MotifProbsN eStarts2_15=MotifProbsN.makeMotif("Exon Starts MP"+N_MOTIF, 19, 15, 2); + private static final MotifProbsN eStops2=MotifProbsN.makeMotif("Exon Stops MP"+N_MOTIF, 13, 4, 2); + private static final MotifProbsN eStops2_GC=MotifProbsN.makeMotif("GC Exon Stops MP"+N_MOTIF, 13, 4, 2); + + private static final MotifProbsN gStarts2=MotifProbsN.makeMotif("Gene Starts MP"+N_MOTIF, 13, 9, 2); + private static final MotifProbsN gStops2=MotifProbsN.makeMotif("Gene Stops MP"+N_MOTIF, 13, 4, 2); + + private static final MotifProbsN trStarts2=MotifProbsN.makeMotif("Tr Starts MP"+N_MOTIF, 13, 7, 2); + private static final MotifProbsN trStops2=MotifProbsN.makeMotif("Tr Stops MP"+N_MOTIF, 13, 7, 2); + + +} diff --git a/current/driver/SummarizeMSDIN.java b/current/driver/SummarizeMSDIN.java new file mode 100755 index 0000000..e749427 --- /dev/null +++ b/current/driver/SummarizeMSDIN.java @@ -0,0 +1,122 @@ +package driver; + +import fileIO.TextFile; + +/** + * Summarizes match/sub/ins/del/N rates for consecutive BBMap runs + * @author Brian Bushnell + * @date Jan 8, 2014 + * + */ +public class SummarizeMSDIN { + + public static void main(String[] args){ + String fname=args[0]; + boolean M=false; + boolean E=false; + boolean S=true; + boolean D=false; + boolean I=false; + boolean N=false; + boolean B=false; + boolean MS=true; + + long mcount=0; + long ecount=0; + long scount=0; + long dcount=0; + long icount=0; + long ncount=0; + long bcount=0; + + TextFile tf=new TextFile(fname); + StringBuilder sb=new StringBuilder(); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + String[] split=s.split("\t"); + if(s.startsWith("Total time:")){ + if(B){ + if(sb.length()>0){sb.append('\t');} + sb.append(bcount); + } + if(MS){ + if(sb.length()>0){sb.append('\t');} + sb.append((mcount+scount)); + } + if(M){ + if(sb.length()>0){sb.append('\t');} + sb.append(mcount); + } + if(E){ + if(sb.length()>0){sb.append('\t');} + sb.append(ecount); + } + if(S){ + if(sb.length()>0){sb.append('\t');} + sb.append(scount); + } + if(D){ + if(sb.length()>0){sb.append('\t');} + sb.append(dcount); + } + if(I){ + if(sb.length()>0){sb.append('\t');} + sb.append(icount); + } + if(N){ + if(sb.length()>0){sb.append('\t');} + sb.append(ncount); + } + System.out.println(sb); + sb.setLength(0); + mcount=ecount=scount=dcount=icount=ncount=bcount=0; + }else if(s.startsWith("Match Rate:")){ + String x=split[split.length-1]; + try{mcount=(Long.parseLong(x));}catch(Exception e){} + }else if(E && s.startsWith("Error Rate:")){ + String x=split[split.length-1]; +// if(E){ +// if(sb.length()>0){sb.append('\t');} +// sb.append(x); +// } + try{ecount=(Long.parseLong(x));}catch(Exception e){} + }else if(s.startsWith("Sub Rate:")){ + String x=split[split.length-1]; +// if(S){ +// if(sb.length()>0){sb.append('\t');} +// sb.append(x); +// } + try{scount=(Long.parseLong(x));}catch(Exception e){} + }else if(s.startsWith("Del Rate:")){ + String x=split[split.length-1]; +// if(D){ +// if(sb.length()>0){sb.append('\t');} +// sb.append(x); +// } + try{dcount=(Long.parseLong(x));}catch(Exception e){} + }else if(s.startsWith("Ins Rate:")){ + String x=split[split.length-1]; +// if(I){ +// if(sb.length()>0){sb.append('\t');} +// sb.append(x); +// } + try{icount=(Long.parseLong(x));}catch(Exception e){} + }else if(s.startsWith("N Rate:")){ + String x=split[split.length-1]; +// if(N){ +// if(sb.length()>0){sb.append('\t');} +// sb.append(x); +// } + try{ncount=(Long.parseLong(x));}catch(Exception e){} + }else if(s.startsWith("Reads Used:")){ + String x=split[split.length-1].replace("(", "").replace(" bases)", ""); +// if(B){ +// if(sb.length()>0){sb.append('\t');} +// sb.append(x); +// } + try{bcount=(Long.parseLong(x));}catch(Exception e){} + } + } + + } + +} diff --git a/current/driver/TestCompressionSpeed.java b/current/driver/TestCompressionSpeed.java new file mode 100755 index 0000000..2eca4c4 --- /dev/null +++ b/current/driver/TestCompressionSpeed.java @@ -0,0 +1,79 @@ +package driver; + +import java.io.File; +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.zip.ZipOutputStream; + +import dna.Timer; + +import fileIO.ReadWrite; +import fileIO.TextFile; + +public class TestCompressionSpeed { + + + public static void main(String[] args){ + + TextFile tf=new TextFile(args[0], false, false); + String[] lines=tf.toStringLines(); + tf.close(); + + Timer t=new Timer(); + + for(int i=0; i<=9; i++){ + t.start(); + String fname=args[1].replaceFirst("#", ""+i); + compress(lines, fname, i); + t.stop(); + + System.out.println("Level "+i+" compress: "+t+" \tsize: "+new File(fname).length()); + } + + for(int i=0; i<=9; i++){ + t.start(); + String fname=args[1].replaceFirst("#", ""+i); + String[] lines2=read(fname); + assert(lines2.length>=lines.length); + t.stop(); + + System.out.println("Level "+i+" decompress: "+t); + } + + } + + + public static void compress(String[] text, String fname, int level){ + ReadWrite.ZIPLEVEL=level; + OutputStream os=ReadWrite.getOutputStream(fname, false, true, true); + PrintWriter writer=new PrintWriter(os); + + for(String s : text){writer.println(s);} + for(String s : text){writer.println(s);} + for(String s : text){writer.println(s);} + for(String s : text){writer.println(s);} + + try { + writer.flush(); + if(os.getClass()==ZipOutputStream.class){ + ZipOutputStream zos=(ZipOutputStream)os; + zos.closeEntry(); + zos.finish(); + } + writer.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + + public static String[] read(String fname){ + TextFile tf=new TextFile(fname, false, false); + String[] s=tf.toStringLines(); + tf.close(); + return s; + } + +} diff --git a/current/driver/Translator.java b/current/driver/Translator.java new file mode 100755 index 0000000..8c2e781 --- /dev/null +++ b/current/driver/Translator.java @@ -0,0 +1,213 @@ +package driver; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +import var.VarLine; +import var.Variation; + +import dna.AminoAcid; +import dna.Data; +import dna.Gene; +import dna.Timer; +import fileIO.ChainLine; +import fileIO.ReadWrite; + +public class Translator { + + + + + public Translator(int from_, int to_){ + fromBuild=from_; + toBuild=to_; + lines=Data.getChainLines(fromBuild, toBuild); + } + + + public VarLine[][] translate(VarLine[][] in){ + ArrayList[] alvls=new ArrayList[in.length]; + for(int i=0; i(); + } + + for(VarLine[] vla : in){ + if(vla!=null){ + for(VarLine vl : vla){ + VarLine vl2=translate(vl); +// if(vl.haplotype==1 && (vl.intersects(244821744, 244821748) || (vl2!=null && vl2.intersects(246755120, 246755126)))){ +// System.out.println("\n"+vl+"\n->\n"+vl2); +// } + if(vl2!=null){ + int chrom=vl2.chromosome; + alvls[chrom].add(vl2); + } + } + } + } + + VarLine[][] out=new VarLine[alvls.length][]; + for(int i=0; i[] alvls=new ArrayList[in.length]; + for(int i=0; i(); + } + + for(Variation[] vla : in){ + if(vla!=null){ + for(Variation vl : vla){ + Variation vl2=translate(vl); + if(vl2!=null){ + int chrom=vl2.chromosome; + alvls[chrom].add(vl2); + } + } + } + } + + Variation[][] out=new Variation[alvls.length][]; + for(int i=0; i "+Arrays.toString(dest1)+ + "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+ + "\n\n"+cl+"\n\n"; + + assert(v2.beginLoc<=v2.endLoc) : "\n\n"+v.toSourceString()+"\n\n"+v2.toSourceString()+ + "\n\n"+v.beginLoc+" -> "+Arrays.toString(dest1)+ + "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+ + "\n\n"+cl+"\n\n"; + + v2.intern(); + return v2; + } + + + public Variation translate(Variation v){ + + if(v.getClass()==VarLine.class){ + return translate((VarLine)v); + } + assert(v.getClass()==Variation.class); + + ChainLine[] array=lines[v.chromosome]; + int index=ChainLine.binarySearch(v.beginLoc, array); + if(index<0){return null;} + ChainLine cl=array[index]; + if(!cl.contains(v.beginLoc, v.endLoc)){return null;} + + int[] dest1=cl.translate(v.beginLoc); + int[] dest2=cl.translate(v.endLoc); + if(dest1==null || dest2==null){return null;} + + Variation v2=v.clone(); + + if(cl.qStrand==Gene.PLUS){ + v2.chromosome=(byte)dest1[0]; + v2.beginLoc=dest1[2]; + v2.endLoc=dest2[2]; + }else{ +// assert(false) : "TODO"; + + v2.chromosome=(byte)dest1[0]; + if(v.isPoint()){ + v2.beginLoc=v2.endLoc=dest1[2]-1; + }else{ + v2.beginLoc=dest2[2]; + v2.endLoc=dest1[2]; + } + + if(v2.call!=null && Character.isLetter(v2.call.charAt(0)) && !v2.call.equalsIgnoreCase("ref")){ + v2.call=AminoAcid.reverseComplementBases(v2.call); + } + + if(v2.ref!=null && Character.isLetter(v2.ref.charAt(0)) && !v2.ref.equalsIgnoreCase("ref")){ + v2.ref=AminoAcid.reverseComplementBases(v2.ref); + } + + } + + assert(v2.endLoc-v2.beginLoc==v.endLoc-v.beginLoc) : "\n\n"+v.toSourceString()+"\n\n"+v2.toSourceString()+ + "\n\n"+v.beginLoc+" -> "+Arrays.toString(dest1)+ + "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+ + "\n\n"+cl+"\n\n"; + + assert(v2.beginLoc<=v2.endLoc) : "\n\n"+v.toSourceString()+"\n\n"+v2.toSourceString()+ + "\n\n"+v.beginLoc+" -> "+Arrays.toString(dest1)+ + "\n\n"+v.endLoc+" -> "+Arrays.toString(dest2)+ + "\n\n"+cl+"\n\n"; + + v2.intern(); + return v2; + } + + + public final int fromBuild; + public final int toBuild; + public final ChainLine[][] lines; + +} diff --git a/current/driver/Translator2.java b/current/driver/Translator2.java new file mode 100755 index 0000000..c090eab --- /dev/null +++ b/current/driver/Translator2.java @@ -0,0 +1,65 @@ +package driver; + +import dna.Data; +import dna.Gene; +import fileIO.ChainLine; + +public class Translator2 { + + + public static void main(String[] args){ + + int from=Gene.toBuild(args[0]); + int to=Gene.toBuild(args[1]); + + if(from==18){from=36;} + if(from==19){from=37;} + if(to==18){to=36;} + if(to==19){to=37;} + assert(from!=to); + assert(from==36 || from==37); + assert(to==36 || to==37); + + int chrom=Gene.toChromosome(args[2]); + + ChainLine[][] lines=Data.getChainLines(from, to); + + for(int i=3; i "); + System.out.println(result==null ? "null" : + "(build"+to+", chr"+Gene.chromCodes[result[0]]+", "+Gene.strandCodes[result[1]]+", "+result[2]+")"); + } + +// Translator2 tr=new Translator2(from, to); +// +// ChainLine[] array=lines[chrom]; +// int index=ChainLine.binarySearch(loc, array); +//// if(index<0){return null;} +// ChainLine cl=array[index]; +// +//// System.out.println(cl); +// +// int[] dest=cl.translate(loc); +// +//// {qChrom, qStrand, qStart+loc-tStart}; +// +// System.out.println(chrom+", +, "+loc+" -> "+dest[0]+", "+Gene.strandCodes[dest[1]]+", "+dest[2]); + } + + /** chrom, strand, loc */ + public static final int[] translate(int fromBuild, int toBuild, int chrom, int strand, int loc){ + ChainLine[][] lines=Data.getChainLines(fromBuild, toBuild); + int[] result=ChainLine.translate(loc, lines[chrom]); + if(result==null){return null;} + int strand2=result[1]; + if(strand2==strand){ + result[1]=Gene.PLUS; + }else{ + result[1]=Gene.MINUS; + } + return result; + } + +} diff --git a/current/driver/TransposeTextFile.java b/current/driver/TransposeTextFile.java new file mode 100755 index 0000000..271e440 --- /dev/null +++ b/current/driver/TransposeTextFile.java @@ -0,0 +1,53 @@ +package driver; + +import fileIO.ReadWrite; +import fileIO.TextFile; + +public class TransposeTextFile { + + public static void main(String[] args){ + + int skipLines=args.length>1 ? Integer.parseInt(args[1]) : 0; + + int minChrom=1; + int maxChrom=22; + + for(int i=minChrom; i<=maxChrom; i++){ + if(args[0].contains("#")){ + process(args[0].replace("#", ""+i), skipLines); + }else{ + process(args[0], skipLines); + break; + } + } + + } + + public static void process(String fname, int skipLines){ + TextFile tf=new TextFile(fname, false, false); + String[] lines=tf.toStringLines(); + tf.close(); + String[][] lines2=TextFile.doublesplitWhitespace(lines, true); + + StringBuilder sb=new StringBuilder(4096); + + int columns=lines2[skipLines].length; + + for(int column=0; column set=findBadLines(tf, scaf, from, to); + tf.reset(); + printExcludingSet(tf, set); + } + + + public static HashSet findBadLines(TextFile tf, String scafS, int from, int to){ + byte[] scaf=scafS.getBytes(); + HashSet set=new HashSet(16000); + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='@'){//header + SamLine sl=new SamLine(s); + + if(sl.pos>=from && sl.pos<=to && Tools.equals(sl.rname(), scaf)){ + set.add(sl.qname); + }else if(sl.pnext>=from && sl.pnext<=to && Tools.equals(sl.rnext(), scaf)){ + set.add(sl.qname); + }else if(Tools.equals(sl.rname(), scaf) && Tools.equals(sl.rnext(), scaf) && (sl.pos set){ + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)=='@'){//header + System.out.println(s); + }else{ + SamLine sl=new SamLine(s); + + if(!set.contains(sl.qname)){ + System.out.println(s); + } + } + } + } + + +} + diff --git a/current/fileIO/ArrayFile.java b/current/fileIO/ArrayFile.java new file mode 100755 index 0000000..3a12157 --- /dev/null +++ b/current/fileIO/ArrayFile.java @@ -0,0 +1,73 @@ +package fileIO; + + +public class ArrayFile extends TextFile{ + + public static void main(String[] args){ + + try { + //Name of mat file + String name=args[0]; + + ArrayFile mat=new ArrayFile(name); + + String s=null; + + for(s=mat.readLine(); s!=null; s=mat.readLine()){ + System.out.println(s); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + } + + + public ArrayFile(String name){super(name, false, false);} + + public String nextLine(){ + String line=readLine(); + char c=line.charAt(0); + + while(line!=null && c!='{' && c!='/'){ + line=readLine(); + c=line.charAt(0); + } + return line; + } + + public float[] nextArray(){ + String line; + String[] split; + + line=nextLine(); + if(line==null || line.startsWith("//end")){return null;} + + assert(line.startsWith("//name: ")) : line; + String name=line.replace("//name: ","").trim(); + + line=nextLine(); + assert(line.startsWith("//size: ")) : line; + line=line.replace("//size: ",""); + int length=Integer.parseInt(line); + + + float[] grid=new float[length]; + + line=nextLine(); + assert(line.startsWith("{")); + if(line.endsWith(",")){line=line.substring(0, line.length()-1);} + assert(line.endsWith("}")); + line=line.replace("{", "").replace("}", "").replace(" ", ""); + split=line.split(","); + assert(split.length==length); + for(int i=0; i table=new HashMap(8000); + + for(s=nextLine(); s!=null; s=nextLine()){ + BaitLine b=new BaitLine(s); + + + if(buildIn!=buildOut && b.chrom<26){ + b=b.translate(buildIn, buildOut); + } + + if(b!=null && b.chrom<26){ + + BaitLine old=table.get(b); + if(old==null){ + table.put(b, b); + }else{ + old.add(b); + } + } + } + + ArrayList master=new ArrayList(table.size()); + master.addAll(table.values()); + table=null; + + if(condense){condense(master);} + + ArrayList[] lists=new ArrayList[26]; + for(BaitLine b : master){ + if(lists[b.chrom]==null){lists[b.chrom]=new ArrayList();} + lists[b.chrom].add(b); + } + + int[][][] out=new int[26][3][]; + for(int i=0; i master){ + Collections.sort(master); + + int merged=0; + BaitLine prev=null; + for(int i=0; i=(a1-1); + } + + private static class BaitLine implements Comparable { + + public BaitLine(String s){ + try{ + String[] split=s.split("\t", -1); + chrom=Gene.toChromosome(split[0]); +// assert(chrom<26) : s; + start=Integer.parseInt(split[1])-1; + stop=Integer.parseInt(split[2])-1; + if(split.length>3){names.add(split[3]);} + }catch(Exception e){ + System.err.println(s); + throw new RuntimeException(e); + } + } + + public BaitLine(int chr, int sta, int sto, ArrayList nam){ + chrom=chr; + start=sta; + stop=sto; + names.addAll(nam); + } + + private BaitLine translate(int buildIn, int buildOut){ + + int[] startTrans=Translator2.translate(buildIn, buildOut, chrom, Gene.PLUS, start); + int[] stopTrans=Translator2.translate(buildIn, buildOut, chrom, Gene.PLUS, stop); + + if(startTrans==null || stopTrans==null){return null;} + + if(startTrans[0]!=stopTrans[0]){return null;}//different chromosomes + + int chrom2=startTrans[0]; + int start2=startTrans[2]; + int stop2=stopTrans[2]; + if(start2>stop2){ + int temp=start2; + start2=stop2; + stop2=temp; + } + + int len=stop-start; + int len2=stop2-start2; + int dif=(len>len2 ? len-len2 : len2-len); + +// assert(len>0 && len<3000) : this; //Baits should be 120 long, IIRC. (**No longer true!**) + if(dif>50){return null;} + + return new BaitLine(chrom2, start2, stop2, names); + } + + public int[] toInt(){ + return new int[] {chrom, start, stop, names.size()}; + } + + public String toString(){ + return chrom+"\t"+start+"\t"+stop; + } + + final int chrom; + int start; + int stop; + final ArrayList names=new ArrayList(2); + + public void add(BaitLine other){ + assert(this.equals(other)); + assert(this!=other); + names.addAll(other.names); + } + + @Override + public int compareTo(BaitLine other) { + int r; + + r=chrom-other.chrom; + if(r!=0){return r;} + + r=start-other.start; + if(r!=0){return r;} + + r=stop-other.stop; + if(r!=0){return r;} + + return 0; + } + + public void merge(BaitLine b){ + assert(touches(b)); + assert(chrom==b.chrom); + assert(start<=b.start) : this+", "+b; + start=Data.min(start, b.start); + stop=Data.max(stop, b.stop); + } + + public boolean touches(BaitLine b){ + return (chrom==b.chrom && b.stop>=start-1 && b.start<=stop+1); + } + + @Override + public boolean equals(Object other){ + return equals((BaitLine)other); + } + + public boolean equals(BaitLine other){ + return compareTo(other)==0; + } + + @Override + public int hashCode(){ + return start^chrom; + } + + } + + +} diff --git a/current/fileIO/ByteFile.java b/current/fileIO/ByteFile.java new file mode 100755 index 0000000..05f5084 --- /dev/null +++ b/current/fileIO/ByteFile.java @@ -0,0 +1,96 @@ +package fileIO; +import java.io.File; +import java.io.InputStream; +import java.util.ArrayList; + +import align2.Shared; + + +public abstract class ByteFile { + +// public static final ByteFile makeByteFile(String fname){ +// return makeByteFile(fname, false, true); +// } + + public static final ByteFile makeByteFile(String fname, boolean tryAllExtensions, boolean allowSubprocess){ + FileFormat ff=FileFormat.testInput(fname, FileFormat.TEXT, null, allowSubprocess, false); + return makeByteFile(ff, tryAllExtensions); + } + + public static final ByteFile makeByteFile(FileFormat ff, boolean tryAllExtensions){ + if(FORCE_MODE_BF2 || (!FORCE_MODE_BF1 && Shared.THREADS>4/* && (ReadWrite.isCompressed(fname) || ReadWrite.isSam(fname))*/)){ +// if(allowSubprocess && ((ReadWrite.USE_UNPIGZ || ReadWrite.USE_GUNZIP) && (fname.endsWith(".gz") || fname.endsWith(".gzip")))){} + return new ByteFile2(ff, tryAllExtensions); + } + return new ByteFile1(ff, tryAllExtensions); + } + +// protected ByteFile(String fname, boolean tryAllExtensions, boolean allowSubprocess_){ +// allowSubprocess=allowSubprocess_; +// fname=fname.replace('\\', '/'); +// File f=new File(fname); +// +// if(tryAllExtensions && !fname.startsWith("jar:") && !f.exists()){ +// name=ReadWrite.findFileExtension(fname); +// f=new File(name); +// }else{ +// name=fname; +// } +// } + + protected ByteFile(FileFormat ff_, boolean tryAllExtensions){ + ff=ff_; + assert(ff.read()) : ff; + } + + public final ArrayList toByteLines(){ + + byte[] s=null; + ArrayList list=new ArrayList(4096); + + for(s=nextLine(); s!=null; s=nextLine()){ + list.add(s); + } + + return list; + } + + public final long countLines(){ + byte[] s=null; + long count=0; + for(s=nextLine(); s!=null; s=nextLine()){count++;} + reset(); + + return count; + } + + public abstract void reset(); + + public final boolean exists(){ + return name().equals("stdin") || name().startsWith("stdin.") || name().startsWith("jar:") || new File(name()).exists(); //TODO Ugly and unsafe hack for files in jars + } + + public abstract InputStream is(); + public abstract long lineNum(); + + /** Returns true if there was an error */ + public abstract boolean close(); + + public abstract byte[] nextLine(); + + public abstract byte[] readLine(); + + public abstract boolean isOpen(); + + public final String name(){return ff.name();} + public final boolean allowSubprocess(){return ff.allowSubprocess();} + +// public final String name; + public final FileFormat ff; + + public static boolean FORCE_MODE_BF1=false; + public static boolean FORCE_MODE_BF2=false; + + protected final static byte slashr='\r', slashn='\n', carrot='>', plus='+', at='@';//, tab='\t'; + +} diff --git a/current/fileIO/ByteFile1.java b/current/fileIO/ByteFile1.java new file mode 100755 index 0000000..db48164 --- /dev/null +++ b/current/fileIO/ByteFile1.java @@ -0,0 +1,239 @@ +package fileIO; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +import dna.Data; +import dna.Timer; + + +/** + * @author Brian Bushnell + * + */ +public class ByteFile1 extends ByteFile { + + + public static void main(String[] args) throws IOException{ + ByteFile1 tf=new ByteFile1(args.length>0 ? args[0] : "stdin", false, true); + long first=0, last=100; + boolean speedtest=false; + if(args.length>1){ + if(args[1].equalsIgnoreCase("speedtest")){ + speedtest=true; + first=0; + last=Long.MAX_VALUE; + }else{ + first=Integer.parseInt(args[1]); + last=first+100; + } + } + if(args.length>2){ + last=Integer.parseInt(args[2]); + } + speedtest(tf, first, last, !speedtest); + + tf.close(); + tf.reset(); + tf.close(); + } + + private static void speedtest(ByteFile1 tf, long first, long last, boolean reprint){ + Timer t=new Timer(); + t.start(); + long lines=0; + long bytes=0; + for(long i=0; i0){lasteol=buffer[len];} + return len; + } + + private final synchronized InputStream open(){ + if(open){ + throw new RuntimeException("Attempt to open already-opened TextFile "+name()); + } + open=true; + is=ReadWrite.getInputStream(name(), false, allowSubprocess()); + bstart=-1; + bstop=-1; + lasteol=-1; + return is; + } + + public boolean isOpen(){return open;} + + public final InputStream is(){return is;} + + public final long lineNum(){return lineNum;} + + private boolean open=false; + private byte[] buffer=new byte[16384]; + private int bstart=0, bstop=0; + public InputStream is; + public long lineNum=-1; + + private byte lasteol=-1; + + public static boolean verbose=false; + + private boolean errorState=false; + +} diff --git a/current/fileIO/ByteFile2.java b/current/fileIO/ByteFile2.java new file mode 100755 index 0000000..d80d7e6 --- /dev/null +++ b/current/fileIO/ByteFile2.java @@ -0,0 +1,400 @@ +package fileIO; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; +import java.util.concurrent.ArrayBlockingQueue; + +import dna.Timer; + + +/** + * Runs a ByteFile1 in a separate thread. Can speed up disk reading, particularly of compressed files, at cost of slightly more work done. + * Drop-in compatible with ByteFile1. + * @author Brian Bushnell + * @date Sep 23, 2013 + * + */ +public class ByteFile2 extends ByteFile { + + + public static void main(String[] args) throws IOException{ + ByteFile2 tf=new ByteFile2(args.length>0 ? args[0] : "stdin", false, true); + long first=0, last=100; + boolean speedtest=false; + if(args.length>1){ + if(args[1].equalsIgnoreCase("speedtest")){ + speedtest=true; + first=0; + last=Long.MAX_VALUE; + }else{ + first=Integer.parseInt(args[1]); + last=first+100; + } + } + if(args.length>2){ + last=Integer.parseInt(args[2]); + } + speedtest(tf, first, last, !speedtest); + + tf.close(); + tf.reset(); + tf.close(); + } + + private static void speedtest(ByteFile2 tf, long first, long last, boolean reprint){ + Timer t=new Timer(); + t.start(); + long lines=0; + long bytes=0; + for(long i=0; i=currentList.length || currentList[currentLoc]==null){ + boolean b=getBuffer(); + if(!b){ + if(verbose2){System.err.println("readLine()->getBuffer() returned false.");} + return null; + } + } + assert(currentList!=null && currentList!=poison); + assert(currentLoc(buffs+2); +// qEmpty=new ArrayBlockingQueue(buffs+2); +// for(int i=0; i(buffs+2); + qEmpty=new ArrayBlockingQueue(buffs+2); + for(int i=0; i6270700; + bases+=s.length; + list[loc]=s; + loc++; +// numIn++; +// if(verbose){System.err.println("Added line "+numIn);} + if(loc>=bufflen || bases>=buffcapacity){ + if(verbose2){System.err.println("Capacity exceeded.");} + while(list!=null){ + try { +// synchronized(this){ +// if(!shutdown){ + if(verbose2){ + System.err.println("A: Adding to qFull list of size "+loc); + System.err.println(ByteFile2.toString(list)); + } + cntr+=list.length; + qFull.put(list); + if(verbose2){System.err.println("A: qFull.size()="+qFull.size());} +// } +// } + list=null; + loc=0; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(shutdown){ + if(verbose2){System.err.println("Break 1");} + break; + } + while(list==null){ + if(verbose2){System.err.println("Taking empty list.");} + try { + list = qEmpty.take(); + } catch (InterruptedException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + } + } + bases=0; + if(list==poison){ + if(verbose2){System.err.println("Break 2");} + break; + } + } + } + if(verbose2){System.err.println("Run loop exit.");} + + while(list!=null && loc>0){ + try { +// synchronized(this){ +// if(!shutdown){ + if(verbose2){System.err.println("B: Adding list of size "+loc);} + qFull.put(list); + if(verbose2){System.err.println("B: qFull.size()="+qFull.size());} +// } +// } + list=null; + loc=0; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + shutdown(); + + if(verbose){System.err.println("ByteFile2("+name()+").run() finished");} + } + + private synchronized void shutdown(){ + if(verbose || verbose2){System.err.println("ByteFile2("+name()+").shutdown()");} + if(shutdown){return;} + shutdown=true; + if(verbose2){System.err.println("Adding poison.");} + qFull.add(poison); + qEmpty.add(poison); + if(verbose2){System.err.println("D: qFull.size()="+qFull.size());} + if(verbose || verbose2){System.err.println("ByteFile2("+name()+").shutdown() finished");} + } + + private boolean shutdown=false; + final ByteFile1 bf1; + final ArrayBlockingQueue qFull; + final ArrayBlockingQueue qEmpty; + + } + + public boolean isOpen(){ + if(currentList!=null && currentLoc{ + + + public static void main(String args[]){ + ChainLine[][] lines=loadChainLines(args[0]); + for(int i=1; i<=22; i++){ + for(ChainLine line : lines[i]){ + System.out.println(line); + } + System.out.println(); + } + } + + + public ChainBlock(List list){ + + String[] head=list.get(0); + assert("chain".equals(head[0])); + + score=Long.parseLong(head[1]); + + tName=head[2]; + tChrom=toChromosome(head[2]); + tSize=Integer.parseInt(head[3]); + tStrand=Gene.toStrand(head[4]); + tStart=Integer.parseInt(head[5]); + tStop=Integer.parseInt(head[6]); + + qName=head[7]; + qChrom=toChromosome(head[7]); + qSize=Integer.parseInt(head[8]); + qStrand=Gene.toStrand(head[9]); + qStart=Integer.parseInt(head[10]); + qStop=Integer.parseInt(head[11]); + + chainID=Integer.parseInt(head[12]); + + chunks=new int[list.size()-1][]; + for(int i=1; i1){ + tloc=tloc2+chunk[1]+1; + qloc=qloc2+chunk[2]+1; + } + } + }else{ + + int tloc=tStart, qloc=qStop-1; + for(int i=0; i1){ + tloc=tloc2+chunk[1]+1; + qloc=qloc2-chunk[2]-1; + } + } + } + + return out; + } + + + public static ChainLine[][] loadChainLines(String fname){ + ArrayList list=loadChainBlocks(fname); + ChainBlock[][] blocks=splitChain(list); + ChainLine[][] out=new ChainLine[blocks.length][]; + ArrayList temp=new ArrayList(); + for(int chrom=0; chrom0){ + for(ChainBlock block : cblocks){ + ChainLine[] blines=block.toLines(); + for(ChainLine line : blines){ + temp.add(line); + } + } + } + if(temp.size()>0){ + out[chrom]=temp.toArray(new ChainLine[temp.size()]); + Arrays.sort(out[chrom]); + } + } + return out; + } + + + public static ArrayList loadChainBlocks(String fname){ + TextFile tf=new TextFile(fname, false, true); + String[] lines=tf.toStringLines(); + tf.close(); + String[][] text=TextFile.doublesplitWhitespace(lines, true); + + ArrayList out=new ArrayList(); + ArrayList current=new ArrayList(40); + for(int i=0; i list){ + int[] size=new int[Gene.chromCodes.length]; + + for(ChainBlock cb : list){size[cb.tChrom]++;} + + ChainBlock[][] out=new ChainBlock[size.length][]; + for(int i=0; i { + + + public static void main(String[] args){ + + byte chrom=Gene.toChromosome(args[0]); + + ChainLine[][] lines=ChainBlock.loadChainLines(Data.ROOT_CHAIN+"hg18ToHg19.over.chain"); + + for(int i=1; i\t"); + System.out.println(result==null ? "null" : Gene.chromCodes[result[0]]+"\t"+Gene.strandCodes[result[1]]+"\t"+result[2]); + } + + } + + + public ChainLine(byte chromT, byte strandT, int startT, int stopT, byte chromQ, byte strandQ, int startQ, int stopQ){ + tChrom=chromT; + tStrand=strandT; + tStart=startT; + tStop=stopT; + + qChrom=chromQ; + qStrand=strandQ; + qStart=startQ; + qStop=stopQ; + } + + + public String toString(){ + return Gene.chromCodes[tChrom]+"\t"+Gene.strandCodes[tStrand]+"\t"+tStart+"\t"+tStop+"\t"+ + Gene.chromCodes[qChrom]+"\t"+Gene.strandCodes[qStrand]+"\t"+qStart+"\t"+qStop; + } + + + public static int binarySearch(int loc, ChainLine[] array){ + return binarySearch(loc, array, 0, array.length-1); + } + + + public static int binarySearch(int loc, ChainLine[] array, int first, int last){ +// if(first>=last){ +// if(first>last){return -1;} +// assert(first==last && first=loc) ? first : -1; +// } +// System.out.println("BinarySearch "+loc+", "+first+", "+last); + if(first>last){return -1;} + int mid=(first+last)/2; + ChainLine midcl=array[mid]; +// System.out.println("mid = "+midcl); + if(locmidcl.tStop){return binarySearch(loc, array, mid+1, last);} + return mid; + } + + /** Returns {chrom, strand, loc} */ + public static int[] translate(int loc, ChainLine[] array){ + int index=binarySearch(loc, array); + if(index<0){return null;} + ChainLine cl=array[index]; + return cl.translate(loc); + } + + public int[] translate(int loc){ + if(loctStop){return null;} +// assert(loc>=tStart && loc<=tStop); + if(qChrom<1 || qChrom>25){return null;} + if(qStrand==Gene.PLUS){ + return new int[] {qChrom, qStrand, qStart+loc-tStart}; + }else{ + assert(qStart>=qStop) : this; + return new int[] {qChrom, qStrand, qStart-(loc-tStart)}; + } + } + + + public boolean contains(int a, int b){ + assert(b>=a); + return a>=tStart && b<=tStop; + } + + + public boolean contains(int a){ + return a>=tStart && a<=tStop; + } + + + @Override + public int compareTo(ChainLine other) { + int temp; + + temp=tChrom-other.tChrom; + if(temp!=0){return temp;} + + assert(tStrand==other.tStrand); + + temp=tStart-other.tStart; + if(temp!=0){return temp;} + + temp=tStop-other.tStop; + return temp; + } + + public byte tChrom; + public byte tStrand; + public int tStart; + public int tStop; + + public byte qChrom; + public byte qStrand; + public int qStart; + public int qStop; + +} diff --git a/current/fileIO/CompressFiles.java b/current/fileIO/CompressFiles.java new file mode 100755 index 0000000..feedeba --- /dev/null +++ b/current/fileIO/CompressFiles.java @@ -0,0 +1,73 @@ +package fileIO; + +import java.io.File; + +import dna.Data; + + +public class CompressFiles { + + + public static void main(String[] args){ + for(String s : args){ + if(s.equalsIgnoreCase("zip")){ + zip=true; + gzip=false; + }else if(s.equalsIgnoreCase("gzip") || s.equalsIgnoreCase("gz")){ + zip=false; + gzip=true; + }else{ + compressFiles(s); + } + } + } + + + public static void compressFiles(String path){ + File f=new File(path); + compressFiles(f); + } + + public static void compressFiles(File path){ + + if(path.isDirectory()){ + File[] array=path.listFiles(); + for(File f : array){compressFiles(f);} + }else{ + compress(path); + } + + } + + public static void compress(File in){ + assert(in.exists()); + assert(in.isFile()); + String abs=in.getAbsolutePath(); +// System.out.println("Considering "+abs); + if(abs.endsWith(".gz") || abs.endsWith(".zip") || abs.endsWith(".bz2")){return;} + +// if(!abs.contains("custom_summary_") || !abs.endsWith("Gene_build36.txt")){return;} //TODO ***TEMPORARY*** + System.err.println(abs); +// if(!abs.endsWith(".gvla")){return;} //TODO ***TEMPORARY*** +// if(!abs.endsWith(".gvla") || +// !(abs.contains("seqGene") || abs.contains("refGene") || abs.contains("unionGene"))){return;} //TODO ***TEMPORARY*** + if(abs.toLowerCase().contains("familytree")){return;} //TODO ***TEMPORARY*** + + if(PRINT_7Z_BATCH){ + //-mx=4 is fast; -mx=5 or 6 is slow; 7+ is very slow. +// System.out.println("C:"+Data.SLASH+"\"Program Files\""+Data.SLASH+"7-Zip"+Data.SLASH+"7z a -mx=4 "+abs+".zip "+abs); + System.out.println("C:\\\"Program Files\"\\7-Zip\\7z a -mx=4 "+abs+".gz "+abs); + }else{ + System.out.println("Compressing "+abs+" to "+(zip ? "zip" : "gz")); + ReadWrite.copyFile(abs, abs+(zip ? ".zip" : ".gz")); + } + + } + + + public static boolean zip=true; + public static boolean gzip=!zip; + + public static boolean PRINT_7Z_BATCH=true; + +} diff --git a/current/fileIO/CopyFile.java b/current/fileIO/CopyFile.java new file mode 100755 index 0000000..7f26729 --- /dev/null +++ b/current/fileIO/CopyFile.java @@ -0,0 +1,128 @@ +package fileIO; + +import java.io.File; +import java.io.FileNotFoundException; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.zip.ZipOutputStream; + +import align2.Tools; + +import dna.Timer; + + +/** + * Unlike ReadWrite's version, this one forces compression and decompression even with same extensions. + * Mainly for benchmarking. + * @author Brian Bushnell + * @date Jan 23, 2013 + * + */ +public class CopyFile { + + public static void main(String[] args){ + + String in=null, out=null; + boolean overwrite=true; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in")){ + in=b; + }else if(a.equals("out")){ + out=b; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF1=!(ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b)); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b);; + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(in==null && i==0 && !args[i].contains("=")){ + in=args[i]; + }else if(out==null && i==1 && !args[i].contains("=")){ + out=args[i]; + } + } + } + assert(in!=null && out!=null); + long bytes=new File(in).length(); + Timer t=new Timer(); + t.start(); + copyFile(in, out, false, overwrite); + t.stop(); + double mbps1=bytes*1000d/t.elapsed; + System.err.println("Time: \t"+t); + System.err.println(String.format("Speed: \t%.2f MB/s", mbps1)); + } + + + public static synchronized void copyFile(String source, String dest, boolean createPathIfNeeded, boolean overwrite){ + + assert(overwrite || !new File(dest).exists()) : "Destination file already exists: "+dest; + if(createPathIfNeeded){ + File parent=new File(dest).getParentFile(); + if(parent!=null && !parent.exists()){ + parent.mkdirs(); + } + } + + try{ + InputStream in=ReadWrite.getInputStream(source, false, true); + OutputStream out=ReadWrite.getOutputStream(dest, false, false, true); + + final byte[] buffer=new byte[16384]; + int len; + + while((len = in.read(buffer)) > 0){ + out.write(buffer, 0, len); + } + + in.close(); + out.flush(); + if(out.getClass()==ZipOutputStream.class){ + ZipOutputStream zos=(ZipOutputStream)out; + zos.closeEntry(); + zos.finish(); + } + // else if(PROCESS_XZ && out.getClass()==org.tukaani.xz.XZOutputStream.class){ + // org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)out; + // zos.finish(); + // } + out.close(); + + }catch(FileNotFoundException e){ + throw new RuntimeException(e); + }catch(IOException e){ + throw new RuntimeException(e); + } + } + +} diff --git a/current/fileIO/CopyFiles.java b/current/fileIO/CopyFiles.java new file mode 100755 index 0000000..071f43e --- /dev/null +++ b/current/fileIO/CopyFiles.java @@ -0,0 +1,64 @@ +package fileIO; + +import java.io.File; + +import dna.Data; + + +public class CopyFiles { + + + public static void main(String[] args){ + for(String s : args){ + renameFiles(s); + } + } + + + public static void renameFiles(String path){ + File f=new File(path); + renameFiles(f); + } + + public static void renameFiles(File path){ + + if(path.isDirectory()){ + File[] array=path.listFiles(); + for(File f : array){renameFiles(f);} + }else{ + rename(path); + } + + } + + public static void rename(File in){ + assert(in.exists()); + assert(in.isFile()); + String abs=in.getAbsolutePath(); + + + int dot=abs.lastIndexOf('.'); + int slash=abs.lastIndexOf('/'); + +// String[] split=Person.parsePath(abs.substring(0, slash)); +// String name=split[0]; +// String out=abs.substring(0, dot)+"_"+name+".txt"; + + + + String fname=abs.substring(slash+1); + +// System.out.println(fname); + + + if(fname.startsWith("chr") && fname.endsWith(".txt")){ + + String out=abs.replace(".txt", ".flow"); + assert(!out.equals(abs)) : out+", "+abs; + + System.out.println("Renaming "+abs+" to "+out); + ReadWrite.copyFile(abs, out); + } + } + +} diff --git a/current/fileIO/CopyFiles2.java b/current/fileIO/CopyFiles2.java new file mode 100755 index 0000000..5eeaec5 --- /dev/null +++ b/current/fileIO/CopyFiles2.java @@ -0,0 +1,163 @@ +package fileIO; + +import java.io.File; + +import dna.Data; +import dna.Timer; + + +public class CopyFiles2 { + + + public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + if(args.length>0){ + assert(args.length==2); + inRoots=new String[] {args[0]}; + outRoot=args[1]; + } + + for(String inRoot : inRoots){ + copyFiles(inRoot, outRoot); + } + + t.stop(); + System.out.println("Time:\t"+t); + } + + + public static void copyFiles(String in, String out){ + File fin=new File(in); + File fout=new File(out); + copyFiles(fin, fout); + } + + public static void copyFiles(File in, File out){ + + String abs=in.getAbsolutePath(); + for(String s : badNames){ + if(abs.matches(s)){ + return; + } + } + + { + String temp=out.getAbsolutePath(); + if(temp.endsWith("\\ASM")){ + temp=temp.replace("\\ASM", ""); + }else if(temp.contains("\\ASM\\")){ + temp=temp.replace("\\ASM\\", ""); + } + out=new File(temp); + } + + if(in.isDirectory()){ +// System.out.println("PATH: "+in.getAbsolutePath()); + if(!out.exists()){ + out.mkdir(); + } + + File[] array=in.listFiles(); + for(File f : array){ +// String outname=f.getAbsolutePath().replace(inRoot, outRoot); + + String outname=out.getAbsolutePath()+"\\"+f.getName(); + + File f2=new File(outname); + copyFiles(f, f2); + } + } + + else{ + copyFile(in, out); + } + + } + + public static void copyFile(File in, File out){ + assert(in.exists()); + assert(in.isFile()); + + if(out.exists()){ + System.out.println("Skipping existing file "+out.getAbsolutePath()); + return; + } + + String abs=in.getAbsolutePath(); + String fname=in.getName(); + + boolean valid=false; + + for(String s : badNames){ + if(fname.matches(s)){ + valid=false; + return; + } + } + + for(String s : dirNames){ + if(abs.contains(s)){ + valid=true; + break; + } + } + + for(String s : fileNames){ + if(valid){break;} + if(fname.matches(s)){ + valid=true; + } + } + + if(!valid){return;} + + if(abs.endsWith(".tsv")/* && in.length()>4000000*/){ + out=new File(out.getAbsolutePath()+".zip"); + } + +// if(abs.endsWith(".bz2")){ +// out=new File(out.getAbsolutePath().replace(".bz2", ".zip")); +// } + + System.out.println("Copying file to "+out.getAbsolutePath()); + ReadWrite.copyFile(in.getAbsolutePath(), out.getAbsolutePath()); + + } + +// public static String[] inRoots={"F:\\UTSW_batch_1\\", "F:\\UTSW_batch_2\\"}; + public static String[] inRoots={"F:\\UTSW_second_set\\"}; + public static String outRoot="C:\\Data\\OCT_8\\"; + + public static final String[] dirNames={"\\CNV\\", "\\SV\\"}; + + public static final String[] fileNamesAbsolute={ + ".*\\\\gene-GS.+-ASM.*\\.tsv.*", + ".*\\\\geneVarSummary-GS.+-ASM.*\\.tsv.*", + ".*\\\\summary-GS.+-ASM.*\\.tsv.*", + ".*\\\\var-GS.+-ASM.*\\.tsv.*", + ".*\\\\manifest\\.all", + ".*\\\\README\\..*", + ".*\\\\version", + }; + + public static final String[] fileNames={ + "gene-GS.+-ASM.*\\.tsv.*", + "geneVarSummary-GS.+-ASM.*\\.tsv.*", + "summary-GS.+-ASM.*\\.tsv.*", + "var-GS.+-ASM.*\\.tsv.*", + "manifest\\.all", + "README\\..*", + "version", + }; + + public static final String[] badNames={ + ".*AppleDouble.*", + ".*DS_Store.*", + ".*EVIDENCE.*" + }; + + +} diff --git a/current/fileIO/FileFormat.java b/current/fileIO/FileFormat.java new file mode 100755 index 0000000..dd85e86 --- /dev/null +++ b/current/fileIO/FileFormat.java @@ -0,0 +1,387 @@ +package fileIO; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.util.Arrays; + +/** + * @author Brian Bushnell + * @date Dec 19, 2012 + * + */ +public final class FileFormat { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + public static FileFormat testInput(String fname, String overrideExtension, boolean allowSubprocess){ + return testInput(fname, FASTQ, overrideExtension, allowSubprocess, true); + } + + public static FileFormat testInput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean allowFileRead){ + if(fname==null){return null;} + int overrideFormat=0; + int overrideCompression=0; + if(overrideExtension!=null && overrideExtension.length()>0){ + int[] a=testFormat(overrideExtension, false); + if(a!=null){ + overrideFormat=a[0]; + if(a[1]!=RAW){overrideCompression=a[1];} + } + } + return testInput(fname, defaultFormat, overrideFormat, overrideCompression, allowFileRead, allowSubprocess); + } + + public static FileFormat testInput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean allowFileRead){ + if(fname==null){return null;} + return new FileFormat(fname, READ, defaultFormat, overrideFormat, overrideCompression, allowFileRead, false, allowSubprocess, false); + } + + public static FileFormat testOutput(String fname, int defaultFormat, String overrideExtension, boolean allowSubprocess, boolean overwrite, boolean ordered){ + if(fname==null){return null;} + int overrideFormat=0; + int overrideCompression=0; + if(overrideExtension!=null && overrideExtension.length()>0){ + int[] a=testFormat(overrideExtension, false); + if(a!=null){ + overrideFormat=a[0]; + if(a[1]!=RAW){overrideCompression=a[1];} + } + } + return testOutput(fname, defaultFormat, overrideFormat, overrideCompression, allowSubprocess, overwrite, ordered); + } + + public static FileFormat testOutput(String fname, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowSubprocess, boolean overwrite, boolean ordered){ + if(fname==null){return null;} + return new FileFormat(fname, WRITE, defaultFormat, overrideFormat, overrideCompression, false, overwrite, allowSubprocess, ordered); + } + + /*--------------------------------------------------------------*/ + /*---------------- Constructor ----------------*/ + /*--------------------------------------------------------------*/ + + private FileFormat(String fname, int mode_, int defaultFormat, int overrideFormat, int overrideCompression, boolean allowFileRead + , boolean overwrite_, boolean allowSubprocess_, boolean ordered_){ +// , boolean interleaved_, boolean colorspace_, long maxReads_){ + + if(verbose){ + new Exception().printStackTrace(System.err); + System.err.println("FileFormat(fname="+fname+", mode="+mode_+", dFormat="+defaultFormat+", oFormat="+overrideFormat+", oCompression="+overrideCompression+ + ", allowRead="+allowFileRead+", ow="+overwrite_+", allowSub="+allowSubprocess_+", ordered="+ordered_+")"); + } + + assert(fname!=null); + fname=fname.trim().replace('\\', '/'); + assert(fname.trim().length()>0) : fname; + + if(defaultFormat<1){defaultFormat=FQ;} + allowFileRead&=(mode_==READ); + int[] a=testFormat(fname, allowFileRead); + + if(verbose){System.err.println(Arrays.toString(a));} + + if(a[0]==UNKNOWN && overrideFormat<1){ + a[0]=defaultFormat; + if(defaultFormat!=TEXT){ + System.err.println("Unspecified format for "+(mode_==READ ? "input" : "output")+" "+(fname==null ? "stream" : fname)+"; defaulting to "+FORMAT_ARRAY[a[0]]+"."); + } + } + if(verbose){System.err.println(Arrays.toString(a));} + + if(overrideFormat>0){a[0]=overrideFormat;} + if(overrideCompression>0){a[1]=overrideCompression;} + + if(verbose){System.err.println(Arrays.toString(a));} + + name=fname; + format=a[0]; + compression=a[1]; + type=a[2]; + mode=mode_; + + overwrite=overwrite_; + allowSubprocess=allowSubprocess_; + ordered=ordered_; + +// interleaved=interleaved_; +// colorspace=colorspace_; +// maxReads=write() ? -1 : maxReads_; + + assert(!unknownFormat()) : "Unknown file format for "+fname+"\n"+ + mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowFileRead+", "+overwrite_+", "+allowSubprocess_; + assert(!unknownCompression()) : "Unknown compression for "+fname+"\n"+ + mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowFileRead+", "+overwrite_+", "+allowSubprocess_; + assert(!unknownType()) : "Unknown stream type for "+fname+"\n"+ + mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowFileRead+", "+overwrite_+", "+allowSubprocess_; + assert(!unknownMode()) : "Unknown I/O mode for "+fname+"\n"+ + mode_+", "+defaultFormat+", "+overrideFormat+", "+overrideCompression+", "+allowFileRead+", "+overwrite_+", "+allowSubprocess_; + } + + /*--------------------------------------------------------------*/ + /*---------------- Methods ----------------*/ + /*--------------------------------------------------------------*/ + + @Override + public String toString(){ + StringBuilder sb=new StringBuilder(); + sb.append(name).append(','); + sb.append(format+"("+FORMAT_ARRAY[format]+")").append(','); + sb.append(compression+"("+COMPRESSION_ARRAY[compression]+")").append(','); + sb.append(type+"("+TYPE_ARRAY[type]+")").append(','); + sb.append(mode+"("+MODE_ARRAY[mode]+")").append(','); + sb.append("ow="+(overwrite ? "t" : "f")).append(','); + sb.append("sub="+(allowSubprocess ? "t" : "f")).append(','); + sb.append("ordered="+(ordered ? "t" : "f")); + return sb.toString(); + } + + + /*--------------------------------------------------------------*/ + /*---------------- Static Methods ----------------*/ + /*--------------------------------------------------------------*/ + + private static int[] testFormat(String fname, boolean allowFileRead){ + int[] r=new int[] {UNKNOWN, RAW, FILE}; + if(fname==null || fname.length()<1){ + r[2]=STDIO; + return r; + } + String slc=fname.trim().toLowerCase(); + if(slc.indexOf('/')<0){slc=slc.substring(slc.lastIndexOf('/')+1);} + if(slc.indexOf('.')<0){slc="."+slc;} + String comp=ReadWrite.compressionType(slc); + String ext=ReadWrite.rawExtension(slc); + + if(ext==null){} + else if(ext.equals("fq") || ext.equals("fastq")){r[0]=FASTQ;} + else if(ext.equals("fa") || ext.equals("fasta") || ext.equals("fas") || ext.equals("fna") || ext.equals("ffn") + || ext.equals("frn") || ext.equals("seq")|| ext.equals("fsa")){r[0]=FASTA;} + else if(/*ext.equals("txt") || */ext.equals("bread")){r[0]=BREAD;} + else if(ext.equals("sam")){r[0]=SAM;} + else if(ext.equals("csfasta")){r[0]=CSFASTA;} + else if(ext.equals("qual")){r[0]=QUAL;} + else if(ext.equals("bam")){r[0]=BAM;} + else if(ext.equals("sites") || ext.equals("sitesonly")){r[0]=SITES;} + else if(ext.equals("info") || ext.equals("attachment")){r[0]=ATTACHMENT;} + else if(ext.equals("scarf")){r[0]=SCARF;} + + if(comp==null){} + else if(comp.equals("gz")){r[1]=GZ;} + else if(comp.equals("zip")){r[1]=ZIP;} + else if(comp.equals("bz2")){r[1]=BZ2;} + else if(comp.equals("xz")){r[1]=XZ;} + +// assert(false) : Arrays.toString(r); + + + if(slc.length()>2 && slc.charAt(0)=='s' && slc.charAt(1)=='t'){ + if(slc.equals("stdin") || slc.startsWith("stdin.") || slc.equals("standardin")){r[2]=STDIO;} + else if(slc.equals("stdout") || slc.startsWith("stdout.") || slc.equals("standardout")){r[2]=STDIO;} + }else if("/dev/null".equalsIgnoreCase(slc)){ + r[2]=DEVNULL; + } + + if(r[0]==UNKNOWN){ + File f=(allowFileRead && r[2]==FILE ? new File(fname) : null); + if(f!=null && f.exists() && !f.isDirectory()){ + InputStream is=ReadWrite.getInputStream(fname, false, false); + int b=-1; + try { + b=is.read(); + is.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(b=='>'){r[0]=FA;} + else if(b=='@'){r[0]=FQ;} //TODO: Note - could be sam + else{r[0]=BREAD;} + }else{ + if(fname.equals("sequential")){r[0]=SEQUENTIAL;} + else if(fname.equals("random")){r[0]=RANDOM;} + else if(fname.equals("sitesonly")){r[0]=SITES;} + } + } + + + if(r[2]==STDIO && allowFileRead){ + File f=new File(fname); + if(f.exists() && !f.isDirectory()){r[2]=FILE;} + } +// else{ +// r[2]=FILE; //What is this for? +// } + + return r; + } + + public static boolean hasFastaExtension(String fname){ + int[] r=testFormat(fname, false); + return r[0]==FA; + } + + public static boolean hasFastqExtension(String fname){ + int[] r=testFormat(fname, false); + return r[0]==FQ; + } + + /*--------------------------------------------------------------*/ + /*---------------- Getters ----------------*/ + /*--------------------------------------------------------------*/ + + public final String name(){return name;} + public final int format(){return format;} + public final int compression(){return compression;} + public final int type(){return type;} + public final int mode(){return mode;} + + public final boolean hasName(){return name!=null;} + public final boolean canWrite(){ + assert(write()); + if(stdio() || devnull()){return true;} + assert(hasName()); + File f=new File(name); + if(!f.exists()){return true;} + if(!f.canWrite()){return false;} + return overwrite(); + } + public final boolean canRead(){ + assert(read()); + if(stdio()){return true;} + assert(hasName()); + File f=new File(name); + return f.canRead(); + } + + public final boolean unknownField(){return unknownFormat() || unknownCompression() || unknownType() || unknownMode();} + + public final boolean unknownFormat(){return format<=UNKNOWN;} + public final boolean fasta(){return format==FASTA;} + public final boolean fastq(){return format==FASTQ;} + public final boolean bread(){return format==BREAD;} + public final boolean sam(){return format==SAM;} + public final boolean samOrBam(){return format==SAM || format==BAM;} + public final boolean csfasta(){return format==CSFASTA;} + public final boolean qual(){return format==QUAL;} + public final boolean sequential(){return format==SEQUENTIAL;} + public final boolean random(){return format==RANDOM;} + public final boolean sites(){return format==SITES;} + public final boolean attachment(){return format==ATTACHMENT;} + public final boolean bam(){return format==BAM;} + public final boolean scarf(){return format==SCARF;} + public final boolean text(){return format==TEXT;} + + public final boolean unknownCompression(){return compression<=UNKNOWN;} + public final boolean raw(){return compression==RAW;} + public final boolean gzip(){return compression==GZIP;} + public final boolean zip(){return compression==ZIP;} + public final boolean bz2(){return compression==BZ2;} + public final boolean xz(){return compression==XZ;} + public final boolean sevenz(){return compression==SEVENZ;} + + public final boolean unknownType(){return type<=UNKNOWN;} + public final boolean file(){return type==FILE;} + public final boolean stdio(){return type==STDIO;} + public final boolean devnull(){return type==DEVNULL;} + + public final boolean unknownMode(){return mode<=UNKNOWN;} + public final boolean read(){return mode==READ;} + public final boolean write(){return mode==WRITE;} + + public final boolean overwrite(){return overwrite;} + public final boolean allowSubprocess(){return allowSubprocess;} + public final boolean ordered(){return ordered;} + +// public final boolean interleaved(){return interleaved;} +// public final boolean colorspace(){return colorspace;} +// public final long maxReads(){return maxReads;} + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + private final String name; + private final int format; + private final int compression; + private final int type; + private final int mode; + + private final boolean overwrite; + private final boolean allowSubprocess; + private final boolean ordered; + +// private final boolean interleaved; +// private final boolean colorspace; +// private final long maxReads; + + /*--------------------------------------------------------------*/ + /*---------------- Statics ----------------*/ + /*--------------------------------------------------------------*/ + + public static boolean verbose=false; + + /*--------------------------------------------------------------*/ + /*---------------- Constants ----------------*/ + /*--------------------------------------------------------------*/ + + private static final int UNKNOWN=0; + + /* Format */ + + public static final int FA=1, FASTA=1; + public static final int FQ=2, FASTQ=2; + public static final int BREAD=3; + public static final int SAM=4; + public static final int CSFASTA=5; + public static final int QUAL=6; + public static final int SEQUENTIAL=7; + public static final int RANDOM=8; + public static final int SITES=9; + public static final int ATTACHMENT=10; + public static final int BAM=11; + public static final int SCARF=12; + public static final int TEXT=13; + + private static final String[] FORMAT_ARRAY=new String[] { + "unknown", "fasta", "fastq", "bread", "sam", "csfasta", + "qual", "sequential", "random", "sites", "attachment", + "bam", "scarf", "text" + }; + + /* Compression */ + + public static final int RAW=1; + public static final int GZ=2, GZIP=2; + public static final int ZIP=3; + public static final int BZ2=4; + public static final int XZ=5; + public static final int c4=6; + public static final int SEVENZ=7; + + private static final String[] COMPRESSION_ARRAY=new String[] { + "unknown", "raw", "gz", "zip", "bz2", "xz", + "c4", "7z" + }; + + /* Type */ + + public static final int FILE=1; + public static final int STDIO=2, STDIN=2, STDOUT=2; + public static final int DEVNULL=3; +// public static final int NULL=4; + + private static final String[] TYPE_ARRAY=new String[] { + "unknown", "file", "stdio", "devnull" + }; + + /* Mode */ + + public static final int READ=1, WRITE=2; + + private static final String[] MODE_ARRAY=new String[] { + "unknown", "read", "write" + }; + +} diff --git a/current/fileIO/FindFiles.java b/current/fileIO/FindFiles.java new file mode 100755 index 0000000..e22e70a --- /dev/null +++ b/current/fileIO/FindFiles.java @@ -0,0 +1,113 @@ +package fileIO; + +import java.io.File; +import java.util.ArrayList; + + +public class FindFiles { + + + public static void main(String[] args){ + + String root=args[0]; +// if(root.equals(".")){root=null;} + String prefix=args[1]; + String suffix=(args[2].equals("null") ? null : args[2]); + String middle=null; + + if(args.length>3){ + middle=(args[3].equals("null") ? null : args[3]); + } + + boolean NEWLINE=true; + boolean BOTH=true; + + ArrayList results=findFiles(root, prefix, suffix, middle); + for(String s : results){ + if(NEWLINE){ + System.out.println(s); + }else{ + System.out.print(s+" "); + } + } + + + if(BOTH){ + System.out.println(); + NEWLINE=!NEWLINE; + for(String s : results){ + if(NEWLINE){ + System.out.println(s); + }else{ + System.out.print(s+" "); + } + } + } + } + + + public FindFiles(String pre, String suf, String mid){ + assert(!"*".equals(pre)) : "Use # instead of *, which has problems from the command line"; + assert(!"*".equals(suf)) : "Use # instead of *, which has problems from the command line"; + prefix=((pre==null || pre.equals("*") || pre.equals("#")) ? null : pre.toLowerCase()); + suffix=((suf==null || suf.equals("*") || suf.equals("#")) ? null : suf.toLowerCase()); + middle=((mid==null || mid.equals("*") || mid.equals("#")) ? null : mid.toLowerCase()); + } + + public static ArrayList findFiles(String root, String prefix, String suffix){ + return findFiles(root, prefix, suffix, null); + } + + public static ArrayList findFiles(String root, String prefix, String suffix, String mid){ + FindFiles ff=new FindFiles(prefix, suffix, mid); + return ff.findFiles(root); + } + + public ArrayList findFiles(String path){ + findFiles(new File(path==null ? "." : path)); + return results; + } + + public ArrayList findFiles(File path){ + + if(path.isDirectory()){ + File[] array=path.listFiles(); + if(array==null){System.err.println("null contents for "+path.getAbsolutePath());} + else{for(File f : array){findFiles(f);}} + }else{ + consider(path); + } + return results; + } + + public void consider(File in){ +// System.out.println("Considering "+in.getAbsolutePath()+" versus '"+prefix+"' '"+suffix+"'"); + if(!in.exists()){return;} + assert(in.exists()) : in; + assert(in.isFile()); + String abs=in.getAbsolutePath(); +// System.out.println("Considering "+abs); + String abs2=abs.toLowerCase(); + int slashLoc=abs2.lastIndexOf(slash); + if(slashLoc>-1){ + abs2=abs2.substring(slashLoc+1); + } +// System.out.println("a"); + if(prefix!=null && !abs2.startsWith(prefix)){return;} +// System.out.println("b"); + if(suffix!=null && !abs2.endsWith(suffix)){return;} +// System.out.println("c"); + + if(middle!=null && !abs2.contains(middle)){return;} + + results.add(abs); + } + + + public ArrayList results=new ArrayList(); + public String prefix; + public String suffix; + public String middle; + public static final char slash=System.getProperty("file.separator").charAt(0); + +} diff --git a/current/fileIO/GenericTextFile.java b/current/fileIO/GenericTextFile.java new file mode 100755 index 0000000..0b7cb2b --- /dev/null +++ b/current/fileIO/GenericTextFile.java @@ -0,0 +1,36 @@ +package fileIO; + +import java.util.ArrayList; + +public class GenericTextFile extends TextFile { + + public GenericTextFile(String name) { + super(name, false, false); + } + + + + + public String[] toLines(){ + + String s=null; + ArrayList list=new ArrayList(4096); + + for(s=nextLine(); s!=null; s=nextLine()){ + list.add(s); + } + + return list.toArray(new String[list.size()]); + + } + + public String nextLine(){ + String line=readLine(); + while(line!=null && false){ + line=readLine(); + } + return line; + } + + +} diff --git a/current/fileIO/LoadThread.java b/current/fileIO/LoadThread.java new file mode 100755 index 0000000..2a9945b --- /dev/null +++ b/current/fileIO/LoadThread.java @@ -0,0 +1,128 @@ +package fileIO; + +import java.util.Arrays; + +import align2.Shared; +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Jan 2, 2013 + * + */ +public class LoadThread extends Thread{ + + public static LoadThread load(String fname, Class c){ + LoadThread lt=new LoadThread(fname, c); + lt.start(); + return lt; + } + + private LoadThread(String fname_, Class c_){ + fname=fname_; + c=c_; + addThread(1); + } + + @Override + public void run(){ + addRunningThread(1); + output=ReadWrite.read(c, fname); + addRunningThread(-1); + synchronized(this){this.notify();} + } + + + private static final int addThread(int x){ + synchronized(activeThreads){ + assert(x!=0); + if(x>0){ + activeThreads[0]+=x; + activeThreads[1]+=x; + }else{ + addRunningThread(x); + } + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=LIMIT) : Arrays.toString(activeThreads); + + return activeThreads[0]; + } + } + + private static final int addRunningThread(int x){ + synchronized(activeThreads){ + assert(x!=0); + if(x>0){ + assert(activeThreads[1]>=x); + while(activeThreads[2]>=LIMIT){ + try { + activeThreads.wait(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + activeThreads[1]-=x; //Remove from waiting + }else{ + activeThreads[0]+=x; //Remove from active + } + activeThreads[2]+=x; //Change number running + + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=LIMIT) : Arrays.toString(activeThreads); + + if(activeThreads[2]==0 || (activeThreads[2]0)){activeThreads.notify();} + return activeThreads[2]; + } + } + + public static final int countActiveThreads(){ + synchronized(activeThreads){ + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=LIMIT) : Arrays.toString(activeThreads); + return activeThreads[0]; + } + } + + public static final void waitForReadingToFinish(){ + synchronized(activeThreads){ + while(activeThreads[0]>0){ + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=LIMIT) : Arrays.toString(activeThreads); + try { + activeThreads.wait(8000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(activeThreads[2]==0 || (activeThreads[2]0)){activeThreads.notify();} + } + } + } + + public final void waitForThisToFinish(){ + if(output==null){ + while(this.getState()!=State.TERMINATED){ + try { + this.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + } + + /** {active, waiting, running}
+ * Active means running or waiting. + */ + public static int[] activeThreads={0, 0, 0}; + + private final String fname; + private final Class c; + public X output=null; + + private static final int[] RUNNING=new int[1]; + public static int LIMIT=Tools.min(8, Tools.max(Shared.THREADS, 1)); + +} diff --git a/current/fileIO/MatrixFile.java b/current/fileIO/MatrixFile.java new file mode 100755 index 0000000..1a4b0c1 --- /dev/null +++ b/current/fileIO/MatrixFile.java @@ -0,0 +1,89 @@ +package fileIO; +import dna.Matrix; + + + +public class MatrixFile extends TextFile{ + + public static void main(String[] args){ + + try { + //Name of mat file + String name=args[0]; + + MatrixFile mat=new MatrixFile(name); + + String s=null; + + for(s=mat.readLine(); s!=null; s=mat.readLine()){ + System.out.println(s); + } + } catch (Exception e) { + throw new RuntimeException(e); + } + + } + + + public MatrixFile(String name){super(name, false, false);} + + public String nextLine(){ + String line=readLine(); + + while(line!=null && line.charAt(0)!='{' && line.charAt(0)!='/'){ + line=readLine(); + } + return line; + } + + public Matrix nextMatrix(){ + String line; + String[] split; + + line=nextLine(); + if(line==null || line.startsWith("//end")){return null;} + + assert(line.startsWith("//name: ")) : line; + String name=line.replace("//name: ","").trim(); + + line=nextLine(); + assert(line.startsWith("//size: ")) : line; + line=line.replace("//size: ",""); + split=line.split("x"); + int length=Integer.parseInt(split[0]); + int width=Integer.parseInt(split[1]); + + line=nextLine(); + assert(line.startsWith("//prefix: ")) : line; + line=line.replace("//prefix: ",""); + int prefix=Integer.parseInt(line); + + line=nextLine(); + assert(line.startsWith("//count: ")) : line; + line=line.replace("//count: ",""); + int count=Integer.parseInt(line); + + + float[][] grid=new float[length][width]; + for(int i=0; i0; len=is.read(buf)){ + os.write(buf, 0, len); + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + if(is!=System.in){ + try { + is.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + if(os!=System.out && os!=System.err){ + ReadWrite.close(os); + } + + synchronized(this){ + finished=true; + this.notify(); + } + } + + public boolean finished(){ + synchronized(this){ + return finished; + } + } + + public void terminate(){ + synchronized(this){ + if(!finished){ + finished=true; + interrupt(); + } + } + } + +// public static void killList(){ +// System.err.println("Kill list."); +// synchronized(list){ +// for(PipeThread pt : list){ +// if(!pt.finished){ +// pt.terminate(); +// } +// } +// } +// } + + public final InputStream is; + public final OutputStream os; + private volatile boolean finished=false; + +// private static ArrayList list=new ArrayList(8); + +} diff --git a/current/fileIO/ReadWrite.java b/current/fileIO/ReadWrite.java new file mode 100755 index 0000000..ab1f2c7 --- /dev/null +++ b/current/fileIO/ReadWrite.java @@ -0,0 +1,1562 @@ +package fileIO; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileNotFoundException; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.io.ObjectInputStream; +import java.io.ObjectOutputStream; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.io.Reader; +import java.lang.ProcessBuilder.Redirect; +import java.net.MalformedURLException; +import java.net.URL; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; +import java.util.zip.GZIPInputStream; +import java.util.zip.GZIPOutputStream; +import java.util.zip.ZipEntry; +import java.util.zip.ZipInputStream; +import java.util.zip.ZipOutputStream; + +import stream.ConcurrentReadStreamInterface; +import stream.RTextOutputStream3; + +import align2.Shared; +import align2.Tools; + +import dna.Data; + +public class ReadWrite { + + + public static void main(String[] args){ + File f=new File(args[1]); + assert(!f.exists()) : "Destination file already exists."; + copyFile(args[0], args[1]); + } + + public static void writeStringInThread(CharSequence x, String fname){ + writeStringInThread(x, fname, false); + } + + public static void writeStringInThread(CharSequence x, String fname, boolean append){ + addThread(1); + new Thread(new WriteStringThread(x, fname, append)).start(); + } + + public static void writeObjectInThread(Object x, String fname, boolean allowSubprocess){ + addThread(1); + new Thread(new WriteObjectThread(x, fname, allowSubprocess)).start(); + } + + private static class WriteStringThread implements Runnable{ + + private final CharSequence x; + private final String fname; + private final boolean append; + WriteStringThread(CharSequence x_, String fname_, boolean append_){ + x=x_; + fname=fname_; + append=append_; + } + + @Override + public void run() { + if(verbose){System.err.println("WriteStringThread.run() started for fname "+fname);} + addRunningThread(1); + writeStringAsync(x, fname, append); + addThread(-1); + if(verbose){System.err.println("WriteStringThread.run() finished for fname "+fname);} + } + + } + + private static class WriteObjectThread implements Runnable{ + + private final Object x; + private final String fname; + private final boolean allowSubprocess; + WriteObjectThread(Object x_, String fname_, boolean allowSubprocess_){ + x=x_; + fname=fname_; + allowSubprocess=allowSubprocess_; + } + + @Override + public void run() { + if(verbose){System.err.println("WriteObjectThread.run() started for fname "+fname);} + addRunningThread(1); +// System.out.println(fname+" began writing."); + writeAsync(x, fname, allowSubprocess); +// System.out.println(fname+" finished writing."); + addThread(-1); +// System.out.println(fname+" reports "+countActiveThreads()+" active threads."); + if(verbose){System.err.println("WriteObjectThread.run() finished for fname "+fname);} + } + + } + + public static boolean setPermissions(String fname, boolean read, boolean write, boolean execute, boolean ownerOnly){ + File f=new File(fname); + if(!f.exists()){return false;} + try { + f.setReadable(read, ownerOnly); + f.setWritable(write, ownerOnly); + f.setExecutable(execute, ownerOnly); + } catch (Exception e) { + return false; + } + return true; + } + + public static void writeString(CharSequence x, String fname){writeString(x, fname, false);} + public static void writeString(CharSequence x, String fname, boolean append){ + if(verbose){System.err.println("writeString(x, "+fname+", "+append+")");} + OutputStream os=getOutputStream(fname, append, true, false); + + try { + + synchronized(diskSync){ + PrintWriter out=new PrintWriter(os); + out.print(x); + out.flush(); + + if(os.getClass()==ZipOutputStream.class){ + ZipOutputStream zos=(ZipOutputStream)os; + zos.closeEntry(); + zos.finish(); + } +// else if(PROCESS_XZ && os.getClass()==org.tukaani.xz.XZOutputStream.class){ +// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)os; +// zos.finish(); +// } + out.close(); + } +// System.out.println("Wrote to "+fname); + +// String read=readString(fname); +// assert(x.equals(read)) : x.length()+", "+read.length(); + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static void writeStringAsync(CharSequence x, String fname){writeStringAsync(x, fname, false);} + public static void writeStringAsync(CharSequence x, String fname, boolean append){ + if(verbose){System.err.println("writeStringAsync(x, "+fname+", "+append+")");} + + OutputStream os=getOutputStream(fname, append, true, false); + + try { + + synchronized(diskSync){ + PrintWriter out=new PrintWriter(os); + out.print(x); + out.flush(); + + if(os.getClass()==ZipOutputStream.class){ + ZipOutputStream zos=(ZipOutputStream)os; + zos.closeEntry(); + zos.finish(); + } +// else if(PROCESS_XZ && os.getClass()==org.tukaani.xz.XZOutputStream.class){ +// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)os; +// zos.finish(); +// } + out.close(); + } +// System.out.println("Wrote to "+fname); + +// String read=readString(fname); +// assert(x.equals(read)) : x.length()+", "+read.length(); + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static void write(X x, String fname, boolean allowSubprocess){ + if(verbose){System.err.println("write(x, "+fname+", "+allowSubprocess+")");} + + OutputStream os=getOutputStream(fname, false, true, allowSubprocess); + + try { + + synchronized(diskSync){ + ObjectOutputStream out=new ObjectOutputStream(os); + out.writeObject(x); + close(out); + } + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static void writeAsync(X x, String fname, boolean allowSubprocess){ + if(verbose){System.err.println("writeAsync(x, "+fname+", "+allowSubprocess+")");} + + OutputStream os=getOutputStream(fname, false, true, allowSubprocess); + + try { + + ObjectOutputStream out=new ObjectOutputStream(os); + out.writeObject(x); + close(out); + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public static final boolean finishReading(InputStream is, String fname, boolean killProcess, Reader...ra){ + if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+", "+ra.length+")");} + boolean error=false; + if(ra!=null){ + for(Reader r : ra){ + try { + r.close(); + } catch (IOException e) { + error=true; + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + error|=finishReading(is, fname, killProcess); + if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+", "+ra.length+") returned "+error);} + return error; + } + + public static final boolean finishReading(InputStream is, String fname, boolean killProcess){ + if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+")");} + boolean error=false; + if(is!=System.in){ + try { + is.close(); + } catch (IOException e) { + error=true; + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(killProcess && fname!=null && is!=System.in){error|=ReadWrite.killProcess(fname);} + if(verbose){System.err.println("finishReading("+is+", "+fname+", "+killProcess+") returned "+error);} + return error; + } + +// public static final boolean finishWriting(PrintWriter writer, OutputStream outStream, String fname){ +// return finishWriting(writer, outStream, fname, fname!=null); +// } + + public static final boolean finishWriting(PrintWriter writer, OutputStream outStream, String fname, boolean killProcess){ + if(verbose){System.err.println("finishWriting("+writer+", "+outStream+" , "+fname+", "+killProcess+")");} + boolean error=false; + if(writer!=null){writer.flush();} + if(outStream!=System.out && outStream!=System.err){ + close(outStream); + } + if(writer!=null){writer.close();} + if(killProcess && fname!=null && outStream!=System.err && outStream!=System.out){error|=ReadWrite.killProcess(fname);} + if(verbose){System.err.println("finishWriting("+writer+", "+outStream+" , "+fname+", "+killProcess+") returned "+error);} + return error; + } + + public static final boolean close(OutputStream os, String fname){ + if(verbose){System.err.println("close("+os+", "+fname+")");} + boolean error=false; + if(os!=null){error|=close(os);} + if(fname!=null && os!=System.err && os!=System.out){error|=killProcess(fname);} + if(verbose){System.err.println("close("+os+", "+fname+") returned "+error);} + return error; + } + + public static final boolean close(OutputStream os){ + if(verbose){System.err.println("close("+os+")");} + boolean error=false; + try { + os.flush(); + } catch (IOException e1) { + // TODO Auto-generated catch block + e1.printStackTrace(); + error=true; + } + if(os.getClass()==ZipOutputStream.class){ + ZipOutputStream zos=(ZipOutputStream)os; + try { + zos.closeEntry(); + zos.finish(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + error=true; + } + } +// else if(PROCESS_XZ && os.getClass()==org.tukaani.xz.XZOutputStream.class){ +// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)os; +// try { +// zos.finish(); +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } +// } + try { + os.close(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + error=true; + } + if(verbose){System.err.println("close("+os+") returned "+error);} + return error; + } + + + + @Deprecated + public static OutputStream getOutputStream_old(String fname, boolean append){ + +// fname=fname.replaceAll("\\\\", "/"); + fname=fname.replace('\\', '/'); + assert(fname.indexOf('\\')<0); +// assert(!fname.contains("//")); + + boolean xz=fname.endsWith(".xz"); + boolean gzipped=fname.endsWith(".gz"); + boolean zipped=fname.endsWith(".zip"); + boolean bzipped=PROCESS_BZ2 && fname.endsWith(".bz2"); + final String basename=basename(fname); + + OutputStream out=null; + + try { + if(fname.equals("stdout") || fname.startsWith("stdout.")){ + out=System.out; + }else{ + FileOutputStream fos=new FileOutputStream(fname, append); + out=new BufferedOutputStream(fos); + } + if(RAWMODE){return out;} + + if(zipped){ + ZipOutputStream zos=new ZipOutputStream(out); + zos.setLevel(ZIPLEVEL); + zos.putNextEntry(new ZipEntry(basename)); + out=zos; + }else if(gzipped){ + GZIPOutputStream gos=new GZIPOutputStream(out, 8192){ + { + // def.setLevel(Deflater.DEFAULT_COMPRESSION); + def.setLevel(ZIPLEVEL); + } + }; + + out=gos; + }else if(bzipped){ + throw new RuntimeException("bz2 compression not supported in public version."); +// out.write('B'); +// out.write('Z'); +// CBZip2OutputStream zos=new CBZip2OutputStream(out, 8192); +// out=zos; + } + // else if(PROCESS_XZ && xz){ + // org.tukaani.xz.LZMA2Options options = new org.tukaani.xz.LZMA2Options(); + // options.setPreset(ZIPLEVEL); + // org.tukaani.xz.XZOutputStream zos=new org.tukaani.xz.XZOutputStream(out, options); + // out=zos; + // } + + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + assert(out!=null); + return out; + } + + + public static OutputStream getOutputStream(String fname, boolean append, boolean buffered, boolean allowSubprocess){ + + if(verbose){ + System.err.println("getOutputStream("+fname+", "+append+", "+buffered+", "+allowSubprocess+")"); + new Exception().printStackTrace(System.err); + } + +// assert(false) : fname; //TODO: for testing +// fname=fname.replaceAll("\\\\", "/"); + fname=fname.replace('\\', '/'); + assert(fname.indexOf('\\')<0); +// assert(!fname.contains("//")); + + boolean gzipped=fname.endsWith(".gz") || fname.endsWith(".gzip"); + boolean zipped=fname.endsWith(".zip"); + boolean bzipped=PROCESS_BZ2 && fname.endsWith(".bz2"); + boolean xz=PROCESS_XZ && fname.endsWith(".xz"); + +// assert(false) : fname; + + allowSubprocess=(allowSubprocess && Shared.THREADS>1); + + if(gzipped){ + assert(!append); + return getGZipOutputStream(fname, allowSubprocess); + }else if(zipped){ + assert(!append); + return getZipOutputStream(fname, buffered, allowSubprocess); + }else if(bzipped){ + assert(!append); + return getBZipOutputStream(fname, buffered, allowSubprocess); + }else if(xz){ + assert(!append); + return getXZOutputStream(fname, buffered, allowSubprocess); + } + return getRawOutputStream(fname, append, buffered); + } + + public static OutputStream getRawOutputStream(String fname, boolean append, boolean buffered){ + + if(verbose){System.err.println("getRawOutputStream("+fname+", "+append+", "+buffered+")");} + + if(fname.equals("stdout") || fname.startsWith("stdout.")){ + return System.out; + }else if(fname.equals("stderr") || fname.startsWith("stderr.")){ + return System.err; + } + FileOutputStream fos=null; + try { + fos = new FileOutputStream(fname, append); + } catch (FileNotFoundException e) { + synchronized(ReadWrite.class){ + try { + File f=new File(fname); + String parent=f.getParent(); + f=new File(parent); + if(!f.exists()){f.mkdirs();} + fos = new FileOutputStream(fname, append); + } catch (Exception e2) { + throw new RuntimeException(e2); + } + } + } + assert(fos!=null); + if(buffered){return new BufferedOutputStream(fos);} + return fos; + } + + public static OutputStream getXZOutputStream(String fname, boolean buffered, boolean allowSubprocess){ + final OutputStream raw=getRawOutputStream(fname, false, buffered); + if(RAWMODE){return raw;} + throw new RuntimeException("Unsupported format: XZ"); +// try { +// org.tukaani.xz.LZMA2Options options = new org.tukaani.xz.LZMA2Options(); +// options.setPreset(ZIPLEVEL); +// org.tukaani.xz.XZOutputStream out=new org.tukaani.xz.XZOutputStream(raw, options); +// return out; +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } +// assert(false); +// return null; + } + + public static OutputStream getBZipOutputStream(String fname, boolean buffered, boolean allowSubprocess){ + if(verbose){System.err.println("getBZipOutputStream("+fname+", "+buffered+", "+allowSubprocess+")");} + final OutputStream raw=getRawOutputStream(fname, false, RAWMODE); + if(RAWMODE){return raw;} + throw new RuntimeException("bz2 compression not supported in public version."); +// try { +// raw.write('B'); +// raw.write('Z'); +// CBZip2OutputStream out=new CBZip2OutputStream(raw, 8192); +// return out; +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } +// assert(false); +// return null; + } + + public static OutputStream getZipOutputStream(String fname, boolean buffered, boolean allowSubprocess){ + if(verbose){System.err.println("getZipOutputStream("+fname+", "+buffered+", "+allowSubprocess+")");} + final OutputStream raw=getRawOutputStream(fname, false, buffered); + if(RAWMODE){return raw;} + try { + ZipOutputStream out=new ZipOutputStream(raw); + out.setLevel(ZIPLEVEL); + final String basename=basename(fname); + out.putNextEntry(new ZipEntry(basename)); + return out; + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + assert(false); + return null; + } + + public static OutputStream getGZipOutputStream(String fname, boolean allowSubprocess){ + if(verbose){System.err.println("getGZipOutputStream("+fname+", "+allowSubprocess+")");} + if(allowSubprocess && Shared.THREADS>2){ + if(USE_PIGZ && Data.PIGZ() && (Data.SH() /*|| fname.equals("stdout") || fname.startsWith("stdout.")*/)){return getPigzStream(fname);} + if(USE_GZIP && Data.GZIP() && (Data.SH() /*|| fname.equals("stdout") || fname.startsWith("stdout.")*/)){return getGzipStream(fname);} + } + + final OutputStream raw=getRawOutputStream(fname, false, false); + if(RAWMODE){return raw;} + try { + final GZIPOutputStream out=new GZIPOutputStream(raw, 8192){ + { + // def.setLevel(Deflater.DEFAULT_COMPRESSION); + def.setLevel(ZIPLEVEL); + } + }; + return out; + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + assert(false); + return null; + } + + public static OutputStream getPigzStream(String fname){ + if(verbose){System.err.println("getPigzStream("+fname+")");} + int threads=Tools.min(MAX_ZIP_THREADS, Tools.max(Shared.THREADS/Tools.max(ZIP_THREAD_DIVISOR, 1), 1)); + threads=Tools.max(1, threads); + int zl=ZIPLEVEL; + if(threads>=4 && zl>0 && zl<4){zl=4;} + OutputStream out=getOutputStreamFromProcess(fname, "pigz -c -p "+threads+" -"+zl, true); + return out; + } + + public static OutputStream getGzipStream(String fname){ + if(verbose){System.err.println("getGzipStream("+fname+")");} + OutputStream out=getOutputStreamFromProcess(fname, "gzip -c -"+ZIPLEVEL, true); + return out; + } + + public static OutputStream getOutputStreamFromProcess(String fname, String command, boolean sh){ + if(verbose){System.err.println("getOutputStreamFromProcess("+fname+", "+command+", "+sh+")");} + + OutputStream out=null; + Process p=null; + + boolean useProcessBuilder=false; + + if(useProcessBuilder){ + ProcessBuilder pb=new ProcessBuilder(); + pb.redirectError(Redirect.INHERIT); + + if(fname.equals("stdout") || fname.startsWith("stdout.")){ + pb.redirectOutput(Redirect.INHERIT); + pb.command(command.split(" ")); + }else{ + + if(fname!=null){ + pb.redirectOutput(new File(fname)); + } + + pb.command(command.split(" ")); + +// if(sh){ +// pb.command(("sh -c "+command+" 1>"+fname).split(" ")); +// }else{ +// pb.command(command.split(" ")); +// } + } + try { + p=pb.start(); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + + addProcess(fname, p); + out=p.getOutputStream(); + return out; + } + + if(fname.equals("stdout") || fname.startsWith("stdout.")){ + try { + p = Runtime.getRuntime().exec(command); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + InputStream is=p.getInputStream(); + PipeThread it=new PipeThread(is, System.out); + addPipeThread(fname, it); + it.start(); +// }else if(fname.equals("stderr") || fname.startsWith("stderr.")){ +// try { +// p = Runtime.getRuntime().exec(command); +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } +// InputStream is=p.getErrorStream(); +// PipeThread it=new PipeThread(is, System.err); +// it.start(); + }else{ + try { + if(sh){ + String[] cmd = { + "sh", + "-c", + command+" 1>"+fname + }; + p=Runtime.getRuntime().exec(cmd); + }else{ + p=Runtime.getRuntime().exec(command); + } +// p = Runtime.getRuntime().exec("gzip -c -"+ZIPLEVEL+" 1>"+fname); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + addProcess(fname, p); + out=p.getOutputStream(); + InputStream es=p.getErrorStream(); + assert(es!=null); + PipeThread et=new PipeThread(es, System.err); + addPipeThread(fname, et); + et.start(); + + return out; + } + + public static String readString(String fname){ + if(verbose){System.err.println("readString("+fname+")");} + String x=null; + InputStream is=getInputStream(fname, false, false); + + try { + + StringBuilder sb=new StringBuilder(); + +// synchronized(diskSync){ + BufferedReader in=new BufferedReader(new InputStreamReader(is), INBUF); + String temp=in.readLine(); + while(temp!=null){ + sb.append(temp).append('\n'); + temp=in.readLine(); + } + in.close(); +// } + + x=sb.toString(); + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } catch (IOException e) { + throw new RuntimeException(e); + } + + return x; + } + + public static Object readObject(String fname){ + if(verbose){System.err.println("readObject("+fname+")");} + Object x=null; + InputStream is=getInputStream(fname, true, false); + + try { +// synchronized(diskSync){ + ObjectInputStream in=new ObjectInputStream(is); + x=in.readObject(); + in.close(); +// } + } catch (IOException e) { + throw new RuntimeException(e); + } catch (ClassNotFoundException e) { + throw new RuntimeException(e); + } + + return x; + } + +// public static InputStream getInputStream(String fname){ +// +// boolean gzipped=fname.endsWith(".gz"); +// boolean zipped=fname.endsWith(".zip"); +// boolean bzipped=false;//fname.endsWith(".bz2"); +// final String basename=basename(fname); +// +// InputStream in=null; +// try { +// +// FileInputStream fis=new FileInputStream(fname); +// BufferedInputStream bis=new BufferedInputStream(fis, INBUF); +// +// if(zipped && !RAWMODE){ +// ZipInputStream zis=new ZipInputStream(bis); +// ZipEntry ze=zis.getNextEntry(); +// assert(ze!=null); +// assert(basename.equals(ze.getName())) : basename+" != "+ze.getName(); +// in=zis; +// }else if(gzipped && !RAWMODE){ +// in=new GZIPInputStream(bis, 4096); +// }else if(bzipped && !RAWMODE){ +// +// +// in=new CBZip2InputStream(bis); +// +// /* +// From http://www.kohsuke.org/bzip2/: +// +// Note +// +// Jacek Bilski told me that he had to read two bytes from the stream +// before he uses CBZip2InputStream. Those two bytes ('B' and 'Z') +// are used by the command line bzip program to mark the stream. +// */ +// +// +// }else{ +// in=bis; +// } +// } catch (FileNotFoundException e) { +// throw new RuntimeException(e); +// } catch (IOException e) { +// throw new RuntimeException(e); +// } +// +// return in; +// } + +// public static InputStream getInputStream(String fname){ +// return getInputStream(fname, true); +// } +// +// public static InputStream getInputStream(String fname, boolean buffer){ +// return getInputStream(fname, buffer, true); +// } + + public static InputStream getInputStream(String fname, boolean buffer, boolean allowSubprocess){ + if(verbose){System.err.println("getInputStream("+fname+", "+buffer+", "+allowSubprocess+")");} + boolean xz=fname.endsWith(".xz"); + boolean gzipped=fname.endsWith(".gz") || fname.endsWith(".gzip"); + boolean zipped=fname.endsWith(".zip"); + boolean bzipped=PROCESS_BZ2 && fname.endsWith(".bz2"); + boolean bam=fname.endsWith(".bam") && Data.SAMTOOLS(); + + allowSubprocess=(allowSubprocess && Shared.THREADS>1); + + if(!RAWMODE){ + if(zipped){return getZipInputStream(fname);} + if(gzipped){return getGZipInputStream(fname, allowSubprocess);} + if(bzipped){return getBZipInputStream(fname);} + if(bam){return getInputStreamFromProcess(fname, "samtools view -h", false);} + } + + return getRawInputStream(fname, buffer); + } + + public static InputStream getRawInputStream(String fname, boolean buffer){ + if(verbose){System.err.println("getRawInputStream("+fname+", "+buffer+")");} + + assert(fname!=null); + fname=fname.replace('\\', '/'); + assert(fname.indexOf('\\')<0); + assert(!fname.contains("\\\\")); +// assert(!fname.contains("//")) : fname; + + final boolean jar=fname.startsWith("jar:"); + + if(!jar){ + File f=new File(fname); + if(!f.exists()){ + String f2=fname.toLowerCase(); + if(f2.equals("stdin") || f2.startsWith("stdin.")){ + // System.err.println("Returning stdin: A"); + return System.in; + } + throw new RuntimeException("Can't find file "+fname); + } + } + +// System.err.println("Getting input stream for "+fname); +// assert(!fname.contains("\\")); +// assert(!loadedFiles.contains(fname)) : "Already loaded "+fname; +// loadedFiles.add(fname); +// assert(!fname.contains("custom_summary_unionGene_build36.txt")); + + + InputStream in=null; + if(jar){ + try { + + URL url=new URL(fname); + + InputStream is=url.openStream(); + + if(buffer){ + BufferedInputStream bis=new BufferedInputStream(is, INBUF); + in=bis; + }else{ + in=is; + } + + } catch (FileNotFoundException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } catch (MalformedURLException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } catch (IOException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } + }else{ + try { + + FileInputStream fis=new FileInputStream(fname); + + if(buffer){ + BufferedInputStream bis=new BufferedInputStream(fis, INBUF); + in=bis; + }else{ + in=fis; + } + + } catch (FileNotFoundException e) { + throw new RuntimeException(e); + } + } + + return in; + } + + public static InputStream getZipInputStream(String fname){return getZipInputStream(fname, true);} + public static InputStream getZipInputStream(String fname, boolean buffer){ + if(verbose){System.err.println("getZipInputStream("+fname+", "+buffer+")");} + InputStream raw=getRawInputStream(fname, buffer); + InputStream in=null; + + final String basename=basename(fname); + + try { + + ZipInputStream zis=new ZipInputStream(raw); + ZipEntry ze=zis.getNextEntry(); + assert(ze!=null); + assert(basename.equals(ze.getName())) : basename+" != "+ze.getName(); + in=zis; + + } catch (FileNotFoundException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } catch (IOException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } + + return in; + } + + public static InputStream getGZipInputStream(String fname, boolean allowSubprocess){ + if(verbose){System.err.println("getGZipInputStream("+fname+", "+allowSubprocess+")");} + if(allowSubprocess && Shared.THREADS>2){ + if(!fname.startsWith("jar:")){ + if(verbose){System.err.println("Fetching gzip input stream: "+fname+", "+allowSubprocess+", "+USE_UNPIGZ+", "+Data.UNPIGZ());} + if(USE_UNPIGZ && Data.UNPIGZ()){return getUnpigzStream(fname);} + if(USE_GUNZIP && Data.GUNZIP()){return getGunzipStream(fname);} + } + } + + InputStream raw=getRawInputStream(fname, false); + InputStream in=null; + + try { + in=new GZIPInputStream(raw, INBUF); + } catch (FileNotFoundException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } catch (IOException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } + + return in; + } + +// public static InputStream getGunzipStream(String fname){ +// +// //InputStream raw=getRawInputStream(fname, false); +// InputStream in=null; +// +// Process p=null; +// if(fname.equals("stdin") || fname.startsWith("stdin.")){ +// try { +// p = Runtime.getRuntime().exec("gzip -c -d"); +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } +// OutputStream os=p.getOutputStream(); +// PipeThread it=new PipeThread(System.in, os); +// it.start(); +// }else{ +// try { +// p = Runtime.getRuntime().exec("gzip -c -d "+fname); +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } +// } +// +// in=p.getInputStream(); +// InputStream es=p.getErrorStream(); +// assert(es!=null); +// PipeThread et=new PipeThread(es, System.err); +// et.start(); +// +// return in; +// } + + public static InputStream getGunzipStream(String fname){ + if(verbose){System.err.println("getGunzipStream("+fname+")");} + return getInputStreamFromProcess(fname, "gzip -c -d", false); + } + + public static InputStream getUnpigzStream(String fname){ + if(verbose){System.err.println("getUnpigzStream("+fname+")");} + return getInputStreamFromProcess(fname, "pigz -c -d", false); + } + + public static InputStream getInputStreamFromProcess(String fname, String command, boolean cat){ + if(verbose){System.err.println("getInputStreamFromProcess("+fname+", "+command+", "+cat+")");} + + //InputStream raw=getRawInputStream(fname, false); + InputStream in=null; + + Process p=null; + if(fname.equals("stdin") || fname.startsWith("stdin.")){ + try { + if(cat){ + throw new RuntimeException(); + }else{ + p=Runtime.getRuntime().exec(command); + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + OutputStream os=p.getOutputStream(); + PipeThread it=new PipeThread(System.in, os); + addPipeThread(fname, it); + it.start(); + }else{ + try { + if(cat){ + assert(false) : "This mode is untested."; + String[] cmd = { + "sh","cat "+fname, + " | "+command + }; + p=Runtime.getRuntime().exec(cmd); + }else{ + p = Runtime.getRuntime().exec(command+" "+fname); + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + addProcess(fname, p); + in=p.getInputStream(); + InputStream es=p.getErrorStream(); + assert(es!=null); + PipeThread et=new PipeThread(es, System.err); + addPipeThread(fname, et); + et.start(); + + return in; + } + + + public static InputStream getBZipInputStream(String fname){ + if(verbose){System.err.println("getBZipInputStream("+fname+")");} + InputStream in=null; + + try {in=getBZipInputStream(fname, true);} + catch (IOException e) {} + + if(in==null){ + try {in=getBZipInputStream(fname, false);} + catch (IOException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + }catch (NullPointerException e) { + System.err.println("Error when attempting to read "+fname); + throw new RuntimeException(e); + } + } + + assert(in!=null); + return in; + } + + + private static InputStream getBZipInputStream(String fname, boolean stripBZ) throws IOException{ + if(verbose){System.err.println("getBZipInputStream("+fname+", "+stripBZ+")");} + throw new RuntimeException("bz2 compression not supported in public version."); +// InputStream raw=getRawInputStream(fname, true); +// InputStream in=null; +// +// if(stripBZ){ +//// System.err.println("Attempting to strip BZ"); +// byte[] header=new byte[2]; +// try {raw.read(header);} +// catch (IOException e) {throw new RuntimeException(e);} +// +// if(header[0]!='B' || header[1]!='Z'){ +// throw new IOException("Found BZ2 file that does not start with BZ: "+"("+header[0]+", "+header[1]+")"); +// } +// } +// +// try { +// +// in=new CBZip2InputStream(raw); +// +// /* +// From http://www.kohsuke.org/bzip2/: +// +// Note +// +// Jacek Bilski told me that he had to read two bytes from the stream +// before he uses CBZip2InputStream. Those two bytes ('B' and 'Z') +// are used by the command line bzip program to mark the stream. +// */ +// +// } catch (Exception e) { +// +// try { +// in=null; +// raw.close(); +// raw=null; +// } catch (IOException e1) { +// System.err.println("Error when attempting to read "+fname); +// e.printStackTrace(); +// e1.printStackTrace(); +// throw new RuntimeException(e1); +// } +// } +// +// return in; + } + + public static InputStream getXZInputStream(String fname){ + + InputStream in=null; + +// if(PROCESS_XZ){ +// InputStream raw=getRawInputStream(fname, true); +// try { +// in=new org.tukaani.xz.XZInputStream(raw); +// } catch (FileNotFoundException e) { +// throw new RuntimeException(e); +// } catch (IOException e) { +// throw new RuntimeException(e); +// } +// } + + return in; + } + + + public static X read(Class cx, String fname){ + X x=(X)readObject(fname); + return x; + } + + public static X[] readArray(Class cx, String fname){ + X[] x=(X[])readObject(fname); + return x; + } + + public static X[][] readArray2(Class cx, String fname){ + X[][] x=(X[][])readObject(fname); + return x; + } + + public static X[][][] readArray3(Class cx, String fname){ + X[][][] x=(X[][][])readObject(fname); + return x; + } + + + private static String basename(String fname){ + fname=fname.replace('\\', '/'); + boolean xz=fname.endsWith(".xz"); + boolean gzipped=fname.endsWith(".gz"); + boolean zipped=fname.endsWith(".zip"); + boolean bzipped=PROCESS_BZ2 && fname.endsWith(".bz2"); + String basename=fname; +// if(basename.contains("\\")){basename=basename.substring(basename.lastIndexOf("\\")+1);} + if(basename.contains("/")){basename=basename.substring(basename.lastIndexOf('/')+1);} + if(zipped || bzipped){basename=basename.substring(0, basename.length()-4);} + else if(gzipped){basename=basename.substring(0, basename.length()-3);} + return basename; + } + + public static String rawName(String fname){ + for(String s : extensions){ + while(fname.endsWith(s)){fname=fname.substring(0, fname.length()-s.length());} + } + return fname; + } + + public static String compressionType(String fname){ + fname=fname.toLowerCase(); + for(String s : extensions){ + if(fname.endsWith(s)){return s.substring(1);} + } + return null; + } + + public static boolean isCompressed(String fname){ + return compressionType(fname)!=null; + } + + public static boolean isSam(String fname){ + if(fname.endsWith(".sam")){return true;} + String s=compressionType(fname); + if(s==null){return false;} + return fname.substring(0, fname.lastIndexOf('.')).endsWith(".sam"); + } + + public static String rawExtension(String fname){ + fname=rawName(fname); + int x=fname.lastIndexOf('.'); + if(x<0){return "";} + return fname.substring(x+1); + } + + public static String parseRoot(String path){ + File f=new File(path); + if(f.isDirectory()){ + if(!path.endsWith(FILESEP)){ + path=path+FILESEP; + } + return path; + }else if(f.isFile()){ + int slash=path.lastIndexOf(FILESEP); + if(slash<0){ + return ""; + }else{ + return path.substring(0, slash+1); + } + }else{ + throw new RuntimeException("Can't find "+path); //Try using parseRoot2 instead. + } + } + + /** This one does not throw an exception for non-existing paths */ + public static String parseRoot2(String path){ + File f=new File(path); + + if(!f.exists()){ + if(path.endsWith(FILESEP)){return path;} + int slash=path.lastIndexOf(FILESEP); + if(slash<0){ + return ""; + }else{ + return path.substring(0, slash+1); + } + } + + if(f.isDirectory()){ + if(!path.endsWith(FILESEP)){ + path=path+FILESEP; + } + return path; + }else if(f.isFile()){ + int slash=path.lastIndexOf(FILESEP); + if(slash<0){ + return ""; + }else{ + return path.substring(0, slash+1); + } + }else{ + throw new RuntimeException("Can't find "+path); + } + } + + public static String findFileExtension(final String fname){ + + File file=new File(fname); + if(file.exists()){return fname;} + + String basename=fname, temp; + if(fname.endsWith(".zip") || fname.endsWith(".gz") || (PROCESS_BZ2 && fname.endsWith(".bz2")) || (PROCESS_XZ && fname.endsWith(".xz"))){ + basename=fname.substring(0, fname.lastIndexOf('.')); + } + temp=basename; + file=new File(temp); + if(!file.exists()){ + temp=basename+".gz"; + file=new File(temp); + } +// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist")); + if(!file.exists()){ + temp=basename+".zip"; + file=new File(temp); + } +// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist")); + if(!file.exists() && PROCESS_BZ2){ + temp=basename+".bz2"; + file=new File(temp); + } +// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist")); + if(!file.exists() && PROCESS_XZ){ + temp=basename+".xz"; + file=new File(temp); + } +// System.err.println(temp+" "+(file.exists() ? " exists" : " does not exist")); + if(!file.exists()){temp=fname;} + + return temp; + } + + public static synchronized void copyFile(String source, String dest){copyFile(source, dest, false);} + public static synchronized void copyFile(String source, String dest, boolean createPathIfNeeded){ + + assert(!new File(dest).exists()) : "Destination file already exists: "+dest; + if(createPathIfNeeded){ + File parent=new File(dest).getParentFile(); + if(parent!=null && !parent.exists()){ + parent.mkdirs(); + } + } + + final boolean oldRawmode=RAWMODE; + if((source.endsWith(".zip") && dest.endsWith(".zip")) + || (source.endsWith(".gz") && dest.endsWith(".gz") + || (source.endsWith(".bz2") && dest.endsWith(".bz2")) + || (source.endsWith(".xz") && dest.endsWith(".xz")))){ + RAWMODE=true; + } + + try{ + InputStream in=getInputStream(source, false, false); + OutputStream out=getOutputStream(dest, false, false, true); + + byte[] buffer=new byte[INBUF]; + int len; + + while((len = in.read(buffer)) > 0){ + out.write(buffer, 0, len); + } + + in.close(); + out.flush(); + if(out.getClass()==ZipOutputStream.class){ + ZipOutputStream zos=(ZipOutputStream)out; + zos.closeEntry(); + zos.finish(); + } +// else if(PROCESS_XZ && out.getClass()==org.tukaani.xz.XZOutputStream.class){ +// org.tukaani.xz.XZOutputStream zos=(org.tukaani.xz.XZOutputStream)out; +// zos.finish(); +// } + out.close(); + + }catch(FileNotFoundException e){ + RAWMODE=oldRawmode; + throw new RuntimeException(e); + }catch(IOException e){ + RAWMODE=oldRawmode; + throw new RuntimeException(e); + } + + RAWMODE=oldRawmode; + } + + public static void copyDirectoryContents(String from, String to){ + assert(!from.equalsIgnoreCase(to)); + + if(to.indexOf('\\')>0){to=to.replace('\\', '/');} + + File d1=new File(from); + assert(d1.exists()); + assert(d1.isDirectory()); + + File d2=new File(to); + assert(!d1.equals(d2)); + if(d2.exists()){ + assert(d2.isDirectory()); + }else{ + d2.mkdirs(); + } + if(!to.endsWith("/")){to=to+"/";} + + File[] array=d1.listFiles(); + + for(File f : array){ + String name=f.getName(); + String dest=to+name; + if(f.isFile()){ + copyFile(f.getAbsolutePath(), dest); + }else{ + assert(f.isDirectory()); + File f2=new File(dest); + if(!f2.exists()){ + f2.mkdir(); + }else{ + assert(f2.isDirectory()); + } + copyDirectoryContents(f.getAbsolutePath(), f2.getAbsolutePath()); + } + } + + } + + + private static final int addThread(int x){ + if(verbose){System.err.println("addThread("+x+")");} + synchronized(activeThreads){ + assert(x!=0); + if(x>0){ + activeThreads[0]+=x; + activeThreads[1]+=x; + }else{ + addRunningThread(x); + } + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=maxWriteThreads) : Arrays.toString(activeThreads); + + return activeThreads[0]; + } + } + + private static final int addRunningThread(int x){ + if(verbose){System.err.println("addRunningThread("+x+")");} + synchronized(activeThreads){ + assert(x!=0); + if(x>0){ + assert(activeThreads[1]>=x); + while(activeThreads[2]>=maxWriteThreads){ + try { + activeThreads.wait(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + activeThreads[1]-=x; //Remove from waiting + }else{ + activeThreads[0]+=x; //Remove from active + } + activeThreads[2]+=x; //Change number running + + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=maxWriteThreads) : Arrays.toString(activeThreads); + + if(activeThreads[2]==0 || (activeThreads[2]0)){activeThreads.notify();} + return activeThreads[2]; + } + } + + public static final int countActiveThreads(){ + if(verbose){System.err.println("countActiveThreads()");} + synchronized(activeThreads){ + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=maxWriteThreads) : Arrays.toString(activeThreads); + return activeThreads[0]; + } + } + + public static final void waitForWritingToFinish(){ + if(verbose){System.err.println("waitForWritingToFinish()");} + synchronized(activeThreads){ + while(activeThreads[0]>0){ + assert(activeThreads[0]==(activeThreads[1]+activeThreads[2]) && activeThreads[0]>=0 && activeThreads[1]>=0 && + activeThreads[2]>=0 && activeThreads[2]<=maxWriteThreads) : Arrays.toString(activeThreads); + try { + activeThreads.wait(8000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(activeThreads[2]==0 || (activeThreads[2]0)){activeThreads.notify();} + } + } + } + + + public static final boolean closeStream(ConcurrentReadStreamInterface cris){return closeStreams(cris, (RTextOutputStream3[])null);} + public static final boolean closeStream(RTextOutputStream3 ross){return closeStreams(null, ross);} + + public static final boolean closeStreams(ConcurrentReadStreamInterface cris, RTextOutputStream3...ross){ + if(verbose){ + System.err.println("closeStreams("+cris+", "+(ross==null ? "null" : ross.length)+")"); + new Exception().printStackTrace(System.err); + } + boolean errorState=false; + if(cris!=null){ + if(verbose){System.err.println("Closing cris; error="+errorState);} + cris.close(); + errorState|=cris.errorState(); +// Object[] prods=cris.producers(); +// for(Object o : prods){ +// if(o!=null && o.getClass()==ReadInputStream.class){ +// ReadInputStream ris=(ReadInputStream)o; +// ris. +// } +// } + if(verbose){System.err.println("Closed cris; error="+errorState);} + } + if(ross!=null){ + for(RTextOutputStream3 ros : ross){ + if(verbose){System.err.println("Closing ros "+ros+"; error="+errorState);} + if(ros!=null){ + ros.close(); + ros.join(); + errorState|=(ros.errorState() || !ros.finishedSuccessfully()); + } + if(verbose){System.err.println("Closed ros; error="+errorState);} + } + } + return errorState; + } + + public static boolean killProcess(String fname){ + if(verbose){ + System.err.println("killProcess("+fname+")"); + new Exception().printStackTrace(System.err); + } + if(fname==null || (!isCompressed(fname) && !fname.endsWith(".bam"))){return false;} + + boolean error=false; + synchronized(processMap){ + Process p=processMap.remove(fname); + if(p!=null){ + if(verbose){System.err.println("Found Process for "+fname);} + int x=-1, tries=0; + for(; tries<20; tries++){ + if(verbose){System.err.println("Trying p.waitFor()");} + try { + x=p.waitFor(); + if(verbose){System.err.println("success; return="+x);} + break; + } catch (InterruptedException e) { + if(verbose){System.err.println("Failed.");} + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + error|=(tries>=20 || x!=0); + if(tries>=20){ + if(verbose){System.err.println("Calling p.destroy because tries=="+tries+"; error="+error);} + p.destroy(); + if(verbose){System.err.println("destroyed");} + } + } + } + synchronized(pipeThreadMap){ + ArrayList atp=pipeThreadMap.remove(fname); + if(atp!=null){ + for(PipeThread p : atp){ + if(p!=null){ + if(verbose){System.err.println("Found PipeThread for "+fname);} + p.terminate(); + if(verbose){System.err.println("Terminated PipeThread");} + } + } + } + } + if(verbose){System.err.println("killProcess("+fname+") returned "+error);} + return error; + } + + private static void addProcess(String fname, Process p){ + if(verbose){System.err.println("addProcess("+fname+", "+p+")");} + synchronized(processMap){ +// System.err.println("Adding Process for "+fname); + Process old=processMap.put(fname, p); + if(old!=null){ + old.destroy(); + throw new RuntimeException("Duplicate process for file "+fname); + } + } + } + + private static void addPipeThread(String fname, PipeThread pt){ + if(verbose){System.err.println("addPipeThread("+fname+", "+pt+")");} + synchronized(pipeThreadMap){ +// System.err.println("Adding PipeThread for "+fname); + ArrayList atp=pipeThreadMap.get(fname); + if(atp==null){ + atp=new ArrayList(2); + pipeThreadMap.put(fname, atp); + } + atp.add(pt); + } + } + + /** {active, waiting, running}
+ * Active means running or waiting. + */ + public static int[] activeThreads={0, 0, 0}; + public static int maxWriteThreads=Shared.THREADS; + + public static boolean verbose=false; + + public static boolean RAWMODE=false; //Does not automatically compress and decompress when true + + public static boolean USE_GZIP=false; + public static boolean USE_PIGZ=false; + public static boolean USE_GUNZIP=false; + public static boolean USE_UNPIGZ=false; + + public static boolean PROCESS_BZ2=true; + public static final boolean PROCESS_XZ=false; + + public static final int INBUF=16384; + public static final int OUTBUF=16384; + + public static int ZIPLEVEL=4; + public static int MAX_ZIP_THREADS=8; + public static int ZIP_THREAD_DIVISOR=4; + + public static final String FILESEP=System.getProperty("file.separator"); + + private static final String diskSync=new String("DISKSYNC"); + + public static final HashSet loadedFiles=new HashSet(); + + private static final String[] extensions=new String[] {".gz", ".gzip", ".zip", ".bz2", ".xz"}; + +// private static HashMap inputProcesses=new HashMap(8); +// private static HashMap outputProcesses=new HashMap(8); + private static HashMap processMap=new HashMap(8); + private static HashMap> pipeThreadMap=new HashMap>(8); + +} diff --git a/current/fileIO/RenameFiles.java b/current/fileIO/RenameFiles.java new file mode 100755 index 0000000..f9f5cc0 --- /dev/null +++ b/current/fileIO/RenameFiles.java @@ -0,0 +1,158 @@ +package fileIO; + +import java.io.File; + +import dna.Data; + + +public class RenameFiles { + + + public static void main(String[] args){ + for(String s : args){ + renameFiles(s); + } + } + + + public static void renameFiles(String path){ + File f=new File(path); + renameFiles(f); + } + + public static void renameFiles(File path){ + + if(path.isDirectory()){ + File[] array=path.listFiles(); + for(File f : array){renameFiles(f);} + }else{ + rename(path); + } + + } + + public static void rename(File in){ + assert(in.exists()) : in.toString(); + assert(in.isFile()) : in.toString(); + String abs=in.getAbsolutePath(); + + + int dot=abs.lastIndexOf('.'); + int slash=abs.lastIndexOf('/'); + +// String[] split=Person.parsePath(abs.substring(0, slash)); +// String name=split[0]; +// String out=abs.substring(0, dot)+"_"+name+".txt"; + + + + String fname=abs.substring(slash+1); + +// System.out.println(fname); + + +// if(fname.startsWith("chr") && fname.endsWith(".txt")){ +// +// String out=abs.replace(".txt", ".flow"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } + + int build=36; + if(abs.contains("FL5-") || abs.contains("630-") || abs.contains("618-")){ + build=37; + } + +// if(fname.startsWith("var") && fname.endsWith(".vla") && !fname.contains("build")){ +// +// String out=abs.replace(".vla", "-build"+build+".vla"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } +// +// if(fname.startsWith("gene") && fname.endsWith(".gvla") && !fname.contains("build")){ +// +// String out=abs.replace(".gvla", "-build"+build+".gvla"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } +// +// if(fname.endsWith(".tsv.zip") && !fname.contains("build")){ +// +// String out=abs.replace(".tsv.zip", "-build"+build+".tsv.zip"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } +// +// if(fname.endsWith(".tsv.gz") && !fname.contains("build")){ +// +// String out=abs.replace(".tsv.gz", "-build"+build+".tsv.gz"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } +// +// if(fname.endsWith(".tsv") && !fname.contains("build")){ +// +// String out=abs.replace(".tsv", "-build"+build+".tsv"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } +// +// if(fname.endsWith(".ca") && !fname.contains("build")){ +// +// String out=abs.replace(".ca", "-build"+build+".ca"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } +// +// if(fname.endsWith(".ca.zip") && !fname.contains("build")){ +// +// String out=abs.replace(".ca.zip", "-build"+build+".ca.zip"); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } +// +// if(fname.contains("-ASM-") && fname.contains("build36")){ +// +// String out=abs.replace("-build36", ""); +// assert(!out.equals(abs)) : out+", "+abs; +// +// System.out.println("Renaming "+abs+" to "+out); +// in.renameTo(new File(out)); +// } + + if(fname.contains("READMEtxt")){ + String out=abs.replace("READMEtxt", "README.txt"); + assert(!out.equals(abs)) : out+", "+abs; + + System.out.println("Renaming "+abs+" to "+out); + in.renameTo(new File(out)); + } + + if(fname.contains("-1.8.0.")){ + + String out=abs.replace("-1.8.0.", "."); + assert(!out.equals(abs)) : out+", "+abs; + + System.out.println("Renaming "+abs+" to "+out); + in.renameTo(new File(out)); + } + } + +} diff --git a/current/fileIO/SummaryFile.java b/current/fileIO/SummaryFile.java new file mode 100755 index 0000000..cac0b14 --- /dev/null +++ b/current/fileIO/SummaryFile.java @@ -0,0 +1,171 @@ +package fileIO; + +import java.io.File; + +import align2.Tools; + +import dna.Data; + +/** + * Tests to see if a summary file matches a reference fasta file, based on date, size, and name + * @author Brian Bushnell + * @date Mar 11, 2013 + * + */ +public class SummaryFile { + + public static void main(String[] args){ + if(args.length==0){ + System.out.println("Usage: SummaryFile
"); + System.exit(0); + } + + String summary=null, ref=null; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("summary")){ + summary=b; + }else if(a.equals("ref") || a.equals("reference")){ + ref=b; + }else{ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + + }else{ + if(args[i].endsWith("summary.txt")){ + summary=args[i]; + }else{ + ref=args[i]; + } + } + } + + if(summary==null && args.length>0){ + summary=args[0]; + } + + if(summary==null){ + System.out.println("Usage: SummaryFile "); + System.exit(0); + } + + if(ref==null){ + + } + } + + public boolean compare(final String refName){ + try { + File ref=new File(refName); + if(!ref.exists()){ + if(refName.startsWith("stdin")){return false;} + else{ + assert(false) : "No such file: "+refName; + } + } +// if(!refName.equals(source) && !Files.isSameFile(ref.toPath(), new File(source).toPath())){ //This is Java-7 specific. +//// assert(false) : refName+", "+source+": "+(Files.isSameFile(ref.toPath(), new File(source).toPath()))+ +//// "\n"+ref.getCanonicalPath()+", "+new File(source).getCanonicalPath()+": "+(ref.getCanonicalPath().equals(new File(source).getCanonicalPath())); +// return false; +// +// } + if(!refName.equals(source) && !ref.getCanonicalPath().equals(new File(source).getCanonicalPath())){ +// assert(false) : refName+", "+source+": "+(Files.isSameFile(ref.toPath(), new File(source).toPath()))+ +// "\n"+ref.getCanonicalPath()+", "+new File(source).getCanonicalPath()+": "+(ref.getCanonicalPath().equals(new File(source).getCanonicalPath())); + return false; + + } + if(bytes!=ref.length()){ +// assert(false) : bytes+", "+ref.length(); + return false; + } + if(modified!=ref.lastModified()){ +// assert(false) : modified+", "+ref.lastModified(); + return false; + } + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + return false; + } + return true; + } + + public static boolean compare(final String summaryName, final String refName){ + assert(refName!=null) : "Null reference file name."; + if(!new File(summaryName).exists()){ +// assert(false); + return false; + } + SummaryFile sf=new SummaryFile(summaryName); + return sf.compare(refName); + } + + public static String getName(){ + return getName(Data.GENOME_BUILD); + } + + public static String getName(int build){ + return Data.ROOT_GENOME+build+"/summary.txt"; + } + + public SummaryFile(String path){ + summaryFname=path; + String s; + TextFile tf=new TextFile(summaryFname, false, false); + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)=='#'){ + if(s.startsWith("#Version")){ + String[] split=s.split("\t"); + version=(split.length>1 ? Integer.parseInt(split[1]) : 0); + } + }else{ + String[] split=s.split("\t"); + String a=split[0]; + String b=split[1]; + if(a.equalsIgnoreCase("chroms")){chroms=(int)Long.parseLong(b);} + else if(a.equalsIgnoreCase("bases")){bases=Long.parseLong(b);} + else if(a.equalsIgnoreCase("version")){version=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("defined")){definedBases=Long.parseLong(b);} + else if(a.equalsIgnoreCase("contigs")){contigs=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("scaffolds")){scaffolds=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("interpad")){interpad=Integer.parseInt(b);} + else if(a.equalsIgnoreCase("undefined")){undefinedBases=Long.parseLong(b);} + else if(a.equalsIgnoreCase("name")){name=b;} + else if(a.equalsIgnoreCase("source")){source=b;} + else if(a.equalsIgnoreCase("bytes")){bytes=Long.parseLong(b);} + else if(a.equalsIgnoreCase("last modified")){modified=Long.parseLong(b);} + else if(a.equalsIgnoreCase("scafprefixes")){scafprefixes=Tools.parseBoolean(b);} + else{throw new RuntimeException("In file "+tf.name+": Unknown term "+s);} + } + } + tf.close(); + } + + public final String summaryFname; + + public int chroms; + public long contigs; + public long scaffolds; + public int interpad; + public long bases; + public long definedBases; + public long undefinedBases; + public String name; + public String source; + public int version; + public long bytes; + public long modified; + public boolean scafprefixes; + +} diff --git a/current/fileIO/TextFile.java b/current/fileIO/TextFile.java new file mode 100755 index 0000000..2adb1ba --- /dev/null +++ b/current/fileIO/TextFile.java @@ -0,0 +1,307 @@ +package fileIO; +import java.io.BufferedReader; +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; + +import dna.Data; + + +public class TextFile { + + + public static void main(String[] args){ + TextFile tf=new TextFile(args.length>0 ? args[0] : "stdin", false, false); + int first=0, last=100; + long lines=0; + long bytes=0; + if(args.length>1){ + first=Integer.parseInt(args[1]); + last=first+100; + } + if(args.length>2){ + last=Integer.parseInt(args[2]); + } + + for(int i=0; i list=new ArrayList(4096); + + for(s=nextLine(); s!=null; s=nextLine()){ + list.add(s); + } + + return list.toArray(new String[list.size()]); + + } + + public final long countLines(){ + + String s=null; + long count=0; + + for(s=nextLine(); s!=null; s=nextLine()){count++;} + + reset(); + + return count; + + } + + public static String[][] doublesplitTab(String[] lines, boolean trim){ + String[][] lines2=new String[lines.length][]; + for(int i=0; i>(5); + buffer=new ArrayList(buffersize); + } + + @Override + public void run() { + assert(open) : fname; + synchronized(this){ + started=true; + this.notify(); + } + + ArrayList job=null; + + while(job==null){ + try { + job=queue.take(); +// job.list=queue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + while(job!=null && job!=POISON2){ + if(!job.isEmpty()){ + for(final CharSequence cs : job){ + assert(cs!=POISON); + myWriter.print(cs); + } + } + + job=null; + while(job==null){ + try { + job=queue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } +// assert(false); + open=false; + ReadWrite.finishWriting(myWriter, myOutstream, fname, allowSubprocess); + synchronized(this){notifyAll();} + } + + public void print(CharSequence cs){ +// System.err.println("Added line '"+cs+"'"); + assert(open) : cs; + buffer.add(cs); + bufferLen+=cs.length(); + if(buffer.size()>=buffersize || bufferLen>=maxBufferLen){ + addJob(buffer); + buffer=new ArrayList(buffersize); + bufferLen=0; + } + } + + public void println(CharSequence cs){ + print(cs); + print("\n"); + } + + public void println(Read r){ + assert(!OTHER); + StringBuilder sb=(FASTQ ? r.toFastq() : FASTA ? r.toFasta() : SAM ? r.toSam() : + SITES ? r.toSites() : INFO ? r.toInfo() : r.toText(true)).append('\n'); + print(sb); + } + + public void print(Read r){ + assert(!OTHER); + StringBuilder sb=(FASTQ ? r.toFastq() : FASTA ? r.toFasta() : SAM ? r.toSam() : + SITES ? r.toSites() : INFO ? r.toInfo() : r.toText(true)); + print(sb); + } + + public synchronized void poison(){ + if(!open){return;} + addJob(buffer); + buffer=null; +// System.err.println("Poisoned!"); +// assert(false); + + //Don't allow thread to shut down before it has started + while(this.getState()==Thread.State.NEW || !started){ + try { + this.wait(20); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } +// assert(false) : open+", "+this.getState()+", "+started; + open=false; + addJob(POISON2); + } + + public void waitForFinish(){ + while(this.getState()!=Thread.State.TERMINATED){ + try { + this.join(1000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + } + } + + public void poisonAndWait(){ + poison(); + waitForFinish(); + } + + //TODO Why is this synchronized? + public synchronized void addJob(ArrayList j){ +// System.err.println("Got job "+(j.list==null ? "null" : j.list.size())); + boolean success=false; + while(!success){ + try { + queue.put(j); + success=true; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + assert(!queue.contains(j)); //Hopefully it was not added. + } + } + } + + private ArrayList buffer; + + public int buffersize=100; + public int maxBufferLen=60000; + private int bufferLen=0; + public final boolean overwrite; + public final boolean allowSubprocess; + public final String fname; + private final OutputStream myOutstream; + private final PrintWriter myWriter; + private final ArrayBlockingQueue> queue; + private boolean open=true; + private volatile boolean started=false; + + /** TODO */ + public boolean errorState=false; + + private final boolean BAM; + private final boolean SAM; + private final boolean FASTQ; + private final boolean FASTA; + private final boolean BREAD; + private final boolean SITES; + private final boolean INFO; + private final boolean OTHER; + + private static final String POISON=new String("POISON_TextStreamWriter"); + private static final ArrayList POISON2=new ArrayList(1); + +} diff --git a/current/jgi/AddStopTags.java b/current/jgi/AddStopTags.java new file mode 100755 index 0000000..15362f3 --- /dev/null +++ b/current/jgi/AddStopTags.java @@ -0,0 +1,53 @@ +package jgi; + +import java.util.ArrayList; + +import stream.SamLine; + +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * Add s + * @author Brian Bushnell + * @date Jul 3, 2013 + * + */ +public class AddStopTags { + + public static void main(String[] args){ + String in=args[0]; + String out=args[1]; + TextStreamWriter tsw=new TextStreamWriter(out, false, false, true); + tsw.start(); + + TextFile tf=new TextFile(in, true, false); + String line; + for(line=tf.nextLine(); line!=null; line=tf.nextLine()){ + if(line.charAt(0)=='@'){ + tsw.println(line); + }else{ + SamLine sl=new SamLine(line); + ArrayList list=sl.optional; + if(list==null){sl.optional=list=new ArrayList(1);} + + for(int i=0; i contigs.fa + e.g. + + fasta_stats2.linux -n 0 contigs.fa # for aplg assemblies + fasta_stats2.linux -n 10 contigs.fa # for velvet + + + + Main genome scaffold total: 1610 + Main genome contig total: 7844 + Main genome scaffold sequence total: 726.6 MB + Main genome contig sequence total: 689.4 MB (-> 5.1% gap) + Main genome scaffold N/L50: 6/62.2 MB + Main genome contig N/L50: 331/429.0 KB + Number of scaffolds > 50 KB: 122 + % main genome in scaffolds > 50 KB: 98.9% + + Minimum Number Number Total Total Scaffold + Scaffold of of Scaffold Contig Contig + Length Scaffolds Contigs Length Length Coverage + -------- --------- ------- ----------- ----------- -------- + All 1,610 7,844 726,616,606 689,442,341 94.88% + 1 kb 1,610 7,844 726,616,606 689,442,341 94.88% + 2.5 kb 1,468 7,677 726,334,758 689,171,164 94.88% + 5 kb 537 6,496 723,058,922 685,949,825 94.87% + 10 kb 321 6,176 721,557,480 684,511,419 94.87% + 25 kb 138 5,900 718,873,396 681,879,275 94.85% + 50 kb 122 5,854 718,322,923 681,420,273 94.86% + 100 kb 83 5,660 715,543,850 679,452,337 94.96% + 250 kb 47 5,326 709,779,897 675,162,461 95.12% + 500 kb 32 5,073 704,645,704 671,472,605 95.29% + 1 mb 19 4,735 695,996,631 664,862,860 95.53% + 2.5 mb 15 4,587 689,883,367 659,102,480 95.54% + 5 mb 13 4,463 681,669,379 651,024,951 95.50% + */ + + public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + if(args.length==0 || (args.length==1 && + (args[0].equalsIgnoreCase("-h") || args[0].equals("-help") || args[0].equals("--help") || args[0].equals("-?") || args[0].equals("?")))){ + System.out.println("\nUsage: java -Xmx120m jgi.AssemblyStats3 "); + System.out.println("\nOptional flags:"); + System.out.println("in= \tThe 'in=' flag is only needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in."); + System.out.println("format=1 \tUses variable units like MB and KB, and is designed for compatibility with existing tools."); + System.out.println("format=2 \tUses only whole numbers of bases, with no commas in numbers, and is designed for machine parsing."); + System.out.println("format=3 \tOutputs stats in 2 rows of tab-delimited columns: a header row and a data row."); + System.out.println("format=4 \tLike 3 but with scaffold data only."); + System.out.println("format=5 \tLike 3 but with contig data only."); + System.out.println("format=6 \tLike 3 but the header starts with a #."); + System.out.println("gc= \tPrint gc statistics per scaffold to a file (or stdout)."); + System.out.println("gcformat=1 \tid start stop A C G T N GC"); + System.out.println("gcformat=2 \tid gc"); + System.out.println("gcformat=3 \tid length A C G T N GC"); + System.out.println("gcformat=4 \tid length gc"); + System.out.println("gchist=\tPrint gc content histogram to this file."); + System.out.println("gcbins=200 \tNumber of bins in gc histogram."); + System.out.println("n=10 \tMinimum number of consecutive Ns between contigs."); + System.out.println("k=13 \tDisplay BBMap's estimated memory usage for this genome with specified kmer length."); + System.out.println("showspeed=t \tSet to 'f' to suppress display of processing speed."); + System.out.println("minscaf=0 \tIgnore scaffolds shorter than this."); + System.out.println("n_=t \tThis flag will prefix the terms 'contigs' and 'scaffolds' with 'n_' in formats 3-6."); +// System.out.println("verbose=t \tSet to false to remove superfluous info."); +// System.out.println("Output is always tab-delimited. AGCT are fractions of defined bases; N is fraction of total bases."); + System.exit(0); + } + + boolean benchmark=false; + ReadWrite.USE_UNPIGZ=true; + + String in=null, out=null, gc=null, gchist=null, scaffoldHistFile=null; + int maxNs=-1; + + int gchistdecimals=-1; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + if(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>0)){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(arg.contains("=") && (a.equals("in") || a.equals("ref"))){ + in=b; + }else if(a.equals("gc")){ + gc=b; + if(b==null || "summaryonly".equalsIgnoreCase(b) || "none".equalsIgnoreCase(b)){ + gc=null; + } + }else if(a.equals("gchist")){ + gchist=b; + if(b==null || "none".equalsIgnoreCase(b)){ + gchist=null; + } + }else if(a.equals("gchistdecimals")){ + gchistdecimals=Integer.parseInt(b); + }else if(a.equals("gcbins")){ + int x=Integer.parseInt(b); + if(x>0){GCBINS=Integer.parseInt(b);} + }else if(a.equals("shist") || a.equals("scaffoldhist")){ + scaffoldHistFile=b; + if(b==null || "none".equalsIgnoreCase(b)){ + scaffoldHistFile=null; + } + }else if(a.equals("out")){ + out=b; + if(b==null || "summaryonly".equalsIgnoreCase(b) || "none".equalsIgnoreCase(b)){ + out=null; + }else if("benchmark".equalsIgnoreCase(b)){ + benchmark=true; + out=null; + gc=null; + } + }else if(a.equals("benchmark")){ + benchmark=Tools.parseBoolean(b); + if(benchmark){ + out=null; + gc=null; + } + }else if(a.equals("format")){ + FORMAT=Integer.parseInt(b); + if(FORMAT<1 || FORMAT>6){ + throw new RuntimeException("\nUnknown format: "+FORMAT+"; valid values are 1 and 2.\n"); + } + }else if(a.equals("gcformat")){ + GCFORMAT=Integer.parseInt(b); + if(GCFORMAT!=1 && GCFORMAT!=2 && GCFORMAT!=3 && GCFORMAT!=4){ + throw new RuntimeException("\nUnknown gcformat: "+GCFORMAT+"; valid values are 1 and 2.\n"); + } + }else if(a.equals("cutoff")){ + cutoff=Long.parseLong(b); + }else if(a.equals("k") || a.equals("bbmapkmer")){ + bbmapkmer=Integer.parseInt(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("n_")){ + N_UNDERSCORE=Tools.parseBoolean(b); + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF1=!(ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b)); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("header") || a.equals("useheader")){ + useheader=Tools.parseBoolean(b); + }else if(a.equals("addfilename") || a.equals("addname")){ + addfilename=Tools.parseBoolean(b); + }else if(a.equals("minscaf")){ + MINSCAF=Integer.parseInt(b); + }else if(a.equals("showspeed") || a.equals("ss")){ + showspeed=Tools.parseBoolean(b); + }else if(a.equals("printheadersize") || a.equals("phs")){ + printheadersize=Tools.parseBoolean(b); + }else if(a.equals("skipduplicatelines") || a.equals("sdl")){ + skipDuplicateLines=Tools.parseBoolean(b); + }else if(a.equals("printduplicatelines") || a.equals("pdl")){ + skipDuplicateLines=!Tools.parseBoolean(b); + }else if(a.equals("showbbmap")){ + if(!Tools.parseBoolean(b)){bbmapkmer=0;} + }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){ + maxNs=Integer.parseInt(b); + }else if(i>0 && (a.equals("n") || a.equals("-n")) && b!=null){ + maxNs=Integer.parseInt(b); + }else if(a.equals("-n") && b==null && args.length>i+1){ + maxNs=Integer.parseInt(args[i+1]); + }else if(in==null && i==0 && !args[i].contains("=")){ + in=args[i]; + }else if(maxNs<0 && i==1 && !args[i].contains("=") && Character.isDigit(args[i].charAt(0))){ + maxNs=Integer.parseInt(args[i]); + }else if(gc==null && i==2 && !args[i].contains("=")){ + gc=args[i]; + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + } + + if(gchistdecimals<1){ + if(GCBINS==2 || GCBINS==5 || GCBINS==10){ + GCHIST_DECIMALS=1; + }else if(GCBINS==20 || GCBINS==25 || GCBINS==50 || GCBINS==100){ + GCHIST_DECIMALS=2; + } + if(GCBINS>1000 && GCHIST_DECIMALS<4){GCHIST_DECIMALS=4;} + if(GCBINS>10000 && GCHIST_DECIMALS<5){GCHIST_DECIMALS=5;} + }else{ + GCHIST_DECIMALS=gchistdecimals; + } + + if(maxNs<0){maxNs=10;} + + long[] counts=null; + long sum=0; + + if(out==null || out.equalsIgnoreCase("stdout") || out.equalsIgnoreCase("standardout")){out=null;} + + InputStream is=null; + { + if(in==null){throw new RuntimeException("No input file.");} + if(in.equalsIgnoreCase("stdin") || in.equalsIgnoreCase("standardin")){ + is=System.in; + }else{ + File f=new File(in); + if((!f.exists() || f.isDirectory()) && !in.toLowerCase().startsWith("stdin")){ + throw new RuntimeException("Input file does not appear to be valid: "+in); + } + } + } + + + if(is==null){is=ReadWrite.getInputStream(in, false, true);} + try { + if(benchmark){sum=bench(is);} + else{counts=countFasta(is, gc, maxNs);} + } catch (IOException e) { + e.printStackTrace(); + } + try { + if(is!=System.in){is.close();} + } catch (IOException e) { + e.printStackTrace(); + } + + if(tlistS!=null && tlistS.size()>0){Collections.sort(tlistS);} + if(llistS!=null && llistS.size>0){Arrays.sort(llistS.array, 0, llistS.size);} + + t.stop(); + + if(benchmark){ + printBenchResults(t, counts, sum, in); + }else{ +// System.err.println("\nclistS="+clistS+"\nslistS="+slistS+"\nsclist1S="+sclist1S+"\nsclist2S="+sclist2S+"\nllistS="+llistS+"\ntlistS="+tlistS+"\n"); //*** +// System.err.println("\nclistS.size="+clistS.size+"\nslistS.size="+slistS.size+"\nsclist1S.size="+sclist1S.size+"\nsclist2S.size="+sclist2S.size+"\nllistS.size="+llistS.size+"\ntlistS.size()="+tlistS.size()+"\n"); //*** + printResults(t, counts, sum, in, clistS, slistS, sclist1S, sclist2S, llistS, tlistS, out); + + writeHistFile(scaffoldHistFile, slistS, tlistS, false); + + if(gchist!=null){printGCHist(gchist);} + } + + } + + + private static void printGCHist(String gchist){ + if(!Tools.canWrite(gchist, overwrite)){ + System.err.println("Can't write gc histogram because file exists and overwrite="+overwrite); + assert(false); + }else{ + long gchistsum=Tools.sum(gchistS); + long gchistsumbb=Tools.sum(gchist_bb_S); + double invsum=(gchistsum==0 ? 0 : 1.0/gchistsum); + double invsumbb=(gchistsum==0 ? 0 : 1.0/gchistsumbb); + double invbins=1.0/(GCBINS==0 ? 1 : GCBINS); +// assert(false) : Arrays.toString(gchistS); + StringBuilder sb=new StringBuilder(); + sb.append(String.format("#GC\tscaffolds\tfraction\tlength\tlen_fraction\n")); + for(int i=0; i tlist, String out){ + + String name=in; + if(in!=null && !in.toLowerCase().startsWith("stdin")){ + try { + File f=new File(in); + name=f.getCanonicalPath(); + } catch (IOException e) {} + } + + long contigs=0; + long scaffolds=0; + long contiglen=0; + long scaflen=0; + long contigs1; + long contiglen2; + long maxScaf=0, maxContig=0; + long[] carray=clist.array; + long[] sarray=slist.array; + long[] scarray1=sclist1.array; + long[] scarray2=sclist2.array; + + long[] larray=llist.array; + + StringBuilder sb=new StringBuilder(), sb2=new StringBuilder(); + + for(int i=0; i0){ + contigs+=x; + contiglen+=(x*i); + maxContig=i; + } + } + + for(int i=0; i0){ + scaffolds+=x; + scaflen+=(x*i); + maxScaf=i; + } + } + + contigs+=llist.size; + for(int i=0; i0); + contiglen+=x; + maxContig=Tools.max(maxContig, x); + } + + scaffolds+=tlist.size(); + for(Triple tp : tlist){ + scaflen+=tp.length; + maxScaf=Tools.max(maxScaf, tp.length); + } + + if(FORMAT<3){ + sb.append("Main genome scaffold total: \t"+scaffolds+"\n"); + sb.append("Main genome contig total: \t"+contigs+"\n"); + } + + if(FORMAT==1){ + sb.append("Main genome scaffold sequence total:\t"+String.format("%.3f MB",scaflen/1000000f)+"\n"); + sb.append("Main genome contig sequence total: \t"+String.format("%.3f MB \t%.3f%% gap",contiglen/1000000f,(scaflen-contiglen)*100f/scaflen)+"\n"); + }else if(FORMAT==2){ + sb.append("Main genome scaffold sequence total:\t"+scaflen+"\n"); + sb.append("Main genome contig sequence total: \t"+String.format("%d \t%.3f%% gap",contiglen,(scaflen-contiglen)*100f/scaflen)+"\n"); + }else if(FORMAT==3 || FORMAT==6){ + + }else if(FORMAT==4){ + + }else if(FORMAT==5){ + + }else{throw new RuntimeException("Unknown format");} + + if(FORMAT<3){ + sb2.append("\nMinimum \tNumber \tNumber \tTotal \tTotal \tScaffold\n"); + sb2.append("Scaffold\tof \tof \tScaffold \tContig \tContig \n"); + sb2.append("Length \tScaffolds \tContigs \tLength \tLength \tCoverage\n"); + sb2.append("--------\t--------------\t--------------\t--------------\t--------------\t--------\n"); + } + + int[] lims=new int[] {0, 50, 100, 250, 500, 1000, 2500, 5000, 10000, 25000, 50000, 100000, + 250000, 500000, 1000000, 2500000, 5000000, 10000000, 25000000, 50000000, 100000000, 250000000}; + + int lidx=0; + int next=0; + long csum=contigs; + long ssum=scaffolds; + long clen=contiglen; + long slen=scaflen; + + long cn50=-1; + long ln50=-1; + long cl50=-1; + long ll50=-1; + + long shalf=slen/2; + long chalf=clen/2; + + long numOver50=0; + float fractionOver50=0; + + + //Disable printing of 50~500 when not needed +// { +// boolean b=true; +// for(int i=0; i<500 && i=50){ + for(int i=1; i0; i++){ + // System.out.println("\n\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen); + + if(i==next){ + prevLine=formatX(next, ssum, csum, slen, clen); +// if(!skipDuplicateLines || ssum!=prevSsum || csum!=prevCsum || slen!=prevSlen || clen!=prevClen){ + sb2.append(prevLine); + sb2.append('\n'); + prevLine=null; +// } + prevSsum=ssum; prevCsum=csum; prevSlen=slen; prevClen=clen; + + lidx++; + if(lidx\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen); + // System.out.println(sb2); + } + // System.out.println("\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen); + + if(i==50001){ + numOver50=ssum; + fractionOver50=slen*100f/scaflen; + } + + // long a=carray[i]; + long b=sarray[i]; + long c=scarray1[i]; + long d=scarray2[i]; + + if(b>0){ + csum-=c; + ssum-=b; + clen-=d; + slen-=(b*i); + } + + if(ln50==-1 && slen<=shalf){ + ln50=i; + ll50=ssum+b; + } + // System.out.println("\tb="+b+", c="+c+", d="+d+", csum="+csum+", ssum="+ssum+", clen="+clen+", slen="+slen); + + // System.out.println("\ti="+i+", lidx="+lidx+", lims.length="+lims.length+", next="+next+", sarray.length="+sarray.length+", slen="+slen); + + } + + for(Triple tp : tlist){ + // assert(false) : tlist; + while(tp.length>=next && lidx\n"+sb2+"\ni="+"?"+", lidx="+lidx+", lims.length="+lims.length+", next="+next); + // else{next=-1;} + } + + if(numOver50==0 && tp.length>50000){ + numOver50=ssum; + fractionOver50=slen*100f/scaflen; + } + + // long a=carray[i]; + long b=tp.length; + long c=tp.contigs; + long d=tp.contiglen; + + if(b>0){ + csum-=c; + ssum-=1; + clen-=d; + slen-=b; + } + + if(ln50==-1 && slen<=shalf){ + ln50=b; + ll50=ssum+1; + } + + } + if(prevLine!=null){ + sb2.append(prevLine); + prevLine=null; + } + } + + clen=contiglen; + csum=contigs; + for(int i=0; i0; i++){ + long a=carray[i]; + + csum-=a; + clen-=a*i; + + if(cn50==-1 && clen<=chalf){ + cn50=i; + cl50=csum+a; + } + } + + for(int i=0; i0; i++){ + long a=larray[i]; + + csum-=1; + clen-=a; + + if(cn50==-1 && clen<=chalf){ + cn50=a; + cl50=csum+1; + } + } + + cn50=Tools.max(cn50, 0); + ln50=Tools.max(ln50, 0); + cl50=Tools.max(cl50, 0); + ll50=Tools.max(ll50, 0); + + if(FORMAT<3){ + + if(addfilename){sb.append("Filename: \t"+name+"\n");} + sb.append("Main genome scaffold N/L50: \t"+ll50+"/"+formatKB(ln50, 3, 0)+"\n"); + sb.append("Main genome contig N/L50: \t"+cl50+"/"+formatKB(cn50, 3, 0)+"\n"); + sb.append("Max scaffold length: \t"+formatKB(maxScaf, 3, 0)+"\n"); + sb.append("Max contig length: \t"+formatKB(maxContig, 3, 0)+"\n"); + sb.append("Number of scaffolds > 50 KB: \t"+numOver50+"\n"); + sb.append("% main genome in scaffolds > 50 KB: \t"+String.format("%.2f%%", fractionOver50)+"\n"); + if(printheadersize){sb.append("Header:\t"+formatKB(HEADERLENSUM, 3, 0)+(HEADERLENSUM<1000 ? " bytes" : ""));} + + // System.out.println(); + // System.out.println("Scaffolds: "+Tools.sum(slist.array)); + // for(int i=0; i0){System.out.print(i+":"+slist.array[i]+", ");} + // } + // System.out.println(); + // System.out.println("Contigs:"+Tools.sum(clist.array)); + // for(int i=0; i0){System.out.print(i+":"+clist.array[i]+", ");} + // } + + if(GCFORMAT==1 || GCFORMAT==3 || GCFORMAT==4){ + System.out.println(" \tA\tC\tG\tT\tN\tGC\tGC_stdev"); + System.out.println(toString3(new StringBuilder("Base Content"), counts)); + }else{ + System.out.println(" \tGC\tGC_stdev"); + System.out.println(toString3(new StringBuilder("Base Content"), counts)); + } + + System.out.println(sb); + System.out.println(sb2); + }else if(FORMAT==3 || FORMAT==6){ + + if(useheader){ + if(FORMAT==6){sb.append('#');} + if(N_UNDERSCORE){sb.append("n_");} + sb.append("scaffolds\t"); + if(N_UNDERSCORE){sb.append("n_");} + sb.append("contigs\t"); + sb.append("scaf_bp\t"); + sb.append("contig_bp\t"); + sb.append("gap_pct\t"); + sb.append("scaf_N50\t"); + sb.append("scaf_L50\t"); + sb.append("ctg_N50\t"); + sb.append("ctg_L50\t"); + sb.append("scaf_max\t"); + sb.append("ctg_max\t"); + sb.append("scaf_n_gt50K\t"); + sb.append("scaf_pct_gt50K\t"); + sb.append("gc_avg\t"); + sb.append("gc_std"); + if(addfilename){sb.append("\tfilename");} + + sb.append("\n"); + } + + sb.append(scaffolds+"\t"); + sb.append(contigs+"\t"); + sb.append(scaflen+"\t"); + sb.append(String.format("%d",contiglen)+"\t"); + sb.append(String.format("%.3f",(scaflen-contiglen)*100f/scaflen)+"\t"); + sb.append(ll50+"\t"); + sb.append(formatKB(ln50, 3, 0)+"\t"); + sb.append(cl50+"\t"); + sb.append(formatKB(cn50, 3, 0)+"\t"); + sb.append(formatKB(maxScaf, 3, 0)+"\t"); + sb.append(formatKB(maxContig, 3, 0)+"\t"); + sb.append(numOver50+"\t"); + sb.append(String.format("%.3f", fractionOver50)+"\t"); + sb.append(String.format("%.5f", (counts[1]+counts[2])*1.0/(counts[0]+counts[1]+counts[2]+counts[3]))+"\t"); + sb.append(String.format("%.5f", gc_stdS)); + if(addfilename){sb.append('\t').append(name);} + + System.out.println(sb); + }else if(FORMAT==4){ + + if(useheader){ + + if(N_UNDERSCORE){sb.append("n_");} + sb.append("scaffolds\t"); +// sb.append("contigs\t"); + sb.append("scaf_bp\t"); +// sb.append("contig_bp\t"); +// sb.append("gap_pct\t"); + sb.append("scaf_N50\t"); + sb.append("scaf_L50\t"); +// sb.append("ctg_N50\t"); +// sb.append("ctg_L50\t"); + sb.append("scaf_max\t"); +// sb.append("ctg_max\t"); + sb.append("scaf_n_gt50K\t"); + sb.append("scaf_pct_gt50K"); +// sb.append("gc_avg"); +// sb.append("gc_std"); + if(addfilename){sb.append("\tfilename");} + + sb.append("\n"); + } + + sb.append(scaffolds+"\t"); +// sb.append(contigs+"\t"); + sb.append(scaflen+"\t"); +// sb.append(String.format("%d",contiglen)+"\t"); +// sb.append(String.format("%.3f",(scaflen-contiglen)*100f/scaflen)+"\t"); + sb.append(ll50+"\t"); + sb.append(formatKB(ln50, 3, 0)+"\t"); +// sb.append(cl50+"\t"); +// sb.append(formatKB(cn50, 3, 0)+"\t"); + sb.append(formatKB(maxScaf, 3, 0)+"\t"); +// sb.append(formatKB(maxContig, 3, 0)+"\t"); + sb.append(numOver50+"\t"); + sb.append(String.format("%.3f", fractionOver50)); +// sb.append(String.format("%.5f", (counts[1]+counts[2])*1.0/(counts[0]+counts[1]+counts[2]+counts[3]))); + if(addfilename){sb.append('\t').append(name);} + System.out.println(sb); + }else if(FORMAT==5){ + + if(useheader){ +// sb.append("scaffolds\t"); + if(N_UNDERSCORE){sb.append("n_");} + sb.append("contigs\t"); +// sb.append("scaf_bp\t"); + sb.append("contig_bp\t"); + sb.append("gap_pct\t"); +// sb.append("scaf_N50\t"); +// sb.append("scaf_L50\t"); + sb.append("ctg_N50\t"); + sb.append("ctg_L50\t"); +// sb.append("scaf_max\t"); + sb.append("ctg_max\t"); +// sb.append("scaf_n_gt50K\t"); +// sb.append("scaf_pct_gt50K\t"); + sb.append("gc_avg\t"); + sb.append("gc_std"); + if(addfilename){sb.append("\tfilename");} + + sb.append("\n"); + } + +// sb.append(scaffolds+"\t"); + sb.append(contigs+"\t"); +// sb.append(scaflen+"\t"); + sb.append(String.format("%d",contiglen)+"\t"); + sb.append(String.format("%.3f",(scaflen-contiglen)*100f/scaflen)+"\t"); +// sb.append(ll50+"\t"); +// sb.append(formatKB(ln50, 3, 0)+"\t"); + sb.append(cl50+"\t"); + sb.append(formatKB(cn50, 3, 0)+"\t"); +// sb.append(formatKB(maxScaf, 3, 0)+"\t"); + sb.append(formatKB(maxContig, 3, 0)+"\t"); +// sb.append(numOver50+"\t"); +// sb.append(String.format("%.3f", fractionOver50)+"\t"); + sb.append(String.format("%.5f", (counts[1]+counts[2])*1.0/(counts[0]+counts[1]+counts[2]+counts[3]))+"\t"); + sb.append(String.format("%.5f", gc_stdS)); + if(addfilename){sb.append('\t').append(name);} + System.out.println(sb); + } + + if(showspeed){ + if(!printheadersize){System.err.println("Header:\t"+formatKB(HEADERLENSUM, 3, 0)+(HEADERLENSUM<1000 ? " bytes" : ""));} + System.err.println("Time: \t"+t); + long bytes=new File(in).length(); + if(bytes<1){bytes=LIMSUM;} + double mbps=bytes*1000d/t.elapsed; + double mbpps=Tools.sum(counts)*1000d/t.elapsed; + System.err.println(String.format("Speed:\t%.2f MBytes/s",mbps)); + System.err.println(String.format(" \t%.2f MBases/s",mbpps)); + } + + if(bbmapkmer>0){ + System.err.println("BBMap minimum memory estimate at k="+bbmapkmer+": "+estimateBBMapMemory(counts, scaffolds, HEADERLENSUM, bbmapkmer)); + } + + + } + + + /** + * @param counts + * @param scaffolds + * @param hEADERLENSUM2 + * @param i + * @return + */ + private static long bbmapMemoryBytes(long[] acgtn, long scaffolds, + long headerlen, int k) { + + long keyspace=(1L<<(2*k)); + long defined=acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3]; + long undefined=acgtn[4]; + long midpad=(scaffolds*(FastaToChromArrays.MID_PADDING)); + long total=defined+undefined+midpad; + int chromlen=FastaToChromArrays.MAX_LENGTH-FastaToChromArrays.END_PADDING-FastaToChromArrays.START_PADDING; + int chroms=(int)(total/chromlen); + int chromsperblock=Integer.MAX_VALUE/chromlen; + int blocks=(chroms+chromsperblock-1)/chromsperblock; + long memperblock=keyspace*4; + long memforcounts=keyspace*4; + + long mem=0; + mem+=total; //reference bulk, including inter-scaffold padding + mem+=(chroms*(FastaToChromArrays.END_PADDING+FastaToChromArrays.START_PADDING)); //reference tip padding + mem+=headerlen; //Header name byte arrays + mem+=(scaffolds*(4+4+4+16+8)); //Other structures for scaffold info + mem+=(blocks*(memperblock)); //start array for each block + mem+=memforcounts; //count array + mem+=(defined*4); //key lists + mem=(long)(mem/0.66); //Expand to compensate for garbage collection + if(k>13){mem=mem+1000000000;} + return mem; + } + + + /** + * @param counts + * @param scaffolds + * @param hEADERLENSUM2 + * @param i + * @return + */ + private static CharSequence estimateBBMapMemory(long[] acgtn, long scaffolds, + long headerlen, int k) { + long mem=180+bbmapMemoryBytes(acgtn, scaffolds, headerlen, k)/1000000; //in megabytes + if(mem>4000){ + return "-Xmx"+((mem+1500)/1000)+"g \t(at least "+(long)Math.ceil((((mem+1500)/0.85)/1000))+" GB physical RAM)"; + }else if(mem>2100){ + return "-Xmx"+((mem+999)/1000)+"g \t(at least "+(long)Math.ceil((((mem+999)/0.85)/1000))+" GB physical RAM)"; + }else{ + return "-Xmx"+(((((mem*11)/8+50))/10)*10)+"m \t(at least "+((((long)(((mem*10)/8+50)/0.82))/10)*10)+" MB physical RAM)"; + } + } + + + public static long bench(InputStream is) throws IOException{ + final byte[] buf=new byte[32768]; + long sum=0; + for(long len=is.read(buf); len>0; len=is.read(buf)){sum+=len;} + return sum; + } + + public static long[] countFasta(final InputStream is, final String out, final int maxNs) throws IOException{ + + GCBINS2=(GCBINS>=1000 ? GCBINS : GCBINS*10); + + long limsum=0; + long headerlen=0; + final byte[] buf=new byte[32768]; + final TextStreamWriter tsw=(out==null ? null : new TextStreamWriter(out, overwrite, false, false)); + if(tsw!=null){tsw.start();} + final int[] counts=new int[6]; + final long[] overall=new long[6]; + final StringBuilder hdr=(out==null ? null : new StringBuilder()); + boolean hdmode=false; + + final LongList clist=new LongList((int)Tools.min(1<<15, cutoff+1)); //Number of contigs of length x + final LongList slist=new LongList((int)Tools.min(1<<15, cutoff+1)); //Number of scaffolds of length x + final LongList sclist1=new LongList((int)Tools.min(1<<15, cutoff+1)); //Sum of contigs per scaffold of length x + final LongList sclist2=new LongList((int)Tools.min(1<<15, cutoff+1)); //Sum of contig lengths per scaffold of length x + + final LongList llist=new LongList(64); //List of contig lengths for contigs at least cutoff in length + final ArrayList tlist=new ArrayList(64); //List of scaf len, contigs, contig sum for scaffolds at least cutoff in length + +// final long[] gchist=(calcGChist ? new long[GCBINS] : null); + final long[] gchist=new long[GCBINS2]; + final long[] gchist_by_base=new long[GCBINS2]; + + int i=0; + int lim=is.read(buf); + limsum+=lim; + + int contigs=0; + int contiglen=0; +// int contiglensum=0; + int scaffoldlen=0; + int ns=0; + + final IntList currentContigs=new IntList(10000); + + while(lim>0){ + if(hdmode){ + if(hdr==null){ + while(i0){ + +// if(contiglen>0 || contigs==0){ +// contigs++; +// contiglensum+=contiglen; +// if(contiglen0 || contigs==0){ + currentContigs.set(contigs, contiglen); + contigs++; +// System.out.println("For header "+hdr+": added contig. len="+contiglen+", contigs="+contigs); +// contiglensum+=contiglen; + } + } + +// assert(false); + if(scaffoldlen>=MINSCAF){ + + int contiglensum=0; + {//NEW +// System.out.println("Dumping "+contigs+" contigs."); + for(int j=0; j0 || contigs==0){ + contiglensum+=cl; + if(cl0){ + int index=Tools.min((int)((gc*GCBINS2)/atgc),GCBINS2-1); + gchist[index]++; + gchist_by_base[index]+=scaffoldlen; +// assert(false); + } + } + for(int j=0; jslashr){ + counts[charToNum[c]]++; + scaffoldlen++; + + if(c!=noref && c!=noref2){ + ns=0; + contiglen++; + }else{ + ns++; + if(ns==maxNs && contiglen>0){ +// if(contiglen=lim){ + i=0; + lim=is.read(buf); + limsum+=lim; + } + } + + if(scaffoldlen>0){ + +// if(contiglen>0 || contigs==0){ +// contigs++; +// contiglensum+=contiglen; +// if(contiglen0 || contigs==0){ + currentContigs.set(contigs, contiglen); + contigs++; +// contiglensum+=contiglen; + } + } + + if(scaffoldlen>=MINSCAF){ + + int contiglensum=0; + {//NEW +// System.out.println("Dumping "+contigs+" contigs."); + for(int j=0; j0 || contigs==0){ + contiglensum+=cl; + if(cl0 || contigs==0){ +// contigs++; +// contiglensum+=contiglen; +// clist.increment(contiglen, 1); +// } +// sclist1.increment(scaffoldlen, contigs); +// sclist2.increment(scaffoldlen, contiglensum); + + if(hdr!=null){ + tsw.print(toString2(hdr, counts)); + hdr.setLength(0); + } + + { + long gc=counts[1]+counts[2]; + long atgc=gc+counts[0]+counts[3]; + if(atgc>0){ + int index=Tools.min((int)((gc*GCBINS2)/atgc),GCBINS2-1); + gchist[index]++; + gchist_by_base[index]+=scaffoldlen; + } + } + for(int j=0; j0){ + len+=(a*i); + num+=a; + sb.append(num); + sb.append('\t'); + sb.append(len); + sb.append('\n'); + tsw.print(sb.toString()); + sb.setLength(0); + } + } + + if(tlist!=null){ + for(Triple t : tlist){ + len+=t.length; + num++; + sb.append(num); + sb.append('\t'); + sb.append(len); + sb.append('\n'); + tsw.print(sb.toString()); + sb.setLength(0); + } + } + }else{ + + if(tlist!=null){ + for(int i=tlist.size()-1; i>=0; i--){ + Triple t=tlist.get(i); + len+=t.length; + num++; + sb.append(num); + sb.append('\t'); + sb.append(len); + sb.append('\n'); + tsw.print(sb.toString()); + sb.setLength(0); + } + } + for(int i=lim-1; i>=0; i--){ + final long a=array[i]; + if(a>0){ + len+=(a*i); + num+=a; + sb.append(num); + sb.append('\t'); + sb.append(len); + sb.append('\n'); + tsw.print(sb.toString()); + sb.setLength(0); + } + } + } + tsw.poisonAndWait(); + } + + private static String toString2(StringBuilder sb, int[] counts){ + long sum=(long)counts[0]+(long)counts[1]+(long)counts[2]+(long)counts[3]; + long sum2=sum+counts[4]; + float inv1=1f/sum; + if(GCFORMAT==1){ + return sb.append(String.format("\t0\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n", + sum, counts[0]*inv1, counts[1]*inv1, counts[2]*inv1, counts[3]*inv1, counts[4]*1f/sum2, (counts[1]+counts[2])*inv1)).toString(); + }else if(GCFORMAT==2){ + return sb.append(String.format("\t%.5f\n", (counts[1]+counts[2])*inv1)).toString(); + }else if(GCFORMAT==3){ + return sb.append(String.format("\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n", + sum, counts[0]*inv1, counts[1]*inv1, counts[2]*inv1, counts[3]*inv1, counts[4]*1f/sum2, (counts[1]+counts[2])*inv1)).toString(); + }else if(GCFORMAT==4){ + return sb.append(String.format("\t%d\t%.5f\n", sum, (counts[1]+counts[2])*inv1)).toString(); + }else{ + throw new RuntimeException("Unknown format."); + } + } + + private static String toString2(StringBuilder sb, long[] counts){ + long sum=(long)counts[0]+(long)counts[1]+(long)counts[2]+(long)counts[3]; + long sum2=sum+counts[4]; + float inv1=1f/sum; + if(GCFORMAT==1){ + return sb.append(String.format("\t0\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n", + sum, counts[0]*inv1, counts[1]*inv1, counts[2]*inv1, counts[3]*inv1, counts[4]*1f/sum2, (counts[1]+counts[2])*inv1)).toString(); + }else if(GCFORMAT==2){ + return sb.append(String.format("\t%.5f\n", (counts[1]+counts[2])*inv1)).toString(); + }else if(GCFORMAT==3){ + return sb.append(String.format("\t%d\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\t%.5f\n", + sum, counts[0]*inv1, counts[1]*inv1, counts[2]*inv1, counts[3]*inv1, counts[4]*1f/sum2, (counts[1]+counts[2])*inv1)).toString(); + }else if(GCFORMAT==4){ + return sb.append(String.format("\t%d\t%.5f\n", sum, (counts[1]+counts[2])*inv1)).toString(); + }else{ + throw new RuntimeException("Unknown format."); + } + } + + private static String toString3(StringBuilder sb, long[] counts){ + long sum=(long)counts[0]+(long)counts[1]+(long)counts[2]+(long)counts[3]; + long sum2=sum+counts[4]; + float inv1=1f/sum; + if(GCFORMAT==1 || GCFORMAT==3 || GCFORMAT==4){ + return sb.append(String.format("\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\t%.4f\n", + counts[0]*inv1, counts[1]*inv1, counts[2]*inv1, counts[3]*inv1, counts[4]*1f/sum2, (counts[1]+counts[2])*inv1, gc_stdS)).toString(); + }else if(GCFORMAT==2){ + return sb.append(String.format("\t%.5f\t%.5f\n", (counts[1]+counts[2])*inv1, gc_stdS)).toString(); + }else{ + throw new RuntimeException("Unknown format."); + } + + } + + + private static final String formatX(int next, long ssum, long csum, long slen, long clen){ + float cov=clen*100f/slen; + + final String s; + if(FORMAT==1){ + s=formatKB_all(next, 1, 7)+" \t"+formatComma(ssum, 14)+"\t"+formatComma(csum, 14)+"\t"+formatComma(slen, 14)+"\t"+formatComma(clen, 14)+"\t"+formatPercent(cov); + }else if(FORMAT>=2){ + s=formatKB_all(next, 1, 7)+" \t"+formatComma(ssum, 14)+"\t"+formatComma(csum, 14)+"\t"+formatComma(slen, 14)+"\t"+formatComma(clen, 14)+"\t"+formatPercent(cov); + }else{ + throw new RuntimeException("Unknown format: "+FORMAT); + } + return s; + } + + private static final String formatKB(long x, int precision, int width){ + String s; + if(FORMAT>=2 || x<1000){ + s=Long.toString(x); + }else if(x<1000000){ + s=String.format("%."+precision+"f",x/1000f); + while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);} + s=s+" KB"; + }else{ + s=String.format("%."+precision+"f",x/1000000f); + while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);} + s=s+" MB"; + } + while(s.length()=2 || x<1000){ + s=Long.toString(x); + }else if(x<1000000){ + s=String.format("%."+precision+"f",x/1000f); + while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);} + s=s+" KB"; + }else{ + s=String.format("%."+precision+"f",x/1000000f); + while(s.contains(".") && (s.endsWith("0") || s.endsWith("."))){s=s.substring(0, s.length()-1);} + s=s+" MB"; + } + + while(s.length()0){ + while(sb.length()=2){ + sb.append(x); + }else{ + throw new RuntimeException("Unknown format: "+FORMAT); + } + while(sb.length()']=r['@']=r['+']=5; + return r; + } + + private static final byte[] charToNum=makeCharToNum(); + public static int GCFORMAT=1; + public static int FORMAT=1; + private static long cutoff=1000000; + + private static long LIMSUM=0; + private static long HEADERLENSUM=0; + private static int GCBINS=200; + private static int GCBINS2; + private static int MINSCAF=0; + private static int GCHIST_DECIMALS=3; + + private static int bbmapkmer=0;//13; + public static boolean overwrite=false; + public static boolean useheader=true; + public static boolean addfilename=false; + public static boolean showspeed=false;//true; + public static boolean printheadersize=false; + public static boolean skipDuplicateLines=true; + public static boolean N_UNDERSCORE=true; + + private final static byte slashr='\r', slashn='\n', carrot='>', at='@', noref='N', noref2='n'; + + /** Number of contigs of length x */ + private static LongList clistS=null; + + /** Number of scaffolds of length x */ + private static LongList slistS=null; + + /** Sum of contigs per scaffold of length x */ + private static LongList sclist1S=null; + + /** Sum of contig lengths per scaffold of length x */ + private static LongList sclist2S=null; + + /** List of contig lengths for contigs at least cutoff in length */ + private static LongList llistS=null; + + /** List of scaf len, contigs, contig sum for scaffolds at least cutoff in length */ + private static ArrayList tlistS=null; + + /** Downsampled gc histogram */ + private static long[] gchistS; + + /** gc standard deviation */ + private static double gc_stdS; + + /** Downsampled gc histogram, using base counts rather than scaffold counts */ + private static long[] gchist_bb_S; + + /** gc standard deviation, using base counts rather than scaffold counts */ + private static double gc_bb_stdS; + + private static class Triple implements Comparable{ + + Triple(long len_, long contigs_, long contiglen_){ + length=len_; + contigs=contigs_; + contiglen=contiglen_; + } + + @Override + public int compareTo(Triple o) { + if(length>o.length){return 1;} + if(length alist=new ArrayList(); + ArrayList ilist=new ArrayList(); + + alist.add(""); + alist.add("header=t"); + alist.add("showspeed=f"); + alist.add("addname=t"); + alist.add("k=0"); + + for(String arg : args){ + if(!arg.contains("=")){ + ilist.add("in="+arg); + }else{ + String[] split=arg.split("="); + if(split[0].equalsIgnoreCase("in") || split[0].equalsIgnoreCase("ref")){ + ilist.add(arg); + }else{ + alist.add(arg); + } + } + } + + + String[] args2=alist.toArray(new String[alist.size()]); + for(int i=0; i0){ + args2[1]="header=f"; + AssemblyStats2.reset(); + System.gc(); + synchronized(AssemblyStatsWrapper.class){ + try { + AssemblyStatsWrapper.class.wait(100); + } catch (InterruptedException e) {} + } + Thread.yield(); + } + AssemblyStats2.main(args2); + } + + } + +} diff --git a/current/jgi/BBDukF.java b/current/jgi/BBDukF.java new file mode 100755 index 0000000..75837d8 --- /dev/null +++ b/current/jgi/BBDukF.java @@ -0,0 +1,2212 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.atomic.AtomicIntegerArray; +import java.util.concurrent.atomic.AtomicLongArray; + +import kmer.AbstractKmerTable; +import kmer.HashArray; +import kmer.HashForest; +import kmer.KmerTable; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import align2.TrimRead; +import dna.AminoAcid; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextStreamWriter; + +/** + * Separates or trims reads based on matching kmers in a reference. + * Supports arbitrary K and inexact matches. + * Supercedes BBDuk by replacing Java's HashMap with HashArray and HashForest. + * @author Brian Bushnell + * @date Aug 30, 2013 + * + */ +public class BBDukF { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + + if(Tools.parseHelp(args)){ + printOptions(); + System.exit(0); + } + + //Create a new BBDuk instance + BBDukF bbd=new BBDukF(args); + + ///And run it + bbd.process(); + } + + /** + * Display usage information. + */ + private static void printOptions(){ + outstream.println("Syntax:\n"); + outstream.println("\njava -ea -Xmx20g -cp jgi.BBDuk in= out= ref="); + outstream.println("\nOptional flags:"); + outstream.println("in= \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in."); + outstream.println("in2= \tUse this if 2nd read of pairs are in a different file."); + outstream.println("out= \t(outnonmatch) The 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out."); + outstream.println("out2= \t(outnonmatch2) Use this to write 2nd read of pairs to a different file."); + outstream.println("outmatch= \t(outm or outb) Write 'bad' reads here (containing contaminant kmers)."); + outstream.println("outmatch2= \t(outm2 or outb2) Use this to write 2nd read of pairs to a different file."); + outstream.println("stats= \tWrite statistics about which contaminants were detected."); + outstream.println("duk= \tWrite duk-like output."); + outstream.println(""); + outstream.println("threads=auto \t(t) Set number of threads to use; default is number of logical processors."); + outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file."); + outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed."); + outstream.println("interleaved=auto \t(int) If true, forces fastq input to be paired and interleaved."); + outstream.println("k=31 \tKmer length used for finding contaminants. Contaminants shorter than k will not be found."); + outstream.println("maskmiddle=t \t(mm) Treat the middle base of a kmer as a wildcard."); + outstream.println("maxbadkmers=0 \t(mbk) Reads with more than this many contaminant kmers will be discarded."); + outstream.println("minavgquality=0 \t(maq) Reads with average quality (before trimming) below this will be discarded."); + outstream.println("touppercase=f \t(tuc) Change all letters in reads and reference to upper-case."); + outstream.println("ktrim=f \tTrim reads to remove bases matching reference kmers. "); + outstream.println(" \tValues: f (don't trim), r (trim right end), l (trim left end), n (convert to N instead of trimming)."); + outstream.println("useshortkmers=f \t(usk) Look for shorter kmers at read tips (only for k-trimming)."); + outstream.println("mink=4 \tMinimum length of short kmers. Setting this automatically sets useshortkmers=t."); + outstream.println("qtrim=f \tTrim read ends to remove bases with quality below minq. Performed AFTER looking for kmers. "); + outstream.println(" \tValues: t (trim both ends), f (neither end), r (right end only), l (left end only)."); + outstream.println("minq=4 \tTrim quality threshold."); + outstream.println("minlength=2 \t(ml) Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter."); + outstream.println("ziplevel=2 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster."); + outstream.println("fastawrap=100 \tLength of lines in fasta output"); + outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto"); + outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)"); + outstream.println("rcomp=t \tLook for reverse-complements of kmers also."); + outstream.println("forest=t \tUse HashForest data structure"); + outstream.println("table=f \tUse KmerTable data structure"); + outstream.println("array=f \tUse HashArray data structure"); + } + + + /** + * Constructor. + * @param args Command line arguments + */ + public BBDukF(String[] args){ + for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}} + System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + /* Set global defaults */ + ReadWrite.ZIPLEVEL=2; + ReadWrite.USE_UNPIGZ=true; + ReadWrite.USE_PIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + FastaReadInputStream.SPLIT_READS=false; + ByteFile.FORCE_MODE_BF2=Shared.THREADS>2; + + /* Initialize local variables with defaults */ + boolean setOut=false, setOutb=false, qtrimRight_=false, qtrimLeft_=false; + boolean ktrimRight_=false, ktrimLeft_=false, ktrimN_=false, ktrimExclusive_=false; + boolean addTrimmedToBad_=true; + boolean rcomp_=true; + boolean forbidNs_=false; + boolean useForest_=false, useTable_=false, useArray_=true; + int k_=28, kbig_=-1; + int mink_=-1; + int ways_=-1; //Currently disabled + int maxBadKmers_=0; + long skipreads_=0; + byte qin=-1, qout=-1; + byte TRIM_SYMBOL_='N'; + + byte trimq_=4; + byte minAvgQuality_=0; + int minReadLength_=20; + float minLenFraction_=0f; + int maxNs_=-1; + boolean removePairsIfEitherBad_=true; + + scaffoldNames.add(""); //Necessary so that the first real scaffold gets an id of 1, not zero + + { + boolean b=false; + assert(b=true); + EA=b; + } + + /* Parse arguments */ + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") || + a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){ + out1=b; + setOut=true; + }else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") || + a.equals("outnonmatched2") || a.equals("outunmatched2")){ + out2=b; + }else if(a.equals("outb") || a.equals("outm") || a.equals("outb1") || a.equals("outm1") || a.equals("outbad") || + a.equals("outbad1") || a.equals("outmatch") || a.equals("outmatch1")){ + outb1=b; + setOut=true; + }else if(a.equals("outb2") || a.equals("outm2") || a.equals("outbad2") || a.equals("outmatch2")){ + outb2=b; + }else if(a.equals("stats")){ + outstats=b; + }else if(a.equals("duk") || a.equals("outduk")){ + outduk=b; + }else if(a.equals("rqc")){ + outrqc=b; + }else if(a.equals("ref")){ + ref=(b==null) ? null : (new File(b).exists() ? new String[] {b} : b.split(",")); + }else if(a.equals("literal")){ + literal=(b==null) ? null : b.split(","); +// assert(false) : b+", "+Arrays.toString(literal); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("forest")){ + useForest_=Tools.parseBoolean(b); + if(useForest_){useTable_=useArray_=false;} + }else if(a.equals("table")){ + useTable_=Tools.parseBoolean(b); + if(useTable_){useForest_=useArray_=false;} + }else if(a.equals("array")){ + useArray_=Tools.parseBoolean(b); + if(useArray_){useTable_=useForest_=false;} + }else if(a.equals("ways")){ + ways_=Integer.parseInt(b); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("k")){ + assert(b!=null) : "\nThe k key needs an integer value greater than 0, such as k=28\n"; + k_=Integer.parseInt(b); + if(k_>31){ + kbig_=k_; + k_=31; + }else{ + kbig_=-1; + } + assert(k_>0 && k_<32) : "k must be at least 1; default is 28."; + }else if(a.equals("mink") || a.equals("kmin")){ + mink_=Integer.parseInt(b); + assert(mink_<0 || (mink_>0 && mink_<32)) : "kmin must be between 1 and 31; default is 4, negative numbers disable it."; + }else if(a.equals("useshortkmers") || a.equals("shortkmers") || a.equals("usk")){ + useShortKmers=Tools.parseBoolean(b); + }else if(a.equals("dist") || a.equals("distance") || a.equals("hdist") || a.equals("hammingdistance")){ + hammingDistance=Integer.parseInt(b); + assert(hammingDistance>=0 && hammingDistance<4) : "hamming distance must be between 0 and 3; default is 0."; + }else if(a.equals("edits") || a.equals("edist") || a.equals("editdistance")){ + editDistance=Integer.parseInt(b); + assert(editDistance>=0 && editDistance<3) : "edit distance must be between 0 and 2; default is 0."; + }else if(a.equals("maxskip") || a.equals("mxs")){ + maxSkip=Integer.parseInt(b); + }else if(a.equals("minskip") || a.equals("mns")){ + minSkip=Integer.parseInt(b); + }else if(a.equals("skipreads")){ + skipreads_=Long.parseLong(b); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.THREADS : Integer.parseInt(b)); + }else if(a.equals("maxbadkmers") || a.equals("mbk")){ + maxBadKmers_=Integer.parseInt(b); + }else if(a.equals("minavgquality") || a.equals("maq")){ + minAvgQuality_=(byte)Integer.parseInt(b); + }else if(a.equals("maxns")){ + maxNs_=Integer.parseInt(b); + }else if(a.equals("showspeed") || a.equals("ss")){ + showSpeed=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + assert(false) : "Verbose flag is currently static final; must be recompiled to change."; + assert(WAYS>1) : "WAYS=1 is for debug mode."; +// verbose=Tools.parseBoolean(b); //123 + if(verbose){outstream=System.err;} //For some reason System.out does not print in verbose mode. + }else if(a.equals("mm") || a.equals("maskmiddle")){ + maskMiddle=Tools.parseBoolean(b); + }else if(a.equals("rcomp")){ + rcomp_=Tools.parseBoolean(b); + }else if(a.equals("forbidns") || a.equals("forbidn") || a.equals("fn")){ + forbidNs_=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("removeifeitherbad") || a.equals("rieb")){ + removePairsIfEitherBad_=Tools.parseBoolean(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("fastaminlen") || a.equals("fastaminlength")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("ktrim")){ + if(b==null){b="";} + if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){ktrimLeft_=true;ktrimRight_=false;ktrimN_=false;} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){ktrimLeft_=false;ktrimRight_=true;ktrimN_=false;} + else if(b.equalsIgnoreCase("n")){ktrimLeft_=false;ktrimRight_=false;ktrimN_=true;} + else if(b.length()==1 && !b.equalsIgnoreCase("t") && !b.equalsIgnoreCase("f")){ + ktrimLeft_=false;ktrimRight_=false;ktrimN_=true; + TRIM_SYMBOL_=(byte)b.charAt(0); + }else{ + boolean x=Tools.parseBoolean(b); + assert(!x) : "\nInvalid setting for ktrim - values must be f (false), l (left), r (right), or n."; + ktrimRight_=ktrimLeft_=ktrimN_=x; + } + }else if(a.equals("ktrimright")){ + ktrimRight_=Tools.parseBoolean(b); + ktrimLeft_=ktrimN_=!(ktrimRight_); + }else if(a.equals("ktrimleft")){ + ktrimLeft_=Tools.parseBoolean(b); + ktrimRight_=ktrimN_=!(ktrimLeft_); + }else if(a.equals("ktrimn")){ + ktrimN_=Tools.parseBoolean(b); + ktrimLeft_=ktrimRight_=!(ktrimN_); + }else if(a.equals("ktrimexclusive")){ + ktrimExclusive_=Tools.parseBoolean(b); + }else if(a.equals("trim") || a.equals("qtrim")){ + if(b==null){qtrimRight_=qtrimLeft_=true;} + else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrimLeft_=true;qtrimRight_=false;} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrimLeft_=false;qtrimRight_=true;} + else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrimLeft_=qtrimRight_=true;} + else if(Character.isDigit(b.charAt(0))){ + if(!qtrimLeft_ && !qtrimRight_){qtrimLeft_=qtrimRight_=true;} + trimq_=Byte.parseByte(b); + }else{qtrimRight_=qtrimLeft_=Tools.parseBoolean(b);} + }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){ + if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){ + TrimRead.optimalMode=true; + TrimRead.optimalBias=Float.parseFloat(b); + assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimright") || a.equals("qtrimright")){ + qtrimRight_=Tools.parseBoolean(b); + }else if(a.equals("trimleft") || a.equals("qtrimleft")){ + qtrimLeft_=Tools.parseBoolean(b); + }else if(a.equals("trimq") || a.equals("trimquality")){ + trimq_=Byte.parseByte(b); + }else if(a.equals("q102matrix") || a.equals("q102m")){ + CalcTrueQuality.q102matrix=b; + }else if(a.equals("qbpmatrix") || a.equals("bqpm")){ + CalcTrueQuality.qbpmatrix=b; + }else if(a.equals("adjustquality") || a.equals("adjq")){ + TrimRead.ADJUST_QUALITY=Tools.parseBoolean(b); + }else if(a.equals("otm") || a.equals("outputtrimmedtomatch")){ + addTrimmedToBad_=Tools.parseBoolean(b); + }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){ + minReadLength_=Integer.parseInt(b); + }else if(a.equals("mlf") || a.equals("minlenfrac") || a.equals("minlenfraction") || a.equals("minlengthfraction")){ + minLenFraction_=Float.parseFloat(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + in1=args[i]; + }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + out1=args[i]; + setOut=true; + }else if(i==2 && ref==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + ref=(new File(args[i]).exists() ? new String[] {args[i]} : args[i].split(",")); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(TrimRead.ADJUST_QUALITY){CalcTrueQuality.initializeMatrices();} + + /* Set final variables; post-process and validate argument combinations */ + + useForest=useForest_; + useTable=useTable_; + useArray=useArray_; + hammingDistance=Tools.max(editDistance, hammingDistance); + minSkip=Tools.max(1, Tools.min(minSkip, maxSkip)); + maxSkip=Tools.max(minSkip, maxSkip); + addTrimmedToBad=addTrimmedToBad_; + rcomp=rcomp_; + forbidNs=(forbidNs_ || hammingDistance<1); + trimSymbol=TRIM_SYMBOL_; + skipreads=skipreads_; + trimq=trimq_; + minAvgQuality=minAvgQuality_; + minReadLength=minReadLength_; + minLenFraction=minLenFraction_; + removePairsIfEitherBad=removePairsIfEitherBad_; + maxNs=maxNs_; +// if(maxReads>0){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;} + +// if(ways_==-1){ +// while(Shared.THREADSk_){ + System.err.println("*********************** WARNING ***********************"); + System.err.println("WARNING: When kmer-trimming, the maximum value of K is "+k_+"."); + System.err.println("K has been reduced from kbig_ to "+k_+"."); + System.err.println("***********************************************************"); + kbig_=k_; + } + } + + k=k_; + k2=k-1; + kbig=kbig_; + if(kbig>k){ + minSkip=maxSkip=0; + if(maskMiddle){ + System.err.println("maskMiddle was disabled because kbig>k"); + maskMiddle=false; + } + } + mink=Tools.min((mink_<1 ? 4 : mink_), k); + maxBadKmers=maxBadKmers_; + if(mink_>0 && mink_0); + + ktrimRight=ktrimRight_; + ktrimLeft=ktrimLeft_; + ktrimN=ktrimN_; + ktrimExclusive=ktrimExclusive_; + + assert(!useShortKmers || ktrimRight || ktrimLeft || ktrimN) : "\nSetting mink or useShortKmers also requires setting a ktrim mode, such as 'r', 'l', or 'n'\n"; + + qtrimRight=qtrimRight_; + qtrimLeft=qtrimLeft_; + + middleMask=maskMiddle ? ~(3L<<(2*(k/2))) : -1L; + + hitCounts=(outduk==null ? null : new AtomicLongArray(1001)); + + + /* Adjust I/O settings and filenames */ + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + assert(FastaReadInputStream.settingsOK()); + + if(in1==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(out1!=null && out1.contains("#")){ + int pound=out1.lastIndexOf('#'); + String a=out1.substring(0, pound); + String b=out1.substring(pound+1); + out1=a+1+b; + out2=a+2+b; + } + + if(outb1!=null && outb1.contains("#")){ + int pound=outb1.lastIndexOf('#'); + String a=outb1.substring(0, pound); + String b=outb1.substring(pound+1); + outb1=a+1+b; + outb2=a+2+b; + } + + if(!setOut){ + out1="stdout.fq"; + outstream=System.err; + out2=null; + }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){ + out1="stdout.fq"; + outstream=System.err; + out2=null; + } + if(out1!=null && !Tools.canWrite(out1, overwrite)){throw new RuntimeException("Output file "+out1+" already exists, and overwrite="+overwrite);} + + assert(!in1.equalsIgnoreCase(out1)); + assert(!in1.equalsIgnoreCase(outb1)); + assert(!in1.equalsIgnoreCase(in2)); + assert(out1==null || !out1.equalsIgnoreCase(out2)); + assert(out1==null || !out1.equalsIgnoreCase(outb1)); + assert(outb1==null || !outb1.equalsIgnoreCase(outb2)); + assert(THREADS>0); + + assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1; + assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2; + assert((ref!=null || literal!=null) || qtrimLeft || qtrimRight || minAvgQuality>0 || maxNs>=0) : "No reference files specified, no trimming set, no min avg quality - nothing to do."; + if(ref!=null){ + for(String s0 : ref){ + assert(s0!=null) : "Specified a null reference."; + String s=s0.toLowerCase(); + assert(s==null || s.startsWith("stdin") || s.startsWith("standardin") || new File(s0).exists()) : "Can't find "+s0; + } + } + + //Initialize tables + for(int i=0; i=4){ +// fillSet_MT(); +// }else{ +// fillSet_ST(); //Still in BBDukF_old +// } + + /* Do kmer matching of input reads */ + findKmers(t); + + /* Write statistics to files */ + writeStats(t); + writeDuk(System.nanoTime()-startTime); + writeRqc(); + + outstream.println("Input: \t"+readsIn+" reads \t\t"+basesIn+" bases."); + + if(ref!=null || literal!=null){ + outstream.println("Contaminants: \t"+readsKFiltered+" reads ("+String.format("%.2f",readsKFiltered*100.0/readsIn)+"%) \t"+ + basesKFiltered+" bases ("+String.format("%.2f",basesKFiltered*100.0/basesIn)+"%)"); + outstream.flush(); + } + if(qtrimLeft || qtrimRight){ + outstream.println("QTrimmed: \t"+readsQTrimmed+" reads ("+String.format("%.2f",readsQTrimmed*100.0/readsIn)+"%) \t"+ + basesQTrimmed+" bases ("+String.format("%.2f",basesQTrimmed*100.0/basesIn)+"%)"); + } + if(ktrimLeft || ktrimRight || ktrimN){ + outstream.println("KTrimmed: \t"+readsKTrimmed+" reads ("+String.format("%.2f",readsKTrimmed*100.0/readsIn)+"%) \t"+ + basesKTrimmed+" bases ("+String.format("%.2f",basesKTrimmed*100.0/basesIn)+"%)"); + } + if(minAvgQuality>0 || maxNs>=0){ + outstream.println("Low quality discards: \t"+readsQFiltered+" reads ("+String.format("%.2f",readsQFiltered*100.0/readsIn)+"%) \t"+ + basesQFiltered+" bases ("+String.format("%.2f",basesQFiltered*100.0/basesIn)+"%)"); + } + outstream.println("Result: \t"+readsOut+" reads ("+String.format("%.2f",readsOut*100.0/readsIn)+"%) \t"+ + basesOut+" bases ("+String.format("%.2f",basesOut*100.0/basesIn)+"%)"); + } + + /** + * Write statistics about how many reads matched each reference scaffold. + * @param t Phase timer + */ + private void writeStats(Timer t){ + if(outstats==null){return;} + final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false); + tsw.start(); + + /* Create StringNum list of scaffold names and hitcounts */ + ArrayList list=new ArrayList(); + for(int i=1; i0){ + final String s=scaffoldNames.get(i); + final StringNum sn=new StringNum(s, num); + list.add(sn); + } + } + Collections.sort(list); + final double rmult=100.0/(readsIn>0 ? readsIn : 1); + for(int i=0; i();} + if(evict || !RQC_MAP.containsKey(key)){RQC_MAP.put(key, value);} + } + + /** + * Helper method; formats statistics to be duk-compatible + * @param time Elapsed time, nanoseconds + * @return duk output string + */ + private String dukString(long time){ + StringBuilder sb=new StringBuilder(); + sb.append("##INPUT PARAMETERS##\n"); + sb.append("#Reference file: "+(ref==null || ref.length<1 ? null : ref.length==1 ? ref[0] : Arrays.toString(ref))+"\n"); + sb.append("#Query file: "+in1+(in2==null ? "" : ","+in2)+"\n"); + sb.append("#Not matched reads file: "+out1+(out2==null ? "" : ","+out2)+"\n"); + sb.append("#Matched reads file: "+outb1+(outb2==null ? "" : ","+outb2)+"\n"); + sb.append("#Output file (duk): "+outduk+"\n"); + sb.append("#Output file (stats): "+outstats+"\n"); + sb.append("#Mer size: "+k+"\n"); + long size=0; + for(AbstractKmerTable x : keySets){size+=x.size();} + sb.append("#Avg step size: "+String.format("%.1f", refKmers/(double)(Tools.max(1, size)))+"\n"); + sb.append("#Cut off: "+maxBadKmers+"\n"); + sb.append("#Mask middle: "+maskMiddle+"\n"); + sb.append("#Quality trim: "+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n"); + sb.append("\n"); + + sb.append("##REFERECE STAT##\n"); + sb.append("#Total Reads: "+refReads+"\n"); + sb.append("#Total Bases: "+refBases+"\n"); + sb.append("#Total kmers: "+refKmers+"\n"); + sb.append("#Total stored kmers: "+size+"\n"); + sb.append("\n"); + + sb.append("## ELAPSED TIME##\n"); + sb.append("# Time: "+String.format("%.2f", time/1000000000.0)+" seconds\n"); + sb.append("\n"); + + sb.append("##QUERY FILE STAT##\n"); + sb.append("# Total number of reads: "+readsIn+"\n"); + sb.append("# Total number of matched reads: "+readsKFiltered+"\n"); + sb.append("# Match ratio: "+String.format("%.6f", readsKFiltered*1.0/readsIn)+"\n"); + sb.append("\n"); + + sb.append("##P-VALUE##\n"); + sb.append("#Avg number of Kmer for each read: "+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n"); +// sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means + sb.append("\n"); + + sb.append("## Histogram of kmer occurance for reads with at least one occurance ##\n"); + sb.append("#NumOcc\tNumReads\tPercentage\n"); + + long sum=0; + for(int i=0; i0){ + sb.append(i).append('\t').append(x).append('\t').append(String.format("%.4f",(x*mult))).append('\n'); + } + } + + return sb.toString(); + } + + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Fills tables with kmers from references, using multiple LoadThread. + * @return Number of kmers stored. + */ + private long fillSet_MT(){ + Timer t=new Timer(); + t.start(); + if((ref==null || ref.length<1) && (literal==null || literal.length<1)){return 0;} + long added=0; + + /* Create load threads */ + LoadThread[] loaders=new LoadThread[WAYS]; + for(int i=0; i ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + /* Iterate through read lists from the input stream */ + while(reads!=null && reads.size()>0){ + { + /* Assign a unique ID number to each scaffold */ + ArrayList reads2=new ArrayList(reads); + for(Read r : reads2){ + final Integer id=scaffoldNames.size(); + scaffoldNames.add(r.id==null ? id.toString() : r.id); + r.obj=id; + if(r.mate!=null){r.mate.obj=id;} + } + + /* Send a pointer to the read list to each LoadThread */ + for(LoadThread lt : loaders){ + boolean b=true; + while(b){ + try { + lt.queue.put(reads2); + b=false; + } catch (InterruptedException e) { + //TODO: This will hang due to still-running threads. + throw new RuntimeException(e); + } + } + } + } + + /* Dispose of the old list and fetch a new one */ + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + /* Cleanup */ + cris.returnList(ln, ln.list.isEmpty()); + errorState|=ReadWrite.closeStream(cris); + } + } + + /* If there are literal sequences to use as references */ + if(literal!=null){ + ArrayList list=new ArrayList(literal.length); + if(verbose){System.err.println("Adding literals "+Arrays.toString(literal));} + + /* Assign a unique ID number to each literal sequence */ + for(int i=0; i0){ + long skipped=0; + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(skipped0){ + skipped+=reads.size(); + if(rosb!=null){ + rosb.add(new ArrayList(1), ln.id); + } + + if(ros!=null){ + ros.add(new ArrayList(1), ln.id); + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + if(reads==null || reads.isEmpty()){ + ReadWrite.closeStreams(cris, ros, rosb); + System.err.println("Skipped all of the reads."); + System.exit(0); + } + } + + /* Create ProcessThreads */ + ArrayList alpt=new ArrayList(THREADS); + for(int i=0; i fetch(){ + ArrayList list=null; + while(list==null){ + try { + list=queue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + return list; + } + + @Override + public void run(){ + ArrayList reads=fetch(); + while(reads!=POISON){ + for(Read r : reads){ + assert(r.pairnum()==0); + final Read r2=r.mate; + + final int rblen=(r==null || r.bases==null ? 0 : r.bases.length); + final int rblen2=(r2==null || r2.bases==null ? 0 : r2.bases.length); + + added+=addToMap(r, rblen>10000000 ? k : rblen>1000000 ? 11 : rblen>100000 ? 2 : 0); + if(r.mate!=null){ + added+=addToMap(r.mate, rblen2>10000000 ? k : rblen2>1000000 ? 11 : rblen2>100000 ? 2 : 0); + } + } + reads=fetch(); + } + + if(map.canRebalance() && map.size()>2L*map.arrayLength()){ + map.rebalance(); + } + } + + /** + * @param r The current read to process + * @param skip Number of bases to skip between kmers + * @return Number of kmers stored + */ + private long addToMap(Read r, int skip){ + skip=Tools.max(minSkip, Tools.min(maxSkip, skip)); + final byte[] bases=r.bases; + final long shift=2*k; + final long shift2=shift-2; + final long mask=~((-1L)<1){ //Process while skipping some kmers + for(int i=0; i>>2)|(x2<=mink; i--){ + kmer=kmer&rightMasks[i]; + rkmer=rkmer>>>2; + long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i]); + added+=x; + if(verbose){ + if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){ + System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); + System.err.println("i="+i+"; tnum="+tnum+"; Looking for left-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)); + final long value=toValue(kmer, rkmer, kMasks[i]); + if(map.contains(value)){System.err.println("Found "+value);} + } + } + } + return added; + } + + + /** + * Adds short kmers on the right end of the read. + * @param kmer Forward kmer + * @param rkmer Reverse kmer + * @param id Scaffold number + * @return Number of kmers stored + */ + private long addToMapRightShift(long kmer, long rkmer, final int id){ + if(verbose){System.err.println("addToMapRightShift");} + long added=0; + for(int i=k-1; i>=mink; i--){ + long extraBase=kmer&3L; + kmer=kmer>>>2; + rkmer=rkmer&rightMasks[i]; +// assert(Long.numberOfLeadingZeros(kmer)>=2*(32-i)) : Long.numberOfLeadingZeros(kmer)+", "+i+", "+kmer+", "+kMasks[i]; +// assert(Long.numberOfLeadingZeros(rkmer)>=2*(32-i)) : Long.numberOfLeadingZeros(rkmer)+", "+i+", "+rkmer+", "+kMasks[i]; + long x=addToMap(kmer, rkmer, i, extraBase, id, kMasks[i]); + added+=x; + if(verbose){ + if((toValue(kmer, rkmer, kMasks[i]))%WAYS==tnum){ + System.err.println("added="+x+"; i="+i+"; tnum="+tnum+"; Added right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)+"; value="+(toValue(kmer, rkmer, kMasks[i]))+"; kmer="+kmer+"; rkmer="+rkmer+"; kmask="+kMasks[i]+"; rightMasks[i+1]="+rightMasks[i+1]); + System.err.println("i="+i+"; tnum="+tnum+"; Looking for right-shift kmer "+AminoAcid.kmerToString(kmer&~kMasks[i], i)); + final long value=toValue(kmer, rkmer, kMasks[i]); + if(map.contains(value)){System.err.println("Found "+value);} + } + } + } + return added; + } + + + /** + * Adds this kmer to the table, including any mutations implied by editDistance or hammingDistance. + * @param kmer Forward kmer + * @param rkmer Reverse kmer + * @param extraBase Base added to end in case of deletions + * @param id Scaffold number + * @param kmask0 + * @return Number of kmers stored + */ + private long addToMap(final long kmer, final long rkmer, final int len, final long extraBase, final int id, final long kmask0){ + + assert(kmask0==kMasks[len]) : kmask0+", "+len+", "+kMasks[len]+", "+Long.numberOfTrailingZeros(kmask0)+", "+Long.numberOfTrailingZeros(kMasks[len]); + + if(verbose){System.err.println("addToMap_A; len="+len+"; kMasks[len]="+kMasks[len]);} + assert((kmer&kmask0)==0); + final long added; + if(hammingDistance==0){ + final long key=toValue(kmer, rkmer, kmask0); + if(key%WAYS!=tnum){return 0;} + if(verbose){System.err.println("addToMap_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);} + added=map.setIfNotPresent(key, id); + }else if(editDistance>0){ +// long extraBase=(i>=bases.length-1 ? -1 : AminoAcid.baseToNumber[bases[i+1]]); + added=mutate(kmer, rkmer, len, id, editDistance, extraBase); + }else{ + added=mutate(kmer, rkmer, len, id, hammingDistance, -1); + } + if(verbose){System.err.println("addToMap added "+added+" keys.");} + return added; + } + + /** + * Mutate and store this kmer through 'dist' recursions. + * @param kmer Forward kmer + * @param rkmer Reverse kmer + * @param id Scaffold number + * @param dist Number of mutations + * @param extraBase Base added to end in case of deletions + * @return Number of kmers stored + */ + private long mutate(final long kmer, final long rkmer, final int len, final int id, final int dist, final long extraBase){ + long added=0; + + final long key=toValue(kmer, rkmer, kMasks[len]); + + if(verbose){System.err.println("mutate_A; len="+len+"; kmer="+kmer+"; rkmer="+rkmer+"; kMasks[len]="+kMasks[len]);} + if(key%WAYS==tnum){ + if(verbose){System.err.println("mutate_B: "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+" = "+key);} + int x=map.setIfNotPresent(key, id); + if(verbose){System.err.println("mutate_B added "+x+" keys.");} + added+=x; + assert(map.contains(key)); + } + + if(dist>0){ + final int dist2=dist-1; + + //Sub + for(int j=0; j<4; j++){ + for(int i=0; i0){ + //Del + if(extraBase>=0 && extraBase<=3){ + for(int i=1; i>2); + for(int j=0; j<4; j++){ + final long temp=temp0|setMasks[j][i-1]; + if(temp!=kmer){ + long rtemp=AminoAcid.reverseComplementBinaryFast(temp, len); + added+=mutate(temp, rtemp, len, id, dist2, eb2); + } + } + } + } + + } + + return added; + } + + /*--------------------------------------------------------------*/ + + /** Number of kmers stored by this thread */ + public long added=0; + /** Number of items encountered by this thread */ + public long refKmersT=0, refReadsT=0, refBasesT=0; + /** Thread number; used to determine which kmers to store */ + public final int tnum; + /** Buffer of input read lists */ + public final ArrayBlockingQueue> queue=new ArrayBlockingQueue>(32); + + /** Destination for storing kmers */ + private final AbstractKmerTable map; + + } + + /*--------------------------------------------------------------*/ + + /** + * Matches read kmers against reference kmers, performs binning and/or trimming, and writes output. + */ + private class ProcessThread extends Thread{ + + /** + * Constructor + * @param cris_ Read input stream + * @param ros_ Unmatched read output stream (optional) + * @param rosb_ Matched read output stream (optional) + */ + public ProcessThread(ConcurrentReadStreamInterface cris_, RTextOutputStream3 ros_, RTextOutputStream3 rosb_){ + cris=cris_; + ros=ros_; + rosb=rosb_; + } + + @Override + public void run(){ + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + ArrayList bad=(rosb==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); + + //While there are more reads lists... + while(reads!=null && reads.size()>0){ + + int removed=0; + + //For each read (or pair) in the list... + for(int i=0; i0){ + if(r1!=null && r1.quality!=null && r1.avgQuality()=0){ + if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);} + if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);} + } + + if(removePairsIfEitherBad){remove=r1.discarded() || (r2!=null && r2.discarded());} + else{remove=r1.discarded() && (r2==null || r2.discarded());} + + if(remove){ + if(r1!=null){ + basesQFilteredT+=r1.bases.length; + readsQFilteredT++; + } + if(r2!=null){ + basesQFilteredT+=r2.bases.length; + readsQFilteredT++; + } + if(bad!=null){bad.add(r1);} + }else{ + //Process kmers + + if(ktrimLeft || ktrimRight || ktrimN){ + + int rlen1=0, rlen2=0; + int xsum=0; + int rktsum=0; + if(r1!=null){ + int x=ktrim(r1, keySets); + xsum+=x; + rktsum+=(x>0 ? 1 : 0); + rlen1=r1.bases==null ? 0 : r1.bases.length; + if(rlen10 ? 1 : 0); + rlen2=r2.bases==null ? 0 : r2.bases.length; + if(rlen2maxBadKmers){r1.setDiscarded(true);} + if(r2!=null && b>maxBadKmers){r2.setDiscarded(true);} + + + if((removePairsIfEitherBad && (r1.discarded() || (r2!=null && r2.discarded()))) || + (r1.discarded() && (r2==null || r2.discarded()))){ + remove=true; + if(a>maxBadKmers){ + readsKFilteredT++; + basesKFilteredT+=r1.bases.length; + } + if(b>maxBadKmers){ + readsKFilteredT++; + basesKFilteredT+=r2.bases.length; + } + if(bad!=null){bad.add(r1);} + } +// assert(false) : "a="+a+", b="+b+", r1.discarded()="+r1.discarded()+", removePairsIfEitherBad="+removePairsIfEitherBad+ +// "\nremove="+remove+", readsKFilteredT="+readsKFilteredT; + } + } + + if(!remove){ + //Do quality trimming + + int rlen1=0, rlen2=0; + if(r1!=null){ + if(qtrimLeft || qtrimRight){ + int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1); + basesQTrimmedT+=x; + readsQTrimmedT+=(x>0 ? 1 : 0); + } + rlen1=r1.bases==null ? 0 : r1.bases.length; + if(rlen10 ? 1 : 0); + } + rlen2=r2.bases==null ? 0 : r2.bases.length; + if(rlen20 ? Tools.condenseNew(reads) : reads), ln.id); //Creates a new list if old one became empty, to prevent shutting down the cris. + } + + //Fetch a new read list + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + } + + /*--------------------------------------------------------------*/ + + /** Input read stream */ + private final ConcurrentReadStreamInterface cris; + /** Output read streams */ + private final RTextOutputStream3 ros, rosb; + + private long readsInT=0; + private long basesInT=0; + private long readsOutT=0; + private long basesOutT=0; + + private long readsQTrimmedT=0; + private long basesQTrimmedT=0; + private long readsQFilteredT=0; + private long basesQFilteredT=0; + + private long readsKTrimmedT=0; + private long basesKTrimmedT=0; + private long readsKFilteredT=0; + private long basesKFilteredT=0; + + } + + /*--------------------------------------------------------------*/ + /*---------------- Helper Methods ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Trim a read to remove matching kmers and everything to their left or right. + * @param r Read to process + * @param sets Kmer tables + * @return Number of bases trimmed + */ + private final int ktrim(final Read r, final AbstractKmerTable[] sets){ + assert(ktrimLeft || ktrimRight || ktrimN); + if(r==null || r.bases==null){return 0;} + if(verbose){System.err.println("KTrimming read "+r.id);} + final byte[] bases=r.bases, quals=r.quality; + final int minlen=k-1; + final int minlen2=(maskMiddle ? k/2 : k); + final long shift=2*k; + final long shift2=shift-2; + final long mask=~((-1L)<>>2)|(x2<=0 && i>bases.length-k; i--){ + byte b=bases[i]; + long x=Dedupe.baseToNumber[b]; + long x2=Dedupe.baseToComplementNumber[b]; + kmer=kmer|(x<<(2*len)); + rkmer=((rkmer<<2)|x2)&mask; + len++; + if(verbose){System.err.println("Scanning5 i="+i+", kmer="+kmer+", rkmer="+rkmer+", bases="+new String(bases, Tools.max(0, i-k2), Tools.min(i+1, k)));} + if(len>=mink){ + + final long value=toValue(kmer, rkmer, kMasks[len]); + if(verbose){System.err.println("Looking for right kmer "+AminoAcid.kmerToString(kmer&~kMasks[len], len)+"; value="+value+"; kmask="+kMasks[len]);} + AbstractKmerTable set=sets[(int)(value%WAYS)]; + if(set.contains(value)){ + if(verbose){System.err.println("Found "+value);} + minLoc=i; + minLocExclusive=Tools.min(minLocExclusive, bases.length); + maxLoc=bases.length-1; + maxLocExclusive=Tools.max(maxLocExclusive, i-1); + found++; + } + } + } + } + } + + + if(verbose){System.err.println("found="+found+", minLoc="+minLoc+", maxLoc="+maxLoc+", minLocExclusive="+minLocExclusive+", maxLocExclusive="+maxLocExclusive);} + + + if(found==0){return 0;} + + if(ktrimN){ //Replace kmer hit zone with the trim symbol + Arrays.fill(bases, minLoc, maxLoc+1, trimSymbol); + if(quals!=null){Arrays.fill(quals, minLoc, maxLoc+1, (byte)0);} + return maxLoc-minLoc+1; + } + + if(ktrimLeft){ //Trim from the read start to the rightmost kmer base + if(verbose){System.err.println("Left trimming to "+(ktrimExclusive ? maxLocExclusive+1 : maxLoc+1)+", "+0);} + int x=TrimRead.trimToPosition(r, ktrimExclusive ? maxLocExclusive+1 : maxLoc+1, 0, 1); + if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));} + return x; + }else{ //Trim from the leftmost kmer base to the read stop + assert(ktrimRight); + if(verbose){System.err.println("Right trimming to "+0+", "+(ktrimExclusive ? minLocExclusive-1 : minLoc-1));} + int x=TrimRead.trimToPosition(r, 0, ktrimExclusive ? minLocExclusive-1 : minLoc-1, 1); + if(verbose){System.err.println("Trimmed "+x+" bases: "+new String(r.bases));} + return x; + } + } + + /** + * Counts the number of kmer hits for a read. + * @param r Read to process + * @param sets Kmer tables + * @return Number of hits + */ + private final int countSetKmers(final Read r, final AbstractKmerTable sets[]){ + if(r==null || r.bases==null){return 0;} + final byte[] bases=r.bases; + final int minlen=k-1; + final int minlen2=(maskMiddle ? k/2 : k); + final long shift=2*k; + final long shift2=shift-2; + final long mask=~((-1L)<>>2)|(x2<0){ + if(verbose){System.err.println("Found = "+(found+1)+"/"+maxBadKmers);} + if(found==maxBadKmers){ + int count=scaffoldCounts.addAndGet(id, 1); + if(count<0){scaffoldCounts.set(id, Integer.MAX_VALUE);} + if(hitCounts==null){ + return (found=found+1); + }//Early exit, but prevents generation of histogram that goes over maxBadKmers+1. + } + found++; + +// if(found>maxBadKmers){ +// Integer id=set.get(key); +// int count=scaffoldCounts.addAndGet(id, 1); +// if(count<0){scaffoldCounts.set(id, Integer.MAX_VALUE);} +// break; +// } + } + } + } + + if(hitCounts!=null){hitCounts.incrementAndGet(Tools.min(found, hitCounts.length()-1));} + return found; + } + + /** Estimates kmer hit counts for kmers longer than k using consecutive matches + * @param r + * @param sets + * @return Number of sets of consecutive hits of exactly length kbig + */ + private final int countSetKmersBig(final Read r, final AbstractKmerTable sets[]){ + assert(kbig>k); + final int sub=kbig-k-1; + assert(sub>=0) : kbig+", "+sub; + if(r==null || r.bases==null){return 0;} + final byte[] bases=r.bases; + final int minlen=k-1; + final int minlen2=(maskMiddle ? k/2 : k); + final long shift=2*k; + final long shift2=shift-2; + final long mask=~((-1L)<>>2)|(x2< scaffoldNames=new ArrayList(); + /** scaffoldCounts[id] stores the number of reads with kmer matches to that scaffold */ + private AtomicIntegerArray scaffoldCounts; + /** hitCounts[x] stores the number of reads with exactly x kmer matches */ + private final AtomicLongArray hitCounts; + /** Array of reference files from which to load kmers */ + private String[] ref=null; + /** Array of literal strings from which to load kmers */ + private String[] literal=null; + + /** Input reads */ + private String in1=null, in2=null; + /** Output reads (unmatched and at least minlen) */ + private String out1=null, out2=null; + /** Output reads (matched or shorter than minlen) */ + private String outb1=null, outb2=null; + /** Statistics output files */ + private String outstats=null, outduk=null, outrqc=null; + + /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */ + private long maxReads=-1; + /** Output reads in input order. May reduce speed. */ + private boolean ORDERED=false; + /** Attempt to match kmers shorter than normal k on read ends when doing kTrimming. */ + private boolean useShortKmers=false; + /** Make the middle base in a kmer a wildcard to improve sensitivity */ + private boolean maskMiddle=true; + + /** Store reference kmers with up to this many substitutions */ + private int hammingDistance=0; + /** Store reference kmers with up to this many edits (including indels) */ + private int editDistance=0; + /** Never skip more than this many consecutive kmers when hashing reference. */ + private int maxSkip=99; + /** Always skip at least this many consecutive kmers when hashing reference. + * Note that a skip of 1 means every kmer is used, 2 means every other, etc. */ + private int minSkip=1; + + /*--------------------------------------------------------------*/ + /*---------------- Statistics ----------------*/ + /*--------------------------------------------------------------*/ + + long readsIn=0; + long basesIn=0; + long readsOut=0; + long basesOut=0; + + long readsQTrimmed=0; + long basesQTrimmed=0; + long readsQFiltered=0; + long basesQFiltered=0; + + long readsKTrimmed=0; + long basesKTrimmed=0; + long readsKFiltered=0; + long basesKFiltered=0; + + long refReads=0; + long refBases=0; + long refKmers=0; + + /*--------------------------------------------------------------*/ + /*---------------- Final Primitives ----------------*/ + /*--------------------------------------------------------------*/ + + /** Look for reverse-complements as well as forward kmers. Default: true */ + private final boolean rcomp; + /** Don't allow a read 'N' to match a reference 'A'. + * Reduces sensitivity when hdist>0 or edist>0. Default: false. */ + private final boolean forbidNs; + /** AND bitmask with 0's at the middle base */ + private final long middleMask; + /** Use HashForest data structure */ + private final boolean useForest; + /** Use KmerTable data structure */ + private final boolean useTable; + /** Use HashArray data structure (default) */ + private final boolean useArray; + + /** Normal kmer length */ + private final int k; + /** k-1; used in some expressions */ + private final int k2; + /** Emulated kmer greater than k */ + private final int kbig; + /** Shortest kmer to use for trimming */ + private final int mink; +// /** OR bitmask for full-length kmers, with a single 1 bit immediately to the left of the kmer */ +// private final long kmask; //TODO: Eliminate this and just use kmasks array + /** A read may contain up to this many kmers before being considered a match. Default: 0 */ + private final int maxBadKmers; + + /** Quality-trim the left side */ + private final boolean qtrimLeft; + /** Quality-trim the right side */ + private final boolean qtrimRight; + /** Trim bases at this quality or below. Default: 4 */ + private final byte trimq; + /** Throw away reads below this average quality before trimming. Default: 0 */ + private final byte minAvgQuality; + /** Throw away reads containing more than this many Ns. Default: -1 (disabled) */ + private final int maxNs; + /** Throw away reads shorter than this after trimming. Default: 20 */ + private final int minReadLength; + /** Toss reads shorter than this fraction of initital length, after trimming */ + private final float minLenFraction; + /** Trim matching kmers and all bases to the left */ + private final boolean ktrimLeft; + /** Trim matching kmers and all bases to the right */ + private final boolean ktrimRight; + /** Don't trim, but replace matching kmers with a symbol (default N) */ + private final boolean ktrimN; + /** Exclude kmer itself when ktrimming */ + private final boolean ktrimExclusive; + /** Replace bases covered by matched kmers with this symbol */ + private final byte trimSymbol; + /** Output over-trimmed reads to outbad (outmatch). If false, they are discarded. */ + private final boolean addTrimmedToBad; + + /** True iff java was launched with the -ea' flag */ + private final boolean EA; + /** Skip this many initial input reads */ + private final long skipreads; + + /** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad. + * Default: true. */ + private final boolean removePairsIfEitherBad; + + /*--------------------------------------------------------------*/ + /*---------------- Static Fields ----------------*/ + /*--------------------------------------------------------------*/ + + public static int VERSION=8; + + /** Number of tables (and threads, during loading) */ + private static final int WAYS=5; //123 + /** Verbose messages */ + public static final boolean verbose=false; //123 + + /** Print messages to this stream */ + private static PrintStream outstream=System.err; + /** Permission to overwrite existing files */ + public static boolean overwrite=false; + /** Print speed statistics upon completion */ + public static boolean showSpeed=true; + /** Display progress messages such as memory usage */ + public static boolean DISPLAY_PROGRESS=true; + /** Number of ProcessThreads */ + public static int THREADS=Shared.THREADS; + /** Indicates end of input stream */ + private static final ArrayList POISON=new ArrayList(0); + /** Do garbage collection prior to printing memory usage */ + private static final boolean GC_BEFORE_PRINT_MEMORY=false; + + /** x&clearMasks[i] will clear base i */ + private static final long[] clearMasks; + /** x|setMasks[i][j] will set base i to j */ + private static final long[][] setMasks; + /** x&leftMasks[i] will clear all bases to the right of i (exclusive) */ + private static final long[] leftMasks; + /** x&rightMasks[i] will clear all bases to the left of i (inclusive) */ + private static final long[] rightMasks; + /** x|kMasks[i] will set the bit to the left of the leftmost base */ + private static final long[] kMasks; + + public static HashMap RQC_MAP=null; + + /*--------------------------------------------------------------*/ + /*---------------- Static Initializers ----------------*/ + /*--------------------------------------------------------------*/ + + static{ + clearMasks=new long[32]; + leftMasks=new long[32]; + rightMasks=new long[32]; + kMasks=new long[32]; + setMasks=new long[4][32]; + for(int i=0; i<32; i++){ + clearMasks[i]=~(3L<<(2*i)); + } + for(int i=0; i<32; i++){ + leftMasks[i]=((-1L)<<(2*i)); + } + for(int i=0; i<32; i++){ + rightMasks[i]=~((-1L)<<(2*i)); + } + for(int i=0; i<32; i++){ + kMasks[i]=((1L)<<(2*i)); + } + for(int i=0; i<32; i++){ + for(long j=0; j<4; j++){ + setMasks[(int)j][i]=(j<<(2*i)); + } + } + } + +} diff --git a/current/jgi/BBMask.java b/current/jgi/BBMask.java new file mode 100755 index 0000000..9050dc2 --- /dev/null +++ b/current/jgi/BBMask.java @@ -0,0 +1,657 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.LinkedHashMap; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SamLine; +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import dna.AminoAcid; +import dna.Data; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ByteFile1; +import fileIO.ByteFile2; +import fileIO.FileFormat; +import fileIO.ReadWrite; + +/** + * Masks a fasta file by inserting 'N' in place of low-complexity short repeats, + * and anything covered by mapped reads in a sam file. + * + * @author Brian Bushnell + * @date Feb 18, 2014 + * + */ +public class BBMask{ + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + BBMask masker=new BBMask(args); + masker.process(t); + } + + public BBMask(String[] args){ + + if(args==null || args.length==0){ + printOptions(); + System.exit(0); + } + + for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}} + outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + FastaReadInputStream.SPLIT_READS=false; + stream.FastaReadInputStream.MIN_READ_LEN=1; + Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH); + Shared.READ_BUFFER_NUM_BUFFERS=Tools.min(8, Shared.READ_BUFFER_NUM_BUFFERS); + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=16; + ReadWrite.ZIP_THREAD_DIVISOR=1; + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + ByteFile2.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + ConcurrentGenericReadInputStream.verbose=verbose; + // align2.FastaReadInputStream2.verbose=verbose; + stream.FastqReadInputStream.verbose=verbose; + ReadWrite.verbose=verbose; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("t") || a.equals("threads")){ + Shared.THREADS=Tools.max(Integer.parseInt(b), 1); + }else if(a.equals("build") || a.equals("genome")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1") || a.equals("ref")){ + inRef=b; + }else if(a.equals("insam") || a.equals("samin") || a.equals("sam")){ + inSam=b.split(","); + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1") || a.equals("output1")){ + outRef=b; + }else if(a.equals("qfin") || a.equals("qfin1")){ + qfinRef=b; + }else if(a.equals("qfout") || a.equals("qfout1")){ + qfoutRef=b; + }else if(a.equals("extin")){ + extinRef=b; + }else if(a.equals("extout")){ + extoutRef=b; + }else if(a.equals("mink") || a.equals("kmin")){ + mink=Integer.parseInt(b); + }else if(a.equals("maxk") || a.equals("kmax")){ + maxk=Integer.parseInt(b); + }else if(a.equals("k")){ + mink=maxk=Integer.parseInt(b); + }else if(a.equals("minlen")){ + minlen=Integer.parseInt(b); + }else if(a.equals("mincount")){ + mincount=Integer.parseInt(b); + }else if(a.equals("trd") || a.equals("trc") || a.equals("trimreaddescription")){ + Shared.TRIM_READ_COMMENTS=Tools.parseBoolean(b); + }else if(a.equals("parsecustom")){ + parsecustom=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("fastareadlen") || a.equals("fastareadlength")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.equals("fastaminread") || a.equals("fastaminlen") || a.equals("fastaminlength")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("tossbrokenreads") || a.equals("tbr")){ + boolean x=Tools.parseBoolean(b); + Read.NULLIFY_BROKEN_QUALITY=x; + ConcurrentGenericReadInputStream.REMOVE_DISCARDED_READS=x; + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){ + int x=Integer.parseInt(b); + stream.FastaReadInputStream.MIN_READ_LEN=(x>0 ? x : Integer.MAX_VALUE); + }else if(inRef==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + inRef=arg; + }else if(outRef==null && i==1 && !arg.contains("=")){ + outRef=arg; + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + + if(inRef==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.THREADS>2){ + // if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;} + ByteFile.FORCE_MODE_BF2=true; + } + + if(outRef==null){ + outRef="stdout"; + } + + if(outRef!=null && outRef.equalsIgnoreCase("null")){outRef=null;} + + if(!Tools.testOutputFiles(overwrite, false, outRef)){ + throw new RuntimeException("\n\nOVERWRITE="+overwrite+"; Can't write to output file "+outRef+"\n"); + } + + FASTQ.PARSE_CUSTOM=parsecustom; + + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + ffoutRef=FileFormat.testOutput(outRef, FileFormat.FASTA, extoutRef, true, overwrite, false); + + ffinRef=FileFormat.testInput(inRef, FileFormat.FASTA, extinRef, true, true); + + if(inSam!=null && inSam.length>0){ + ffinSam=new FileFormat[inSam.length]; + for(int i=0; i0){ + t.start(); + repeats=maskRepeats(); + t.stop(); + + double rpnano=refReads/(double)(t.elapsed); + double bpnano=refBases/(double)(t.elapsed); + + String rpstring=""+refReads; + String bpstring=""+refBases; + String bmstring=""+repeats; + + while(rpstring.length()<12){rpstring=" "+rpstring;} + while(bpstring.length()<12){bpstring=" "+bpstring;} + while(bmstring.length()<12){bmstring=" "+bmstring;} + + outstream.println("Time: \t"+t); + outstream.println("Ref Scaffolds: "+rpstring+" \t"+String.format("%.2fk scafs/sec", rpnano*1000000)); + outstream.println("Ref Bases: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + outstream.println("Repeat Bases Masked: "+bmstring); + } + + if(ffinSam!=null){ + t.start(); + mapping=maskSam(); + t.stop(); + + double rpnano=samReads/(double)(t.elapsed); + double bpnano=samBases/(double)(t.elapsed); + + String rpstring=""+samReads; + String bpstring=""+samBases; + String bmstring=""+mapping; + + while(rpstring.length()<12){rpstring=" "+rpstring;} + while(bpstring.length()<12){bpstring=" "+bpstring;} + while(bmstring.length()<12){bmstring=" "+bmstring;} + + outstream.println("Time: \t"+t); + outstream.println("Sam Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Sam Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + outstream.println("Sam Bases Masked: "+bmstring); + } + long total=repeats+mapping, masked=0; + + if(total>0 || true){ + masked=maskFromBitsets(); + } + + assert(total==masked) : repeats+", "+mapping+", "+total+", "+masked; + + { + writeOutput(); + t0.stop(); + String tstring=""+total; + while(tstring.length()<12){tstring=" "+tstring;} + outstream.println("\nTotal Bases Masked: "+tstring+"/"+refBases); + outstream.println("Total Time: \t"+t); +// outstream.println("Total Time: "+t0); + } + + + + if(errorState){ + throw new RuntimeException("\nBBMask terminated in an error state; the output may be corrupt."); + } + } + + private long maskFromBitsets(){ + long sum=0; + for(Read r : map.values()){ + BitSet bs=((BitSet)r.obj); + byte[] bases=r.bases; + for(int i=0; i list=new ArrayList(1); + list.add(r); + ros.add(list, i); + i++; + } + errorState|=ReadWrite.closeStream(ros); + } + + private long maskRepeats(){ + long sum=0; + for(Read r : map.values()){ + sum+=maskRepeats(r, mink, maxk, mincount, minlen); + } + return sum; + } + + private long maskSam(){ + long before=0, after=0; + for(Read r : map.values()){ + before+=((BitSet)r.obj).cardinality(); + } + for(FileFormat ff : ffinSam){ + maskSam(ff); + } + for(Read r : map.values()){ + after+=((BitSet)r.obj).cardinality(); + } + return after-before; + } + + private void maskSam(FileFormat ff){ + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, false, ff, null, null, null); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + + { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + + for(int idx=0; idx hashRef(){ + + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, false, ffinRef, null, qfinRef, null); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + + final LinkedHashMap hmr=new LinkedHashMap(); + + { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + + for(int idx=0; idx15 ? -1 : ~((-1)<<(2*k))); + for(int loc=0; loc=minlen){ + int a=loc-k, b=loc-k+len; + bs.set(a, b); +// System.err.println("len="+len+", minlen="+minlen+", set "+(loc-k)+"-"+(loc-k+len)); + loc=Tools.max(loc, b-minlen); +// System.err.println("Reset loc to "+loc); + }else{ +// System.err.println("len="+len+" < minlen="+minlen); + } + } + + } + + + private static int repeatLength(final byte[] bases, final int k, final int mask, final int loc){ + + final int lim=bases.length; + final int key=getInitialKey(bases, loc, k); + if(key<0){return 0;} + int kmer=key; + int gap=0, last=-1; + for(int i=loc; i jgi.BBMask ref= sam= out="); + outstream.println("sam and out are optional.\n"); + outstream.println("Other parameters and their defaults:\n"); + outstream.println("overwrite=false \tOverwrites files that already exist"); + outstream.println("ziplevel=2 \tSet compression level, 1 (low) to 9 (max)"); + outstream.println("fastawrap=100 \tLength of lines in fasta output"); + outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto"); + outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)"); + } + + + /*--------------------------------------------------------------*/ + + private LinkedHashMap map=null; + + private long refReads=0; + private long refBases=0; +// private long repeatsMasked=0; + + private long samReads=0; + private long samBases=0; +// private long samMasked=0; + + public boolean errorState=false; + + private String inRef=null; + private String inSam[]=null; + + private String qfinRef=null; + + private String outRef=null; + + private String qfoutRef=null; + + private String extinRef=null; + private String extoutRef=null; + + private boolean parsecustom=false; + private boolean overwrite=false; + private boolean colorspace=false; + + private long maxReads=-1; + + private byte qin=-1; + private byte qout=-1; + + private int mink=0; + private int maxk=0; + private int minlen=30; + private int mincount=3; + + private final FileFormat ffinRef; + private final FileFormat[] ffinSam; + + private final FileFormat ffoutRef; + + + /*--------------------------------------------------------------*/ + + private PrintStream outstream=System.err; + public static boolean verbose=false; + public static boolean CONVERT_NON_ACGTN=true; + +} diff --git a/current/jgi/BBQC.java b/current/jgi/BBQC.java new file mode 100755 index 0000000..c6c79eb --- /dev/null +++ b/current/jgi/BBQC.java @@ -0,0 +1,909 @@ +package jgi; + +import java.io.File; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.TimeZone; + +import stream.FASTQ; +import stream.Read; + +import dna.Data; + +import align2.BBMap; +import align2.Shared; +import align2.Tools; +import align2.TrimRead; +import fileIO.ByteFile1; +import fileIO.ReadWrite; +import fileIO.TextStreamWriter; + +/** + * Wrapper for BBDukF to implement Rolling QC's filter stage. + * @author Brian Bushnell + * @date Nov 26, 2013 + * + */ +public class BBQC { + + + /*--------------------------------------------------------------*/ + /*---------------- Initialization Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Program entrance from command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + + ReadWrite.USE_PIGZ=true; + ReadWrite.USE_UNPIGZ=true; + + //Create a filter instance + BBQC filter=new BBQC(args); + + ///...and execute it. + filter.process(); + } + + /** + * Constructor. + * @param args Command line arguments + */ + BBQC(String[] args){ + + //Optional default parameters to match current pipeline +// arglist.add("k=22"); +// arglist.add("maxbadkmers=2"); + + //Symbols to insert in output filename to denote operations performed; may be overriden from command line + String symbols_=null;//"filtered" + + //Parse argument list + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null") || a.equals(in2)){ + // do nothing + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + primaryArgList.add(arg); + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){ + out1=b; + }else if(a.equals("out2") || a.equals("output2")){ + out2=b; + }else if(a.equals("qfin") || a.equals("qfin1")){ + qfin1=b; + }else if(a.equals("qfout") || a.equals("qfout1")){ + qfout1=b; + }else if(a.equals("qfin2")){ + qfin2=b; + }else if(a.equals("qfout2")){ + qfout2=b; + }else if(a.equals("ref")){ + if(b!=null){ + if(!b.contains(",") || new File(b).exists()){ + filterrefs.add(b); + }else{ + String[] split2=b.split(","); + for(String s2 : split2){ + filterrefs.add(s2); + } + } + } + }else if(a.equals("artifactdb")){ + mainArtifactFile=b; + }else if(a.equals("rnadb")){ + artifactFileRna=b; + }else if(a.equals("dnadb")){ + artifactFileDna=b; + }else if(a.equals("phixref")){ + phixRef=b; + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){ + minLen=Integer.parseInt(b); + }else if(a.equals("mlf") || a.equals("minlenfrac") || a.equals("minlenfraction") || a.equals("minlengthfraction")){ + minLenFraction=Float.parseFloat(b); + }else if(a.equals("path") || a.equals("outdir")){ + outDir=b; + }else if(a.equals("symbols")){ + symbols_=b; + }else if(a.equals("overallstats") || a.equals("stats")){ + rqcStatsName=b; + }else if(a.equals("scafstats")){ + scaffoldStatsName=b; + }else if(a.equals("kmerstats")){ + kmerStatsName=b; + }else if(a.equals("log")){ + logName=b; + }else if(a.equals("filelist")){ + fileListName=b; + }else if(a.equals("compress")){ + compress=Tools.parseBoolean(b); + }else if(a.equals("rna")){ + rnaFlag=Tools.parseBoolean(b); + }else if(a.equals("phix")){ + phixFlag=Tools.parseBoolean(b); + }else if(a.equals("ktrim")){ + ktrim=b; + }else if(a.equals("mink")){ + mink=Integer.parseInt(b); + }else if(a.equals("k")){ + k=Integer.parseInt(b); + }else if(a.equals("maq")){ + maq=Byte.parseByte(b); + }else if(a.equals("trimq")){ + trimq=Byte.parseByte(b); + }else if(a.equals("normalize") || a.equals("norm")){ + normalize=Tools.parseBoolean(b); + }else if(a.equals("ecc")){ + ecc=Tools.parseBoolean(b); + }else if(a.equals("aec")){ + aec=Tools.parseBoolean(b); + }else if(a.equals("markerrorsonly") || a.equals("meo")){ + meo=Tools.parseBoolean(b); + }else if(a.equals("tam")){ + tam=Tools.parseBoolean(b); + }else if(a.equals("taf")){ + trimAfterFiltering=Tools.parseBoolean(b); + }else if(a.equals("mue")){ + mue=Tools.parseBoolean(b); + }else if(a.equals("mw1")){ + mw1=Tools.parseBoolean(b); + }else if(a.equals("max") || a.equals("maxdepth")){ + maxdepth=Integer.parseInt(b); + }else if(a.equals("min") || a.equals("mindepth")){ + mindepth=Integer.parseInt(b); + }else if(a.equals("target") || a.equals("targetdepth")){ + target=Integer.parseInt(b); + }else if(a.equals("prehashes")){ + prehashes=Integer.parseInt(b); + }else if(a.equals("hashes")){ + hashes=Integer.parseInt(b); + }else if(a.equals("bits")){ + bits=Integer.parseInt(b); + }else if(a.equals("qtrim")){ + if(b==null){qtrim="rl";} + else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrim="l";} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrim="r";} + else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrim="lr";} + else if(Character.isDigit(b.charAt(0))){ + trimq=Byte.parseByte(b); + qtrim=(trimq>=0 ? "lr" : "f"); + }else{qtrim=""+Tools.parseBoolean(b);} + }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){ + if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){ + TrimRead.optimalMode=true; + TrimRead.optimalBias=Float.parseFloat(b); + assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("maxns")){ + maxNs=Integer.parseInt(b); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + System.err.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in1=arg; + if(arg.indexOf('#')>-1 && !new File(arg).exists()){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else{ + //Uncaptured arguments are passed to BBDuk + primaryArgList.add(arg); + } + } + + //Set final field 'symbols' + symbols=(symbols_==null ? abbreviation() : symbols_); + + //Pass overwrite flag to BBDuk + primaryArgList.add("ow="+overwrite); + + if(outDir!=null){ + outDir=outDir.trim().replace('\\', '/'); + if(outDir.length()>0 && !outDir.endsWith("/")){outDir=outDir+"/";} + }else{outDir="";} + + {//Prepend output directory to output files + if(logName!=null){logName=outDir+logName+".tmp";} //Add '.tmp' to log file + if(fileListName!=null){fileListName=outDir+fileListName;} + } + + {//Create unique output file names for second pass + if(rqcStatsName!=null){ + rqcStatsName_kt=outDir+"ktrim_"+rqcStatsName; + rqcStatsName=outDir+rqcStatsName; + } + if(kmerStatsName!=null){ + kmerStatsName_kt=outDir+"ktrim_"+kmerStatsName; + kmerStatsName=outDir+kmerStatsName; + } + if(scaffoldStatsName!=null){ + scaffoldStatsName_kt=outDir+"ktrim_"+scaffoldStatsName; + scaffoldStatsName=outDir+scaffoldStatsName; + } + } + + //Create output filename from input filename if no output filename is specified + if(out1==null && in1!=null){ + File f=new File(in1); + String name=f.getName(); + String raw=ReadWrite.rawName(name); + int x=raw.lastIndexOf('.'); + if(x>-1){ + out1=raw.substring(0, x)+"."+symbols+raw.substring(x)+(compress ? ".gz" : ""); + }else{ + out1=raw+"."+symbols+".fastq"+(compress ? ".gz" : ""); + } + } + + tempSalt=KmerNormalize.getSalt(out1, 0); + } + + + /*--------------------------------------------------------------*/ + /*---------------- Processing Methods ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Primary method to fully execute the program. + */ + public void process(){ + + //Create output directory + if(outDir!=null && outDir.length()>0){ + File f=new File(outDir); + if(!f.exists()){ + f.mkdirs(); + } + } + + //Create log file + if(logName!=null){ + boolean b=Tools.canWrite(logName, overwrite); + assert(b) : "Can't write to "+logName; + log("start", false); + } + + //Create file list file + if(fileListName!=null){ + boolean b=Tools.canWrite(fileListName, overwrite); + assert(b) : "Can't write to "+fileListName; + + StringBuilder sb=new StringBuilder(); + if(out1!=null){sb.append("filtered_fastq="+out1).append('\n');} + if(qfout1!=null){sb.append("filtered_qual="+qfout1).append('\n');} + if(out2!=null){sb.append("filtered_fastq_2="+out2).append('\n');} + if(qfout2!=null){sb.append("filtered_qual_2="+qfout2).append('\n');} + + if(sb.length()>0){ + ReadWrite.writeString(sb, fileListName, false); + } + } + + final String trimPrefix="TEMP_TRIM_"+tempSalt+"_"; + final String humanPrefix="TEMP_HUMAN_"+tempSalt+"_"; + final String filterPrefix="TEMP_FILTER_"+tempSalt+"_"; + + trim(in1, in2, out1, out2, qfin1, qfin2, qfout1, qfout2, trimPrefix); + filter(out1, out2, out1, out2, qfout1, qfout2, qfout1, qfout2, trimPrefix, filterPrefix, true); + delete(trimPrefix, out1, out2, qfout1, qfout2); + if(normalize || ecc){ + dehumanize(out1, out2, out1, out2, qfout1, qfout2, filterPrefix, humanPrefix, true, true); + delete(filterPrefix, out1, out2, qfout1, qfout2); + Data.unloadAll(); + normalize(out1, out2, out1, out2, qfout1, qfout2, qfout1, qfout2, humanPrefix, true); + delete(humanPrefix, out1, out2, qfout1, qfout2); + }else{ + dehumanize(out1, out2, out1, out2, qfout1, qfout2, filterPrefix, "", true, false); + delete(filterPrefix, out1, out2, qfout1, qfout2); + Data.unloadAll(); + } + + //Write combined stats file (number of reads/bases present/removed in each stage) + if(rqcStatsName!=null){ + final TextStreamWriter tsw=new TextStreamWriter(rqcStatsName, overwrite, false, false); + tsw.start(); + tsw.println(BBDukF.rqcString()); + tsw.poisonAndWait(); + } + + //Finish writing log file + if(logName!=null){ + log("complete", true); + if(logName.endsWith(".tmp")){ //Remove .tmp extension + String old=logName; + logName=logName.substring(0, logName.length()-4); + try { + new File(old).renameTo(new File(logName)); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + } + + /** + * Runs BBDuk to perform: + * Quality filtering, quality trimming, n removal, short read removal, artifact removal (via kmer filtering), phiX removal. + * + * @param in1 Primary input reads file (required) + * @param in2 Secondary input reads file + * @param out1 Primary output reads file (required) + * @param out2 Secondary output reads file + * @param qfin1 Primary input qual file + * @param qfin2 Secondary input qual file + * @param qfout1 Primary output qual file + * @param qfout2 Secondary output qual file + * @param inPrefix Append this prefix to input filenames + */ + private void filter(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2, String inPrefix, + String outPrefix, boolean prependIndir){ + + log("filter start", true); + + ArrayList argList=new ArrayList(); + + {//Fill list with BBDuk arguments + if(maq>-1){argList.add("maq="+maq);} + if(maxNs>=0){argList.add("maxns="+maxNs);} + if(minLen>0){argList.add("minlen="+minLen);} + if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);} + argList.add("k="+k); + argList.add("hdist=1"); + + if(qtrim!=null && trimAfterFiltering){ + argList.add("trimq="+trimq); + argList.add("qtrim="+qtrim); + } + + //Pass along uncaptured arguments + for(String s : primaryArgList){argList.add(s);} + + //Set read I/O files + if(in1!=null){argList.add("in1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in1);} + if(in2!=null){argList.add("in2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in2);} + if(out1!=null){argList.add("out1="+(tmpDir==null ? outDir : tmpDir)+outPrefix+out1);} + if(out2!=null){argList.add("out2="+(tmpDir==null ? outDir : tmpDir)+outPrefix+out2);} + if(qfin1!=null){argList.add("qfin1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin1);} + if(qfin2!=null){argList.add("qfin2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin2);} + if(qfout1!=null){argList.add("qfout1="+(tmpDir==null ? outDir : tmpDir)+outPrefix+qfout1);} + if(qfout2!=null){argList.add("qfout2="+(tmpDir==null ? outDir : tmpDir)+outPrefix+qfout2);} + +// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName);} //Old style for 2 log files + if(rqcStatsName!=null){argList.add("rqc=hashmap");} + if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName);} + if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName);} + } + + {//Add BBDuk references + filterrefs.add(mainArtifactFile); + filterrefs.add(rnaFlag ? artifactFileRna : artifactFileDna); + if(phixFlag){filterrefs.add(phixRef);} + + + + StringBuilder refstring=new StringBuilder(); + for(String ref : filterrefs){ + if(ref!=null){ + refstring.append(refstring.length()==0 ? "ref=" : ","); + refstring.append(ref); + } + } + + if(refstring!=null && refstring.length()>0){ + argList.add(refstring.toString()); + } + } + + String[] dukargs=argList.toArray(new String[0]); + + {//Run BBDuk + BBDukF duk=new BBDukF(dukargs); + try { + duk.process(); + } catch (Exception e) { + e.printStackTrace(); + log("failed", true); + System.exit(1); + } + } + + //Optionally append files to file list here + + log("filter finish", true); + } + + /** + * Runs BBMap to perform: + * Removal of reads that map to human with high identity (~88%). + * + * @param in1 Primary input reads file (required) + * @param in2 Secondary input reads file + * @param out1 Primary output reads file (required) + * @param out2 Secondary output reads file + * @param qfin1 Primary input qual file + * @param qfin2 Secondary input qual file + * @param qfout1 Primary output qual file + * @param qfout2 Secondary output qual file + * @param inPrefix Append this prefix to input filenames + */ + private void dehumanize(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String inPrefix, + String outPrefix, boolean prependIndir, boolean prependOutdir){ + + log("dehumanize start", true); + + ArrayList argList=new ArrayList(); + + { + +// argList.add("kfilter="+47); + argList.add("minratio=.75"); + argList.add("maxindel=20"); + argList.add("bw=20"); + argList.add("bwr=0.18"); + argList.add("minhits=2"); + argList.add("path="+humanPath); + argList.add("quickmatch"); + + //Pass along uncaptured arguments + for(String s : primaryArgList){argList.add(s);} + + //Set read I/O files + if(in1!=null){argList.add("in1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in1);} + if(in2!=null){argList.add("in2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in2);} + if(out1!=null){argList.add("outu1="+(prependOutdir ? (tmpDir==null ? outDir : tmpDir) : "")+outPrefix+out1);} + if(out2!=null){argList.add("outu2="+(prependOutdir ? (tmpDir==null ? outDir : tmpDir) : "")+outPrefix+out2);} + if(qfin1!=null){argList.add("qfin1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin1);} + if(qfin2!=null){argList.add("qfin2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin2);} + + } + + String[] args=argList.toArray(new String[0]); + + {//Run BBMap + try { + BBMap.main(args); + } catch (Exception e) { + e.printStackTrace(); + log("failed", true); + System.exit(1); + } + } + + //Optionally append files to file list here + + log("dehumanize finish", true); + } + + /** + * Runs BBNorm to preform: + * Error correction, error marking, quality trimming, normalization + * + * @param in1 Primary input reads file (required) + * @param in2 Secondary input reads file + * @param out1 Primary output reads file (required) + * @param out2 Secondary output reads file + * @param qfin1 Primary input qual file + * @param qfin2 Secondary input qual file + * @param qfout1 Primary output qual file + * @param qfout2 Secondary output qual file + * @param inPrefix Append this prefix to input filenames + */ + private void normalize(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2, String inPrefix, boolean prependIndir){ + + log("filter start", true); + + ArrayList argList=new ArrayList(); + + {//Fill list with BBDuk arguments + if(qtrim!=null && !trimAfterFiltering){ + argList.add("trimq="+trimq); + argList.add("qtrim="+qtrim); + } + if(minLen>0){argList.add("minlen="+minLen);} + if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);} + + argList.add("ecc="+ecc); + argList.add("aec="+aec); + argList.add("meo="+meo); + argList.add("tam="+tam); + argList.add("mue="+mue); + argList.add("mw1="+mw1); + argList.add("prefilter=t"); + argList.add("prehashes="+prehashes); + argList.add("hashes="+hashes); + argList.add("bits="+bits); + if(normalize){ + if(target>0){ + argList.add("target="+target); + if(mindepth<0){mindepth=Tools.min(10, target/8);} + if(maxdepth<0){maxdepth=Tools.max(target, (int)((target*17L)/16L));} + } + if(mindepth>=0){argList.add("min="+mindepth);} + if(maxdepth>0){argList.add("max="+maxdepth);} + }else{ + argList.add("keepall"); + } + + //Set read I/O files + if(in1!=null){argList.add("in1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in1);} + if(in2!=null){argList.add("in2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+in2);} + if(out1!=null){argList.add("out="+outDir+out1);} +// if(out2!=null){argList.add("out2="+outDir+out2);} + if(qfin1!=null){argList.add("qfin1="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin1);} + if(qfin2!=null){argList.add("qfin2="+(prependIndir ? (tmpDir==null ? outDir : tmpDir) : "")+inPrefix+qfin2);} +// if(qfout1!=null){argList.add("qfout1="+outDir+qfout1);} +// if(qfout2!=null){argList.add("qfout2="+outDir+qfout2);} + + if(kmerHistName!=null){argList.add("hist="+kmerHistName);} + } + + String[] normargs=argList.toArray(new String[0]); + + {//Run BBNorm + try { + KmerNormalize.main(normargs); + } catch (Exception e) { + e.printStackTrace(); + log("failed", true); + System.exit(1); + } + } + + //Optionally append files to file list here + + log("normalization finish", true); + } + + + /** + * Runs BBDuk to perform: + * Kmer trimming, short read removal. + * + * @param in1 Primary input reads file (required) + * @param in2 Secondary input reads file + * @param out1 Primary output reads file (required) + * @param out2 Secondary output reads file + * @param qfin1 Primary input qual file + * @param qfin2 Secondary input qual file + * @param qfout1 Primary output qual file + * @param qfout2 Secondary output qual file + * @param outPrefix Append this prefix to output filenames + */ + private void trim(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2, String outPrefix){ + + log("ktrim start", true); + + ArrayList argList=new ArrayList(); + + {//Fill list with BBDuk arguments + argList.add("mink="+mink); + argList.add("ktrim="+(ktrim==null ? "f" : ktrim)); + if(minLen>0){argList.add("minlen="+minLen);} + if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);} + argList.add("k=23"); + argList.add("hdist=1"); + + //Pass along uncaptured arguments + for(String s : primaryArgList){argList.add(s);} + + //Set read I/O files + if(in1!=null){argList.add("in1="+in1);} + if(in2!=null){argList.add("in2="+in2);} + if(out1!=null){argList.add("out1="+(tmpDir==null ? outDir : tmpDir)+outPrefix+out1);} + if(out2!=null){argList.add("out2="+(tmpDir==null ? outDir : tmpDir)+outPrefix+out2);} + if(qfin1!=null){argList.add("qfin1="+qfin1);} + if(qfin2!=null){argList.add("qfin2="+qfin2);} + if(qfout1!=null){argList.add("qfout1="+(tmpDir==null ? outDir : tmpDir)+outPrefix+qfout1);} + if(qfout2!=null){argList.add("qfout2="+(tmpDir==null ? outDir : tmpDir)+outPrefix+qfout2);} + +// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName_kt);} //Old style for 2 log files + if(rqcStatsName!=null){argList.add("rqc=hashmap");} + if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName_kt);} + if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName_kt);} + } + + {//Add BBDuk references + trimrefs.add(fragArtifacts); + + StringBuilder refstring=new StringBuilder(); + for(String ref : trimrefs){ + if(ref!=null){ + refstring.append(refstring.length()==0 ? "ref=" : ","); + refstring.append(ref); + } + } + + if(refstring!=null && refstring.length()>0){ + argList.add(refstring.toString()); + } + } + + String[] dukargs=argList.toArray(new String[0]); + + {//run BBDuk + BBDukF duk=new BBDukF(dukargs); + try { + duk.process(); + } catch (Exception e) { + e.printStackTrace(); + log("failed", true); + System.exit(1); + } + } + + //Optionally append files to file list here + + log("ktrim finish", true); + } + + /*--------------------------------------------------------------*/ + /*---------------- Helper Methods ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Log a message in the log file + * @param message Message to log + * @param append True to append, false to overwrite + */ + private void log(String message, boolean append){ + if(logName!=null){ + ReadWrite.writeString(message+", "+timeString()+"\n", logName, append); + } + } + + + /** + * Delete all non-null filenames. + * @param prefix Append this prefix to filenames before attempting to delete them + * @param names Filenames to delete + */ + private void delete(String prefix, String...names){ + log("delete temp files start", true); + if(names!=null){ + for(String s : names){ + if(s!=null){ + s=(tmpDir==null ? outDir : tmpDir)+prefix+s; + if(verbose){System.err.println("Trying to delete "+s);} + File f=new File(s); + if(f.exists()){ + f.delete(); + } + } + } + } + log("delete temp files finish", true); + } + + + /** + * Delete all non-null filenames. + * @param prefix Append this prefix to filenames before attempting to delete them + * @param names Filenames to delete + */ + private void move(String prefix, String...names){ + log("delete temp files start", true); + if(names!=null){ + for(String s : names){ + if(s!=null){ + s=(tmpDir==null ? outDir : tmpDir)+prefix+s; + if(verbose){System.err.println("Trying to delete "+s);} + File f=new File(s); + if(f.exists()){ + f.delete(); + } + } + } + } + log("delete temp files finish", true); + } + + /** + * @return String of symbols indicating which processes were applied to the input reads + */ + private String abbreviation(){ + StringBuilder sb=new StringBuilder(); + + if(mainArtifactFile!=null || (rnaFlag ? artifactFileRna!=null : artifactFileDna!=null)){sb.append("a");} + + if(maxNs>=0){sb.append("n");} +// if(qtrim!=null && !qtrim.equalsIgnoreCase("f") && !qtrim.equalsIgnoreCase("false")){sb.append("q");} + if(maq>0){sb.append("q");} + + if(rnaFlag){sb.append("r");} + else{sb.append("d");} + + if(phixFlag){sb.append("p");} + + return sb.toString(); + } + + /*--------------------------------------------------------------*/ + /*---------------- Static Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * TODO: Some machines are set to UTC rather than PST + * @return Timestamp in RQC's format + */ + public static String timeString(){ + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); +// sdf.setTimeZone(TimeZone.getTimeZone("PST")); + sdf.setTimeZone(TimeZone.getDefault()); + return sdf.format(new Date()); + } + + /*--------------------------------------------------------------*/ + /*---------------- BBNorm Parameters ----------------*/ + /*--------------------------------------------------------------*/ + + private boolean normalize=true; + private boolean ecc=true; + private boolean aec=false; + private boolean meo=false; + private boolean tam=false; + private boolean trimAfterFiltering=true; + private boolean mue=false; + private boolean mw1=false; + private int maxdepth=-1; + private int mindepth=6; + private int target=50; + private int prehashes=3; + private int hashes=4; + private int bits=16; + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Symbols to insert in output filename to denote operations performed */ + private final String symbols; + + /** True for rna artifacts, false for dna artifacts */ + private boolean rnaFlag=false; + /** True if phix should be filtered out */ + private boolean phixFlag=true; + /** Toss reads shorter than this */ + private int minLen=40; + /** Toss reads shorter than this fraction of initial length, after trimming */ + private float minLenFraction=0.6f; + /** Trim bases at this quality or below */ + private byte trimq=14; + /** Throw away reads below this average quality before trimming. Default: 8 */ + private byte maq=8; + /** Quality-trimming mode */ + private String qtrim="rl"; + /** Kmer-trimming mode */ + private String ktrim="r"; + /** Kmer to use for filtering */ + private int k=27; + /** Shortest kmer to use for trimming */ + private int mink=11; + /** Throw away reads containing more than this many Ns. Default: 0 (toss reads with any Ns) */ + private int maxNs=5; + + private boolean verbose=false; + private boolean overwrite=true; + private boolean compress=true; + + /** Arguments to pass to BBDuk */ + private ArrayList primaryArgList=new ArrayList(); + /** References to pass to BBDuk for artifact removal */ + private ArrayList trimrefs=new ArrayList(); + /** References to pass to BBDuk for artifact removal */ + private ArrayList filterrefs=new ArrayList(); + + /*--------------------------------------------------------------*/ + /*---------------- Read Data Files ----------------*/ + /*--------------------------------------------------------------*/ + + /** Directory in which to write all files */ + private String outDir=""; + + /** Directory in which to write all temp files */ + private String tmpDir=Shared.TMPDIR; + + private final String tempSalt; + + /** Primary input reads file (required) */ + private String in1=null; + /** Secondary input reads file */ + private String in2=null; + /** Primary output reads file (required) */ + private String out1=null; + /** Secondary output reads file */ + private String out2=null; + /** Primary input qual file */ + private String qfin1=null; + /** Secondary input qual file */ + private String qfin2=null; + /** Primary output qual file */ + private String qfout1=null; + /** Secondary output qual file */ + private String qfout2=null; + + /*--------------------------------------------------------------*/ + /*---------------- Log Files ----------------*/ + /*--------------------------------------------------------------*/ + + private String logName="status.log"; + private String fileListName="file-list.txt"; + + private String rqcStatsName="filterStats.txt"; + private String kmerStatsName="kmerStats.txt"; + private String scaffoldStatsName="scaffoldStats.txt"; + private String kmerHistName="khist.txt"; + + /** ktrim phase rqc stats file */ + private String rqcStatsName_kt; + /** ktrim phase stats file */ + private String kmerStatsName_kt; + /** ktrim phase scaffold stats file */ + private String scaffoldStatsName_kt; + + /*--------------------------------------------------------------*/ + /*---------------- Reference Files ----------------*/ + /*--------------------------------------------------------------*/ + + private String mainArtifactFile = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/Illumina.artifacts.2013.12.no_DNA_RNA_spikeins.fa"; + private String artifactFileRna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/RNA_spikeins.artifacts.2012.10.NoPolyA.fa"; + private String artifactFileDna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/DNA_spikeins.artifacts.2012.10.fa"; + private String phixRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/phix174_ill.ref.fa"; + + private String allArtifactsLatest = "/global/projectb/sandbox/rqc/qcdb/illumina.artifacts/Illumina.artifacts.fa"; + private String fragArtifacts = "/global/projectb/sandbox/gaag/bbtools/data/adapters.fa"; + private String humanPath = "/global/projectb/sandbox/gaag/bbtools/hg19/"; + + /*--------------------------------------------------------------*/ + /*---------------- Static Fields ----------------*/ + /*--------------------------------------------------------------*/ + +} diff --git a/current/jgi/CalcTrueQuality.java b/current/jgi/CalcTrueQuality.java new file mode 100755 index 0000000..19125ea --- /dev/null +++ b/current/jgi/CalcTrueQuality.java @@ -0,0 +1,1031 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FastaReadInputStream; +import stream.Read; +import stream.SamLine; +import align2.ListNum; +import align2.QualityTools; +import align2.ReadStats; +import align2.Shared; +import align2.Tools; +import dna.AminoAcid; +import dna.Data; +import dna.Gene; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ByteFile1; +import fileIO.ByteFile2; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jan 13, 2014 + * + */ +public class CalcTrueQuality { + + /*--------------------------------------------------------------*/ + /*---------------- Initialization ----------------*/ + /*--------------------------------------------------------------*/ + + public static void main(String[] args){ + ReadStats.COLLECT_QUALITY_STATS=true; + CalcTrueQuality ctq=new CalcTrueQuality(args); + ReadStats.OVERWRITE=ctq.overwrite; + ctq.process(); + + ctq.writeMatrices(); + } + + public static void printOptions(){ + assert(false) : "No help available."; + } + + public CalcTrueQuality(String[] args){ + if(args==null || args.length==0){ + printOptions(); + System.exit(0); + } + + for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}} + outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + FastaReadInputStream.SPLIT_READS=false; + stream.FastaReadInputStream.MIN_READ_LEN=1; + Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH); + Shared.READ_BUFFER_NUM_BUFFERS=Tools.min(8, Shared.READ_BUFFER_NUM_BUFFERS); + ReadWrite.USE_PIGZ=false; + ReadWrite.USE_UNPIGZ=true; + SamLine.CONVERT_CIGAR_TO_MATCH=true; + + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + ByteFile2.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + ConcurrentGenericReadInputStream.verbose=verbose; +// align2.FastaReadInputStream2.verbose=verbose; + stream.FastqReadInputStream.verbose=verbose; + ReadWrite.verbose=verbose; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("t") || a.equals("threads")){ + Shared.THREADS=Tools.max(Integer.parseInt(b), 1); + }else if(a.equals("build") || a.equals("genome")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in=b.split(","); + }else if(a.equals("out") || a.equals("output") || a.equals("q102") || a.equals("q102out")){ + q102out=b; + }else if(a.equals("qbp") || a.equals("qbpout")){ + qbpout=b; + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(in==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in=arg.split(","); + }else if(q102out==null && i==1 && !arg.contains("=")){ + q102out=arg; + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); +// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;} + + if(in==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.THREADS>2){ +// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;} + ByteFile.FORCE_MODE_BF2=true; + } + + if(!Tools.testOutputFiles(overwrite, false, q102out, qbpout, q10out, q12out, qb012out, qb234out, qpout, qout, pout)){ + throw new RuntimeException("\n\nOVERWRITE="+overwrite+"; Can't write to output file "+q102out+"\n"); + } + } + + /*--------------------------------------------------------------*/ + /*---------------- Outer Methods ----------------*/ + /*--------------------------------------------------------------*/ + + public void process(){ + Timer t=new Timer(); + t.start(); + for(String s : in){ + process(s); + } + + t.stop(); + + double rpnano=readsProcessed/(double)(t.elapsed); + double bpnano=basesProcessed/(double)(t.elapsed); + + String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m"); + String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m"); + + while(rpstring.length()<8){rpstring=" "+rpstring;} + while(bpstring.length()<8){bpstring=" "+bpstring;} + + outstream.println("Time: \t"+t); + outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + + rpstring=(readsUsed<100000 ? ""+readsUsed : readsUsed<100000000 ? (readsUsed/1000)+"k" : (readsUsed/1000000)+"m"); + bpstring=(basesUsed<100000 ? ""+basesUsed : basesUsed<100000000 ? (basesUsed/1000)+"k" : (basesUsed/1000000)+"m"); + + while(rpstring.length()<8){rpstring=" "+rpstring;} + while(bpstring.length()<8){bpstring=" "+bpstring;} + + outstream.println("Reads Used: "+rpstring); + outstream.println("Bases Used: "+bpstring); + + if(errorState){ + throw new RuntimeException(this.getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + public void process(String fname){ + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + FileFormat ff=FileFormat.testInput(fname, FileFormat.SAM, null, true, false); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, false, ff, null); + if(verbose){System.err.println("Starting cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + + for(int idx=0; idx0){ + sb.append(a); + sb.append('\t'); + sb.append(b); + sb.append('\t'); + sb.append(c); + sb.append('\t'); + sb.append(d); + sb.append('\t'); + sb.append(sum); + sb.append('\t'); + sb.append(bad); + sb.append('\n'); + } + } + if(sb.length()>0){ + tsw.print(sb.toString()); + sb.setLength(0); + } + } + } + } + System.err.println("Writing "+fname); + tsw.poisonAndWait(); + System.err.println("Done."); + } + + public static void writeMatrix(String fname, long[][][] goodMatrix, long[][][] badMatrix, boolean overwrite){ + assert(fname!=null) : "No file specified"; + FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, false); + TextStreamWriter tsw=new TextStreamWriter(ff, false); + System.err.println("Starting tsw for "+fname); + tsw.start(); + System.err.println("Started tsw for "+fname); + StringBuilder sb=new StringBuilder(); + + final int d0=goodMatrix.length, d1=goodMatrix[0].length, d2=goodMatrix[0][0].length; + for(int a=0; a0){ + sb.append(a); + sb.append('\t'); + sb.append(b); + sb.append('\t'); + sb.append(c); + sb.append('\t'); + sb.append(sum); + sb.append('\t'); + sb.append(bad); + sb.append('\n'); + } + } + if(sb.length()>0){ + tsw.print(sb.toString()); + sb.setLength(0); + } + } + } + System.err.println("Writing "+fname); + tsw.poisonAndWait(); + System.err.println("Done."); + } + + public static void writeMatrix(String fname, long[][] goodMatrix, long[][] badMatrix, boolean overwrite){ + assert(fname!=null) : "No file specified"; + FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, false); + TextStreamWriter tsw=new TextStreamWriter(ff, false); + System.err.println("Starting tsw for "+fname); + tsw.start(); + System.err.println("Started tsw for "+fname); + StringBuilder sb=new StringBuilder(); + + final int d0=goodMatrix.length, d1=goodMatrix[0].length; + for(int a=0; a0){ + sb.append(a); + sb.append('\t'); + sb.append(b); + sb.append('\t'); + sb.append(sum); + sb.append('\t'); + sb.append(bad); + sb.append('\n'); + } + } + if(sb.length()>0){ + tsw.print(sb.toString()); + sb.setLength(0); + } + } + System.err.println("Writing "+fname); + tsw.poisonAndWait(); + System.err.println("Done."); + } + + public static void writeMatrix(String fname, long[] goodMatrix, long[] badMatrix, boolean overwrite){ + assert(fname!=null) : "No file specified"; + FileFormat ff=FileFormat.testOutput(fname, FileFormat.TEXT, null, false, overwrite, false); + TextStreamWriter tsw=new TextStreamWriter(ff, false); + System.err.println("Starting tsw for "+fname); + tsw.start(); + System.err.println("Started tsw for "+fname); + StringBuilder sb=new StringBuilder(); + + final int d0=goodMatrix.length; + for(int a=0; a0){ + sb.append(a); + sb.append('\t'); + sb.append(sum); + sb.append('\t'); + sb.append(bad); + sb.append('\n'); + } + if(sb.length()>0){ + tsw.print(sb.toString()); + sb.setLength(0); + } + } + System.err.println("Writing "+fname); + tsw.poisonAndWait(); + System.err.println("Done."); + } + + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + + private void process(Read r){ + if(r==null){return;} + readsProcessed++; + basesProcessed+=r.bases==null ? 0 : r.bases.length; + + if(verbose){outstream.println(r+"\n");} + + if(verbose){outstream.println("A");} + if(r.match!=null && r.shortmatch()){ + r.match=Read.toLongMatchString(r.match); + r.setShortMatch(false); + } + final byte[] quals=r.quality, bases=r.bases, match=r.match; + if(quals==null || bases==null || match==null){return;} + if(verbose){outstream.println("B");} + if(r.containsNonNMS() || r.containsConsecutiveS(4)){ + if(verbose){System.err.println("*************************************************** "+new String(match));} + return; + } + if(r.strand()==Gene.MINUS){ + Tools.reverseInPlace(match); + } + if(verbose){outstream.println("C");} + + final byte e='E'; + + readstats.addToQualityHistogram(r); + + readsUsed++; + for(int i=0, last=quals.length-1; i0 ? (byte)Tools.mid(QMAX, quals[i-1], 0) : QEND); + final byte q1=quals[i]; + final byte q2=(i1 ? bases[i-2] : e; + byte b1=i>0 ? bases[i-1] : e; + byte b2=bases[i]; + byte b3=i0 ? (byte)Tools.mid(QMAX, quals[pos-1], 0) : QEND); + final byte q1=quals[pos]; + final byte q2=(pos1 ? bases[pos-2] : e; + byte b1=pos>0 ? bases[pos-1] : e; + byte b2=bases[pos]; + byte b3=pos0 ? (byte)Tools.mid(QMAX, quals[pos-1], 0) : QEND); + final byte q1=quals[pos]; + final byte q2=(pos1 ? bases[pos-2] : e; + byte b1=pos>0 ? bases[pos-1] : e; + byte b2=bases[pos]; + byte b3=pos0 ? (byte)Tools.mid(QMAX, quals[pos-1], 0) : QEND); + final byte q1=quals[pos]; + final byte q2=(pos1 ? bases[pos-2] : e; + byte b1=pos>0 ? bases[pos-1] : e; + byte b2=bases[pos]; + byte b3=pos out="); + System.out.println("Alternately, 'out=stdout' will print to standard out."); + System.out.println("Optional flag, format:"); + System.out.println("format=1\tid start stop A C G T N GC"); + System.out.println("format=2\tid gc"); + System.out.println("format=4\tid length gc"); + System.out.println("Output is always tab-delimited. AGCT are fractions of defined bases; N is fraction of total bases."); + System.exit(0); + } + + boolean benchmark=false; + ReadWrite.USE_UNPIGZ=true; + + String in=null, out=null; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in")){ + in=b; + }else if(a.equals("out")){ + out=b; + if(b==null || "summaryonly".equalsIgnoreCase(b) || "none".equalsIgnoreCase(b)){ + out=null; + SUMMARY_ONLY=true; + }else if("benchmark".equalsIgnoreCase(b)){ + benchmark=true; + out=null; + SUMMARY_ONLY=true; + } + }else if(a.equals("benchmark")){ + benchmark=Tools.parseBoolean(b); + if(benchmark){ + out=null; + SUMMARY_ONLY=true; + } + }else if(a.equals("format")){ + FORMAT=Integer.parseInt(b); + if(FORMAT!=1 && FORMAT!=2 && FORMAT!=4){ + throw new RuntimeException("\nUnknown format: "+FORMAT+"; valid values are 1, 2, and 4.\n"); + } + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF1=!(ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b)); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(in==null && i==0 && !args[i].contains("=")){ + in=args[i]; + }else if(out==null && i==1 && !args[i].contains("=")){ + out=args[i]; + } + } + } + + long[] counts=null; + long sum=0; + + if(out==null || out.equalsIgnoreCase("stdout") || out.equalsIgnoreCase("standardout")){out=null;} + + InputStream is=null; + { + if(in==null){throw new RuntimeException("No input file.");} + if(in.equalsIgnoreCase("stdin") || in.equalsIgnoreCase("standardin")){ + is=System.in; + }else{ + File f=new File(in); + if((!f.exists() || f.isDirectory()) && !in.toLowerCase().startsWith("stdin")){ + throw new RuntimeException("Input file does not appear to be valid: "+in); + } + } + } + + if(is==null){is=ReadWrite.getInputStream(in, false, true);} + try { + if(benchmark){sum=bench2(is);} + else{ + FileFormat ff=FileFormat.testInput(in, FileFormat.FASTA, null, true, true); + boolean fastq=ff.fastq(); + boolean fasta=!fastq; //Default. + if(fastq){counts=countFastq(is, out);} + else if(fasta){counts=countFasta(is, out);} + else{throw new RuntimeException("Unknown or unsupported file format.");} + } + } catch (IOException e) { + e.printStackTrace(); + } + try { + if(is!=System.in){is.close();} + } catch (IOException e) { + e.printStackTrace(); + } + + + t.stop(); + + if(benchmark){ + System.err.println("Time: \t"+t); + long bytes=new File(in).length(); + if(bytes<1){bytes=LIMSUM;} + double mbps1=bytes*1000d/t.elapsed; + double mbps2=sum*1000d/t.elapsed; + System.err.println(String.format("Raw Speed: \t%.2f MBytes/s",mbps1)); + System.err.println(String.format("Uncompressed Speed:\t%.2f MBytes/s",mbps2)); + }else{ + System.err.println(toString2(new StringBuilder("Overall"), counts)); + System.err.println("Time: \t"+t); + long bytes=new File(in).length(); + if(bytes<1){bytes=LIMSUM;} + double mbps=bytes*1000d/t.elapsed; + double mbpps=Tools.sum(counts)*1000d/t.elapsed; + System.err.println(String.format("Speed:\t%.2f MBytes/s",mbps)); + System.err.println(String.format(" \t%.2f MBases/s",mbpps)); + } + + } + + public static long bench2(InputStream is) throws IOException{ + final byte[] buf=new byte[32768]; + long sum=0; + for(long len=is.read(buf); len>0; len=is.read(buf)){sum+=len;} + return sum; + } + + public static long[] countFasta(InputStream is, String out) throws IOException{ + + long limsum=0; + final byte[] buf=new byte[32768]; + final TextStreamWriter tsw=(out==null ? null : new TextStreamWriter(out, true, false, false)); + if(tsw!=null){tsw.start();} + final int[] counts=new int[6]; + final long[] overall=new long[6]; + final StringBuilder hdr=new StringBuilder(); + boolean hdmode=false; + + int i=0; + int lim=is.read(buf); + limsum+=lim; + + while(lim>0){ + if(hdmode){ + while(i0 || Tools.sum(counts)>0){ + if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.print(toString2(hdr, counts));} + hdr.setLength(0); + for(int j=0; j=lim){ + i=0; + lim=is.read(buf); + limsum+=lim; + } + } + + if(hdr.length()>0 || Tools.sum(counts)>0){ + if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.println(toString2(hdr, counts));} + hdr.setLength(0); + for(int j=0; j0){ + while(i=lim){ + lim=is.read(buf); + limsum+=lim; + }else{ + assert(buf[i]==at); + mode=0; + } + } + + while(lim>0){ + if(mode==0){ + while(i0 || Tools.sum(counts)>0){ + if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.print(toString2(hdr, counts));} + hdr.setLength(0); + for(int j=0; j=lim){ + i=0; + lim=is.read(buf); + limsum+=lim; + } + } + + if(hdr.length()>0 || Tools.sum(counts)>0){ + if(tsw!=null){tsw.print(toString2(hdr, counts));}else if(!SUMMARY_ONLY){System.out.println(toString2(hdr, counts));} + hdr.setLength(0); + for(int j=0; j']=r['@']=r['+']=5; + return r; + } +} diff --git a/current/jgi/CountKmersExact.java b/current/jgi/CountKmersExact.java new file mode 100755 index 0000000..8810aa1 --- /dev/null +++ b/current/jgi/CountKmersExact.java @@ -0,0 +1,1048 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import kmer.AbstractKmerTable; +import kmer.HashArray; +import kmer.HashBuffer; +import kmer.HashForest; +import kmer.KCountArray; +import kmer.KmerCount7MTA; +import kmer.KmerTable; +import kmer.Primes; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import align2.TrimRead; +import dna.AminoAcid; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Nov 22, 2013 + * + */ +public class CountKmersExact { + + /** + * Code entrance from the command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + + if(Tools.parseHelp(args)){ + printOptions(); + System.exit(0); + } + + //Create a new BBDuk instance + CountKmersExact cke=new CountKmersExact(args); + + ///And run it + cke.process(); + } + + /** + * Display usage information. + */ + private static void printOptions(){ + outstream.println("Syntax:\n"); + outstream.println("\njava -ea -Xmx20g -cp jgi.CountKmersExact in="); + outstream.println("\nOptional flags:"); + outstream.println("in= \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in."); + outstream.println("in2= \tUse this if 2nd read of pairs are in a different file."); + outstream.println("out= \t(outnonmatch) The 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out."); + outstream.println("out2= \t(outnonmatch2) Use this to write 2nd read of pairs to a different file."); + outstream.println("stats= \tWrite statistics about which contaminants were detected."); + outstream.println(""); + outstream.println("threads=auto \t(t) Set number of threads to use; default is number of logical processors."); + outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file."); + outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed."); + outstream.println("interleaved=auto \t(int) If true, forces fastq input to be paired and interleaved."); + outstream.println("k=28 \tKmer length used for finding contaminants. Contaminants shorter than k will not be found."); + outstream.println("minavgquality=0 \t(maq) Reads with average quality (before trimming) below this will be discarded."); + outstream.println("touppercase=f \t(tuc) Change all letters in reads and reference to upper-case."); + outstream.println("qtrim=f \tTrim read ends to remove bases with quality below minq. Performed AFTER looking for kmers. "); + outstream.println(" \tValues: t (trim both ends), f (neither end), r (right end only), l (left end only)."); + outstream.println("minq=4 \tTrim quality threshold."); + outstream.println("minlength=2 \t(ml) Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter."); + outstream.println("ziplevel=2 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster."); + outstream.println("fastawrap=100 \tLength of lines in fasta output"); + outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto"); + outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)"); + outstream.println("rcomp=t \tLook for reverse-complements of kmers also."); + outstream.println("forest=t \tUse HashForest data structure"); + outstream.println("table=f \tUse KmerTable data structure"); + outstream.println("array=f \tUse HashArray data structure"); + } + + + /** + * Constructor. + * @param args Command line arguments + */ + public CountKmersExact(String[] args){ + for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}} + System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + /* Set global defaults */ + ReadWrite.ZIPLEVEL=2; + ReadWrite.USE_UNPIGZ=true; + FastaReadInputStream.SPLIT_READS=false; + ByteFile.FORCE_MODE_BF2=Shared.THREADS>2; + + /* Initialize local variables with defaults */ + boolean setOut=false, setOutb=false, qtrimRight_=false, qtrimLeft_=false; + boolean rcomp_=true; + boolean useForest_=false, useTable_=false, useArray_=true; + long skipreads_=0; + int k_=28; + int ways_=-1; + byte qin=-1, qout=-1; + + byte trimq_=4; + byte minAvgQuality_=0; + int minReadLength_=20; + int maxNs_=-1; + boolean removePairsIfEitherBad_=true; + int filterMax_=2; + + { + boolean b=false; + assert(b=true); + EA=b; + } + + /* Parse arguments */ + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.equals("out") || a.equals("out1") || a.equals("outu") || a.equals("outu1") || a.equals("outnonmatch") || + a.equals("outnonmatch1") || a.equals("outunnmatch") || a.equals("outunmatch1") || a.equals("outunnmatched") || a.equals("outunmatched1")){ + out1=b; + setOut=true; + }else if(a.equals("out2") || a.equals("outu2") || a.equals("outnonmatch2") || a.equals("outunmatch2") || + a.equals("outnonmatched2") || a.equals("outunmatched2")){ + out2=b; + }else if(a.equals("stats")){ + outstats=b; + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + + }else if(a.equals("initialsize")){ + initialSize=Integer.parseInt(b); + }else if(a.equals("forest")){ + useForest_=Tools.parseBoolean(b); + if(useForest_){useTable_=useArray_=false;} + }else if(a.equals("table")){ + useTable_=Tools.parseBoolean(b); + if(useTable_){useForest_=useArray_=false;} + }else if(a.equals("array")){ + useArray_=Tools.parseBoolean(b); + if(useArray_){useTable_=useForest_=false;} + }else if(a.equals("ways")){ + ways_=Integer.parseInt(b); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("buflen") || a.equals("bufflen") || a.equals("bufferlength")){ + buflen=Integer.parseInt(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("k")){ + assert(b!=null) : "\nThe k key needs an integer value from 1 to 31, such as k=28\n"; + k_=Integer.parseInt(b); + }else if(a.equals("skipreads")){ + skipreads_=Long.parseLong(b); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.THREADS : Integer.parseInt(b)); + }else if(a.equals("minavgquality") || a.equals("maq")){ + minAvgQuality_=Byte.parseByte(b); + }else if(a.equals("maxns")){ + maxNs_=Byte.parseByte(b); + }else if(a.equals("showspeed") || a.equals("ss")){ + showSpeed=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + assert(false) : "Verbose flag is currently static final; must be recompiled to change."; +// verbose=Tools.parseBoolean(b); + }else if(a.equals("rcomp")){ + rcomp_=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("fastaminlen") || a.equals("fastaminlength")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("trim") || a.equals("qtrim")){ + if(b==null){qtrimRight_=qtrimLeft_=true;} + else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrimLeft_=true;qtrimRight_=false;} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrimLeft_=false;qtrimRight_=true;} + else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrimLeft_=qtrimRight_=true;} + else if(Character.isDigit(b.charAt(0))){ + if(!qtrimLeft_ && !qtrimRight_){qtrimLeft_=qtrimRight_=true;} + trimq_=Byte.parseByte(b); + }else{qtrimRight_=qtrimLeft_=Tools.parseBoolean(b);} + }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){ + if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){ + TrimRead.optimalMode=true; + TrimRead.optimalBias=Float.parseFloat(b); + assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimright") || a.equals("qtrimright")){ + qtrimRight_=Tools.parseBoolean(b); + }else if(a.equals("trimleft") || a.equals("qtrimleft")){ + qtrimLeft_=Tools.parseBoolean(b); + }else if(a.equals("trimq") || a.equals("trimquality")){ + trimq_=Byte.parseByte(b); + }else if(a.equals("q102matrix") || a.equals("q102m")){ + CalcTrueQuality.q102matrix=b; + }else if(a.equals("qbpmatrix") || a.equals("bqpm")){ + CalcTrueQuality.qbpmatrix=b; + }else if(a.equals("adjustquality") || a.equals("adjq")){ + TrimRead.ADJUST_QUALITY=Tools.parseBoolean(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("prefilter")){ + if(b==null || b.length()<1 || !Character.isDigit(b.charAt(0))){ + prefilter=Tools.parseBoolean(b); + }else{ + filterMax_=Integer.parseInt(b); + prefilter=filterMax_>0; + } + }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + in1=args[i]; + }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + out1=args[i]; + setOut=true; + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(TrimRead.ADJUST_QUALITY){CalcTrueQuality.initializeMatrices();} + + if(ways_<1){ + long maxKmers=Runtime.getRuntime().maxMemory()/12; + long minWays=Tools.min(10000, maxKmers/Integer.MAX_VALUE); + ways_=(int)Tools.max(31, THREADS*4, minWays); + ways_=(int)Primes.primeAtLeast(ways_); + assert(ways_>0); + System.err.println("ways="+ways_); + } + + /* Set final variables; post-process and validate argument combinations */ + + useForest=useForest_; + useTable=useTable_; + useArray=useArray_; + rcomp=rcomp_; + skipreads=skipreads_; + trimq=trimq_; + minAvgQuality=minAvgQuality_; + minReadLength=minReadLength_; + removePairsIfEitherBad=removePairsIfEitherBad_; + maxNs=maxNs_; + WAYS=ways_; + filterMax=Tools.min(filterMax_, 0x7FFFFFFF); + + k=k_; + k2=k-1; + + qtrimRight=qtrimRight_; + qtrimLeft=qtrimLeft_; + + keySets=new AbstractKmerTable[WAYS]; + + + /* Adjust I/O settings and filenames */ + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + assert(FastaReadInputStream.settingsOK()); + + if(in1==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(out1!=null && out1.contains("#")){ + int pound=out1.lastIndexOf('#'); + String a=out1.substring(0, pound); + String b=out1.substring(pound+1); + out1=a+1+b; + out2=a+2+b; + } + + if(!setOut){ + out1="stdout.fq"; + outstream=System.err; + out2=null; + }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){ + out1="stdout.fq"; + outstream=System.err; + out2=null; + } + if(out1!=null && !Tools.canWrite(out1, overwrite)){throw new RuntimeException("Output file "+out1+" already exists, and overwrite="+overwrite);} + + assert(!in1.equalsIgnoreCase(out1)); + assert(!in1.equalsIgnoreCase(in2)); + assert(out1==null || !out1.equalsIgnoreCase(out2)); + assert(THREADS>0); + + assert(in1==null || in1.toLowerCase().startsWith("stdin") || in1.toLowerCase().startsWith("standardin") || new File(in1).exists()) : "Can't find "+in1; + assert(in2==null || in2.toLowerCase().startsWith("stdin") || in2.toLowerCase().startsWith("standardin") || new File(in2).exists()) : "Can't find "+in2; + + //Initialize tables + for(int i=0; i=(1<100000){ + Timer ht=new Timer(); + ht.start(); + + KmerCount7MTA.CANONICAL=true; + ArrayList extra=null; + prefilterArray=KmerCount7MTA.makeKca(in1, in2, extra, k, cbits, 0, precells, prehashes, minq, true, maxReads, 1, 1, 1, 1, null); + assert(filterMax0.6){ + outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" : + uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy for kmers of depth under 3. Ideal load is under 60% used." + + "\nFor better accuracy, run on a node with more memory; quality-trim or error-correct reads; " + + "or increase the values of the minprob flag to reduce spurious kmers."); + } + ht.stop(); + + if(DISPLAY_PROGRESS){ + outstream.println("Prefilter time:\t"+ht); + outstream.println("After prefilter:"); + printMemory(); + outstream.println(); + } + }else{prefilter=false;} + } + + /* Fill tables with kmers */ + long added=loadKmers(t); + + if(DISPLAY_PROGRESS){ + outstream.println("Final:"); + printMemory(); + outstream.println(); + } + + /* Write statistics to files */ + writeStats(System.nanoTime()-startTime); + + outstream.println("Input: \t"+readsIn+" reads \t\t"+basesIn+" bases."); + + if(qtrimLeft || qtrimRight){ + outstream.println("QTrimmed: \t"+readsTrimmed+" reads ("+String.format("%.2f",readsTrimmed*100.0/readsIn)+"%) \t"+ + basesTrimmed+" bases ("+String.format("%.2f",basesTrimmed*100.0/basesIn)+"%)"); + } + if(minAvgQuality>0 || maxNs>=0){ + outstream.println("Low quality discards: \t"+lowqReads+" reads ("+String.format("%.2f",lowqReads*100.0/readsIn)+"%) \t"+ + lowqBases+" bases ("+String.format("%.2f",lowqBases*100.0/basesIn)+"%)"); + } + outstream.println("Result: \t"+readsOut+" reads ("+String.format("%.2f",readsOut*100.0/readsIn)+"%) \t"+ + basesOut+" bases ("+String.format("%.2f",basesOut*100.0/basesIn)+"%)"); + outstream.println("Unique Kmers: \t"+added); + } + + /** + * Write processing statistics in DUK's format. + * @param time Elapsed time, nanoseconds + */ + private void writeStats(long time){ + if(outstats==null){return;} + final TextStreamWriter tsw=new TextStreamWriter(outstats, overwrite, false, false); + tsw.start(); + tsw.println(dukString(time)); + tsw.poisonAndWait(); + } + + /** + * Helper method; formats statistics to be duk-compatible + * @param time Elapsed time, nanoseconds + * @return duk output string + */ + private String dukString(long time){ + StringBuilder sb=new StringBuilder(); + sb.append("##INPUT PARAMETERS##\n"); + sb.append("#Query file: "+in1+(in2==null ? "" : ","+in2)+"\n"); + sb.append("#Not matched reads file: "+out1+(out2==null ? "" : ","+out2)+"\n"); + sb.append("#Output file (stats): "+outstats+"\n"); + sb.append("#Mer size: "+k+"\n"); + long size=0; + for(AbstractKmerTable x : keySets){size+=x.size();} + sb.append("#Quality trim: "+((qtrimLeft || qtrimRight) ? trimq : "false")+"\n"); + sb.append("\n"); + + sb.append("##REFERENCE STAT##\n"); +// sb.append("#Total Reads: "+refReads+"\n"); +// sb.append("#Total Bases: "+refBases+"\n"); +// sb.append("#Total kmers: "+refKmers+"\n"); + sb.append("#Total stored kmers: "+size+"\n"); + sb.append("\n"); + + sb.append("## ELAPSED TIME##\n"); + sb.append("# Time: "+String.format("%.2f", time/1000000000.0)+" seconds\n"); + sb.append("\n"); + + sb.append("##QUERY FILE STAT##\n"); + sb.append("# Total number of reads: "+readsIn+"\n"); + sb.append("\n"); + + sb.append("##P-VALUE##\n"); + sb.append("#Avg number of Kmer for each read: "+((basesIn/(Tools.max(readsIn, 1)))-k)+"\n"); +// sb.append("# P value for the given threshold 1 is 4.05231e-14\n"); //duk prints a P value; not sure what it means + sb.append("\n"); + + return sb.toString(); + } + + + /*--------------------------------------------------------------*/ + /*---------------- Inner Methods ----------------*/ + /*--------------------------------------------------------------*/ + /** + * load reads into tables, using multiple ProcessThread. + * @param t + */ + private long loadKmers(Timer t){ + + /* Create read input stream */ + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, false, ff1, ff2); + Thread cristhread=new Thread(cris); + cristhread.start(); + } + + /* Create read output streams */ + RTextOutputStream3 ros=null; + if(out1!=null){ + final int buff=(!ORDERED ? 12 : Tools.max(32, 2*Shared.THREADS)); + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, ORDERED); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, ORDERED); + ros=new RTextOutputStream3(ff1, ff2, null, null, buff, null, true); + ros.start(); + } + if(ros!=null){ + t.stop(); + outstream.println("Started output stream:\t"+t); + t.start(); + } + + /* Optionally skip the first reads, since initial reads may have lower quality */ + if(skipreads>0){ + long skipped=0; + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(skipped0){ + skipped+=reads.size(); + + if(ros!=null){ + ros.add(new ArrayList(1), ln.id); + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + if(reads==null || reads.isEmpty()){ + ReadWrite.closeStreams(cris, ros); + System.err.println("Skipped all of the reads."); + System.exit(0); + } + } + + /* Create ProcessThreads */ + ArrayList alpt=new ArrayList(THREADS); + for(int i=0; i ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + //While there are more reads lists... + while(reads!=null && reads.size()>0){ + + //For each read (or pair) in the list... + for(int i=0; i0){ + if(r1!=null && r1.quality!=null && r1.avgQuality()=0){ + if(r1!=null && r1.countUndefined()>maxNs){r1.setDiscarded(true);} + if(r2!=null && r2.countUndefined()>maxNs){r2.setDiscarded(true);} + } + + int rlen1=0, rlen2=0; + if(r1!=null){ + if(qtrimLeft || qtrimRight){ + int x=TrimRead.trimFast(r1, qtrimLeft, qtrimRight, trimq, 1); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + rlen1=r1.bases==null ? 0 : r1.bases.length; + if(rlen10 ? 1 : 0); + } + rlen2=r2.bases==null ? 0 : r2.bases.length; + if(rlen2>>2)|(x2<=k){ + final long key=toValue(kmer, rkmer); + if(!prefilter || prefilterArray.read(key)>filterMax){ + int temp=table.incrementAndReturnNumCreated(key); + created+=temp; + if(verbose){System.err.println("Added "+temp);} + } + } + } + return created; + } + + /*--------------------------------------------------------------*/ + + /** Input read stream */ + private final ConcurrentReadStreamInterface cris; + + private final HashBuffer table; + + public long added=0; + + private long readsInT=0; + private long basesInT=0; + private long readsOutT=0; + private long basesOutT=0; + private long readsTrimmedT=0; + private long basesTrimmedT=0; + private long lowqReadsT=0; + private long lowqBasesT=0; + + } + + /*--------------------------------------------------------------*/ + /*---------------- Helper Methods ----------------*/ + /*--------------------------------------------------------------*/ + + + public boolean dumpKmersAsText(String fname, int k){ + if(fname==null){return true;} + TextStreamWriter tsw=new TextStreamWriter(fname, overwrite, false, true); + tsw.start(); + for(AbstractKmerTable set : keySets){ + set.dumpKmersAsText(tsw, k); + } + tsw.poisonAndWait(); + return tsw.errorState; + } + + + /*--------------------------------------------------------------*/ + /*---------------- Static Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** Print statistics about current memory use and availability */ + private static final void printMemory(){ + if(GC_BEFORE_PRINT_MEMORY){ + System.gc(); + System.gc(); + } + Runtime rt=Runtime.getRuntime(); + long mmemory=rt.maxMemory()/1000000; + long tmemory=rt.totalMemory()/1000000; + long fmemory=rt.freeMemory()/1000000; + long umemory=tmemory-fmemory; + outstream.println("Memory: "+/*"max="+mmemory+"m, total="+tmemory+"m, "+*/"free="+fmemory+"m, used="+umemory+"m"); + } + + /** + * Transforms a kmer into a canonical value stored in the table. Expected to be inlined. + * @param kmer Forward kmer + * @param rkmer Reverse kmer + * @param lengthMask Bitmask with single '1' set to left of kmer + * @return Canonical value + */ + private final long toValue(long kmer, long rkmer){ + long value=(rcomp ? Tools.max(kmer, rkmer) : kmer); + return value; + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Has this class encountered errors while processing? */ + public boolean errorState=false; + + /** Use a count-min prefilter for low-depth kmers */ + public boolean prefilter=false; + + /** Initial size of data structures */ + private int initialSize=128000; + /** Hold kmers. A kmer X such that X%WAYS=Y will be stored in keySets[Y] */ + private final AbstractKmerTable[] keySets; + /** A scaffold's name is stored at scaffoldNames.get(id). + * scaffoldNames[0] is reserved, so the first id is 1. */ + private final ArrayList scaffoldNames=new ArrayList(); + + private KCountArray prefilterArray=null; + + /** Input reads */ + private String in1=null, in2=null; + /** Output reads */ + private String out1=null, out2=null; + /** Statistics output file */ + private String outstats=null; + + /** Maximum input reads (or pairs) to process. Does not apply to references. -1 means unlimited. */ + private long maxReads=-1; + /** Output reads in input order. May reduce speed. */ + private boolean ORDERED=false; + + private int buflen=1000; + + long readsIn=0; + long basesIn=0; + long readsOut=0; + long basesOut=0; + long readsTrimmed=0; + long basesTrimmed=0; + long lowqReads=0; + long lowqBases=0; + +// long refReads=0; +// long refBases=0; +// long refKmers=0; + + /*--------------------------------------------------------------*/ + /*---------------- Final Primitives ----------------*/ + /*--------------------------------------------------------------*/ + + /** Number of tables (and threads, during loading) */ + private final int WAYS; + + /** Filter kmers up to this level; don't store them in primary data structure */ + private final int filterMax; + + /** Look for reverse-complements as well as forward kmers. Default: true */ + private final boolean rcomp; + /** Use HashForest data structure */ + private final boolean useForest; + /** Use KmerTable data structure */ + private final boolean useTable; + /** Use HashArray data structure (default) */ + private final boolean useArray; + + /** Normal kmer length */ + private final int k; + /** k-1; used in some expressions */ + private final int k2; + + /** Quality-trim the left side */ + private final boolean qtrimLeft; + /** Quality-trim the right side */ + private final boolean qtrimRight; + /** Trim bases at this quality or below. Default: 4 */ + private final byte trimq; + /** Throw away reads below this average quality before trimming. Default: 0 */ + private final byte minAvgQuality; + /** Throw away reads containing more than this many Ns. Default: -1 (disabled) */ + private final int maxNs; + /** Throw away reads shorter than this after trimming. Default: 20 */ + private final int minReadLength; + + /** True iff java was launched with the -ea' flag */ + private final boolean EA; + /** Skip this many initial input reads */ + private final long skipreads; + + /** Pairs go to outbad if either of them is bad, as opposed to requiring both to be bad. + * Default: true. */ + private final boolean removePairsIfEitherBad; + + + /*--------------------------------------------------------------*/ + /*---------------- Static Fields ----------------*/ + /*--------------------------------------------------------------*/ + + public static int VERSION=1; + + /** Print messages to this stream */ + private static PrintStream outstream=System.err; + /** Permission to overwrite existing files */ + public static boolean overwrite=false; + /** Print speed statistics upon completion */ + public static boolean showSpeed=true; + /** Display progress messages such as memory usage */ + public static boolean DISPLAY_PROGRESS=true; + /** Verbose messages */ + public static final boolean verbose=false; + /** Number of ProcessThreads */ + public static int THREADS=Shared.THREADS; + /** Indicates end of input stream */ + private static final ArrayList POISON=new ArrayList(0); + /** Do garbage collection prior to printing memory usage */ + private static final boolean GC_BEFORE_PRINT_MEMORY=false; + + /*--------------------------------------------------------------*/ + /*---------------- Static Initializers ----------------*/ + /*--------------------------------------------------------------*/ + +// static{ +// } + + + +} diff --git a/current/jgi/CountUniqueness.java b/current/jgi/CountUniqueness.java new file mode 100755 index 0000000..0146586 --- /dev/null +++ b/current/jgi/CountUniqueness.java @@ -0,0 +1,114 @@ +package jgi; + +import java.io.PrintStream; +import java.util.ArrayList; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.Read; +import align2.ListNum; +import align2.Tools; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.ReadWrite; + +/** + * TODO + * @author Brian Bushnell + * @date Jan 14, 2014 + * + */ +public class CountUniqueness { + + + public void process(){ + Timer t=new Timer(); + t.start(); + for(String s : in){ + process(s); + } + + t.stop(); + + double rpnano=readsProcessed/(double)(t.elapsed); + double bpnano=basesProcessed/(double)(t.elapsed); + + String rpstring=(readsProcessed<100000 ? ""+readsProcessed : readsProcessed<100000000 ? (readsProcessed/1000)+"k" : (readsProcessed/1000000)+"m"); + String bpstring=(basesProcessed<100000 ? ""+basesProcessed : basesProcessed<100000000 ? (basesProcessed/1000)+"k" : (basesProcessed/1000000)+"m"); + + while(rpstring.length()<8){rpstring=" "+rpstring;} + while(bpstring.length()<8){bpstring=" "+bpstring;} + + outstream.println("Time: \t"+t); + outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + + if(errorState){ + throw new RuntimeException(this.getClass().getName()+" terminated in an error state; the output may be corrupt."); + } + } + + private void process(Read r1, Read r2){ + if(r1==null || r2==null){return;} + readsProcessed++; + basesProcessed+=r1.bases==null ? 0 : r1.bases.length; + readsProcessed++; + basesProcessed+=r2.bases==null ? 0 : r2.bases.length; + assert(false) : "TODO"; + } + + public void process(String fname){ + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + FileFormat ff=FileFormat.testInput(fname, FileFormat.SAM, null, true, false); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, false, ff, null); + if(verbose){System.err.println("Starting cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + + for(int idx=0; idx jgi.Dedupe "); + outstream.println("\nOptional flags:"); + outstream.println("in= \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in."); + outstream.println("out= \tThe 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out."); + outstream.println(""); + outstream.println("threads=auto \t(t) Set number of threads to use; default is number of logical processors."); + outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file."); + outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed."); + outstream.println("minscaf=0 \t(ms) Ignore contigs/scaffolds shorter than this."); + outstream.println("interleaved=auto \tIf true, forces fastq input to be paired and interleaved."); + + outstream.println("absorbrc=t \t(arc) Absorb reverse-complements as well as normal orientation."); + outstream.println("absorbmatch=t \t(am) Absorb exact matches of contigs."); + outstream.println("absorbcontainment=t\t(ac) Absorb full containments of contigs."); + outstream.println("absorboverlap=f \t(ao) Absorb (merge) non-contained overlaps of contigs."); + + outstream.println("numaffixmaps=1 \t(nam) Set to 2 to index two prefixes and suffixes per contig."); + outstream.println("ignoreaffix1=f \t(ia1) Ignore first affix (for testing)."); + outstream.println("storesuffix=f \t(ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches."); + + outstream.println("findoverlap=f \t(fo) Find overlaps between contigs (containments and non-containments)."); + outstream.println("cluster=f \t(c) Group overlapping contigs into clusters."); + outstream.println("fixmultijoins=t \t(fmj) Remove redundant overlaps between the same two contigs."); + outstream.println("removecycles=t \t(rc) Remove all cycles so clusters form trees."); + outstream.println("renameclusters=f \t(rnc) Rename contigs to indicate which cluster they are in."); + outstream.println("minclustersize=1 \t(mcs) Don't output clusters smaller than this."); + outstream.println("cc=t \t(canonicizeclusters) Flip contigs so clusters have a single orientation."); + outstream.println("fcc=f \t(fixcanoncontradictions) Truncate graph at nodes with canonization disputes."); + outstream.println("foc=f \t(fixoffsetcontradictions) Truncate graph at nodes with offset disputes."); + + outstream.println("storename=t \t(sn) Store contig names (set false to save memory)."); + outstream.println("storequality=t \t(sq) Store quality values for fastq assemblies (set false to save memory)."); + outstream.println("exact=t \t(ex) Only allow exact symbol matches. When false, an 'N' will match any symbol."); + outstream.println("touppercase=f \t(tuc) Change all input bases to upper case."); + outstream.println("uniquenames=t \t(un) Ensure all output contigs have unique names. Uses more memory."); + outstream.println("maxsubs=0 \t(s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits."); + outstream.println("maxedits=0 \t(e) Allow up to this many edits (subs or indels). Higher is slower, so below 20 is suggested."); + //outstream.println("bandwidth=9 \t(bw) Width of banded alignment, if maxedits>0. To ensure correctness, set bandwidth=2*maxedits+1. Higher is slower."); + outstream.println("minidentity=100 \t(mid) Allow inter-sequence identity as low as this (subs only, no indels)."); + outstream.println("k=31 \tKmer length used for finding containments. Containments shorter than k will not be found."); + outstream.println("minlengthpercent=0 \t(mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed."); + outstream.println("minoverlappercent=0\t(mop) Overlap must be at least this percent of smaller contig's length to cluster and merge."); + outstream.println("minoverlap=200 \t(mo) Overlap must be at least this long to cluster and merge."); + + outstream.println("mopc=0 \t(minoverlappercentmerge) Overlap must be at least this percent of smaller contig's length to cluster."); + outstream.println("mopm=0 \t(minoverlappercentcluster) Overlap must be at least this percent of smaller contig's length to merge."); + outstream.println("moc=200 \t(minoverlapcluster) Overlap must be at least this long to cluster."); + outstream.println("mom=200 \t(minoverlapmerge) Overlap must be at least this long to merge."); + outstream.println("rt=f \t(rigoroustransitive) Ensure exact transitivity. Slow. For testing only."); + + outstream.println("ziplevel=2 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster."); + outstream.println("sort=f \tsort output by contig length (otherwise it will be random).\n" + + " \t'a' for ascending, 'd' for descending, 'f' for false (no sorting)."); + outstream.println(""); + outstream.println("Note! When allowing inexact alignments, if maxsubs is less than maxedits, maxsubs is set to maxedits."); + outstream.println("If maxsubs and minidentity yield different numbers for some contig, the more liberal is used for substitutions."); + outstream.println("For indels, minidentity is ignored and maxedits is always used (due to time and memory constraints)."); + outstream.println("Regardless of maxsubs, maxedits, or minidentity, no comparison will be made between two sequences unless "); + outstream.println("one contains the first or last k bases of the other, exactly, with no edits."); + + } + + public Dedupe(String[] args){ + for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}} + System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + ReadWrite.ZIPLEVEL=2; + //ReadWrite.USE_UNPIGZ=true; + FastaReadInputStream.SPLIT_READS=false; + boolean setOut=false; + int bandwidth_=-1; + int k_=31; + + { + boolean b=false; + assert(b=true); + EA=b; + } + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in")){ + if(b.indexOf(',')>=0 && !new File(b).exists()){ + in=b.split(","); + }else{ + in=new String[] {b}; + } + }else if(a.equals("out")){ + out=b; + setOut=true; + }else if(a.equals("csf") || a.equals("clusterstatsfile")){ + csfOut=b; + }else if(a.equals("mcsfs") || a.equals("minclustersizeforstats")){ + minClusterSizeForStats=Integer.parseInt(b); + }else if(a.equals("mcs") || a.equals("minclustersize")){ + minClusterSize=Integer.parseInt(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("sort")){ + if(b==null){sort=true;} + else if(b.equalsIgnoreCase("a")){ + sort=true; + ascending=true; + }else if(b.equalsIgnoreCase("d")){ + sort=true; + ascending=false; + }else{ + sort=Tools.parseBoolean(b); + } + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("arc") || a.equals("absorbrc") || a.equals("trc") || a.equals("testrc")){ + ignoreReverseComplement=!Tools.parseBoolean(b); + }else if(a.equals("ac") || a.equals("absorbcontainment") || a.equals("absorbcontainments") || a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){ + absorbContainment=Tools.parseBoolean(b); + }else if(a.equals("am") || a.equals("absorbmatch") || a.equals("absorbmatches") || a.equals("tm") || a.equals("testmatch")){ + absorbMatch=Tools.parseBoolean(b); + }else if(a.equals("ao") || a.equals("absorboverlap") || a.equals("absorboverlaps") || a.equals("to") || a.equals("testoverlap")){ + absorbOverlap=Tools.parseBoolean(b); + }else if(a.equals("fo") || a.equals("findoverlap") || a.equals("findoverlaps")){ + findOverlaps=Tools.parseBoolean(b); + }else if(a.equals("c") || a.equals("cluster") || a.equals("clusters")){ + makeClusters=Tools.parseBoolean(b); + }else if(a.equals("fmj") || a.equals("fixmultijoin") || a.equals("fixmultijoins")){ + fixMultiJoins=Tools.parseBoolean(b); + }else if(a.equals("fcc") || a.equals("fixcanoncontradiction") || a.equals("fixcanoncontradictions")){ + fixCanonContradictions=Tools.parseBoolean(b); + }else if(a.equals("foc") || a.equals("fixoffsetcontradiction") || a.equals("fixoffsetcontradictions")){ + fixOffsetContradictions=Tools.parseBoolean(b); + }else if(a.equals("cc") || a.equals("canonicizecluster") || a.equals("canonicizeclusters")){ + canonicizeClusters=Tools.parseBoolean(b); + }else if(a.equals("pc") || a.equals("processcluster") || a.equals("processclusters")){ + processClusters=Tools.parseBoolean(b); + }else if(a.equals("rnc") || a.equals("renamecluster") || a.equals("renameclusters")){ + renameClusters=Tools.parseBoolean(b); + if(renameClusters){storeName=false;} + }else if(a.equals("rc") || a.equals("removecycles") || a.equals("removecycle")){ + removeCycles=Tools.parseBoolean(b); +// }else if(a.equals("tuc") || a.equals("touppercase")){ +// toUpperCase=Tools.parseBoolean(b); + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("k")){ + k_=Integer.parseInt(b); + assert(k_>0 && k_<32) : "k must be between 1 and 31; default is 31, and lower values are slower."; + }else if(a.equals("minscaf") || a.equals("ms")){ + MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("mlp") || a.equals("minlengthpercent")){ + minLengthPercent=Float.parseFloat(b); + }else if(a.equals("mop") || a.equals("minoverlappercent")){ + minOverlapPercentCluster=minOverlapPercentMerge=Float.parseFloat(b); + }else if(a.equals("mopc") || a.equals("minoverlappercentcluster")){ + minOverlapPercentCluster=Float.parseFloat(b); + }else if(a.equals("mopm") || a.equals("minoverlappercentmerge")){ + minOverlapPercentMerge=Float.parseFloat(b); + }else if(a.equals("mo") || a.equals("minoverlap")){ + minOverlapCluster=minOverlapMerge=Integer.parseInt(b); + }else if(a.equals("moc") || a.equals("minoverlapcluster")){ + minOverlapCluster=Integer.parseInt(b); + }else if(a.equals("mom") || a.equals("minoverlapmerge")){ + minOverlapMerge=Integer.parseInt(b); + }else if(a.equals("rt") || a.equals("rigoroustransitive")){ + rigorousTransitive=Tools.parseBoolean(b); + }else if(a.equals("e") || a.equals("maxedits")){ + maxEdits=Integer.parseInt(b); + }else if(a.equals("s") || a.equals("maxsubs") || a.equals("maxsubstitutions")){ + maxSubs=Integer.parseInt(b); + }else if(a.equals("bw") || a.equals("bandwidth")){ + bandwidth_=Integer.parseInt(b); + }else if(a.equals("mid") || a.equals("minidentity")){ + minIdentity=Float.parseFloat(b); + minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=(b==null || b.equalsIgnoreCase("auto") ? Shared.THREADS : Integer.parseInt(b)); + }else if(a.equals("showspeed") || a.equals("ss")){ + showSpeed=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); +// BandedAligner.verbose=verbose; + }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){ + maxNs=Integer.parseInt(b); + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("sn") || a.equals("storename") || a.equals("storenames")){ + storeName=Tools.parseBoolean(b); + }else if(a.equals("ssx") || a.equals("storesuffix") || a.equals("storesuffixes")){ + storeSuffix=Tools.parseBoolean(b); + }else if(a.equals("numaffixmaps") || a.equals("nam")){ + numAffixMaps=Integer.parseInt(b); + }else if(a.equals("mac") || a.equals("maxaffixcopies")){ + maxAffixCopies=Integer.parseInt(b); + }else if(a.equals("me") || a.equals("maxedges")){ + maxEdges=Integer.parseInt(b); + maxEdges2=maxEdges*2; + if(maxEdges2<1){maxEdges2=Integer.MAX_VALUE-1;} + }else if(a.equals("ignoreaffix1") || a.equals("ia1")){ + ignoreAffix1=Tools.parseBoolean(b); + }else if(a.equals("parsedepth") || a.equals("pd")){ + parseDepth=Tools.parseBoolean(b); + }else if(a.equals("depthmult") || a.equals("depthratio") || a.equals("dr")){ + depthRatio=Float.parseFloat(b); + if(depthRatio<=0){ + parseDepth=false; + }else{ + parseDepth=true; + assert(depthRatio>0); + if(depthRatio<1){depthRatio=1/depthRatio;} + } + }else if(a.equals("storequality") || a.equals("sq")){ + storeQuality=Tools.parseBoolean(b); + }else if(a.equals("exact") || a.equals("ex")){ + exact=Tools.parseBoolean(b); + }else if(a.equals("uniquenames") || a.equals("un")){ + uniqueNames=Tools.parseBoolean(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(i==0 && in==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + String c=args[i]; + if(c.indexOf(',')>=0 && !new File(c).exists()){ + in=c.split(","); + }else{ + in=new String[] {c}; + } + }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + out=args[i]; + setOut=true; + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(verbose){ + ReadWrite.verbose=ConcurrentGenericReadInputStream.verbose=RTextOutputStream3.verbose=ByteFile1.verbose=ByteFile2.verbose=FastqReadInputStream.verbose=true; + } + verbose=false; + + k=k_; + k2=k-1; + + BandedAligner.penalizeOffCenter=true; + + if(absorbOverlap){processClusters=true;} + if(processClusters || renameClusters){makeClusters=true;} + if(makeClusters){findOverlaps=true;} + if(renameClusters){uniqueNames=/*storeName=*/false;} + + if(bandwidth_>-1){ + bandwidth=Tools.min(bandwidth_, 2*maxEdits+1); + }else{ + bandwidth=2*maxEdits+1; + } + maxSubs=Tools.max(maxSubs, maxEdits); + if(maxSubs>0 || minIdentity<100 || findOverlaps){storeSuffix=true;} + + assert(FastaReadInputStream.settingsOK()); + + if(in==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + + for(int i=0; i1; + for(int i=0; i alht=new ArrayList(THREADS); + for(int i=0; i list=new ArrayList((int)addedToMain); + for(ArrayList alu : codeMap.values()){ + for(Unit u : alu){ + assert(u.r.mate==null) : "Containments are not currently supported with paired reads."; + if(u.valid() && u.r.pairnum()==0){list.add(u.r);} + } + } + + // if(minLengthPercent>0){ + // if(verbose){System.err.println("Sorting.");} + // Collections.sort(list, ReadLengthComparator.comparator); + // Collections.reverse(list); + // assert(list.isEmpty() || list.get(0).bases.length<=list.get(list.size()-1).bases.length) : + // list.get(0).bases.length+", "+list.get(list.size()-1).bases.length; + // } + + crisa=new ConcurrentCollectionReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)}; + Thread cristhread=new Thread(crisa[0]); + cristhread.start(); + + ArrayList alht=new ArrayList(THREADS); + for(int i=0; i list=new ArrayList((int)addedToMain); + for(ArrayList alu : codeMap.values()){ + for(Unit u : alu){ + if(u.valid() && u.r.pairnum()==0){list.add(u.r);} + } + } + + crisa=new ConcurrentCollectionReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, -1)}; + Thread cristhread=new Thread(crisa[0]); + cristhread.start(); + + ArrayList alht=new ArrayList(THREADS); + for(int i=0; i>(list.size()/4+1); + processedClusters=new ArrayList>(); + }else{ + assert(clusterQueue.isEmpty()); + } + makeClusters(t, list); + } + + list.clear(); + } + + private long makeTransitive(Timer t, ArrayList list, boolean rigorous){ + assert(false) : "No longer needed."; + long added=0; + for(Read r : list){ + assert(r!=null); + Unit u=(Unit) r.obj; + assert(u!=null); + assert(u.valid()); +// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); + if(u.valid()){ + + if(u.overlapList!=null){ + for(Overlap o : u.overlapList){ + Unit u2=(o.u1==u ? o.u2 : o.u1); + assert(u2!=u); + if(u2.overlapList==null){ + u2.overlapList=new ArrayList(2); + u2.overlapList.add(o); + }else{ + boolean found=false; + if(rigorous){ + found=u2.overlapList.contains(o); + }else{ + for(Overlap o2 : u2.overlapList){ + if(o2.u1==u || o2.u2==u){found=true; break;} + } + } + if(!found){ + added++; + u2.overlapList.add(o); + } + } + } + } + } + } + + for(Read r : list){ + Unit u=(Unit) r.obj; + if(u.valid()){ + assert(u.isTransitive()); + } + } + + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println("Added overlaps: "+added); + outstream.println("Made overlaps transitive. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + return added; + } + + private int countIntransitive(Timer t, ArrayList list, boolean rigorous){ + if(!countTransitive){return 0;} + int transitive=0, intransitive=0; + for(Read r : list){ + assert(r!=null); + Unit u=(Unit) r.obj; + assert(u!=null); + assert(u.valid()); +// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); + if(u.valid()){ + if(rigorous ? u.isPerfectlyTransitive() : u.isTransitive()){ + transitive++; + }else{ + intransitive++; + } + } + } + + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println("Intransitive: "+intransitive+", \ttransitive: "+transitive); + outstream.println("Checked transitivity. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + + return intransitive; + } + + private int countRedundant(Timer t, ArrayList list){ + if(!countRedundant){return 0;} + int redundant=0, nonredundant=0; + for(Read r : list){ + assert(r!=null); + Unit u=(Unit) r.obj; + assert(u!=null); + assert(u.valid()); +// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); + if(u.valid()){ + if(u.isNonRedundant()){ + nonredundant++; + }else{ + redundant++; + } + } + } + + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println("Redundant: "+redundant+", \tnonredundant: "+nonredundant); + outstream.println("Checked redundancy. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + return redundant; + } + + private long countOverlaps(Timer t, ArrayList list){ + + long overlaps=0, length=0; + for(Read r : list){ + assert(r!=null); + Unit u=(Unit) r.obj; + assert(u!=null); + assert(u.valid()); +// Data.sysout.println("Considering "+r.id+"; valid="+u.valid()+", overlaps="+(u.overlapList==null ? "null" : u.overlapList.size())); + if(u.valid() && u.overlapList!=null){ + for(Overlap o : u.overlapList){ + overlaps++; + length+=o.overlapLen; + } + } + } + + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println("Overlaps: "+overlaps+", \tlength: "+length); + outstream.println("Counted overlaps. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + return overlaps; + } + + private void makeClusters(Timer t, ArrayList list){ + + int[] clusterSize=new int[70000]; + int max=0; + for(Read r : list){ + Unit u=(Unit) r.obj; + + if(!u.clustered()){ + ArrayList cluster=u.makeCluster(); + if(cluster.size()>2){cluster.trimToSize();} + if(cluster.size()==1 || (!processClusters)){processedClusters.add(cluster);} + else{clusterQueue.add(cluster);} + clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++; + max=Tools.max(max, cluster.size()); + } + } + + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println(toClusterSizeString(clusterSize)); + outstream.println("Largest: "+max); + outstream.println("Finished making clusters. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + + + long x=removeInvalid(list); + + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println("Removed "+x+" invalid entries."); + outstream.println("Finished invalid removal. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + } + + private String toClusterSizeString(int[] clusterSize){ + + long totalClusters=Tools.sum(clusterSize); + + final StringBuilder sb=new StringBuilder(100), sb2=new StringBuilder(1000); + sb2.append("Clusters: "+totalClusters); + final int spaces=19; + for(int i=0; i=clusterSize.length){ + long x=Tools.sum(clusterSize, a, clusterSize.length-1); + if(x>0){ + sb.append("\nSize "+a+"+:"); + while(sb.length()0){ + sb.append("\nSize "+a+"-"+b+":"); + while(sb.length() alu : processedClusters){ + for(int i=0; i alct=new ArrayList(THREADS); + for(int i=0; i cluster : processedClusters){ + clusterSize[Tools.min(clusterSize.length-1, cluster.size())]++; + max=Tools.max(max, cluster.size()); + } + outstream.println(toClusterSizeString(clusterSize)); + outstream.println("Largest: "+max); + + outstream.println("Finished processing. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + } + + private long removeInvalid(ArrayList list){ + final LongM keym=new LongM(); + long removedC=0, removedP=0, removedS=0, invalid=0; + + for(int j=0, lim=list.size(); j alu=codeMap.get(key); + if(alu!=null){ + int valid=0; + for(int i=alu.size()-1; i>=0; i--){ + Unit u2=alu.get(i); + if(u2==null || !u2.valid()){ + alu.remove(i); + removedC++; + } + else{valid++;} + } + if(valid==0){codeMap.remove(key);} + } + } + + if(affixMap1!=null && !affixMap1.isEmpty()){ + { + keym.set(u.prefix1); + ArrayList alu=affixMap1.get(keym); + if(alu!=null){ + int valid=0; + for(int i=alu.size()-1; i>=0; i--){ + Unit u2=alu.get(i); + if(u2==null || !u2.valid()){ + alu.remove(i); + removedP++; + } + else{valid++;} + } + if(valid==0){affixMap1.remove(keym);} + } + } + if(storeSuffix){ + keym.set(u.suffix1); + ArrayList alu=affixMap1.get(keym); + if(alu!=null){ + int valid=0; + for(int i=alu.size()-1; i>=0; i--){ + Unit u2=alu.get(i); + if(u2==null || !u2.valid()){ + alu.remove(i); + removedS++; + } + else{valid++;} + } + if(valid==0){affixMap1.remove(keym);} + } + } + } + if(affixMap2!=null && !affixMap2.isEmpty()){ + if(u.prefix2!=-1){ + keym.set(u.prefix2); + ArrayList alu=affixMap2.get(keym); + if(alu!=null){ + int valid=0; + for(int i=alu.size()-1; i>=0; i--){ + Unit u2=alu.get(i); + if(u2==null || !u2.valid()){ + alu.remove(i); + removedP++; + } + else{valid++;} + } + if(valid==0){affixMap2.remove(keym);} + } + } + if(storeSuffix && u.suffix2!=-1){ + keym.set(u.suffix2); + ArrayList alu=affixMap2.get(keym); + if(alu!=null){ + int valid=0; + for(int i=alu.size()-1; i>=0; i--){ + Unit u2=alu.get(i); + if(u2==null || !u2.valid()){ + alu.remove(i); + removedS++; + } + else{valid++;} + } + if(valid==0){affixMap2.remove(keym);} + } + } + + } + + list.set(j, null); + } + } + + if(invalid>0){ + Tools.condenseStrict(list); + } + if(verbose){ + outstream.println("Removed invalids: "+removedC+", "+removedP+", "+removedS); + } + return invalid; + } + + + private static ArrayList addToArray(HashMap> codeMap, boolean sort, boolean ascending, boolean clear, long outNum){ + assert(outNum<=Integer.MAX_VALUE); + if(verbose){System.err.println("Making list.");} + ArrayList list=new ArrayList((int)outNum); + if(verbose){System.err.println("Adding.");} + for(ArrayList alu : codeMap.values()){ + for(Unit u : alu){ + if(u.valid() && u.r.pairnum()==0){list.add(u.r);} + } + if(clear){alu.clear();} + } + if(clear){codeMap.clear();} + + if(sort){ + if(verbose){System.err.println("Sorting.");} + Collections.sort(list, ReadLengthComparator.comparator); + if(ascending){ + Collections.reverse(list); + assert(list.isEmpty() || list.get(0).bases.length<=list.get(list.size()-1).bases.length) : + list.get(0).bases.length+", "+list.get(list.size()-1).bases.length; + }else{ + assert(list.isEmpty() || list.get(0).bases.length>=list.get(list.size()-1).bases.length) : + list.get(0).bases.length+", "+list.get(list.size()-1).bases.length; + } + } + assert(list.size()==outNum) : list.size()+", "+outNum; + return list; + } + + private void writeOutput(TextStreamWriter tsw, String clusterStatsFile, Timer t){ +// verbose=true; + if(processedClusters==null || processedClusters.isEmpty()){ + ArrayList list=addToArray(codeMap, sort, ascending, true, addedToMain-containments); + codeMap=null; + + if(sort){ + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println("Sorted output. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + } + + writeOutput(tsw, list); + }else{ + writeOutputClusters(tsw, clusterStatsFile, processedClusters); + } + + if(DISPLAY_PROGRESS){ + t.stop(); + outstream.println("Printed output. Time: "+t); + printMemory(); + outstream.println(); + t.start(); + } + } + + + + private void writeOutput(TextStreamWriter tsw, ArrayList list){ + + if(verbose){System.err.println("Writing from array.");} + tsw.start(); + + HashSet names=((uniqueNames && storeName) ? + new HashSet(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); + long rid=0; + for(int x=0; x> clist){ + + Collections.sort(clist, CLUSTER_LENGTH_COMPARATOR); + + if(verbose){System.err.println("Writing clusters.");} + tsw.start(); + + TextStreamWriter csf=null; + if(clusterStatsFile!=null){ + csf=new TextStreamWriter(clusterStatsFile, overwrite, false, false); + csf.start(); + csf.print("#Name\tsize\t"+nmerLength+"-mer frequencies\n"); + } + + HashSet names=((uniqueNames && storeName) ? + new HashSet(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); + long rid=0; + final long[] nmerCounts=new long[maxNmer+1]; + + final StringBuilder sb=new StringBuilder(64); + + for(int cnum=0; cnum alu=clist.get(cnum); + clist.set(cnum, null); +// Collections.sort(alu); //TODO: Remove + + if(alu.size()=minClusterSizeForStats){ + float[] profile=makeNmerProfile(alu, nmerCounts); + sb.append("Cluster_"); + sb.append(cnum); + sb.append('\t'); + sb.append(alu.size()); + sb.append('\t'); + for(float f : profile){ + sb.append(String.format("%.5f ", f)); + } + sb.setCharAt(sb.length()-1, '\n'); + csf.print(sb.toString()); + sb.setLength(0); + } + + for(int contig=0; contig=0; i--){ + byte b=bases[i]; + assert(hashcodes[b]!=null) : "Invalid sequence character: '"+(char)b+"'"; + b=baseToComplementExtended[b]; + int mode=(int)(code&31); + code=code^hashcodes[b][mode]; + code=Long.rotateLeft(code, 1); + } + return code; + } + + + public static boolean isCanonical(byte[] bases){ + if(ignoreReverseComplement || bases==null || bases.length==0){return true;} + final int lim=(bases.length+1)/2; + for(int i=0, j=bases.length-1; i0 ? new BandedAligner(bandwidth) : null); + } + + public void run(){ + + final ArrayList temp=new ArrayList(1000); + + ArrayList cluster=null; + while((cluster=nextCluster())!=null){ + + if(EA){ + for(Unit u : cluster){assert(u.r.mate==null) : "Cluster processing/merging is not supported for paired reads, only cluster generation.";} + } + +// for(Unit u : cluster){assert(!u.visited());} + unvisit(cluster); + + reorderClusterBreadthFirst(cluster); + int multiJoinCount=findMultiJoinsInCluster(cluster, fixMultiJoinsT); + + if(EA){ + for(Unit u : cluster){assert(!u.visited());} + } + + boolean ok=true; + if(multiJoinCount!=0){ + assert(multiJoinCount>0); + multiJoinsFoundT+=multiJoinCount; + if(!fixMultiJoinsT){ + multiJoinFailuresT++; + ok=false; + } + } + + int canonContradictions=0; + if(ok && canonicizeT){ + if(EA){ + for(Unit u : cluster){ + assert(!u.visited()); + assert(!u.canonContradiction()); + assert(!u.canonicized()); + for(Overlap o : u.overlapList){ + assert(!o.invalid()); + assert(!o.canonContradiction()) : + o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+cluster.contains(o.u1)+", "+cluster.contains(o.u2); + } + } + } + canonContradictions=canonicizeClusterBreadthFirst(cluster, temp); +// System.err.println("Canonicized cluster of size "+cluster.size()+"; contradictions = "+canonContradictions+"; canonicized = "+temp.size()); + temp.clear(); + for(Unit u : cluster){assert(!u.visited());} + if(canonContradictions>0){ + canonContradictoryOverlapsT+=canonContradictions; + canonContradictoryClustersT++; + if(fixCanonContradictionsT){ + if(verbose){System.err.println("Pruning cluster to remove canonization contradictions.");} + fullyPruneCluster(cluster, temp); + if(verbose){System.err.println("Resulting size: "+cluster.size());} + if(EA){ + for(Unit u : cluster){ + assert(!u.visited()); + assert(!u.canonContradiction()); + assert(u.canonicized()); + for(Overlap o : u.overlapList){ + assert(!o.invalid()); + assert(!o.canonContradiction()); + assert(o.type==FORWARD) : "\n"+o+"\n"+ + o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+o.u1.canonicized()+", "+o.u2.canonicized()+ + "\n"+cluster.contains(o.u1)+", "+cluster.contains(o.u2)+", "+cluster.size(); + } + } + } + }else{ + ok=false; + } + } + } + + int cycleOverlaps=0; + if(ok){ + cycleOverlaps=findCycles(cluster, removeCycles); + for(Unit u : cluster){assert(!u.visited());} + if(cycleOverlaps>0){ + cycleOverlapsT+=cycleOverlaps; + cycleClustersT++; + } + } + + int offsetContradictions=0; + if(ok){ + if(EA){ + for(Unit u : cluster){ + assert(!u.visited()); + assert(!u.offsetContradiction()); + assert(!u.offsetValid()); + assert(u.canonicized()); + for(Overlap o : u.overlapList){ + assert(!o.invalid()); + assert(!o.offsetContradiction()); + assert(o.type==FORWARD) : o; + } + } + } + offsetContradictions=generateOffsetsBreadthFirst(cluster, temp); +// System.err.println("Made offsets for cluster of size "+cluster.size()+"; contradictions = "+offsetContradictions+"; set = "+temp.size()); + temp.clear(); + for(Unit u : cluster){assert(!u.visited());} + if(offsetContradictions>0){ + offsetContradictoryOverlapsT+=offsetContradictions; + offsetContradictoryClustersT++; + if(fixOffsetContradictionsT){ + if(verbose){System.err.println("Pruning cluster to remove offset contradictions.");} + fullyPruneCluster(cluster, temp); + if(verbose){System.err.println("Resulting size: "+cluster.size());} + if(EA){ + for(Unit u : cluster){ + assert(!u.visited()); + assert(!u.offsetContradiction()); + assert(u.offsetValid()); + for(Overlap o : u.overlapList){ + assert(!o.invalid()); + assert(!o.offsetContradiction()); + assert(o.type==FORWARD) : o; + } + } + } + }else{ + ok=false; + } + } + if(ok){Collections.sort(cluster, UNIT_OFFSET_COMPARATOR);} + } + + if(ok && absorbOverlap){ + mergeCluster(cluster); + } + + processedClustersT.add(cluster); + if(processedClustersT.size()>=threadMaxReadsToBuffer){ + synchronized(processedClusters){ + processedClusters.addAll(processedClustersT); + processedClustersT.clear(); + } + } + } + synchronized(processedClusters){ + processedClusters.addAll(processedClustersT); + processedClustersT.clear(); + } + } + + /** Returns next cluster larger than 1 element. + * Singleton clusters are added directly to 'processed'. */ + private ArrayList nextCluster(){ + synchronized(clusterQueue){ + ArrayList cluster=clusterQueue.poll(); + assert(cluster==null || cluster.size()>1); +// while(cluster!=null && cluster.size()<2){ +//// unmark(cluster); +// processedClustersT.add(cluster); +// cluster=clusterQueue.poll(); +// } + return cluster; + } + } + + private void fullyPruneCluster(ArrayList cluster, ArrayList temp){ + assert(cluster.size()>1) : cluster.size(); + ArrayList pruned=pruneCluster(cluster, true, true, temp); + assert(temp.isEmpty()); + assert(pruned==null || pruned.size()>0); + while(pruned!=null){ + ArrayList subcluster=pruned; + for(Unit u : subcluster){ + u.clearVolatileFlags(); + for(Overlap o : u.overlapList){ + o.clearVolatileFlags(); + } + } + assert(subcluster.size()>0); + pruned=pruneCluster(subcluster, false, false, temp); + assert(temp.isEmpty()); + assert(pruned==null || pruned.size()>0); + assert(subcluster.size()>0); + if(subcluster.size()==1){ + processedClustersT.add(subcluster); + }else{ + assert(subcluster.size()>1); + synchronized(clusterQueue){ + clusterQueue.add(subcluster); + } + } + } + } + + /** + * @param cluster + */ + private void mergeCluster(ArrayList cluster) { + if(cluster.size()==1){return;} + if(mergeLeavesT){ + mergeLeaves(cluster); + } + if(mergeInnerT){ + mergeInner(cluster); + } + } + + /** + * @param cluster + */ + private void unvisit(ArrayList cluster) { + for(Unit u : cluster){ + if(u.visited()){u.setVisited(false);} + } + } + + /** + * Finds places in the cluster where two Units are joined by multiple different Overlaps. + * Returns number of multijoins found. + * @param cluster + */ + private int findMultiJoinsInCluster(ArrayList cluster, boolean resolveProblems) { + if(cluster.size()<2){return 0;} + int totalMultiJoins=0; + for(Unit ua : cluster){ + ArrayList list=ua.overlapList; + assert(list!=null); + if(list.size()>1){ + Collections.sort(list); + + int multiJoins=0; + for(int i=0; i0){ + totalMultiJoins+=multiJoins; + if(resolveProblems){Tools.condenseStrict(list);} + } + + for(int i=0; i cluster) { + if(verbose){System.err.println("reorderClusterBreadthFirst");} + + final int size=cluster.size(); + Collections.sort(cluster); //Now it is in descending length + final Unit root=cluster.get(0); + assert(root.length()>=cluster.get(size-1).length()) : root.length()+", "+cluster.get(size-1).length()+", "+root.compareTo(cluster.get(size-1)); + + ArrayList breadthFirst=new ArrayList(cluster.size()); + root.setVisited(true); +// System.err.println("root="+root.name()); + breadthFirst.add(root); + for(int i=0; i pruneCluster(ArrayList cluster, boolean pruneContradictoryNodes, boolean pruneContradictoryOverlaps, ArrayList visited){ + if(verbose){System.err.println("pruneCluster(size="+cluster.size()+", "+pruneContradictoryNodes+", "+pruneContradictoryOverlaps+")");} + + //pruneContradictoryOverlaps is less strict than pruneContradictoryNodes + assert(pruneContradictoryOverlaps || !pruneContradictoryNodes); + + for(Unit ua : cluster){ + assert(!ua.visited()); + assert(ua.isPerfectlyTransitive()) : ua; + if(ua.visited()){ua.setVisited(false);} + } + + int prunedOverlaps=0; + int visits=1; + + { + final Unit root=cluster.get(0); + assert(!root.contradiction()); + root.setVisited(true); + visited.add(root); + } + + for(int i=0; i list=ua.overlapList; + int removed=0; + for(int j=0; j0){Tools.condenseStrict(list);} + } + } + + if(verbose){System.err.println("cluster.size()="+cluster.size()+", visits="+visits+", visited.size()="+visited.size());} + +// if(visited.size()==11486){ //TODO: For testing. Remove. +// for(int i=0; i pruned=(numUnvisited==0 ? null : new ArrayList(numUnvisited)); + assert(visits==visited.size()); + assert(visits>=1 && visits<=cluster.size()); + + if(visits(cluster.size()-visits); + for(Unit ua : cluster){ + if(!ua.visited()){ + pruned.add(ua); + ArrayList list=ua.overlapList; + int removed=0; + for(int j=0; j0){Tools.condenseStrict(list);} + } + } + assert(pruned.size()==numUnvisited); + }else{ + assert(prunedOverlaps==0) : "If this fails then I may need to mark overlaps to remove."; + } + for(Unit u : cluster){ + assert(u.isPerfectlyTransitive()) : u; + if(EA){ + for(Overlap o : u.overlapList){assert(!o.invalid());} + } + if(u.visited()){u.setVisited(false);} + } + cluster.clear(); + cluster.addAll(visited); + cluster.trimToSize(); + +// for(Unit u : cluster){ +//// assert(u.canonicized()); +// for(Overlap o : u.overlapList){ +// assert(pruned==null || !pruned.contains(o.u1)); +// assert(pruned==null || !pruned.contains(o.u2)); +// assert(cluster.contains(o.u1)); +// assert(cluster.contains(o.u2)); +// } +// } +// if(pruned!=null){ +// for(Unit u : pruned){ +// for(Overlap o : u.overlapList){ +// assert(pruned.contains(o.u1)); +// assert(pruned.contains(o.u2)); +// assert(!cluster.contains(o.u1)); +// assert(!cluster.contains(o.u2)); +// } +// } +// } + + visited.clear(); + return pruned; + } + + /** + * Cluster should already be ordered breadth-first + * This may fail because removing cycles could change breadth-first traversal, but if it fails, an assertion will be thrown + * @param cluster + */ + private int findCycles(ArrayList cluster, boolean remove){ + + { + final Unit root=cluster.get(0); + assert(root.length()>=cluster.get(cluster.size()-1).length()); + root.setVisited(true); + } + int cycles=0; + + for(Unit ua : cluster){ + assert(ua.visited()); + ArrayList list=ua.overlapList; + int removed=0; + for(int i=0; i0){Tools.condenseStrict(list);} + } + + for(Unit u : cluster){ + if(u.visited()){u.setVisited(false);} + for(Overlap o : u.overlapList){ + if(o.visited()){o.setVisited(false);} + } + } + + return cycles; + } + + /** + * Cluster should already be ordered breadth-first + * @param cluster + */ + private int generateOffsetsBreadthFirst(ArrayList cluster, ArrayList temp){ + + + assert(temp!=null); + assert(temp.isEmpty()); + { + final Unit root=cluster.get(0); + assert(root.length()>=cluster.get(cluster.size()-1).length()); + root.setOffset(0); + temp.add(root); + } + + int contradictions=0; + for(int i=0; i0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp); + if(u.offsetValid() && !u.offsetContradiction()){ + contradictions+=setOffsetsNeighbors(u, temp); + assert(contradictions==0 || (i>0 && temp.size()>2)); + } + } + + int min=0; + for(Unit u : temp){ + if(u.visited()){u.setVisited(false);} + for(Overlap o : u.overlapList){ + if(o.visited()){o.setVisited(false);} + } + if(u.offsetValid() && !u.offsetContradiction()){ + min=Tools.min(min, u.offset()); + } + } + + if(verbose){ + System.err.println("min offset = "+min); + } + + for(Unit u : temp){ + if(u.offsetValid()){ + if(verbose){System.err.println("Set "+u.name()+" offset from "+u.offset+" to "+(u.offset-min));} + u.offset=u.offset-min; + } + } + + + return contradictions; + } + + /** + * @param root + */ + private int setOffsetsNeighbors(Unit root, ArrayList temp) { + if(verbose){System.err.println("\nsetOffsetsNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");} + assert(root.valid()); + assert(!root.visited()); + assert(root.offsetValid()); + assert(!root.offsetContradiction()); + root.setVisited(true); + if(root.overlapList==null){return 0;} + final int contradictions=countOffsetContradictions(root, false); + if(verbose){System.err.println("\ncontradictions="+contradictions);} + for(Overlap o : root.overlapList){ + Unit u=(o.u1==root ? o.u2 : o.u1); + assert(o.u1==root || o.u2==root); + assert(root!=u); + assert(u.valid()); + + if(verbose){System.err.println("\nProcessing Overlap "+o);} + if(!o.visited() && !o.offsetContradiction()){ + o.setVisited(true); + if(!u.offsetContradiction()){ + if(verbose){System.err.println("Calling setOffset: "+o);} + if(!u.offsetValid()){temp.add(u);} + boolean b=setOffset(root, u, o); + if(verbose){System.err.println("Finished setOffset: "+o);} + +// if(x>0){ +// if(verbose){System.err.println("\n*********************************************");} +// if(verbose){System.err.println("Problem detected with contig "+u.name());} +// if(verbose){System.err.println("*********************************************\n");} +// verbose=true; +// int y2=countOffsetContradictions(root, false); +// assert(contradictions==y2); +// } + + assert(b) : "\n"+contradictions+", "+o.offsetContradiction()+", "+root.offsetContradiction()+", "+u.offsetContradiction()+"\n" + +root.offsetValid()+", "+u.offsetValid()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b + +fixMultiJoins; //This assertion can fail if a multijoin is present + assert(u.offsetValid()); + } + } + } + return contradictions; + } + + private int countOffsetContradictions(Unit root, boolean includeKnown){ + if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");} + assert(root.valid()); + assert(root.visited()); + assert(root.offsetValid()); +// assert(!root.offsetContradiction()); + if(root.overlapList==null){return 0;} + int contradictions=0; + for(Overlap o : root.overlapList){ + Unit u=(o.u1==root ? o.u2 : o.u1); + assert(o.u1==root || o.u2==root); + assert(root!=u); + assert(u.valid()); + + if(verbose){System.err.println("\nOverlap "+o+"\nu="+u.name()+", offsetValid="+u.offsetValid());} + + boolean contradictory=(u.offsetValid() && u.offset()!=calcOffset(root, u, o)); + if(verbose){System.err.println("contradictory= \t"+contradictory);} + if(contradictory){ + if(includeKnown || !u.offsetContradiction()){ + contradictions++; + if(!root.offsetContradiction()){root.setOffsetContradiction(true);} + } + if(!o.offsetContradiction()){o.setOffsetContradiction(true);} + if(!u.offsetContradiction()){u.setOffsetContradiction(true);} + } + assert(contradictory==o.offsetContradiction()) : contradictory+", "+o.offsetContradiction(); + if(verbose){ + System.err.println("root.offsetContradiction()=\t"+root.offsetContradiction()); + System.err.println("u.offsetContradiction()= \t"+u.offsetContradiction()); + System.err.println("o.offsetContradiction()= \t"+o.offsetContradiction()); + System.err.println("contradictions= \t"+contradictions); + } + } + if(verbose){System.err.println("Final contradictions="+contradictions+"\n");} + return contradictions; + } + + /** + * Cluster should already be ordered breadth-first + * @param cluster + */ + private int canonicizeClusterBreadthFirst(ArrayList cluster, ArrayList temp) { + + assert(temp!=null); + assert(temp.isEmpty()); + { + final Unit root=cluster.get(0); + assert(root.length()>=cluster.get(cluster.size()-1).length()); + root.setCanonicized(true); + temp.add(root); + } + + int contradictions=0; + for(int i=0; i0) : i+", "+temp.size()+", "+contradictions+"\n"+toString(temp); + if(u.canonicized() && !u.canonContradiction()){ + contradictions+=canonicizeNeighbors(u, temp); + assert(contradictions==0 || (i>0 && temp.size()>2)); + + for(Overlap o : u.overlapList){ + assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) : + o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+ + "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited(); + } + } + +// if(u.r.numericID==59462 || u.r.numericID==56439){ //TODO: remove +// System.err.println("\nid="+u.r.numericID+", canonicized="+u.canonicized()+", contradiction="+u.canonContradiction()+", visited="+u.visited()); +// for(Overlap o : u.overlapList){ +// Unit u2=(o.u1==u ? o.u2 : o.u1); +// assert(o.u1==u || o.u2==u); +// assert(u2!=u); +// assert(u2.valid()); +// System.err.println("o = "+o); +// System.err.println("o.contradiction="+o.canonContradiction()); +// System.err.println("u2.id="+u2.r.numericID+", canonicized="+u2.canonicized()+", contradiction="+u2.canonContradiction()+", visited="+u.visited()); +// } +// } + } + + for(Unit u : temp){ + if(u.visited()){u.setVisited(false);} + if(EA){ + for(Overlap o : u.overlapList){assert(!o.visited());} + } + } + + return contradictions; + } + + /** + * @param root + */ + private int canonicizeNeighbors(Unit root, ArrayList canonicized) { + if(verbose){System.err.println("\ncanonicizeNeighbors("+root.name()+")\nroot.code1="+root.code1+"\n");} + assert(root.valid()); + assert(!root.visited()); + assert(root.canonicized()); + assert(!root.canonContradiction()); + root.setVisited(true); + if(root.overlapList==null){return 0;} + final int contradictions=countCanonContradictions(root, false); + if(verbose){System.err.println("\ncontradictions="+contradictions);} + for(Overlap o : root.overlapList){ + Unit u=(o.u1==root ? o.u2 : o.u1); + assert(o.u1==root || o.u2==root); + assert(root!=u); + assert(u.valid()); + + if(verbose){System.err.println("\nProcessing Overlap "+o);} + if(!o.canonContradiction()){ + if(!u.canonContradiction()){ + boolean b=u.canonicized(); + int dir=o.type; + if(verbose){System.err.println("Calling canonicize: "+o);} + int x=canonicize(root, u, o); + if(verbose){System.err.println("Finished canonicize: "+o);} + +// if(x>0){ +// if(verbose){System.err.println("\n*********************************************");} +// if(verbose){System.err.println("Problem detected with contig "+u.name());} +// if(verbose){System.err.println("*********************************************\n");} +// verbose=true; +// int y2=countCanonContradictions(root, false); +// assert(contradictions==y2); +// } + + assert(x==0 || (u.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC))); + assert(x==0) : "\n"+x+", "+contradictions+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+u.canonContradiction()+"\n" + +root.canonicized()+", "+u.canonicized()+", "+OVERLAP_TYPE_NAMES[o.type]+"\n"+b+", "+dir + +fixMultiJoins; //This assertion can fail if a multijoin is present + if(!u.canonicized()){ + u.setCanonicized(true); + canonicized.add(u); + } + assert(u.canonicized()); + } + } + } + if(EA){ + for(Overlap o : root.overlapList){ + assert(o.type==FORWARD || o.canonContradiction() || o.u1.canonContradiction() || o.u2.canonContradiction()) : + o+"\n"+contradictions+", "+o.canonContradiction()+", "+o.u1.canonContradiction()+", "+o.u2.canonContradiction()+", "+root.canonContradiction()+ + "\n"+o.u1.canonicized()+", "+o.u2.canonicized()+", "+o.u1.visited()+", "+o.u2.visited(); + } + } + return contradictions; + } + + private int countCanonContradictions(Unit root, boolean includeKnown){ + if(verbose){System.err.println("\ncountContradictions("+root.name()+", "+includeKnown+")\nroot.code1="+root.code1+"\n");} + assert(root.valid()); + assert(root.visited()); + assert(root.canonicized()); +// assert(!root.canonContradiction()); + if(root.overlapList==null){return 0;} + int contradictions=0; + for(Overlap o : root.overlapList){ + Unit ub=(o.u1==root ? o.u2 : o.u1); + assert(o.u1==root || o.u2==root); + assert(root!=ub); + assert(ub.valid()); + + if(verbose){System.err.println("\nOverlap "+o+"\nu="+ub.name()+", canonicized="+ub.canonicized());} + + boolean contradictory=(ub.canonicized() && (o.type==FORWARDRC || o.type==REVERSERC)); + if(verbose){System.err.println("contradictory= \t"+contradictory);} + if(contradictory){ + if(!o.canonContradiction()){o.setCanonContradiction(true);} + if(includeKnown || !ub.canonContradiction()){ + contradictions++; + if(!root.canonContradiction()){root.setCanonContradiction(true);} + if(!ub.canonContradiction()){ub.setCanonContradiction(true);} + } + } + + assert(!o.canonContradiction() || (root.canonContradiction() || ub.canonContradiction())) : + "\n"+contradictory+", "+o.canonContradiction()+", "+root.canonContradiction()+", "+ub.canonContradiction(); + + assert(contradictory==o.canonContradiction()) : contradictory+", "+o.canonContradiction(); + if(verbose){ + System.err.println("root.canonContradiction()=\t"+root.canonContradiction()); + System.err.println("u.canonContradiction()= \t"+ub.canonContradiction()); + System.err.println("o.canonContradiction()= \t"+o.canonContradiction()); + System.err.println("contradictions= \t"+contradictions); + } + } + if(verbose){System.err.println("Final contradictions="+contradictions+"\n");} + return contradictions; + } + + private String toString(ArrayList cluster){ + for(int i=0; i"+u.name()+"\n"); + sb.append(new String(u.bases())); + sb.append("\n"); + } + sb.append("\n*****\n"); + for(Unit u : cluster){ + sb.append("\n"+u.name()+":"); + for(Overlap o : u.overlapList){ + Unit ub=(o.u1==u ? o.u2 : o.u1); + sb.append(" "+ub.name()); + } + } + sb.append("\n"); + return sb.toString(); + } + + private String toShortString(ArrayList cluster){ + for(int i=0; i cluster) { + assert(false) : "TODO"; + for(Unit u : cluster){ + + } + } + + /** + * @param cluster + */ + private void mergeInner(ArrayList cluster) { + assert(false) : "TODO"; + for(Unit u : cluster){ + + } + } + + private ArrayList> processedClustersT=new ArrayList>(threadMaxReadsToBuffer); + + long leafMergesT=0; + long innerMergesT=0; + long leafBaseMergesT=0; + long innerBaseMergesT=0; + + long multiJoinFailuresT=0; + long multiJoinsFoundT=0; + long multiJoinBasesFoundT=0; + long unitsFlippedT=0; + long overlapsFlippedT=0; + long canonContradictoryOverlapsT=0; + long canonContradictoryClustersT=0; + long offsetContradictoryOverlapsT=0; + long offsetContradictoryClustersT=0; + long cycleOverlapsT=0; + long cycleClustersT=0; + + private final boolean fixMultiJoinsT; + private final boolean canonicizeT; + private final boolean fixCanonContradictionsT; + private final boolean fixOffsetContradictionsT; + private final boolean mergeClustersT; + private final boolean mergeLeavesT; + private final boolean mergeInnerT; + private final BandedAligner bandy; + } + + + /** + * Creates Unit objects or uses ones already attached to reads. + * Places them in local storage and percolates them to shared storage (codeMap), removing exact duplicates. + * Also hashes tips and places these in shared affixMap. + * Looks for containments in the affix map. + * @author Brian Bushnell + * @date Jul 24, 2013 + * + */ + private final class HashThread extends Thread{ + + public HashThread(boolean addToCodeMap_, boolean addToAffixMap_, boolean findMatches_, boolean findContainments_, boolean findOverlaps_){ + addToCodeMapT=addToCodeMap_; + addToAffixMapT=addToAffixMap_; + findContainmentsT=findContainments_; + findOverlapsT=findOverlaps_; + findMatchesT=findMatches_; + tid=getTid(); + crisq=new ArrayDeque(crisa.length); + for(int i=0; i0 && (findOverlapsT || findContainmentsT) ? new BandedAligner(bandwidth) : null); + +// assert(addToCodeMapT) : "addToCodeMapT="+addToCodeMapT+", addToAffixMapT="+addToAffixMapT+", findContainmentsT="+findContainmentsT+ +// ", findOverlapsT="+findOverlapsT+", findMatchesT="+findMatchesT+", convertToUpperCaseT="+convertToUpperCaseT+", numAffixMaps="+numAffixMaps; + } + + public void run(){ + + ConcurrentReadStreamInterface cris=crisq.poll(); + + while(cris!=null){ + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + // long xx=0; + while(reads!=null && reads.size()>0){ + + for(Read r : reads){ + assert(r.pairnum()==0); + processRead(r); + if(r.mate!=null){ + assert(r.mate.pairnum()==1) : cris.getClass()+", "+cris.producers()[0].getClass(); + processRead(r.mate); + } + } + + if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){ + assert(addToCodeMapT); + long added=mergeMaps(); + addedToMainT+=added; + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + if(codeMapT!=null && !codeMapT.isEmpty()){ + long added=mergeMaps(); + addedToMainT+=added; + } + cris=crisq.poll(); + } + + codeMapT=null; + localConflictList=null; + sharedConflictList=null; + } + + private void processRead(Read r){ + + if(r.bases!=null && r.bases.length>=MINSCAF){ + if(!storeName){r.id=null;} + if(!storeQuality){r.quality=null;} +// if(convertToUpperCaseT){ +// if(r.obj==null && r.bases!=null){ +// for(int i=0; i list=codeMapT.get(codeL); + if(list==null){ + if(verbose){System.err.println("Unique.");} + list=new ArrayList(1); + list.add(u); + basesStoredT+=r.bases.length; + codeMapT.put(codeL, list); + }else{ + if(verbose){System.err.println("Exists.");} + boolean match=false; + if(findMatchesT){ + for(Unit u2 : list){ + if(pairedEqualsRC(u, u2)){ +// if(u.r.mate!=null){ +// verbose=true; +// +// Unit um=(Unit)u.r.mate.obj; +// Unit u2m=(Unit)u2.r.mate.obj; +// +// if(verbose){ +// System.err.println("********"); +// System.err.println(u.r.toFastq()); +// System.err.println(u.r.mate.toFastq()); +// System.err.println("********"); +// System.err.println(u2.r.toFastq()); +// System.err.println(u2.r.mate.toFastq()); +// System.err.println("********"); +// System.err.println(u); +// System.err.println(u2); +// System.err.println(um); +// System.err.println(u2m); +// System.err.println("********"); +// System.err.println(u.equals(u2)); +// System.err.println(u.compareTo(u2)); +// System.err.println("********"); +// System.err.println(um.equals(u2m)); +// System.err.println(um.compareTo(u2m)); +// System.err.println("********"); +// } +// +// verbose=false; +// } + assert(u.r.mate==null || pairedEqualsRC((Unit)u.r.mate.obj, (Unit)u2.r.mate.obj)) : + u.r.toFastq()+"\n"+u2.r.toFastq()+"\n"+u.r.mate.toFastq()+"\n"+u2.r.mate.toFastq()+ + "\n"+u+"\n"+u2+"\n"+u.r.mate.obj+"\n"+u2.r.mate.obj; + // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.bases.length)));} + match=true; + u2.absorbMatch(u); + break; + } + } + } + if(match){ + matchesT++; + baseMatchesT+=r.bases.length; + // if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} + }else{ + collisionsT++; + if(verbose){System.err.println("False collision; count = "+collisionsT);} + list.add(u); + basesStoredT+=r.bases.length; + } + } + } + + if(findContainmentsT){ + int x=findContainments(u); + } + + if(findOverlapsT){ + int x=findOverlaps(u); + } + } + } + + private int findContainments(final Unit u){ + if(minLengthPercent<=0 && maxSubs<=0 && minIdentity>=100 && !u.valid()){return 0;} + final byte[] bases=u.bases(); + final int minlen=k-1; + final long shift=2*k; + final long shift2=shift-2; + final long mask=~((-1L)<>>2)|(x2<(2);} + assert(!ub.overlapList.contains(o)); + ub.overlapList.add(o); + if(verbose || flag){System.err.println("Added overlap "+o);} + } + }else{ + if(verbose || flag){System.err.println("Already contained overlap "+o);} + } + + +// assert(ua.alreadyHas(o)); +// assert(ub.alreadyHas(o)); +// assert(ua.overlapList.contains(o)); +// assert(ub.overlapList.contains(o)); + if(verbose || flag){ + System.err.println("ua contains o? "+ua.alreadyHas(o)); + System.err.println("ub contains o? "+ub.alreadyHas(o)); + System.err.println("ua.list="+ua.overlapList); + System.err.println("ub.list="+ub.overlapList); + } + } + } + } + } + } + } + } + } + if(EA){ + synchronized(u){ + if(u.overlapList!=null && u.overlapList.isEmpty()){ + assert(false) : "Why would this happen?"; + u.overlapList=null; + } + } + } +// assert(false) : hits+", "+currentOverlaps+", "+baseOverlaps+"\n"+overlapMapT+"\n"; + +// assert(hits==currentOverlaps) : hits+", "+currentOverlaps; + + overlapCollisionsT+=(hits-currentOverlaps); +// outstream.println("hits="+hits+", currentOverlaps="+currentOverlaps); + overlapsT+=currentOverlaps; + return hits; + } + + private long mergeMaps(){ + if(verbose){System.err.println("Merging maps.");} + long novelReads=0, novelKeys=0; + long collisionReads=0; + long mergedReads=0; + + assert(localConflictList.isEmpty()); + assert(sharedConflictList.isEmpty()); + + synchronized(codeMap){ + for(Long key : codeMapT.keySet()){ + if(codeMap.containsKey(key)){ + localConflictList.add(codeMapT.get(key)); + sharedConflictList.add(codeMap.get(key)); + }else{ + ArrayList list=codeMapT.get(key); + codeMap.put(key, list); + addedList.addAll(list); + novelReads+=list.size(); + novelKeys++; + } + } + } + + if(verbose){System.err.println("Novel reads = "+novelReads+", conflicts = "+localConflictList.size());} + + for(int i=0; i listT=localConflictList.get(i); + ArrayList list=sharedConflictList.get(i); + synchronized(list){ + for(Unit u : listT){ + if(verbose){System.err.println("Processing novel unit "+u.name());} + boolean match=false; + if(findMatchesT){ + for(Unit u2 : list){ + if(pairedEqualsRC(u, u2)){ + // if(verbose){System.err.println("Matches "+new String(r2.bases, 0, Tools.min(40, r2.bases.length)));} + u2.absorbMatch(u); + match=true; + break; + } + } + } + if(match){ + mergedReads++; + baseMatchesT+=u.length(); + if(verbose){System.err.println("matchesT="+matchesT+", baseMatchesT="+baseMatchesT);} + }else{ + collisionReads++; + if(verbose){System.err.println("False collision; count = "+collisionReads);} + list.add(u); + addedList.add(u); + } + } + } + } + matchesT+=mergedReads; + collisionsT+=collisionReads; + if(verbose){System.err.println("Done Merging.");} + if(verbose){System.err.println("mapT.size="+codeMapT.size()+", basesStoredT="+basesStoredT);} + + codeMapT.clear(); + localConflictList.clear(); + sharedConflictList.clear(); + + if(!addedList.isEmpty()){ + if(addToAffixMapT){ + final LongM p=new LongM(-1, true); + assert(affixMap1!=null || affixMap2!=null); + if(affixMap1!=null && !ignoreAffix1){//Allows you to not use am1 + synchronized(affixMap1){ + for(Unit u : addedList){ + if(verbose){System.err.println("Processing affixes for "+u.name());} + if(u.prefix1!=-1 || u.prefix1!=u.suffix1){ + if(verbose){System.err.println("Using prefix "+u.prefix1);} + p.set(u.prefix1); + ArrayList alu=affixMap1.get(p); + if(alu==null){ + if(verbose){System.err.println("Made new alu for "+p);} + alu=new ArrayList(2); + affixMap1.put(p.iCopy(), alu); + } + if(alu.size() alu=affixMap1.get(p); + if(alu==null){ + if(verbose){System.err.println("Made new alu for "+p);} + alu=new ArrayList(2); + affixMap1.put(p.iCopy(), alu); + } + if(alu.size() alu=affixMap2.get(p); + if(alu==null){ + alu=new ArrayList(2); + affixMap2.put(p.iCopy(), alu); + } + if(alu.size() alu=affixMap2.get(p); + if(alu==null){ + alu=new ArrayList(2); + affixMap2.put(p.iCopy(), alu); + } + if(alu.size()> codeMapT=new HashMap>(threadMaxReadsToBuffer*8); + private ArrayList addedList=new ArrayList(threadMaxReadsToBuffer); + private ArrayList> localConflictList=new ArrayList>(threadMaxReadsToBuffer); + private ArrayList> sharedConflictList=new ArrayList>(threadMaxReadsToBuffer); + + long matchesT=0; + long baseMatchesT=0; + long baseContainmentsT=0; + long collisionsT=0; + long containmentsT=0; + long containmentCollisionsT=0; + long basesStoredT=0; + long addedToMainT=0; + long readsProcessedT=0; + long basesProcessedT=0; + long overlapsT=0; + long baseOverlapsT=0; + long overlapCollisionsT=0; + + private final boolean addToCodeMapT; + private final boolean addToAffixMapT; + private final boolean findContainmentsT; + private final boolean findOverlapsT; + private final boolean findMatchesT; +// private final boolean convertToUpperCaseT; + private final int tid; + private final ArrayDeque crisq; + private final BandedAligner bandy; + } + + public static boolean equalsRC(byte[] a, byte[] b){ + if(a==b){return true;} + if(a==null || b==null){return false;} + if(a.length!=b.length){return false;} + + boolean ca=isCanonical(a); + boolean cb=isCanonical(b); + + if(ca==cb){ + for(int i=0; iub.prefix1 ? 1 : -1;} + if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;} + }else{ + if(ua.prefix1!=ub.suffix1){return ua.prefix1>ub.suffix1 ? 1 : -1;} + if(ua.suffix1!=ub.prefix1){return ua.suffix1>ub.prefix1 ? 1 : -1;} + } + }else{ + if(verbose){System.err.println("d");} + if(ub.canonical()){ + if(ua.suffix1!=ub.prefix1){return ua.suffix1>ub.prefix1 ? 1 : -1;} + if(ua.prefix1!=ub.suffix1){return ua.prefix1>ub.suffix1 ? 1 : -1;} + }else{ + if(ua.suffix1!=ub.suffix1){return ua.suffix1>ub.suffix1 ? 1 : -1;} + if(ua.prefix1!=ub.prefix1){return ua.prefix1>ub.prefix1 ? 1 : -1;} + } + } + if(verbose){System.err.println("e");} + if(ua.code1!=ub.code1){return ua.code1>ub.code1 ? 1 : -1;} + if(ua.code2!=ub.code2){return ua.code2>ub.code2 ? 1 : -1;} + + return ua.pairnum()-ub.pairnum(); + } + if(verbose){System.err.println("f");} + final byte[] a=ua.r.bases, b=ub.r.bases; + if(a==b){return 0;} + if(a==null || b==null){return a==null ? -1 : 1;} + if(verbose){System.err.println("g");} + + if(ua.canonical()==ub.canonical()){ + if(verbose){System.err.println("h");} + if(ua.canonical() && ub.canonical()){ + for(int i=0; i=0; i--){ + final byte aa=baseToComplementExtended[a[i]], bb=baseToComplementExtended[b[i]]; + if(aa!=bb){return aa-bb;} + } + } + }else{ + if(verbose){System.err.println("i");} + if(ua.canonical()){ + for(int i=0, j=b.length-1; i=0; i--, j++){ + final byte aa=baseToComplementExtended[a[i]], bb=b[j]; + if(aa!=bb){return aa-bb;} + } + } + } + + if(verbose){System.err.println("j");} + return ua.pairnum()-ub.pairnum(); + } + + private static long hashTip(byte[] bases, boolean prefix, int k, int skipInitialBases){ + if(bases==null || bases.length{ + public Overlap(Unit u1_, Unit u2_, int type_, int start1_, int start2_, int stop1_, int stop2_, int len_, int mismatches_, int edits_, BandedAligner bandy){ + assert(u1_!=u2_); + if(verbose){System.err.println("\nCreating an overlap.");} + u1=u1_; + u2=u2_; + type=type_; + start1=start1_; + start2=start2_; + stop1=stop1_; + stop2=stop2_; + overlapLen=len_; + mismatches=mismatches_; + edits=edits_; + + assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start2, stop2))<=maxEdits) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) + +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) + +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); + + assert(start1>=0 && start1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); + assert(stop1>=0 && stop1<=u1.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); + assert(start2>=0 && start2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); + assert(stop2>=0 && stop2<=u2.length()) : "type "+type+": "+start1+", "+stop1+", "+u1.length()+", "+start2+", "+stop2+", "+u2.length(); + + assert(type==FORWARD || type==FORWARDRC || type==REVERSE || type==REVERSERC); + + if(verbose){System.err.println(this);} + + assert(Tools.absdif(Tools.absdif(start1, stop1), Tools.absdif(start1, stop1))<=maxEdits); + + assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) + +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) + +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); + if(verbose){System.err.println("Passed test 1.");} + +// bandy.verbose=true; +// test(bandy); +// assert(false); + + assert(u1!=u2); + u1.firstInOverlap(u2); + u2.firstInOverlap(u1); + assert(u1.length()!=u2.length() || u1.code1!=u2.code1 || u1.code2!=u2.code2 || (u1.r!=null && u1.r.mate!=null)) : "Collision? \n"+this+"\n"+u1+"\n"+u2; + assert(u1.firstInOverlap(u2)!=u2.firstInOverlap(u1)) : + "\nu1.firstInOverlap(u2)="+u1.firstInOverlap(u2)+"\nu2.firstInOverlap(u1)="+u2.firstInOverlap(u1)+"\nu1="+u1+"\nu2="+u2; + + if(!u1.firstInOverlap(u2)){ + if(verbose){System.err.println("\nSwapping.");} + swap(); + if(verbose){System.err.println(this);} + + if(EA && !test(bandy, edits+maxEdits)){ + System.err.println("\n"+this); + swap(); + System.err.println("\n"+this); + assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; + System.err.println("Passed test 2a, "+bandy.lastEdits+" edits.\n"); + swap(); + System.err.println("\n"+this); + assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) + +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) + +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); + System.err.println("Passed test 2b, "+bandy.lastEdits+" edits.\n"); + } + + assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) + +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) + +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); + if(verbose){System.err.println("Passed test 2.");} + } + + if(type==REVERSE || type==REVERSERC){ + if(verbose){System.err.println("\nReversing.");} + reverseDirection(); + if(verbose){System.err.println(this);} + + if(EA && !test(bandy, edits+maxEdits)){ + bandy.verbose=true; + System.err.println("\n********** Failed test 3, "+bandy.lastEdits+" edits. ***************\n"); + reverseDirection(); + System.err.println(this); + assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; + System.err.println("Passed test 3a, "+bandy.lastEdits+" edits.\n"); + reverseDirection(); + System.err.println(this); + assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases) + +"\n>1a\n"+new String(u1.r.bases, Tools.min(start1, stop1), Tools.max(start1, stop1)-Tools.min(start1, stop1)+1) + +"\n>2a\n"+new String(u2.r.bases, Tools.min(start2, stop2), Tools.max(start2, stop2)-Tools.min(start2, stop2)+1); + System.err.println("Passed test 3b, "+bandy.lastEdits+" edits.\n"); + } + + assert(test(bandy, edits+maxEdits)) : "\n"+this+"\n>1\n"+new String(u1.r.bases)+"\n>2\n"+new String(u2.r.bases)+"\n"; + if(verbose){System.err.println("Passed test 3.");} + } + //Now all overlaps should be FORWARD or FORWARDRC and u1 should be at least as big as u2 + assert(type==FORWARD || type==FORWARDRC); + assert(u1.length()>=u2.length()); + assert(u1.firstInOverlap(u2)); + assert(!u2.firstInOverlap(u1)); + if(verbose){System.err.println("Finished overlap initialization.");} + } + + public boolean test(BandedAligner bandy, int editLimit){ + final int last1=u1.length()-1, last2=u2.length()-1; + if(verbose){System.err.println("Testing "+OVERLAP_TYPE_NAMES[type]+", "+start1+", "+start2);} + if(type==FORWARD){ + assert(start1==0 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; + if(start2==0){ + if(verbose){System.err.println("A");} + return u1.overlapsForward(u2, start1, start2, bandy, false, editLimit);} + else{ + if(verbose){System.err.println("B");} + return u2.overlapsForward(u1, start2, start1, bandy, false, editLimit);} + } + if(type==FORWARDRC){ + assert(start1==0 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; + if(start2==last2){return u1.overlapsForwardRC(u2, start1, start2, bandy, false, editLimit);} + else{return u2.overlapsReverseRC(u1, start2, start1, bandy, false, editLimit);} + } + if(type==REVERSE){ + assert(start1==last1 || start2==last2) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; + if(start2==last2){return u1.overlapsReverse(u2, start1, start2, bandy, false, editLimit);} + else{return u2.overlapsReverse(u1, start2, start1, bandy, false, editLimit);} + } + if(type==REVERSERC){ + assert(start1==last1 || start2==0) : "start1="+start1+", stop1="+stop1+", last1="+last1+", start2="+start2+", stop2="+stop2+", last2="+last2; + if(start2==0){return u1.overlapsReverseRC(u2, start1, start2, bandy, false, editLimit);} + else{return u2.overlapsForwardRC(u1, start2, start1, bandy, false, editLimit);} + } + throw new RuntimeException(); + } + + public boolean equals(Object o){ + return equals((Overlap)o); + } + + public boolean equals(Overlap o){ + if(this==o){return true;} + assert(o!=null) : "*A*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; + assert(u1!=null && u2!=null) : "*B*\n"+this+"\n"+o+"\n"+u1+"\n"+u2; + assert(u1!=o.u2 || u2!=o.u1) : "*C*\n"+this+"\n"+o+"\n"+u1.firstInOverlap(u2)+"\n"+o.u1.firstInOverlap(o.u2)+"\n"+u1+"\n"+u2; + return (u1==o.u1 && u2==o.u2 && type==o.type && start1==o.start1 && start2==o.start2 && stop1==o.stop1 && stop2==o.stop2) + ;//|| (u1==o.u2 && u2==o.u1 && type==reverseType(o.type) && start1==o.start2 && start2==o.start1); + } + +// public int compareTo(Overlap o){ +// int a=compareTo2(o); +// int b=o.compareTo2(this); +// assert(a==-b) : "\n"+this+"\n"+o+"\na="+a+", b="+b+", equals="+this.equals(o) +// +"\nu1.compareTo(o.u1)="+u1.compareTo(o.u1)+"\no.u1.compareTo(u1)="+o.u1.compareTo(u1) +// +"\nu2.compareTo(o.u2)="+u2.compareTo(o.u2)+"\no.u2.compareTo(u2)="+o.u2.compareTo(u2); +// return a; +// } + + public int compareTo(Overlap o){ + int score1=overlapLen-50*(mismatches+edits); + int score2=o.overlapLen-50*(o.mismatches+o.edits); + if(score1!=score2){return score2-score1;} + if(overlapLen!=o.overlapLen){return o.overlapLen-overlapLen;} + int x=u1.compareTo(o.u1); + if(x!=0){return -x;} + x=u2.compareTo(o.u2); + if(x!=0){return -x;} + if(type!=o.type){return type-o.type;} + if(u1!=o.u1 || u2!=o.u2){ + verbose=true; + System.err.println(this); + System.err.println(o); + System.err.println("********"); + System.err.println(u1); + System.err.println(u2); + System.err.println(o.u1); + System.err.println(o.u2); + System.err.println("********"); + System.err.println(u1.equals(o.u1)); + System.err.println("********"); + System.err.println(u2.equals(o.u2)); + System.err.println("********"); + System.err.println(u1.compareTo(o.u1)); + System.err.println("********"); + System.err.println(u2.compareTo(o.u2)); + System.err.println("********"); + verbose=false; + } + assert(u1==o.u1 && u2==o.u2) : "\n"+u1.r+"\n"+u2.r+"\n"+o.u1.r+"\n"+o.u2.r + +"\n\n"+u1.r.mate+"\n"+u2.r.mate+"\n"+o.u1.r.mate+"\n"+o.u2.r.mate; +// assert(false) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; + if(start1!=o.start1){return start1-o.start1;} + if(stop1!=o.stop1){return stop1-o.stop1;} + if(start2!=o.start2){return start2-o.start2;} + if(stop2!=o.stop2){return stop2-o.stop2;} + assert(this.equals(o)) : "\n"+this+"\n"+o+"\n>"+u1.name()+"\n"+new String(u1.bases())+"\n>"+u2.name()+"\n"+new String(u2.bases())+"\n"; + return 0; + } + + public int hashCode(){ + return u1.hashCode()^u2.hashCode()^overlapLen; + } + + public void flip(Unit changed, BandedAligner bandy){ + + if(changed==u2){ + if(type==FORWARD){type=FORWARDRC;} + else if(type==FORWARDRC){type=FORWARD;} + else if(type==REVERSE){type=REVERSERC;} + else if(type==REVERSERC){type=REVERSE;} + else{throw new RuntimeException("Unknown overlap type "+type);} + start2=u2.length()-start2-1; + stop2=u2.length()-stop2-1; + }else if(changed==u1){ + if(type==FORWARD){type=REVERSERC;} + else if(type==FORWARDRC){type=REVERSE;} + else if(type==REVERSE){type=FORWARDRC;} + else if(type==REVERSERC){type=FORWARD;} + else{throw new RuntimeException("Unknown overlap type "+type);} + start1=u1.length()-start1-1; + stop1=u1.length()-stop1-1; + }else{throw new RuntimeException("'changed' was not in the Overlap.");} + + assert(test(bandy, edits+maxEdits)); + } + + public void swap(){ + Unit tempu=u1; + u1=u2; + u2=tempu; + int temp=start1; + start1=start2; + start2=temp; + temp=stop1; + stop1=stop2; + stop2=temp; + if(type==FORWARDRC){type=REVERSERC;} + else if(type==REVERSERC){type=FORWARDRC;} + } + + public void reverseDirection(){ + type=reverseType(type); + int temp=start1; + start1=stop1; + stop1=temp; + temp=start2; + start2=stop2; + stop2=temp; + } + + public String toString(){ + StringBuilder sb=new StringBuilder(80); + sb.append("type="); + sb.append(OVERLAP_TYPE_NAMES[type]); + sb.append(", len="); + sb.append(overlapLen); + sb.append(", subs="); + sb.append(mismatches); + sb.append(", edits="); + sb.append(edits); + + sb.append(" ("); + sb.append(u1.name()==null ? u1.r.numericID+"" : u1.name()); + sb.append(", start1="); + sb.append(start1); + sb.append(", stop1="); + sb.append(stop1); + + sb.append(") ("); + sb.append(u2.name()==null ? u2.r.numericID+"" : u2.name()); + sb.append(", start2="); + sb.append(start2); + sb.append(", stop2="); + sb.append(stop2); + sb.append(")"); + return sb.toString(); + } + + + private void setCanonContradiction(boolean b){ + assert(b!=canonContradiction()) : b+", "+canonContradiction(); + if(b){flags|=CANON_CONTRADICTION_MASK;} + else{flags&=~CANON_CONTRADICTION_MASK;} + assert(b==canonContradiction()) : b+", "+canonContradiction(); + } + + private void setOffsetContradiction(boolean b){ + assert(b!=offsetContradiction()) : b+", "+offsetContradiction(); + if(b){flags|=OFFSET_CONTRADICTION_MASK;} + else{flags&=~OFFSET_CONTRADICTION_MASK;} + assert(b==offsetContradiction()) : b+", "+offsetContradiction(); + } + + private void setMultiJoin(boolean b){ + assert(b!=multiJoin()) : b+", "+multiJoin(); + if(b){flags|=MULTIJOIN_MASK;} + else{flags&=~MULTIJOIN_MASK;} + assert(b==multiJoin()) : b+", "+multiJoin(); + } + + private void setVisited(boolean b){ + assert(b!=visited()) : b+", "+visited(); + if(b){flags|=VISITED_MASK;} + else{flags&=~VISITED_MASK;} + assert(b==visited()) : b+", "+visited(); + } + + private void setCyclic(boolean b){ + assert(b!=cyclic()) : b+", "+cyclic(); + if(b){flags|=CYCLIC_MASK;} + else{flags&=~CYCLIC_MASK;} + assert(b==cyclic()) : b+", "+cyclic(); + } + + private void setInvalid(boolean b){ + assert(b!=invalid()) : b+", "+invalid(); + if(b){flags|=INVALID_MASK;} + else{flags&=~INVALID_MASK;} + assert(b==invalid()) : b+", "+invalid(); + } + + public void clearVolatileFlags(){ + flags=0; +// flags=flags&~(MULTIJOIN_MASK|VISITED_MASK|CANON_CONTRADICTION_MASK|CYCLIC_MASK|OFFSET_CONTRADICTION_MASK|INVALID_MASK); +// assert(!canonContradiction()); +// assert(!offsetContradiction()); +// assert(!multiJoin()); +// assert(!visited()); +// assert(!cyclic()); +// assert(!invalid()); + } + + public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;} + public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} + public boolean multiJoin(){return (MULTIJOIN_MASK&flags)==MULTIJOIN_MASK;} + public boolean visited(){return (VISITED_MASK&flags)==VISITED_MASK;} + public boolean cyclic(){return (CYCLIC_MASK&flags)==CYCLIC_MASK;} + public boolean invalid(){return (INVALID_MASK&flags)==INVALID_MASK;} + public boolean contradiction(){return canonContradiction() || offsetContradiction();} + + private static final long VISITED_MASK=(1L<<0); + private static final long MULTIJOIN_MASK=(1L<<1); + private static final long CYCLIC_MASK=(1L<<2); + private static final long CANON_CONTRADICTION_MASK=(1L<<3); + private static final long OFFSET_CONTRADICTION_MASK=(1L<<4); + private static final long INVALID_MASK=(1L<<4); + + Unit u1; + Unit u2; + int type; + int start1; + int start2; + int stop1; + int stop2; + + long flags=0; + + final int overlapLen; + final int mismatches; + final int edits; + } + + + private class Unit implements Comparable{ + + public Unit(Read r_){ + this(r_, isCanonical(r_.bases)); + } + + public Unit(Read r_, boolean canonical_){ +// this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases)); + this(r_, canonical_, hash(r_.bases), hashReversed(r_.bases)); + } + + public Unit(Read r_, boolean canonical_, long codeF_, long codeR_){ + r=r_; + code1=Tools.min(codeF_, codeR_); + code2=Tools.max(codeF_, codeR_); + long f=r.bases.length; + prefix1=hashTip(r.bases, true, k, 0); + suffix1=hashTip(r.bases, false, k, 0); + if(r.bases.length>2*k){ + prefix2=hashTip(r.bases, true, k, k); + suffix2=hashTip(r.bases, false, k, k); + } + if(canonical_){f|=CANON_MASK;} + if(r.pairnum()==1){f|=PAIRNUM_MASK;} + flags=f; + assert(canonical()==canonical_); + assert(length()==r.bases.length); + assert(pairnum()==r.pairnum()); + if(parseDepth){ + int[] quad=KmerNormalize.parseDepth(r.id, null); + if(quad!=null){depth=quad[r.pairnum()];} + } + } + + public void absorbMatch(Unit u){ + + assert(code1==u.code1 && code2==u.code2 && length()==u.length()); + if(r==null || u.r==null){return;} + u.r.setDiscarded(true); + final byte[] bases1=r.bases, bases2=u.r.bases; + final byte[] quals1=r.quality, quals2=u.r.quality; + + assert((r.mate==null) == (u.r.mate==null)); + + if(r.mate!=null && !u.r.mate.discarded()){ + ((Unit)r.mate.obj).absorbMatch((Unit)u.r.mate.obj); + } + if(quals1==null || quals2==null){return;} + + if(canonical()==u.canonical()){ + for(int i=0; i makeCluster() { + assert(!visited()); + assert(!clustered()); + assert(valid()); +// assert(set.isEmpty()); + ArrayList cluster=new ArrayList(overlapList==null ? 1 : overlapList.size()+1); + cluster.add(this); + setClustered(true); + + int added=1; + for(int i=0; i cluster) { + assert(!visited()); + assert(clustered()); + assert(valid()); +// assert(cluster.contains(this)); + setVisited(true); + int added=0; + + if(r!=null && r.mate!=null){ + Unit u2=(Unit)r.mate.obj; + assert(u2!=this); + assert(u2.valid()); + if(!u2.clustered()){ + u2.setClustered(true); + cluster.add(u2); + added++; + } + } + + if(overlapList!=null){ + for(Overlap o : overlapList){ + Unit u2=(o.u1==this ? o.u2 : o.u1); + assert(o.u1==this || o.u2==this); + assert(u2!=this); + assert(u2.valid()); + if(!u2.clustered()){ + u2.setClustered(true); + cluster.add(u2); + added++; + } + } + } + return added; + } + + public boolean isTransitive(){ + assert(valid()); + if(overlapList==null || overlapList.size()==0){return true;} + for(Overlap o : overlapList){ + assert(o.u1==this || o.u2==this); + Unit u2=(o.u1==this ? o.u2 : o.u1); + assert(u2!=this); + if(u2.overlapList==null){ + return false; + }else{ + boolean found=false; + for(Overlap o2 : u2.overlapList){ + if(o2.u1==this || o2.u2==this){ + found=true; break; + } + } + if(!found){return false;} + } + } + return true; + } + + public boolean isPerfectlyTransitive(){ + assert(valid()); + if(overlapList==null || overlapList.size()==0){return true;} + for(Overlap o : overlapList){ + assert(o.u1==this || o.u2==this); + Unit u2=(o.u1==this ? o.u2 : o.u1); + assert(u2!=this); + if(u2.overlapList==null){ + return false; + }else{ + boolean found=false; + for(Overlap o2 : u2.overlapList){ + if(o2==o){ + found=true; break; + } + } + if(!found){return false;} + } + } + return true; + } + + public boolean isNonRedundant(){ + assert(valid()); + if(overlapList==null || overlapList.size()==0){return true;} + for(int i=0; i0 && (u2.length()*100f/length())length() || start<0 || start>=length()){return false;} +// if(true){return false;} + if(u2.r!=null){ + final byte[] a=bases(), b=u2.bases(); + int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); + + for(int i=start, j=0; jmaxMismatches){ + if(bandy==null || maxEdits<1){return false;} + int edits=bandy.alignForward(b, a, 0, start, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + return edits<=maxEdits && bandy.score()>4*edits; + } + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + private boolean containsForwardRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { + if(ignoreReverseComplement){return false;} + if(start+u2.length()>length() || start<0 || start>=length()){return false;} +// if(true){return false;} + if(u2.r!=null){ + final byte[] a=bases(), b=u2.bases(); + int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); + + for(int i=start, j=b.length-1, iprefix=start+k2; j>=0; i++, j--){ + byte aa=a[i]; + byte bb=baseToComplementExtended[b[j]]; + if(aa!=bb){ + if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ + if(earlyExit && imaxMismatches){ + if(bandy==null || maxEdits<1){return false;} + int edits=bandy.alignForwardRC(b, a, b.length-1, start, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + return edits<=maxEdits && bandy.score()>4*edits; + } + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + private boolean containsReverse(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { + if(start+1=length()){return false;} +// if(true){return false;} + if(u2.r!=null){ + final byte[] a=bases(), b=u2.bases(); + int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); + + for(int i=start, j=b.length-1, iprefix=start-k2; j>=0; i--, j--){ + byte aa=a[i]; + byte bb=b[j]; + if(aa!=bb){ + if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ + if(earlyExit && i>iprefix){return false;} + if((mismatches=mismatches+1)>maxMismatches){ + if(bandy==null || maxEdits<1){return false;} + int edits=bandy.alignReverse(b, a, b.length-1, start, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + return edits<=maxEdits && bandy.score()>4*edits; + } + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + private boolean containsReverseRC(Unit u2, int start, BandedAligner bandy, boolean earlyExit) { + if(ignoreReverseComplement){return false;} + if(start+1=length()){return false;} +// if(true){return false;} + if(u2.r!=null){ + final byte[] a=bases(), b=u2.bases(); + int mismatches=0, maxMismatches=calcMaxEdits(maxSubs, minIdentityMult, b.length); + + for(int i=start, j=0; jmaxMismatches){ + if(bandy==null || maxEdits<1){return false;} + int edits=bandy.alignReverseRC(b, a, 0, start, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + return edits<=maxEdits && bandy.score()>4*edits; + } + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + + public boolean depthCongruent(int aa, int bb){ + if(aa<5 && bb<5){return true;} + final int a=Tools.max(1, Tools.min(aa, bb)); + final int b=Tools.max(aa, bb); + return a*depthRatio>=b; + } + + + /** + * @param u2 + * @param loc + * @param key + * @return + */ + public boolean overlaps(Unit u2, int loc, LongM key, BandedAligner bandy, int tableNum, int editLimit) { +// return makeOverlap(u2, loc, key, bandy, earlyExit)!=null; + +// assert(false) : "TODO"; + if(verbose){System.err.println("overlaps: Considering key "+key+", unit "+u2);} + if(parseDepth && !depthCongruent(depth, u2.depth)){return false;} + if(minLengthPercent>0){ + final int len1=length(), len2=u2.length(); + if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)0){ + final int len1=length(), len2=u2.length(); + if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)maxMismatches){ + if(bandy==null || maxEdits<1){return false;} + if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} + int edits=bandy.alignForward(b, a, 0, start1, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; + return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment + } + } + } + } + return true; + } + + private boolean overlapsForwardRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { + if(verbose){System.err.println("overlapsForwardRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} + + if(ignoreReverseComplement){return false;} + final int len1=length(), len2=u2.length(); + if(start1<0){ + start2+=start1; + start1=0; + if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} + } + final int overlapLength=Tools.min(len1-start1, start2+1); + final int overlapLength2=Tools.max(len1-start1, start2+1); + int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1; + if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} + + if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ + if(overlapLength20f && (overlapLength2*100f/Tools.min(len1, len2))maxMismatches){ + if(bandy==null || maxEdits<1){return false;} + if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} + int edits=bandy.alignForwardRC(b, a, b.length-1, start1, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; + return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment + } + } + } + } + return true; + } + + private boolean overlapsReverse(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { + if(verbose){System.err.println("overlapsReverse(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} + + final int len1=length(), len2=u2.length(); + if(start1>=len1){ + start2-=(start1-len1+1); + start1=len1-1; + if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} + } + final int overlapLength=Tools.min(start1+1, start2+1); + final int overlapLength2=Tools.max(start1+1, start2+1); + int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; + if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} + + if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ + if(overlapLength20f && (overlapLength2*100f/Tools.min(len1, len2))maxMismatches){ + if(bandy==null || maxEdits<1){return false;} + if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} + int edits=bandy.alignReverse(b, a, b.length-1, start1, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; + return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment + } + } + } + } + return true; + } + + private boolean overlapsReverseRC(Unit u2, int start1, int start2, BandedAligner bandy, boolean earlyExit, int maxEdits) { + if(verbose){System.err.println("overlapsReverseRC(u1="+this.name()+", u2="+u2.name()+", start1="+start1+", start2="+start2+", earlyExit="+earlyExit+")");} + + if(ignoreReverseComplement){return false;} + final int len1=length(), len2=u2.length(); + if(start1>=len1){ + start2+=(start1-len1+1); + start1=len1-1; + if(verbose){System.err.println("Modified: start1="+start1+", start2="+start2);} + } + final int overlapLength=Tools.min(start1+1, len2-start2); + final int overlapLength2=Tools.max(start1+1, len2-start2); + int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1; + if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} + + if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ + if(overlapLength20f && (overlapLength2*100f/Tools.min(len1, len2))maxMismatches){ + if(bandy==null || maxEdits<1){return false;} + if(verbose){System.err.println("Mismatches exceeded maximum, attempting banded alignment.");} + int edits=bandy.alignReverseRC(b, a, 0, start1, maxEdits, exact); + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; + return edits<=maxEdits && bandy.score()>2*edits; //Set at 2*edits instead of 4*edits to prevent assertion errors when reversing alignment + } + } + } + } + return true; + } + + + + private Overlap makeOverlapForward(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { + if(verbose){System.err.println("makeOverlapForward(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} + final int len1=length(), len2=u2.length(); + int start2=0; + if(start1<0){ + start2-=start1; + start1=0; + } + final int overlapLength=Tools.min(len1-start1, len2-start2); + final int overlapLength2=Tools.max(len1-start1, len2-start2); + int stop1=start1+overlapLength-1, stop2=start2+overlapLength-1; + if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} + + if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ + if(overlapLength0f && (overlapLength*100f/Tools.min(len1, len2))1 && bandy!=null){ + if(maxEdits<1){return null;} + if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} + int edits=bandy.alignForward(b, a, start2, start1, maxEdits, exact); + if(edits>maxEdits || bandy.score()<=4*edits){ + if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} + return null; + } + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; +// if(bandy.lastOffset>0){//Ref longer than query +// for(int k=0; kbandy.lastOffset; k--){ +// if(stop2+1<=len2){stop2++;} +// else{stop1--;} +// } +// } + return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); + }else if(mismatches>maxMismatches){return null;} + } + } + } + return new Overlap(this, u2, FORWARD, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); + } + + private Overlap makeOverlapForwardRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { + if(verbose){System.err.println("makeOverlapForwardRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} + if(ignoreReverseComplement){return null;} + final int len1=length(), len2=u2.length(); + int start2=len2-1; + if(start1<0){ + start2+=start1; + start1=0; + } + final int overlapLength=Tools.min(len1-start1, start2+1); + final int overlapLength2=Tools.max(len1-start1, start2+1); + int stop1=start1+overlapLength-1, stop2=start2-overlapLength+1; + if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} + + if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ + if(overlapLength0f && (overlapLength*100f/Tools.min(len1, len2))1 && bandy!=null){ + if(maxEdits<1){return null;} + if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} + int edits=bandy.alignForwardRC(b, a, start2, start1, maxEdits, exact); + if(edits>maxEdits || bandy.score()<=4*edits){ + if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} + return null; + } + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; +// if(bandy.lastOffset>0){//Ref longer than query +// for(int k=0; kbandy.lastOffset; k--){ +// if(stop2>0){stop2--;} +// else{stop1--;} +// } +// } + return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); + }else if(mismatches>maxMismatches){return null;} + } + } + } + return new Overlap(this, u2, FORWARDRC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); + } + + private Overlap makeOverlapReverse(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { + if(verbose){System.err.println("makeOverlapReverse(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} + + final int len1=length(), len2=u2.length(); + int start2=len2-1; + if(start1>=len1){ + start2-=(start1-len1+1); + start1=len1-1; + } + final int overlapLength=Tools.min(start1+1, start2+1); + final int overlapLength2=Tools.max(start1+1, start2+1); + int stop1=start1-overlapLength+1, stop2=start2-overlapLength+1; + if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} + + if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ + if(overlapLength0f && (overlapLength*100f/Tools.min(len1, len2))1 && bandy!=null){ + if(maxEdits<1){return null;} + if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} + int edits=bandy.alignReverse(b, a, start2, start1, maxEdits, exact); + if(edits>maxEdits || bandy.score()<=4*edits){ + if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} + return null; + } + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; +// if(bandy.lastOffset>0){//Ref longer than query +// for(int k=0; k0){stop1--;} +// else{stop2++;}//I don't think this can happen +// } +// }else if(bandy.lastOffset<0){//Query longer than ref +// for(int k=0; k>bandy.lastOffset; k--){ +// if(stop2>0){stop2--;} +// else{stop1++;} +// } +// } + return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); + }else if(mismatches>maxMismatches){return null;} + } + } + } + return new Overlap(this, u2, REVERSE, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); + } + + private Overlap makeOverlapReverseRC(Unit u2, int start1, BandedAligner bandy, boolean earlyExit) { + if(verbose){System.err.println("makeOverlapReverseRC(u1="+this.name()+", u2="+u2.name()+", start="+start1+", earlyExit="+earlyExit+")");} + if(ignoreReverseComplement){return null;} + final int len1=length(), len2=u2.length(); + int start2=0; + if(start1>=len1){ + start2+=(start1-len1+1); + start1=len1-1; + } + final int overlapLength=Tools.min(start1+1, len2-start2); + final int overlapLength2=Tools.max(start1+1, len2-start2); + int stop1=start1-overlapLength+1, stop2=start2+overlapLength-1; + if(verbose){System.err.println("Calculated stop1="+stop1+", stop2="+stop2+", overlapLength="+overlapLength);} + + if(!allowAllContainedOverlaps || overlapLength>Tools.min(len1, len2)){ + if(overlapLength0f && (overlapLength*100f/Tools.min(len1, len2))1 && bandy!=null){ + if(maxEdits<1){return null;} + if(verbose){System.err.println("mismatches exceeded 1, attempting banded alignment.");} + int edits=bandy.alignReverseRC(b, a, start2, start1, maxEdits, exact); + if(edits>maxEdits || bandy.score()<=4*edits){ + if(verbose){System.err.println((edits>maxEdits ? "Too many edits" : "Alignment score too low")+"; returning null.");} + return null; + } + assert(b.length=k2 || edits>maxEdits) : b.length+", "+k+", "+bandy.lastRow+", "+edits; + stop2=bandy.lastQueryLoc; + stop1=bandy.lastRefLoc; +// if(bandy.lastOffset>0){//Ref longer than query +// for(int k=0; k0){stop1--;} +// else{stop2--;}//I don't think this can happen +// } +// }else if(bandy.lastOffset<0){//Query longer than ref +// for(int k=0; k>bandy.lastOffset; k--){ +// if(stop2+1<=len2){stop2++;} +// else{stop1++;} +// } +// } + return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, edits, edits, bandy); + }else if(mismatches>maxMismatches){return null;} + } + } + } + return new Overlap(this, u2, REVERSERC, start1, start2, stop1, stop2, overlapLength, mismatches, 0, bandy); + } + + @Override + public int compareTo(Unit b) { + int x=comparePairedRC(this, b); +// int y=comparePairedRC(b, this); +// boolean eq1=this.equals(b); +// boolean eq2=b.equals(this); +// +// assert((x==y)==(x==0)) : x+", "+y+"\n"+this+"\n"+b; +// assert((x>0 == y<0) || (x==0 && y==0)) : x+", "+y+"\n"+this+"\n"+b; +// +// assert(eq1==eq2): x+", "+y+"\n"+this+"\n"+b; +// assert(eq1==(x==0)): x+", "+y+"\n"+this+"\n"+b; +// +// assert(eq1 || this!=b); +// +// if(verbose){ //TODO: Remove +// System.err.println(this+"\n"+b+"\n"+this.r.toFastq()+"\n"+this.r.mate.toFastq()+"\n"+b.r.toFastq()+"\n"+b.r.mate.toFastq()+"\n"); +// System.err.println("\n"+x+", "+y+", "+eq1+", "+eq2); +// verbose=false; +// } + + return x; + } + + public boolean equals(Object b){return equals((Unit)b);} + public boolean equals(Unit b){ + boolean x=pairedEqualsRC(this, b); +// assert(x==pairedEqualsRC(b, this)); +// assert(x==(comparePairedRC(this, b)==0)); +// assert(x==(comparePairedRC(b, this)==0)); +// assert(x || this!=b); +// System.err.println("\n****EQUALS?****:\n"+this+"\n"+b+"\n**** ****"); //TODO: Remove + return x; + } + + @Override + public int hashCode(){ + return (int)((code1^(code1>>>32))&0xFFFFFFFFL); + } + + private synchronized void setValid(boolean b){ + assert(b!=valid()); +// if(!b){System.err.println("Setting invalid "+name());} + if(b){flags&=~INVALID_MASK;} + else{flags|=INVALID_MASK;} + assert(b==valid()); + } + + private synchronized void setClustered(boolean b){ + assert(b!=clustered()); + if(b){flags|=CLUSTER_MASK;} + else{flags&=~CLUSTER_MASK;} + assert(b==clustered()); + } + + private synchronized void setVisited(boolean b){ + assert(b!=visited()); + if(b){flags|=VISIT_MASK;} + else{flags&=~VISIT_MASK;} + assert(b==visited()); + } + + private synchronized void setCanonical(boolean b){ + assert(b!=canonical()); + if(b){flags|=CANON_MASK;} + else{flags&=~CANON_MASK;} + assert(b==canonical()); + assert(r==null || b==isCanonical(r.bases)); + } + + private synchronized void setCanonicized(boolean b){ + assert(b!=canonicized()); + if(b){flags|=CANONICIZED_MASK;} + else{flags&=~CANONICIZED_MASK;} + assert(b==canonicized()); + } + + private synchronized void setCanonContradiction(boolean b){ +// assert(b!=canonContradiction()); + if(b){flags|=CANON_CONTRADICTION_MASK;} + else{flags&=~CANON_CONTRADICTION_MASK;} + assert(b==canonContradiction()); + } + + private synchronized void setOffset(int x){ + offset=x; + setOffsetValid(true); + } + + private synchronized void setOffsetValid(boolean b){ + assert(!offsetValid()); + if(b){flags|=OFFSET_VALID_MASK;} + else{flags&=~OFFSET_VALID_MASK;} + assert(b==offsetValid()); + } + + private synchronized void setOffsetContradiction(boolean b){ +// assert(b!=offsetContradiction()); + assert(offsetValid()); + if(b){flags|=OFFSET_CONTRADICTION_MASK;} + else{flags&=~OFFSET_CONTRADICTION_MASK;} + assert(b==offsetContradiction()); + } + + private void reverseComplement(){ + assert(r!=null); + r.reverseComplement(); + long temp=prefix1; + prefix1=suffix1; + suffix1=temp; + temp=prefix2; + prefix2=suffix2; + suffix2=temp; + setCanonical(!canonical()); + } + + /** Return true if 'this' should be the first Unit in the overlap object */ + public boolean firstInOverlap(Unit u2){ + assert(this!=u2) : "\n"+this.r+"\n"+u2.r; + if(u2.length()!=length()){return u2.length()=0;} + return r.numericID>=u2.r.numericID; + } + + public byte[] bases(){return r==null ? null : r.bases;} + + public String name(){return r!=null ? r.id : null /*code+""*/;} + public String toString(){return "("+name()+","+code1+","+code2+","+length()+","+prefix1+","+suffix1+","+(canonical()?"c":"nc")+",d="+depth+")";} + + + public final Read r; + public final long code1; + public final long code2; + public long prefix1=-1; + public long suffix1=-1; + public long prefix2=-1; + public long suffix2=-1; + /** Distance of leftmost side of this read relative to leftmost side of root. + * Assumes everything is in 'forward' orientation. */ + public int offset=-999999999; + public int depth=1; +// private boolean valid=true; + + public ArrayList overlapList; + + private long flags; + /** True if the original read orientation was canonical */ + public final boolean canonical(){return (CANON_MASK&flags)!=0;} + /** True if this contig should be output, false if not */ + public final boolean valid(){return (INVALID_MASK&flags)==0;} + /** Length of this contig */ + public final int length(){return (int)(LEN_MASK&flags);} + /** Position of this contig relative to root */ + public final int offset(){ + assert(offsetValid()); + return offset; + } + public int pairnum(){return (PAIRNUM_MASK&flags)==PAIRNUM_MASK ? 1 : 0;} + + public void clearVolatileFlags(){ + flags=flags&~(CANONICIZED_MASK|VISIT_MASK|CANON_CONTRADICTION_MASK|OFFSET_VALID_MASK|OFFSET_CONTRADICTION_MASK); + assert(!visited()); + assert(!canonicized()); + assert(!canonContradiction()); + assert(!offsetValid()); + assert(!offsetContradiction()); + } + + public boolean visited(){return (VISIT_MASK&flags)==VISIT_MASK;} + public boolean clustered(){return (CLUSTER_MASK&flags)==CLUSTER_MASK;} + public boolean canonicized(){return (CANONICIZED_MASK&flags)==CANONICIZED_MASK;} + public boolean canonContradiction(){return (CANON_CONTRADICTION_MASK&flags)==CANON_CONTRADICTION_MASK;} + public boolean offsetValid(){return (OFFSET_VALID_MASK&flags)==OFFSET_VALID_MASK;} + public boolean offsetContradiction(){return (OFFSET_CONTRADICTION_MASK&flags)==OFFSET_CONTRADICTION_MASK;} + public boolean contradiction(){return offsetContradiction() || canonContradiction();} + + private static final long LEN_MASK=0x7FFFFFFFL; + private static final long CANON_MASK=(1L<<33); + private static final long INVALID_MASK=(1L<<34); + private static final long VISIT_MASK=(1L<<35); + private static final long CLUSTER_MASK=(1L<<36); + private static final long CANONICIZED_MASK=(1L<<37); + private static final long CANON_CONTRADICTION_MASK=(1L<<38); + private static final long OFFSET_VALID_MASK=(1L<<39); + private static final long OFFSET_CONTRADICTION_MASK=(1L<<40); + private static final long PAIRNUM_MASK=(1L<<41); + } + + private static final class UnitOffsetComparator implements Comparator { + + UnitOffsetComparator(){} + + /* (non-Javadoc) + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(Unit a, Unit b) { + if(a.offsetValid() && b.offsetValid()){ + int x=a.offset()-b.offset(); + if(x!=0){return x;} + }else{ + if(a.offsetValid()){return -1;} + if(b.offsetValid()){return 1;} + } + return a.compareTo(b); + } + + } + + private static final class ClusterLengthComparator implements Comparator> { + + ClusterLengthComparator(){} + + /* (non-Javadoc) + * @see java.util.Comparator#compare(java.lang.Object, java.lang.Object) + */ + @Override + public int compare(ArrayList a, ArrayList b) { + if(a.size()!=b.size()){return b.size()-a.size();} + if(a.isEmpty() && b.isEmpty()){return 0;} + return a.get(0).compareTo(b.get(0)); + } + + } + + private static final int[] makeNmerIndex(int n){ + final int max=(1<<(2*n))-1; + int[] array=new int[max+1]; + + int count=0; + for(int i=0; i<=max; i++){ + final int a=i, b=AminoAcid.reverseComplementBinaryFast(i, n); + int min=Tools.min(a, b); + if(min==a){ + array[a]=array[b]=count; + count++; + } + } + return array; + } + + /** Makes a nmer (e.g., tetramer) profile of a cluster */ + private static final float[] makeNmerProfile(ArrayList alu, long[] array_){ + final int nbits=2*nmerLength; + final long[] array=(array_==null ? new long[maxNmer+1] : array_); + final int mask=~((-1)<<(nbits)); + + long keysCounted=0; + + for(Unit u : alu){ + byte[] bases=u.r.bases; + int len=0; + int kmer=0; + for(byte b : bases){ + int x=AminoAcid.baseToNumber[b]; + if(x<0){ + len=0; + kmer=0; + }else{ + kmer=((kmer<<2)|x)&mask; + len++; + if(len>=nmerLength){ + int rkmer=AminoAcid.reverseComplementBinaryFast(kmer, nmerLength); + keysCounted++; + array[nmerIndex[Tools.min(kmer, rkmer)]]++; + } + } + } + } + + if(keysCounted==0){keysCounted=1;} + final float mult=1f/keysCounted; + + float[] r=new float[array.length]; + for(int i=0; i> codeMap=new HashMap>(4000000); + private HashMap> affixMap1=null; + private HashMap> affixMap2=null; + private HashMap>[] affixMaps=null; + private ArrayDeque> clusterQueue=null; + private ArrayList> processedClusters=null; + + private static final UnitOffsetComparator UNIT_OFFSET_COMPARATOR=new UnitOffsetComparator(); + private static final ClusterLengthComparator CLUSTER_LENGTH_COMPARATOR=new ClusterLengthComparator(); + private static final long[][] hashcodes=makeCodes2(32); + public static final byte[] baseToNumber=new byte[128]; + public static final byte[] baseToComplementNumber=new byte[128]; + public static final byte[] baseToComplementExtended=new byte[128]; + public static final int nmerLength=4; + public static final int[] nmerIndex=makeNmerIndex(nmerLength); + public static final int maxNmer=Tools.max(nmerIndex); + private static PrintStream outstream=System.err; + public static boolean overwrite=false; + public static boolean showSpeed=true; + public static boolean verbose=false; + public static boolean ignoreReverseComplement=false; + public static boolean ignoreAffix1=false; + public static boolean parseDepth=false; + public static float depthRatio=2; + public static int MINSCAF=0; + public static int THREADS=Shared.THREADS; + public static int threadMaxReadsToBuffer=4000; + public static int threadMaxBasesToBuffer=32000000; + public static boolean DISPLAY_PROGRESS=true; + + private static int reverseType(int type){return (type+2)%4;} + public static final int FORWARD=0; + public static final int FORWARDRC=1; + public static final int REVERSE=2; + public static final int REVERSERC=3; + public static final String[] OVERLAP_TYPE_NAMES=new String[] {"FORWARD", "FORWARDRC", "REVERSE", "REVERSERC"}; + + static{//All others are 0 + baseToNumber['A']=baseToNumber['a']=0; + baseToNumber['C']=baseToNumber['c']=1; + baseToNumber['G']=baseToNumber['g']=2; + baseToNumber['T']=baseToNumber['t']=3; + baseToNumber['U']=baseToNumber['u']=3; + + baseToComplementNumber['A']=baseToComplementNumber['a']=3; + baseToComplementNumber['C']=baseToComplementNumber['c']=2; + baseToComplementNumber['G']=baseToComplementNumber['g']=1; + baseToComplementNumber['T']=baseToComplementNumber['t']=0; + baseToComplementNumber['U']=baseToComplementNumber['u']=0; + + for(int i=0; i jgi.Dedupe2 "); + outstream.println("\nOptional flags:"); + outstream.println("in= \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in."); + outstream.println("out= \tThe 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out."); + outstream.println("showspeed=t \tSet to 'f' to suppress display of processing speed."); + outstream.println("minscaf=0 \tIgnore scaffolds shorter than this."); + outstream.println("testrc=t \t(trc) Test reverse-complements as well as normal orientation."); + outstream.println("testmatch=t \t(tm) Test for exact matches of scaffolds."); + outstream.println("testcontainment=t \t(tc) Test for full containments of scaffolds."); + outstream.println("storename=t \t(sn) Store scaffold names (set false to save memory)."); + outstream.println("storequality=t \t(sq) Store quality values for fastq assemblies (set false to save memory)."); + outstream.println("exact=t \t(ex) Only allow exact symbol matches. When false, an 'N' will match any symbol."); + outstream.println("uniquenames=t \t(un) Ensure all output scaffolds have unique names. Uses more memory."); + outstream.println("maxedits=0 \t(e) Absorb contained sequences with up to this many mismatches (subs only, no indels)."); + outstream.println("minidentity=100 \t(mid) Absorb contained sequences with percent identity of at least this (subs only, no indels)."); + outstream.println("k=31 \tKmer length used for finding containments. Containments shorter than k will not be found."); + outstream.println("ziplevel=2 \tSet to 1 (lowest) through 9 (max) to change compression level; lower compression is faster."); + outstream.println("sort=f \tsort output by scaffold length (otherwise it will be random).\n" + + " \t'a' for ascending, 'd' for descending, 'f' for false (no sorting)."); + outstream.println("minlengthpercent=0 \t(mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed." + + " \tThis option is only for compatibility with vmatch and changing it is not recommended, " + + " \tas it may cause nondeterministic output."); + } + + public Dedupe2(String[] args){ + for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}} + System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + ReadWrite.ZIPLEVEL=2; + //ReadWrite.USE_UNPIGZ=true; + FastaReadInputStream.SPLIT_READS=false; + boolean setOut=false; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in")){ + if(b.indexOf(',')>=0 && !new File(b).exists()){ + in=b.split(","); + }else{ + in=new String[] {b}; + } + }else if(a.equals("out")){ + out=b; + setOut=true; + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("sort")){ + if(b==null){sort=true;} + else if(b.equalsIgnoreCase("a")){ + sort=true; + ascending=true; + }else if(b.equalsIgnoreCase("d")){ + sort=true; + ascending=false; + }else{ + sort=Tools.parseBoolean(b); + } + }else if(a.equals("trc") || a.equals("testrc")){ + ignoreReverseComplement=!Tools.parseBoolean(b); + }else if(a.equals("tc") || a.equals("testcontainment") || a.equals("containment")){ + testContainment=Tools.parseBoolean(b); + }else if(a.equals("tm") || a.equals("testmatch")){ + testMatch=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("k")){ + k=Integer.parseInt(b); + k2=k-1; + assert(k>0 && k<32) : "k must be between 1 and 31; default is 31, and lower values are slower."; + }else if(a.equals("minscaf")){ + MINSCAF=FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("mlp") || a.equals("minlengthpercent")){ + minLengthPercent=Float.parseFloat(b); + }else if(a.equals("e") || a.equals("maxedits")){ + maxEdits=Integer.parseInt(b); + }else if(a.equals("mid") || a.equals("minidentity")){ + minIdentity=Float.parseFloat(b); + minIdentityMult=(minIdentity==100f ? 0 : (100f-minIdentity)/100f); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=Integer.parseInt(b); + }else if(a.equals("showspeed")){ + showSpeed=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.equals("contigbreak") || (arg.contains("=") && (a.equals("n") || a.equals("-n")))){ + maxNs=Integer.parseInt(b); + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("storename") || a.equals("sn")){ + storeName=Tools.parseBoolean(b); + }else if(a.equals("storesuffix") || a.equals("ss")){ + storeSuffix=Tools.parseBoolean(b); + }else if(a.equals("storequality") || a.equals("sq")){ + storeQuality=Tools.parseBoolean(b); + }else if(a.equals("exact") || a.equals("ex")){ + exact=Tools.parseBoolean(b); + }else if(a.equals("uniquenames") || a.equals("un")){ + uniqueNames=Tools.parseBoolean(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(i==0 && in==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + String c=args[i]; + if(c.indexOf(',')>=0 && !new File(c).exists()){ + in=c.split(","); + }else{ + in=new String[] {c}; + } + }else if(i==1 && out==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + out=args[i]; + setOut=true; + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(maxEdits>0 || minIdentity<100){storeSuffix=true;} + + assert(FastaReadInputStream.settingsOK()); + + if(in==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + + for(int i=0; i1; + for(int i=0; i alht=new ArrayList(THREADS); + for(int i=0; i list=new ArrayList((int)addedToMain); + for(Unit u : codeMap){ + if(u.valid()){list.add(u.r);} + } + +// if(minLengthPercent>0){ +// if(verbose){System.err.println("Sorting.");} +// Collections.sort(list, ReadLengthComparator.comparator); +// Collections.reverse(list); +// assert(list.isEmpty() || list.get(0).bases.length<=list.get(list.size()-1).bases.length) : +// list.get(0).bases.length+", "+list.get(list.size()-1).bases.length; +// } + + synchronized(this){ + t.stop(); + outstream.println("Allocated list. Time: "+t); + printMemory(); + outstream.println(); + Tools.pause(800); + t.start(); + } + crisa=new ConcurrentCollectionReadInputStream[] {new ConcurrentCollectionReadInputStream(list, null, maxReads)}; + Thread cristhread=new Thread(crisa[0]); + cristhread.start(); + + ArrayList alht=new ArrayList(THREADS); + for(int i=0; i addToArray(HashSet codeMap, boolean sort, boolean ascending, boolean clear, long outNum){ + assert(outNum<=Integer.MAX_VALUE); + if(verbose){System.err.println("Making list.");} + ArrayList list=new ArrayList((int)outNum); + if(verbose){System.err.println("Adding.");} + for(Unit u : codeMap){ + if(u.valid()){list.add(u.r);} + } + if(clear){codeMap.clear();} + + if(sort){ + if(verbose){System.err.println("Sorting.");} + Collections.sort(list, ReadLengthComparator.comparator); + if(ascending){ + Collections.reverse(list); + assert(list.isEmpty() || list.get(0).bases.length<=list.get(list.size()-1).bases.length) : + list.get(0).bases.length+", "+list.get(list.size()-1).bases.length; + }else{ + assert(list.isEmpty() || list.get(0).bases.length>=list.get(list.size()-1).bases.length) : + list.get(0).bases.length+", "+list.get(list.size()-1).bases.length; + } + } + assert(list.size()==outNum) : list.size()+", "+outNum; + return list; + } + + private void writeOutput(TextStreamWriter tsw, Timer t){ + + ArrayList list=addToArray(codeMap, sort, ascending, true, addedToMain-containments); + codeMap=null; + + if(sort){ + synchronized(this){ + t.stop(); + outstream.println("Sorted output. Time: "+t); + printMemory(); + outstream.println(); + Tools.pause(800); + t.start(); + } + } + + writeOutput(tsw, list); + + synchronized(this){ + t.stop(); + outstream.println("Printed output. Time: "+t); + printMemory(); + outstream.println(); + Tools.pause(800); + t.start(); + } + } + + + + private void writeOutput(TextStreamWriter tsw, ArrayList list){ + + if(verbose){System.err.println("Writing.");} + tsw.start(); + + HashSet names=((uniqueNames && storeName) ? + new HashSet(Tools.min(Integer.MAX_VALUE, Tools.max((int)addedToMain, (int)(addedToMain*1.35)))) : null); + long rid=0; + for(int x=0; x=0; i--){ + byte b=bases[i]; + b=AminoAcid.baseToComplementExtended[b]; + int mode=(int)(code&31); + code=code^hashcodes[b][mode]; + code=Long.rotateLeft(code, 1); + } + return code; + } + + + public static boolean isCanonical(byte[] bases){ + if(ignoreReverseComplement || bases==null || bases.length==0){return true;} + final int lim=(bases.length+1)/2; + for(int i=0, j=bases.length-1; i>(threadMaxReadsToBuffer*8);} + crisq=new ArrayDeque(crisa.length); + for(int i=0; i ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + // long xx=0; + while(reads!=null && reads.size()>0){ + + for(Read r : reads){ + if(r.bases!=null && r.bases.length>=MINSCAF){ + assert(r.mate==null); + if(!storeName){r.id=null;} + if(!storeQuality){r.quality=null;} + readsProcessedT++; + // xx++; + // outstream.println("Processing read "+r.id+", "+xx); + basesProcessedT+=r.bases==null ? 0 : r.bases.length; + +// final long code; +// final Unit u; +// if(r.obj==null){ +// final boolean canonical=isCanonical(r.bases); +// code=(canonical ? hash(r.bases) : hashReversed(r.bases)); +// u=(r.obj!=null ? (Unit)r.obj : new Unit(r, canonical, code)); +// u=(r.obj!=null ? (Unit)r.obj : new Unit(r)); +// r.obj=u; +// }else{ +// u=(Unit)r.obj; +// code=u.code; +// } +// assert(u.r==r && r.obj==u); + + final Unit u=(r.obj!=null ? (Unit)r.obj : new Unit(r)); + assert(u.r==r && (r.obj==u || r.obj==null)); + final long code=u.code1; + r.obj=u; + assert(u.r==r && r.obj==u); + + // if(verbose){System.err.println("Generated "+code+" for sequence "+new String(r.bases, 0, Tools.min(40, r.bases.length)));} + + if(addToCodeMap){ + boolean b=codeMapT.add(u); + if(b){ + basesStoredT+=r.bases.length; + }else{ + matchesT++; + basematchesT+=r.bases.length; + } + } + + if(findContainments){ + // System.err.println("\naffixMap:\n"+affixMap+"\n"); + int x=findContainments(u); + } + + // if(verbose){System.err.println("mapT.size="+mapT.size()+", basesStoredT="+basesStoredT);} + } + } + + if(codeMapT!=null && (codeMapT.size()>threadMaxReadsToBuffer || basesStoredT>threadMaxBasesToBuffer)){ + assert(addToCodeMap); + long added=mergeMaps(); + addedToMainT+=added; + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + if(codeMapT!=null && !codeMapT.isEmpty()){ + long added=mergeMaps(); + addedToMainT+=added; + } + cris=crisq.poll(); + } + + codeMapT=null; + } + + private int findContainments(final Unit u){ + if(minLengthPercent<=0 && maxEdits<=0 && minIdentity>=100 && !u.valid()){return 0;} + final byte[] bases=u.bases(); + final int minlen=k-1; + final long shift=2*k; + final long shift2=shift-2; + final long mask=~((-1L)<>>2)|(x2<{ + + public Unit(Read r_){ + this(r_, isCanonical(r_.bases)); + } + + public Unit(Read r_, boolean canonical_){ +// this(r_, canonical_, canonical_ ? hash(r_.bases) : hashReversed(r_.bases)); + this(r_, canonical_, hash(r_.bases), hashReversed(r_.bases)); + } + + public Unit(Read r_, boolean canonical_, long codeF_, long codeR_){ + r=r_; + code1=Tools.min(codeF_, codeR_); + code2=Tools.max(codeF_, codeR_); + long f=r.bases.length; + prefix=hashTip(r.bases, true, k); + suffix=hashTip(r.bases, false, k); + if(canonical_){f|=CANON_MASK;} + flags=f; + assert(canonical()==canonical_); + assert(length()==r.bases.length); + } + + /** + * @param u2 + * @param loc + * @param key + * @return + */ + public boolean contains(Unit u2, int loc, LongM key) { + if(verbose){System.err.println("Considering key "+key+", unit "+u2);} + if(minLengthPercent>0 && (u2.length()*100f/length())length()){return false;} +// if(true){return false;} + if(u2.r!=null){ + final byte[] a=bases(), b=u2.bases(); + int mismatches=0, maxMismatches=calcMaxEdits(maxEdits, minIdentityMult, b.length); + + for(int i=start, j=0; jmaxMismatches){return false;} + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + private boolean containsForwardRC(Unit u2, int start) { + if(ignoreReverseComplement){return false;} + if(start+u2.length()>length()){return false;} +// if(true){return false;} + if(u2.r!=null){ + final byte[] a=bases(), b=u2.bases(); + int mismatches=0, maxMismatches=calcMaxEdits(maxEdits, minIdentityMult, b.length); + + for(int i=start, j=b.length-1; j>=0; i++, j--){ + byte aa=a[i]; + byte bb=AminoAcid.baseToComplementExtended[b[j]]; + if(aa!=bb){ + if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ + if((mismatches=mismatches+1)>maxMismatches){return false;} + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + private boolean containsReverse(Unit u2, int start) { + if(start+1=0; i--, j--){ + byte aa=a[i]; + byte bb=b[j]; + if(aa!=bb){ + if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ + if((mismatches=mismatches+1)>maxMismatches){return false;} + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + private boolean containsReverseRC(Unit u2, int start) { + if(ignoreReverseComplement){return false;} + if(start+1maxMismatches){return false;} + } + } + } + return true; + }else{ + assert(false) : "TODO: Verify by hashing and checking both tips"; + return false; + } + } + + /** + * @param u2 + * @param loc + * @param key + * @return + */ + public boolean overlaps(Unit u2, int loc, LongM key) { + assert(false) : "TODO"; + if(verbose){System.err.println("Considering key "+key+", unit "+u2);} + if(minLengthPercent>0){ + final int len1=length(), len2=u2.length(); + if(Tools.min(len1, len2)*100f/Tools.max(len1, len2)0f && (overlapLength*100f/Tools.min(len1, len2))maxMismatches){return false;} + } + } + } + return true; + } + + private boolean overlapsForwardRC(Unit u2, int start) { + if(ignoreReverseComplement){return false;} + final int len1=length(), len2=u2.length(); + final int overlapLength=Tools.min(len1-start, len2); + + if(overlapLength0f && (overlapLength*100f/Tools.min(len1, len2))maxMismatches){return false;} + } + } + } + return true; + } + + private boolean overlapsReverse(Unit u2, int start) { + + final int len1=length(), len2=u2.length(); + final int overlapLength=Tools.min(len1-start, len2); + + if(overlapLength0f && (overlapLength*100f/Tools.min(len1, len2))=0 && j>=0; i--, j--){ + byte aa=a[i]; + byte bb=b[j]; + if(aa!=bb){ + if(exact || (AminoAcid.isFullyDefined(aa) && AminoAcid.isFullyDefined(bb))){ + if((mismatches=mismatches+1)>maxMismatches){return false;} + } + } + } + return true; + } + + private boolean overlapsReverseRC(Unit u2, int start) { + if(ignoreReverseComplement){return false;} + final int len1=length(), len2=u2.length(); + final int overlapLength=Tools.min(len1-start, len2); + + if(overlapLength0f && (overlapLength*100f/Tools.min(len1, len2))maxMismatches){return false;} + } + } + } + return true; + } + + @Override + public int compareTo(Unit b) { + return compareRC(this, b); + } + + public boolean equals(Object b){return equals((Unit)b);} + public boolean equals(Unit b){return equalsRC(this, b);} + + @Override + public int hashCode(){ + return (int)((code1^(code1>>>32))&0xFFFFFFFFL); + } + + private synchronized void setInvalid(){ + assert(valid()); + flags|=VALID_MASK; + } + + public byte[] bases(){return r==null ? null : r.bases;} + + public String name(){return r!=null ? r.id : null /*code+""*/;} + public String toString(){return "("+code1+","+code2+","+length()+","+prefix+","+suffix+","+(canonical()?"c":"nc")/*+","+(tipCanonical()?"tc":"ntc")*/+")";} + + + public final Read r; + public final long code1; + public final long code2; + public final long prefix; + public final long suffix; +// private boolean valid=true; + + private long flags; + /** True if the original read orientation was canonical */ + public final boolean canonical(){return (CANON_MASK&flags)!=0;} + /** True if the original read orientation was canonical */ + public final boolean valid(){return (VALID_MASK&flags)==0;} +// /** True if the original read tip orientation was canonical, false if the prefix and suffix were swapped */ +// public final boolean tipCanonical(){return (CANON_TIP_MASK&flags)!=0;} + /** True if the original read tip orientation was canonical, false if the prefix and suffix were swapped */ + public final int length(){return (int)(LEN_MASK&flags);} + + private static final long LEN_MASK=0x7FFFFFFFL; + private static final long CANON_MASK=(1L<<33); + private static final long VALID_MASK=(1L<<34); + } + + private ConcurrentReadStreamInterface crisa[]; + + private String[] in=null; + private String out=null; + private int maxNs=-1; + private long maxReads=-1; + public boolean errorState=false; + boolean sort=false; + boolean ascending=true; + boolean testContainment=true; + boolean testMatch=true; + boolean storeSuffix=false; + boolean storeName=true; + boolean storeQuality=true; + boolean exact=true; + boolean uniqueNames=true; + private boolean multipleInputFiles=false; + + int maxEdits=0; + float minIdentity=100; + float minIdentityMult=0; + float minLengthPercent=0; + int minOverlap=200; + float minOverlapPercent=0; + + long readsProcessed=0; + long basesProcessed=0; + long collisions=0; + long containments=0; + long containmentCollisions=0; + long matches=0; + long basematches=0; + long basecontainments=0; + long addedToMain=0; + + int k=31; + int k2=k-1; + + private static int tcount=0; + + private HashSet codeMap=new HashSet(4000000); + private HashMap> affixMap=null; + + private static final long[][] hashcodes=makeCodes2(32); + public static final byte[] baseToNumber=new byte[128]; + public static final byte[] baseToRcompNumber=new byte[128]; + public static final byte[] baseToRcomp=new byte[128]; + private static PrintStream outstream=System.err; + public static boolean overwrite=false; + public static boolean showSpeed=true; + public static boolean verbose=false; + public static boolean ignoreReverseComplement=false; + public static int MINSCAF=0; + public static int THREADS=Shared.THREADS; + public static int threadMaxReadsToBuffer=4000; + public static int threadMaxBasesToBuffer=32000000; + public static boolean PAUSE=false; + + static{//All others are 0 + baseToNumber['A']=baseToNumber['a']=0; + baseToNumber['C']=baseToNumber['c']=1; + baseToNumber['G']=baseToNumber['g']=2; + baseToNumber['T']=baseToNumber['t']=3; + + baseToRcompNumber['A']=baseToRcompNumber['a']=3; + baseToRcompNumber['C']=baseToRcompNumber['c']=2; + baseToRcompNumber['G']=baseToRcompNumber['g']=1; + baseToRcompNumber['T']=baseToRcompNumber['t']=0; + } + +} diff --git a/current/jgi/Difference.java b/current/jgi/Difference.java new file mode 100755 index 0000000..81941da --- /dev/null +++ b/current/jgi/Difference.java @@ -0,0 +1,39 @@ +package jgi; + +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Oct 9, 2013 + * + */ +public class Difference { + + public static void main(String[] args){ + + TextFile tf1=new TextFile(args[0], false, false); + TextFile tf2=new TextFile(args[1], false, false); + + String s1=tf1.readLine(false); + String s2=tf2.readLine(false); + + int difs=0; + int i=1; + while(s1!=null && s2!=null){ + if(!s1.equals(s2)){ + difs++; + System.err.println("Line "+i+":\n"+s1+"\n"+s2+"\n"); + assert(difs<5); + } + i++; + s1=tf1.readLine(false); + s2=tf2.readLine(false); + } + + assert(s1==null && s2==null) : "Line "+i+":\n"+s1+"\n"+s2+"\n"; + + tf1.close(); + tf2.close(); + } + +} diff --git a/current/jgi/ErrorCorrect.java b/current/jgi/ErrorCorrect.java new file mode 100755 index 0000000..a0c4330 --- /dev/null +++ b/current/jgi/ErrorCorrect.java @@ -0,0 +1,852 @@ +package jgi; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; + +import kmer.KCountArray; +import kmer.KmerCount6; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.RTextOutputStream3; +import stream.Read; + +import dna.AminoAcid; +import dna.Data; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.ReadWrite; + +import align2.ListNum; +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Aug 20, 2012 + * + */ +public class ErrorCorrect extends Thread{ + + public static void main(String[] args){ + + String reads1=args[0]; + String reads2=(args.length>1 ? args[1] : null); + + int k=23; + int cbits=4; + int gap=0; + int hashes=1; + int thresh1=1; + int thresh2=2; + int matrixbits=34; + long maxReads=-1; + int buildpasses=1; + long tablereads=-1; //How many reads to process when building the hashtable + int buildStepsize=4; + String output=null; + boolean ordered=true; + boolean overwrite=false; + + for(int i=2; i1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.equals("initialthresh") || a.equals("thresh1")){ + thresh1=Integer.parseInt(b); + }else if(a.equals("thresh") || a.equals("thresh2")){ + thresh2=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else if(a.startsWith("matrixbits")){ + matrixbits=Integer.parseInt(b); + }else if(a.startsWith("hashes") || a.startsWith("multihash")){ + hashes=Integer.parseInt(b); + }else if(a.startsWith("maxerrors")){ + ERROR_CORRECTION_LIMIT=Integer.parseInt(b); + }else if(a.startsWith("passes")){ + buildpasses=Integer.parseInt(b); + }else if(a.startsWith("stepsize") || a.startsWith("buildstepsize")){ + buildStepsize=Integer.parseInt(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("threads") || a.equals("t")){ + System.err.println("Can't change threadcount for this class."); //THREADS=Integer.parseInt(b); + }else if(a.startsWith("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.startsWith("tablereads")){ + tablereads=Long.parseLong(b); + }else if(a.startsWith("build") || a.startsWith("genome")){ + Data.setGenome(Integer.parseInt(b)); + Data.sysout.println("Set genome to "+Data.GENOME_BUILD); + }else if(a.equals("outputinfo") || a.startsWith("info")){ + OUTPUT_INFO=Tools.parseBoolean(b); + }else if(a.startsWith("out")){ + output=b; + }else if(a.startsWith("verbose")){ + KCountArray.verbose=Tools.parseBoolean(b); +// verbose=KCountArray.verbose=Tools.parseBoolean(b); + }else if(a.equals("ordered") || a.equals("ord")){ + ordered=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + KCountArray kca=makeTable(reads1, reads2, k, cbits, gap, hashes, buildpasses, matrixbits, tablereads, buildStepsize, thresh1, thresh2); + + detect(reads1, reads2, kca, k, thresh2, maxReads, output, ordered, overwrite); + + } + + public static KCountArray makeTable(String reads1, String reads2, int k, int cbits, int gap, int hashes, int buildpasses, int matrixbits, + long maxreads, int stepsize, int thresh1, int thresh2){ + + Timer thash=new Timer(); + + KmerCount6.maxReads=maxreads; + int kbits=2*k; + matrixbits=Tools.min(kbits, matrixbits); + + thash.start(); +// Data.sysout.println("kbits="+(kbits)+" -> "+(1L< "+(1L<1); + KCountArray trusted=null; + for(int i=1; i2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3) + int step=(stepsize==1 ? 1 : stepsize+i%2); +// if(!conservative){step=(step+3)/4;} + if(!conservative){step=Tools.min(3, (step+3)/4);} + + KmerCount6.countFastq(reads1, reads2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative); + + kca.shutdown(); + Data.sysout.println("Trusted: \t"+kca.toShortString()); + trusted=kca; + kca=KCountArray.makeNew(1L< ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + long covered=0; + long uncovered=0; + + long coveredFinal=0; + long uncoveredFinal=0; + + long fullyCorrected=0; + long failed=0; + + long totalBases=0; + long totalReads=0; + + + while(reads!=null && reads.size()>0){ + for(Read r : reads){ + Read r2=r.mate; + { + +// if(r.numericID==23){verbose=true;} + + totalReads++; + if(verbose){System.err.println();} + totalBases+=r.bases.length; +// BitSet bs=detectErrors(r, kca, k, thresh); + BitSet bs=detectErrorsBulk(r, kca, k, thresh, 1); + if(verbose){System.err.println(toString(bs, r.bases.length));} +// Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.bases.length)); + if(verbose){System.err.println(toString(detectErrors(r, kca, k, thresh), r.bases.length-k+1));} + if(bs==null){//No errors, or can't detect errors + assert(false); + }else{ + int x=bs.cardinality(); + covered+=x; + uncovered+=(r.bases.length-x); + if(x0){return detectErrorsSplit(r, kca, k, thresh);} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + int bslen=r.bases.length-k-gap+1; + if(bslen<1){return null;} //Read is too short to detect errors + BitSet bs=new BitSet(bslen); + + int len=0; + long kmer=0; + byte[] bases=r.bases; + for(int i=0; i=k){ + int count=kca.read(kmer); + if(count>=thresh){ + bs.set(i+1-k); + } + } + } + } + return bs; + } + + public static BitSet detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize/*, final int offset*/){ + if(kca.gap>0){return detectErrorsSplit(r, kca, k, thresh);} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + if(r.bases==null || r.bases.length=k && ((len-k)%stepsize==0 || i==bases.length-1)){ + int count=kca.read(kmer); + if(count>=thresh){ + bs.set(i+1-setlen, i+1); + } + } + } + } + + r.errors=bs.cardinality()-r.bases.length; + +// assert(bases.length==r.bases.length); + return bs; + } + + public static BitSet detectTrusted(final Read r, final KCountArray kca, final int k, final int thresh, final int detectStepsize){ + if(kca.gap>0){throw new RuntimeException("TODO");} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + if(r.bases==null || r.bases.length=k && (i%detectStepsize==0 || i==bases.length-1)){ + int count=kca.read(kmer); + if(count0){return detectErrorsSplit(r, kca, k, thresh);} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + if(r.bases==null || r.bases.length=k){ + int count=kca.read(kmer); + if(count>=thresh){ + bs.set(i+1-setlen); + bs.set(i); + } + } + } + } + return bs; + } + + /** + * @param r + * @param kca + * @param k + * @param thresh + * @return + */ + private static BitSet detectErrorsSplit(Read r, KCountArray kca, int k, + int thresh) { + assert(false) : "TODO"; + return null; + } + + + /** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers */ + public static BitSet correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst){ + if(kca.gap>0){assert(false) : "TODO";} + + assert(!OUTPUT_INFO) : "TODO: Outputting correction data is not yet supported."; + + int corrections=0; //Alternately, corrections=r.errorsCorrected + r.errors=0; + + if(bs.cardinality()==0){//Cannot be corrected + r.errors=r.bases.length; + return bs; + } + +// verbose=!bs.get(0); + if(verbose){ + Data.sysout.println(); + Data.sysout.println(toString(bs, r.bases.length)); + Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.bases.length)); + Data.sysout.println(toString(detectErrors(r, kca, k, thresh), r.bases.length-k+1)); + } + + + int lastloc=-99; + int burst=1; + while(!bs.get(0) && corrections\n"+toString(bs, r.bases.length));} + }else{ + r.errors=r.bases.length-bs.cardinality(); + r.errorsCorrected+=corrections; + if(verbose){System.err.println("Could not correct.");} + r.bases[errorLoc]='N'; + r.quality[errorLoc]=0; + return bs; + } + } + + burst=1; + while(bs.cardinality()\n"+toString(bs, r.bases.length));} + }else{ + r.errors=r.bases.length-bs.cardinality(); + r.errorsCorrected+=corrections; + r.bases[errorLoc]='N'; + r.quality[errorLoc]=0; + if(verbose){System.err.println("Could not correct.");} + return bs; + } + } + } + + r.errors=r.bases.length-bs.cardinality(); + r.errorsCorrected+=corrections; + assert(corrections<=maxCorrections); + return bs; + } + + + /** + * @param r + * @param kca + * @param k + * @param thresh + * @param bs + * @param errorLoc + * @return + */ + private static boolean correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) { + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + final int setlen=k+gap; + final int startLoc=error-(setlen)+1; + final byte oldBase=r.bases[error]; + final byte[] bases=r.bases; + + final int minAdvance=Tools.min(MIN_ADVANCE, bases.length-error); + + long kmer=0; + int len=0; + for(int i=startLoc; i=minLoc; i--){ + if(!bs.get(i)){ + minLoc=i+1; + break; + } + } + } + + if(verbose){ + Data.sysout.println("correctFromRight. Error = "+error+", minloc="+minLoc); + Data.sysout.println(new String(r.bases)); + } + for(int bnum=0; bnum<4; bnum++){ + byte c=AminoAcid.numberToBase[bnum]; + bases[error]=c; + if(verbose){System.err.println("Considering "+(char)c);} + long key=kmer; + for(int loc=error; loc>=minLoc; loc--){ + c=bases[loc]; + int x=AminoAcid.baseToNumber[c]; + if(x<0){ + if(verbose){System.err.println("break: N");} + break; + } + key=((key>>2)|(((long)x)<max){ + max=array[i]; + maxIndex=i; + }else if(max==array[i]){ + maxIndex=-1; + } + } + return maxIndex; + } + + public static final String toString(BitSet bs, int len){ +// assert(verbose); + StringBuilder sb=new StringBuilder(len); + for(int i=0; i list){ + + for(int i=0; i0){ + if(r.mate==null || r.mate.errors>0){ + list.set(i, null); + } + } + } + + } + + public static boolean verbose=false; + /** Bails out if a read still has errors after correcting this many. */ + public static int ERROR_CORRECTION_LIMIT=6; + /** Max allowed number of nearby corrections. + * A long error burst indicates the read simply has low coverage, and is not being corrected correctly. */ + public static int MAX_ERROR_BURST=3; + /** Bursts have at most this distance between errors. E.G. '1' means errors are adjacent. */ + public static int BURST_THRESH=2; + /** Withhold uncorrectable reads from output. */ + public static boolean DONT_OUTPUT_BAD_READS=false; + /** Do not correct an error if it is at most this far from the next error. Instead, bail out. */ + public static int MIN_ADVANCE=1; + + /** Number of threads used for error correction. Does not control number of threads for creating the hash table. + * Additionally, up to 2 threads are used for reading and up to 2 for writing. For this (singlethreaded) class, the number does nothing. */ + public static final int THREADS=1; + + /** Output correction data instead of the corrected read */ + public static boolean OUTPUT_INFO=false; + + +} diff --git a/current/jgi/ErrorCorrectMT.java b/current/jgi/ErrorCorrectMT.java new file mode 100755 index 0000000..acff60d --- /dev/null +++ b/current/jgi/ErrorCorrectMT.java @@ -0,0 +1,1819 @@ +package jgi; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; + +import kmer.KCountArray; +import kmer.KmerCount7MT; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; +import stream.SamLine; + +import dna.AminoAcid; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.FileFormat; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import align2.TrimRead; + +/** + * @author Brian Bushnell + * @date Aug 20, 2012 + * + */ +public class ErrorCorrectMT extends Thread{ + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + if(args.length<1){throw new RuntimeException("No parameters.");} + + String reads1=args[0]; + String reads2=(args.length>1 ? args[1] : null); + if(reads2!=null && "null".equalsIgnoreCase(reads2)){reads2=null;} + + { + { + File f=new File(reads1); + if(!f.exists() || !f.isFile()){throw new RuntimeException(reads1+" does not exist.");} + } + if(reads2!=null){ + File f=new File(reads2); + if(!f.exists() || !f.isFile()){throw new RuntimeException(reads2+" does not exist.");} + if(reads1.equalsIgnoreCase(reads2)){ + throw new RuntimeException("Both input files are the same."); + } + } + } + + int k=31; + int cbits=2; + int gap=0; + int hashes=3; + int thresh1=2; + int thresh2=2; +// int matrixbits=-1; + long cells=-1; + long maxReads=-1; + int buildpasses=2; + long tablereads=-1; //How many reads to process when building the hashtable + int buildStepsize=4; + String output=null; + boolean ordered=true; + boolean overwrite=true; + int threads=-1; + + boolean auto=true; + + FastaReadInputStream.TARGET_READ_LEN=Integer.MAX_VALUE; + FASTQ.PARSE_CUSTOM=false; + + List extra=null; + + long memory=Runtime.getRuntime().maxMemory(); + long tmemory=Runtime.getRuntime().totalMemory(); +// assert(false) : memory+", "+tmemory; + + for(int i=2; i1 ? split[1] : "true"); + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.equals("initialthresh") || a.equals("thresh1")){ + thresh1=Integer.parseInt(b); + }else if(a.equals("thresh") || a.equals("thresh2") || a.equals("threshgood")){ + thresh2=Integer.parseInt(b); + }else if(a.equals("threshbad")){ + THRESH_BAD=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else if(a.startsWith("matrixbits")){ + int matrixbits=Integer.parseInt(b); + assert(matrixbits<63); + cells=1L<=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimleft")){ + TRIM_LEFT=Tools.parseBoolean(b); + }else if(a.equals("trimright")){ + TRIM_RIGHT=Tools.parseBoolean(b); + }else if(a.startsWith("trimq")){ + TRIM_QUAL=MAX_TRIM_QUAL=Byte.parseByte(b); + }else if(a.startsWith("mintrimq")){ + TRIM_QUAL=Byte.parseByte(b); + }else if(a.startsWith("maxtrimq")){ + MAX_TRIM_QUAL=Byte.parseByte(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("auto") || a.equals("automatic")){ + auto=Tools.parseBoolean(b); + }else if(a.equals("trybothsides")){ + TRY_BOTH_SIDES=Tools.parseBoolean(b); + }else if(a.equals("onlycorrectn")){ + ONLY_CORRECT_N=Tools.parseBoolean(b); + }else if(a.startsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + Data.sysout.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.startsWith("canonical")){ + CANONICAL=KmerCount7MT.CANONICAL=Tools.parseBoolean(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.startsWith("extra")){ + if(b!=null && !b.equalsIgnoreCase("null")){ + extra=Arrays.asList(b.split(",")); + } + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + if(k>31){CANONICAL=KmerCount7MT.CANONICAL=false;} + assert(CANONICAL==KmerCount7MT.CANONICAL); + + assert(THRESH_BAD0){ + THREADS=threads; + }else{ + THREADS=Data.LOGICAL_PROCESSORS; + } + assert(THREADS>0 && THREADS<100000); + + long cells2=cells; + if(auto && cells==-1){ + final long usable=(long)Tools.max(((memory-16000000)*.75), memory*0.45); + long mem=usable; + if(buildpasses>1){mem/=2;} + cells=(mem*8)/cbits; + cells2=cells; + +// long tablebytes=((1L<(1L<<(2*k))){cells=cells2=(1L<<(2*k));} + + Data.sysout.println("\nSettings:"); + Data.sysout.println("threads: \t"+THREADS); + Data.sysout.println("k: \t"+k); + Data.sysout.println("cbits: \t"+cbits); + Data.sysout.println("cells: \t"+Tools.toKMG(cells)); +// if(buildpasses>1){Data.sysout.println("cells2: \t"+Tools.toKMG(cells2));} + Data.sysout.println("hashes: \t"+hashes); + Data.sysout.println("passes: \t"+buildpasses); + Data.sysout.println("maxerrors: \t"+ERROR_CORRECTION_LIMIT); + Data.sysout.println("maxburst: \t"+MAX_ERROR_BURST); + if(buildpasses>1){ + Data.sysout.println("thresh1: \t"+thresh1); + Data.sysout.println("thresh2: \t"+thresh2); + }else{ + Data.sysout.println("thresh: \t"+thresh2); + } + Data.sysout.println("output bad reads: \t"+(!DONT_OUTPUT_BAD_READS)); + Data.sysout.println(); + +// KmerCount7MT.THREADS=Tools.max(THREADS/2, KmerCount7MT.THREADS); //Seems like 4 is actually optimal... + + FastaReadInputStream.MIN_READ_LEN=k; + + if(DONT_OUTPUT_BAD_PAIRS){DONT_OUTPUT_BAD_READS=true;} + + Timer t=new Timer(); + t.start(); +// assert(false) : cells+", "+cells2; + KCountArray kca=makeTable(reads1, reads2, extra, k, cbits, gap, hashes, buildpasses, cells, cells2, tablereads, buildStepsize, thresh1, thresh2); + + long bases=detect(reads1, reads2, kca, k, thresh2, maxReads, output, ordered, overwrite); + t.stop(); + Data.sysout.println("Total time: \t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec"); + + } + + public static KCountArray makeTable(String reads1, String reads2, Iterable extra, int k, int cbits, int gap, int hashes, int buildpasses, long cells1, + long cells2, long maxreads, int stepsize, int thresh1, int thresh2){ + + long extraMaxreads=-1; + + Timer thash=new Timer(); + + KmerCount7MT.maxReads=maxreads; + int kbits=Tools.max(2, Tools.min(2*k, 62)); + + long mcells=(buildpasses&1)==1 ? cells1 : cells2; + + thash.start(); +// Data.sysout.println("kbits="+(kbits)+" -> "+(1L< "+(1L<0){ + int slash=out1.lastIndexOf('/'); + String a=out1.substring(0, slash+1); + String b=out1.substring(slash+1); + outbad1=a+(b.replaceFirst("\\.", "_BAD.")); +// assert(false) : "\n"+a+"\n"+b+"\n"+outbad1+"\n"; + }else{ + outbad1=out1.replaceFirst("\\.", "_BAD."); + } +// assert(false) : outbad1; + + String outbad2=null; + if(out2!=null && out2.lastIndexOf('/')>0){ + int slash=out2.lastIndexOf('/'); + String a=out2.substring(0, slash+1); + String b=out2.substring(slash+1); + outbad2=a+(b.replaceFirst("\\.", "_BAD.")); +// assert(false) : "\n"+a+"\n"+b+"\n"; + }else if(out2!=null){ + outbad2=out2.replaceFirst("\\.", "_BAD."); + } + + FileFormat ffb1=FileFormat.testOutput(outbad1, FileFormat.FASTQ, OUTPUT_INFO_ONLY ? ".info" : null, true, overwrite, ordered); + FileFormat ffb2=FileFormat.testOutput(outbad2, FileFormat.FASTQ, OUTPUT_INFO_ONLY ? ".info" : null, true, overwrite, ordered); + ros=new RTextOutputStream3(ffb1, ffb2, buff, null, true); + } + } + + + if(ros!=null){ + ros.start(); + Data.sysout.println("Started output threads."); + } + if(rosbad!=null){ + rosbad.start(); + } + + long bases=detect(cris, kca, k, thresh, maxReads, ros, rosbad); + + ReadWrite.closeStreams(cris, ros, rosbad); + if(verbose){System.err.println("Closed stream");} + return bases; + } + + public static long detect(ConcurrentReadStreamInterface cris, KCountArray kca, int k, int thresh, long maxReads, RTextOutputStream3 ros, RTextOutputStream3 rosbad) { + Timer tdetect=new Timer(); + tdetect.start(); + + long covered=0; + long uncovered=0; + + long coveredFinal=0; + long uncoveredFinal=0; + + long fullyCorrected=0; + long failed=0; + + long errorsCorrected=0; + + long totalBases=0; + long totalReads=0; + long readsOut=0; + long basesOut=0; + long readsTrimmed=0; + long basesTrimmed=0; + + ProcessThread[] pta=new ProcessThread[THREADS]; + for(int i=0; i0){return detectErrorsSplit(r, kca, k, thresh);} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + int bslen=r.bases.length-k-gap+1; + if(bslen<1){return null;} //Read is too short to detect errors + BitSet bs=new BitSet(bslen); + + int len=0; + long kmer=0; + byte[] bases=r.bases; + for(int i=0; i=k){ + int count=kca.read(CANONICAL ? KCountArray.makeCanonical2(kmer, k) : kmer); + if(count>=thresh){ + bs.set(i+1-k); + } + } + } + } + return bs; + } + + public static BitSet detectErrorsBulk(final Read r, final KCountArray kca, final int k, final int thresh, final int stepsize/*, final int offset*/){ + if(ONLY_CORRECT_N){return detectNBulk(r);} + if(kca.gap>0){return detectErrorsSplit(r, kca, k, thresh);} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + if(r.bases==null || r.bases.length=k && ((len-k)%stepsize==0 || i==bases.length-1)){ + int count=kca.read(CANONICAL ? KCountArray.makeCanonical2(kmer, k) : kmer); + if(count>=thresh){ + bs.set(i+1-setlen, i+1); + } + } + } + } + + r.errors=r.bases.length-bs.cardinality(); + +// assert(bases.length==r.bases.length); + return bs; + } + + public static BitSet detectNBulk(final Read r){ + if(r.bases==null){return null;} //Read is too short to detect errors + BitSet bs=new BitSet(r.bases.length); + + for(int i=0; i0){throw new RuntimeException("TODO");} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + if(r.bases==null || r.bases.length=k && (i%detectStepsize==0 || i==bases.length-1)){ + int count=kca.read(CANONICAL ? KCountArray.makeCanonical2(kmer, k) : kmer); + if(count0){return detectErrorsSplit(r, kca, k, thresh);} + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + + if(r.bases==null || r.bases.length=k){ + int count=kca.read(CANONICAL ? KCountArray.makeCanonical2(kmer, k) : kmer); + if(count>=thresh){ + bs.set(i+1-setlen); + bs.set(i); + } + } + } + } + return bs; + } + + /** + * @param r + * @param kca + * @param k + * @param thresh + * @return + */ + private static BitSet detectErrorsSplit(Read r, KCountArray kca, int k, + int thresh) { + assert(false) : "TODO"; + return null; + } + + + /** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers */ + public static BitSet correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst){ + assert(!TRY_BOTH_SIDES); + if(kca.gap>0){assert(false) : "TODO";} + assert(!OUTPUT_INFO_ONLY) : "TODO: Outputting correction data is not yet supported."; + + int corrections=0; //Alternately, corrections=r.errorsCorrected + r.errors=0; + + int initialErrors=r.bases.length-bs.cardinality(); + if(initialErrors==0){return bs;} + + if(initialErrors>r.bases.length-k){//Cannot be corrected + r.errors=r.bases.length; + return bs; + } + + + byte[] bases0=Arrays.copyOf(r.bases, r.bases.length); + byte[] qual0=(r.quality==null ? null : Arrays.copyOf(r.quality, r.quality.length)); + +// verbose=!bs.get(0); + if(verbose){ + Data.sysout.println(); + Data.sysout.println(toString(bs, r.bases.length)); + Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.bases.length)); + Data.sysout.println(toString(detectErrors(r, kca, k, thresh), r.bases.length-k+1)); + } + + int lastloc=-99; + int burst=1; + while(!bs.get(0) && corrections0); + assert(initialErrors>corrections); +// Data.sysout.println("Could not correct."); +// return bs; + int errorLoc=bs.nextSetBit(0)-1;//Location to left of first '1' + if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;} + else{burst=1;} + lastloc=errorLoc; + boolean success=(burst<=maxBurst) && correctFromRight(r, kca, k, thresh, bs, errorLoc); + if(success){ + corrections++; + bs=detectErrorsBulk(r, kca, k, thresh, 1); + if(verbose){System.err.println(">\n"+toString(bs, r.bases.length));} + }else{ + if(verbose){System.err.println("Could not correct.");} + + r.bases=bases0; + r.quality=qual0; + r.errors=initialErrors; + +// r.errors=r.bases.length-bs.cardinality(); +// r.errorsCorrected+=corrections; +// r.bases[errorLoc]='N'; +// r.quality[errorLoc]=0; + + return bs; + } + } + + burst=1; + while(bs.cardinality()0); + assert(initialErrors>corrections); + if(bs.get(0)){//First bit is a "1", can correct from the left + int errorLoc=bs.nextClearBit(0);//Location to left of first '0' + if(Tools.absdif(errorLoc,lastloc)<=BURST_THRESH){burst++;} + else{burst=1;} + lastloc=errorLoc; + boolean success=(burst<=maxBurst) && correctFromLeft(r, kca, k, thresh, bs, errorLoc); + if(success){ + corrections++; + bs=detectErrorsBulk(r, kca, k, thresh, 1); + if(verbose){System.err.println(">\n"+toString(bs, r.bases.length));} + }else{ + if(verbose){System.err.println("Could not correct.");} + + r.bases=bases0; + r.quality=qual0; + r.errors=initialErrors; + +// r.errors=r.bases.length-bs.cardinality(); +// r.errorsCorrected+=corrections; +// r.bases[errorLoc]='N'; +// r.quality[errorLoc]=0; + + return bs; + } + } + } + + if(corrections>=maxCorrections && bs.cardinality()0 || maxCorrections<1) : "\ncorrections="+corrections+", maxCorrections="+maxCorrections+",\n" + + "r.bases.length="+r.bases.length+", initialErrors="+initialErrors+", r.errors="+r.errors; + return bs; + } + + + + + /** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers. + * This function */ + public static BitSet correctErrorsBothSides(final Read r, final KCountArray kca, final int k, final int goodThresh, final int badThresh, BitSet bs, final int maxCorrections){ + +// verbose=r.numericID==405093; + + assert(goodThresh>badThresh) : goodThresh+", "+badThresh; + assert(goodThresh<=kca.maxValue) : goodThresh+", "+kca.maxValue+", "+kca.cellBits; + +// assert(false) : "TODO"; + if(kca.gap>0){assert(false) : "TODO";} + assert(!OUTPUT_INFO_ONLY) : "TODO: Outputting correction data is not yet supported."; + + int corrections=0; //Alternately, corrections=r.errorsCorrected + r.errors=0; + + byte[] bclone=r.bases.clone(); + + int initialErrors=r.bases.length-bs.cardinality(); + if(initialErrors>r.bases.length-k){//Cannot be corrected + r.errors=r.bases.length; + return bs; + } + if(initialErrors==0){return bs;} //Nothing to correct. + + + int prevBlock=-1; + int blockStart=bs.nextClearBit(0); + int blockStop=bs.nextSetBit(blockStart)-1; + if(blockStop<0){blockStop=r.bases.length-1;} + +// verbose=!bs.get(0); + if(verbose){ + Data.sysout.println(); + Data.sysout.println(new String(r.bases)); + Data.sysout.println(toString(bs, r.bases.length)); + Data.sysout.println(toString(detectErrorsTips(r, kca, k, goodThresh), r.bases.length)); + Data.sysout.println(toString(detectErrors(r, kca, k, goodThresh), r.bases.length-k+1)); + Data.sysout.println("prevBlock="+prevBlock+", blockStart="+blockStart+", blockStop="+blockStop); + Data.sysout.println(); + } + + while(corrections=k+kca.gap){ + x+=correctFullyFromRight(r, kca, k, goodThresh, badThresh, bs, blockStop); + if(verbose){System.err.println("Right: "+x);} + } + } + + corrections+=x; + if(corrections>=blockStop-blockStart+1){//then the whole block was cleared + + }else{ + prevBlock=bs.nextClearBit(blockStart); + prevBlock=bs.nextSetBit(prevBlock)-1; + } + + blockStart=nextBlock; + blockStop=bs.nextSetBit(blockStart)-1; + if(blockStop<0){blockStop=r.bases.length-1;} + } + + + r.errors=0;//r.bases.length-bs.cardinality(); + r.errorsCorrected+=corrections; + assert(corrections<=initialErrors) : corrections+", "+initialErrors+"\n"+new String(r.bases)+"\n"+new String(bclone)+"\n"; +// assert(corrections<=maxCorrections); +// assert(corrections>0 || maxCorrections<1); + return bs; + } + + + + /** + * @param r + * @param kca + * @param k + * @param thresh + * @param bs + * @param errorLoc + * @return + */ + private static boolean correctFromLeft(Read r, KCountArray kca, int k, int thresh, BitSet bs, int error) { + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final int gap=kca.gap; + final int setlen=k+gap; + final int startLoc=error-(setlen)+1; + final byte oldBase=r.bases[error]; + final byte[] bases=r.bases; + + final int minAdvance=Tools.min(MIN_ADVANCE, bases.length-error); + + long kmer=0; + int len=0; + for(int i=startLoc; i=minLoc; i--){ + if(!bs.get(i)){ + minLoc=i+1; + break; + } + } + } + + if(verbose){ + Data.sysout.println("correctFromRight. Error = "+error+", minloc="+minLoc); + Data.sysout.println(new String(r.bases)); + } + for(int bnum=0; bnum<4; bnum++){ + byte c=AminoAcid.numberToBase[bnum]; + bases[error]=c; + if(verbose){System.err.println("Considering "+(char)c);} + long key=kmer; + for(int loc=error; loc>=minLoc; loc--){ + c=bases[loc]; + int x=AminoAcid.baseToNumber[c]; + if(x<0){ + if(verbose){System.err.println("break: N");} + break; + } + key=((key>>2)|(((long)x)<0 && !bs.get(minLoc)){minLoc--;} + if(bs.get(minLoc)){minLoc++;} + + if(verbose){ + Data.sysout.println("correctFullyFromRight. Error = "+error+", minloc="+minLoc); + Data.sysout.println(new String(bases)); + } + + boolean success=true; + int corrected=0; + for(int loc=error; loc>=minLoc; loc--){ + int lows=0, highs=0, zeros=0; + for(int bnum=0; bnum<4; bnum++){ + if(verbose){System.err.println("Considering "+(char)AminoAcid.numberToBase[bnum]);} + long key=kmer; + + if(bnum<0){ + if(verbose){System.err.println("break: N");} + break; + } + key=((key>>2)|(((long)bnum)<=threshGood){highs++;} + if(count==0){zeros++;} + counts[bnum]=count; + } + assert(zeros<=lows); + + if((highs==1 && lows==3) || (bases[loc]=='N' && zeros==3)){ + int x=Tools.maxIndex(counts); + if(verbose){System.err.println("Best="+x+": \t"+Arrays.toString(counts));} + bases[loc]=AminoAcid.numberToBase[x]; + if(r.quality!=null){r.quality[loc]=(byte)(20+zeros+(highs==1 ? 3 : 0));} + corrected++; + kmer=((kmer>>2)|(((long)x)<=bases.length){maxLoc=bases.length-1;} + if(bs.get(maxLoc)){maxLoc--;} + + if(verbose){ + Data.sysout.println("correctFullyFromLeft. Error = "+error+", minloc="+maxLoc); + Data.sysout.println(new String(bases)); + } + + boolean success=true; + int corrected=0; + for(int loc=error; loc<=maxLoc; loc++){ + +// Data.sysout.println(bs); +// Data.sysout.println(loc); +// Data.sysout.println(bases[loc]); +// Data.sysout.println(); + + int lows=0, highs=0, zeros=0; + for(int bnum=0; bnum<4; bnum++){ + if(verbose){System.err.println("Considering "+(char)AminoAcid.numberToBase[bnum]);} + long key=kmer; + + if(bnum<0){ + if(verbose){System.err.println("break: N");} + break; + } + key=((key<<2)|bnum)&mask; +// { +// String s=Long.toBinaryString(key); +// while(s.length()=threshGood){highs++;} + if(count==0){zeros++;} + counts[bnum]=count; + } + assert(zeros<=lows); +// assert(zeros>0); + + if((highs==1 && lows==3) || (bases[loc]=='N' && zeros==3)){ + assert(zeros<4); + int x=Tools.maxIndex(counts); + if(verbose){System.err.println("Best="+x+": \t"+Arrays.toString(counts));} + bases[loc]=AminoAcid.numberToBase[x]; + if(r.quality!=null){r.quality[loc]=(byte)(20+zeros+(highs==1 ? 3 : 0));} + corrected++; + kmer=((kmer<<2)|x)&mask; + bs.set(loc); +// assert(highs==1) : "\nloc="+loc+", zeros="+zeros+", lows="+lows+", highs="+highs+ +// ", counts="+Arrays.toString(counts)+", bases["+loc+"]="+((char)bases[loc])+ +// "\n"+new String(bases)+"\n"+toString(bs, bases.length)+"\n"; + }else{ +// assert(zeros<3 || bases[bases.length-1]=='N') : "\nloc="+loc+", zeros="+zeros+", lows="+lows+", highs="+highs+ +// ", counts="+Arrays.toString(counts)+", bases["+loc+"]="+((char)bases[loc])+ +// "\n"+new String(bases)+"\n"+toString(bs, bases.length)+"\n"; +// assert(false) : threshGood+", "+threshBad; + success=false; + break; + } + } + + return corrected; + } + + /** returns index of highest value, if unique; else a negative number */ + private static int maxUniqueIndex(int[] array){ + int max=array[0]; + int maxIndex=0; + for(int i=1; imax){ + max=array[i]; + maxIndex=i; + }else if(max==array[i]){ + maxIndex=-1; + } + } + return maxIndex; + } + + public static final String toString(BitSet bs, int len){ +// assert(verbose); + StringBuilder sb=new StringBuilder(len); + for(int i=0; i removeBad(ArrayList list){ + + ArrayList bad=new ArrayList(); + + if(DONT_OUTPUT_BAD_PAIRS){ + for(int i=0; i0 || r.discarded()) || (r.mate!=null && (r.mate.errors>0 || r.mate.discarded()))){ + list.set(i, null); + bad.add(r); + } + } + }else{ + for(int i=0; i0 || r.discarded()) && (r.mate==null || (r.mate.errors>0 || r.mate.discarded()))){ + list.set(i, null); + bad.add(r); + } + } + } + + return bad; + } + + private static void trim(Read r, BitSet bs, byte minq, int maxTrim) { + if(bs==null){trim(r, minq, maxTrim);} + else{ + assert(false) : "TODO"; + trim(r, minq, maxTrim); + } + } + + private static int trim(Read r, byte minq, int maxTrim) { +// assert(r.bases.length>=MIN_LEN) : r.bases.length; + assert(maxTrim>0) : maxTrim; + if(maxTrim<1){return 0;} + +// Data.sysout.println(Arrays.toString(r.quality)); +// Data.sysout.println("TRIM_LEFT="+TRIM_LEFT+", TRIM_RIGHT="+TRIM_RIGHT+", minq="+minq+", TRIM_N="+TRIM_N+", TRIM_N_ONLY="+TRIM_N_ONLY); + + byte[] bases=r.bases; + byte[] quals=r.quality; + if(bases.length=MIN_LEN) : r.bases.length; + + if(TRIM_LEFT){ + for(int i=0; i=minsafe){break;} + } + } + } + while(left>maxTrim){left--;} + + safe=0; + + if(TRIM_RIGHT && left0; i--){ + byte b=bases[i]; + byte q=quals[i]; +// Data.sysout.println("Processing "+(char)b+" of q="+q); +// Data.sysout.println("safe="+safe+", minsafe="+minsafe+", right="+right); + if((b=='N' && TRIM_N) || (q=minsafe){break;} + } + } + } + + while(left+bases.length-right-1>maxTrim){right++;} + + if(left==0 && right==bases.length-1){ + //do nothing + return 0; + }else if(right-left+10 && right-left+1=MIN_LEN || r.discarded()); + return bases.length-r.bases.length; + } + + + private static class ProcessThread extends Thread{ + + ProcessThread(ConcurrentReadStreamInterface cris_, KCountArray kca_, int k_, int thresh_, RTextOutputStream3 ros_, RTextOutputStream3 rosbad_){ + cris=cris_; + kca=kca_; + k=k_; + thresh=thresh_; + ros=ros_; + rosbad=rosbad_; + } + + public void run(){ + detect(); + } + + void detect() { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + + while(reads!=null && reads.size()>0){ + for(Read r : reads){ + Read r2=r.mate; +// assert(ec==0); //*** for testing +// +// boolean cor=false; + { + +// if(r.numericID==23){verbose=true;} + + int initialErrorsCorrected=r.errorsCorrected; + int initialLen=r.bases.length; + totalReads++; + if(verbose){System.err.println();} + totalBases+=r.bases.length; +// BitSet bs=detectErrors(r, kca, k, thresh); + BitSet bs=detectErrorsBulk(r, kca, k, thresh, 1); + if(verbose){System.err.println(ErrorCorrectMT.toString(bs, r.bases.length));} +// Data.sysout.println(toString(detectErrorsTips(r, kca, k, thresh), r.bases.length)); + if(verbose){System.err.println(ErrorCorrectMT.toString(detectErrors(r, kca, k, thresh), r.bases.length-k+1));} + if(bs==null){//can't detect errors +// assert(false); + r.setDiscarded(true); + }else{ + int initialCorrect=bs.cardinality(); + covered+=initialCorrect; + uncovered+=(r.bases.length-initialCorrect); + if(initialCorrect0 || initialLen>r.bases.length); + }else{ + failed++; + assert(errorsNewlyCorrected==0 || TRY_BOTH_SIDES); + } + }else{ + assert(errorsNewlyCorrected==0 || initialLen>r.bases.length); + } + } + } + if(r2!=null){ +// assert(false); //*** + int initialErrorsCorrected=r2.errorsCorrected; + int initialLen=r2.bases.length; + totalReads++; + totalBases+=r2.bases.length; +// BitSet bs=detectErrors(r2, kca, k, thresh); + BitSet bs=detectErrorsBulk(r2, kca, k, thresh, 1); + if(verbose){System.err.println(ErrorCorrectMT.toString(bs, r2.bases.length));} +// Data.sysout.println(toString(detectErrorsTips(r2, kca, k, thresh), r2.bases.length)); + if(verbose){System.err.println(ErrorCorrectMT.toString(detectErrors(r2, kca, k, thresh), r2.bases.length-k+1));} + if(bs==null){//can't detect errors + r.setDiscarded(true); + }else{ + int initialCorrect=bs.cardinality(); + covered+=initialCorrect; + uncovered+=(r2.bases.length-initialCorrect); + if(initialCorrect0 || initialLen>r2.bases.length); + }else{ + failed++; + assert(errorsNewlyCorrected==0); + } + }else{ + assert(errorsNewlyCorrected==0 || initialLen>r2.bases.length); + } + } + } + } + + + if(DONT_OUTPUT_BAD_READS){ + ArrayList bad=removeBad(reads); + if(rosbad!=null){ + rosbad.add(bad, ln.id); + } + } + for(Read r : reads){ + if(r!=null){ + Read r2=r.mate; + readsOut++; + basesOut+=(r.bases==null ? 0 : r.bases.length); + r.obj=null; + assert(r.bases!=null); + if(r.sites!=null && r.sites.isEmpty()){r.sites=null;} + + if(r2!=null){ + readsOut++; + basesOut+=(r2.bases==null ? 0 : r2.bases.length); + r2.obj=null; + assert(r2.bases!=null); + if(r2.numSites()==0){r2.sites=null;} + } + } + } + //System.err.println("Adding list of length "+readlist.size()); + if(ros!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. + ros.add(reads, ln.id); + } + + + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + } + + + /** Assumes bulk mode was used; e.g., any '0' bit is covered by no correct kmers */ + private BitSet correctErrors(final Read r, final KCountArray kca, final int k, final int thresh, BitSet bs, final int maxCorrections, final int maxBurst, boolean trim){ + if(r.discarded()){r.setDiscarded(false);} + + if(!trim){return ErrorCorrectMT.correctErrors(r, kca, k, thresh, bs, maxCorrections, maxBurst);} + + byte q=TRIM_QUAL; + + byte[] qual=r.quality; + byte[] bases=r.bases; + int initialErrors=r.errors; + int initialCorrected=r.errorsCorrected; +// Data.sysout.println("A"); + assert(initialErrors>=0) : initialErrors; +// Data.sysout.println("initialErrors = "+initialErrors+", errors = "+r.errors+", corrected = "+r.errorsCorrected+", discarded = "+r.discarded()); + bs=ErrorCorrectMT.correctErrors(r, kca, k, thresh, bs, maxCorrections, maxBurst); + + int maxtrim=Tools.min(MAX_TRIM_BASES, (r.bases.length*1)/4); + for(int i=0; r.errors>0 && i<20 && q<=MAX_TRIM_QUAL && !r.discarded() && maxtrim>0; i++){ +// Data.sysout.println("B"); +// Data.sysout.println("Errors = "+r.errors+", corrected = "+r.errorsCorrected+", discarded = "+r.discarded()); + int x=trim(r, q, maxtrim); + maxtrim-=x; +// Data.sysout.println("trimmed: "+x); +// Data.sysout.println("Errors = "+r.errors+", corrected = "+r.errorsCorrected+", discarded = "+r.discarded()); + if(x>0){ +// Data.sysout.println("C"); + bs=detectErrorsBulk(r, kca, k, thresh, 1); +// Data.sysout.println("Errors = "+r.errors+", corrected = "+r.errorsCorrected+", discarded = "+r.discarded()); + bs=ErrorCorrectMT.correctErrors(r, kca, k, thresh, bs, maxCorrections, maxBurst); +// Data.sysout.println("Errors = "+r.errors+", corrected = "+r.errorsCorrected+", discarded = "+r.discarded()); + } + q=(byte) (q+2); + } +// Data.sysout.println("D"); +// Data.sysout.println("Errors = "+r.errors+", corrected = "+r.errorsCorrected+", discarded = "+r.discarded()); + + if(r.errors>0 || r.discarded()){ +// Data.sysout.println("E"); + r.bases=bases; + r.quality=qual; + r.errors=initialErrors; + r.errorsCorrected=initialCorrected; + }else{ +// Data.sysout.println("F"); + int x=bases.length-r.bases.length; + if(x>0){ +// Data.sysout.println("G"); + readsTrimmed++; + basesTrimmed+=x; + assert(x<=Tools.min(MAX_TRIM_BASES, (bases.length*3)/8)) : x+", "+bases.length+", "+r.bases.length+", "+Tools.min(MAX_TRIM_BASES, (bases.length*3)/8); +// assert(r.bases.length>38); + } + } +// Data.sysout.println("H"); +// Data.sysout.println("Errors = "+r.errors+", corrected = "+r.errorsCorrected+", discarded = "+r.discarded()); + return bs; + } + + private final ConcurrentReadStreamInterface cris; + private final KCountArray kca; + private final int k; + private final int thresh; + private final RTextOutputStream3 ros; + private final RTextOutputStream3 rosbad; + + long covered=0; + long uncovered=0; + + long coveredFinal=0; + long uncoveredFinal=0; + + long fullyCorrected=0; + long readsTrimmed=0; + long basesTrimmed=0; + long failed=0; + long readsOut=0; + long basesOut=0; + + long totalBases=0; + long totalReads=0; + + long errorsCorrected=0; + } + + public static boolean verbose=false; + /** Bails out if a read still has errors after correcting this many. */ + public static int ERROR_CORRECTION_LIMIT=4; + /** Max allowed number of nearby corrections. + * A long error burst indicates the read simply has low coverage, and is not being corrected correctly. */ + public static int MAX_ERROR_BURST=2; + /** Bursts have at most this distance between errors. E.G. '1' means errors are adjacent. */ + public static int BURST_THRESH=2; + /** Max frequency of kmer for an alternate (not chosen) base in TRY_BOTH_SIDES mode.*/ + public static int THRESH_BAD=0; + /** Withhold uncorrectable reads from output. */ + public static boolean DONT_OUTPUT_BAD_READS=false; + /** Withhold uncorrectable reads from output, if either the read or its mate is bad. */ + public static boolean DONT_OUTPUT_BAD_PAIRS=false; + /** Do not correct an error if it is at most this far from the next error. Instead, bail out. */ + public static int MIN_ADVANCE=1; + +// /** Trim bases of this quality or below when trimming the left side of a read */ +// public static byte TRIM_QUAL_LEFT=6; +// /** Trim bases of this quality or below when trimming the left side of a read */ +// public static byte TRIM_QUAL_RIGHT=6; + + /** Trim bases of this quality or below when trimming a read */ + public static byte TRIM_QUAL=5; + /** Trim bases of up to this quality if a read still has errors after trimming at a lower quality */ + public static byte MAX_TRIM_QUAL=15; + + public static boolean TRIM_LEFT=false; + public static boolean TRIM_RIGHT=false; + public static boolean TRIM_N=true; + public static boolean TRIM_N_ONLY=false; + public static boolean CANONICAL=false; + + /** Number of threads used for error correction. Does not control number of threads for creating the hash table. + * Additionally, up to 2 threads are used for reading and up to 2 for writing. */ + public static int THREADS=8; + + /** Output correction data instead of the corrected read. */ + public static boolean OUTPUT_INFO_ONLY=false; + /** Only detect/correct N, not called bases. Mainly for filling read middles. */ + public static boolean ONLY_CORRECT_N=false; + /** When an uncorrectable error is encountered, don't bail out, but instead try from the other side. This mode is far faster. */ + public static boolean TRY_BOTH_SIDES=false; + + public static int MIN_LEN=45; + public static int MIN_LEN_2=35; + + public static int MAX_TRIM_BASES=60; + +} diff --git a/current/jgi/FakeReads.java b/current/jgi/FakeReads.java new file mode 100755 index 0000000..665dd75 --- /dev/null +++ b/current/jgi/FakeReads.java @@ -0,0 +1,430 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; + +import dna.AminoAcid; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ByteFile1; +import fileIO.ByteFile2; +import fileIO.ReadWrite; +import fileIO.FileFormat; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +/** + * @author Brian Bushnell + * @date Sep 11, 2012 + * + */ +public class FakeReads { + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + FakeReads rr=new FakeReads(args); + rr.process(t); + } + + public FakeReads(String[] args){ + if(args==null || args.length==0){ + printOptions(); + System.exit(0); + } + + for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}} + outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + + FastaReadInputStream.SPLIT_READS=false; + stream.FastaReadInputStream.MIN_READ_LEN=1; + Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH); + Shared.READ_BUFFER_NUM_BUFFERS=Tools.min(2, Shared.READ_BUFFER_NUM_BUFFERS); + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("passes")){ + assert(false) : "'passes' is disabled."; +// passes=Integer.parseInt(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + ByteFile2.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + ConcurrentGenericReadInputStream.verbose=verbose; +// align2.FastaReadInputStream2.verbose=verbose; + stream.FastqReadInputStream.verbose=verbose; + ReadWrite.verbose=verbose; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("addspacer") || a.equals("addspace") || a.equals("usespacer")){ + addSpacer=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("t") || a.equals("threads")){ + Shared.THREADS=Tools.max(Integer.parseInt(b), 1); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){ + out1=b; + }else if(a.equals("out2") || a.equals("output2")){ + out2=b; + }else if(a.equals("identifier") || a.equals("id")){ + identifier=b; + }else if(a.equals("qfin") || a.equals("qfin1")){ + qfin1=b; + }else if(a.equals("qfout") || a.equals("qfout1")){ + qfout1=b; + }else if(a.equals("qfout2")){ + qfout2=b; + }else if(a.equals("extin")){ + extin=b; + }else if(a.equals("extout")){ + extout=b; + }else if(a.equals("trd") || a.equals("trc") || a.equals("trimreaddescription")){ + Shared.TRIM_READ_COMMENTS=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("fastareadlen") || a.equals("fastareadlength")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.equals("fastaminread") || a.equals("fastaminlen") || a.equals("fastaminlength")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("tossbrokenreads") || a.equals("tbr")){ + boolean x=Tools.parseBoolean(b); + Read.NULLIFY_BROKEN_QUALITY=x; + ConcurrentGenericReadInputStream.REMOVE_DISCARDED_READS=x; + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){ + int x=Integer.parseInt(b); + stream.FastaReadInputStream.MIN_READ_LEN=(x>0 ? x : Integer.MAX_VALUE); + }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){ + minReadLength=Integer.parseInt(b); + }else if(a.equals("length")){ + desiredLength=Integer.parseInt(b); + }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in1=arg; + if(arg.indexOf('#')>-1 && !new File(arg).exists()){ + in1=b.replace("#", "1"); +// in2=b.replace("#", "2"); + } + }else if(out1==null && i==1 && !arg.contains("=")){ + out1=arg; + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(identifier==null){identifier="";} + else{identifier=identifier+"_";} + + if(!addSpacer){spacer="";} + +// if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ +// in2=in1.replace("#", "2"); +// in1=in1.replace("#", "1"); +// } + if(out1!=null && out2==null && out1.indexOf('#')>-1){ + out2=out1.replace("#", "2"); + out1=out1.replace("#", "1"); + } +// if(in2!=null){ +// if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} +// FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; +// } + + assert(FastaReadInputStream.settingsOK()); + + if(in1==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.THREADS>2){ +// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;} + ByteFile.FORCE_MODE_BF2=true; + } + + if(out1==null){ + if(out2!=null){ + printOptions(); + throw new RuntimeException("Error - cannot define out2 without defining out1."); + } + out1="stdout"; + } + + if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} + if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;} + + if(!Tools.testOutputFiles(overwrite, false, out1, out2)){ + throw new RuntimeException("\n\nOVERWRITE="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); + } + + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, false); + ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, false); + + ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); + } + + void process(Timer t){ + + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, false, ffin1, null, qfin1, null); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));} + + RTextOutputStream3 ros=null; + if(out1!=null){ + final int buff=4; + + if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){ + outstream.println("Writing interleaved."); + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name."; + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(out1))) : "out1 and out2 have same name."; + + ros=new RTextOutputStream3(ffout1, ffout2, qfout1, qfout2, buff, null, false); + ros.start(); + } + + long readsProcessed=0; + long basesProcessed=0; + + { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); + } + + while(reads!=null && reads.size()>0){ + ArrayList fake=new ArrayList(reads.size()); + + for(int idx=0; idx jgi.FakeReads in= out= out2="); + outstream.println("\nout2 is optional. \nIf output is paired and there is only one output file, it will be written interleaved.\n"); + outstream.println("Other parameters and their defaults:\n"); + outstream.println("overwrite=false \tOverwrites files that already exist"); + outstream.println("ziplevel=5 \tSet compression level, 1 (low) to 9 (max)"); + outstream.println("interleaved=false\tDetermines whether input file is considered interleaved"); + outstream.println("fastawrap=100 \tLength of lines in fasta output"); + outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto"); + outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)"); + } + + + /*--------------------------------------------------------------*/ + + public boolean errorState=false; + + public String identifier=null; + + private String in1=null; + + private boolean addSpacer=true; + private String spacer=" "; + + private String qfin1=null; + + private String out1=null; + private String out2=null; + + private String qfout1=null; + private String qfout2=null; + + private String extin=null; + private String extout=null; + + private boolean overwrite=false; + private boolean colorspace=false; + + private long maxReads=-1; + private int minReadLength=1; + private int desiredLength=250; + + private byte qin=-1; + private byte qout=-1; + + private final FileFormat ffin1; + + private final FileFormat ffout1; + private final FileFormat ffout2; + + + /*--------------------------------------------------------------*/ + + private PrintStream outstream=System.err; + public static boolean verbose=false; + +} + diff --git a/current/jgi/FastaQualToFastq.java b/current/jgi/FastaQualToFastq.java new file mode 100755 index 0000000..22d8b81 --- /dev/null +++ b/current/jgi/FastaQualToFastq.java @@ -0,0 +1,109 @@ +package jgi; + +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Oct 25, 2012 + * + */ +public class FastaQualToFastq extends Thread { + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + for(int i=0; i")){ + if(print){ + tsw.print("@"); + tsw.println(header); + sbf.append('\n'); + tsw.print(sbf); + sbf=new StringBuilder(sbf.length()); + + String s2=tfq.nextLine(); + s2=tfq.nextLine(); + String[] qs=s2.toString().split(" "); + + StringBuilder sbq=new StringBuilder(qs.length+3); + sbq.append('+').append('\n'); + for(int i=0; i0){ +// System.out.println(header); +// System.out.println(sbf); + tsw.print("@"); + tsw.println(header); + sbf.append('\n'); + tsw.print(sbf); + sbf=new StringBuilder(sbf.length()); + + String s2=tfq.nextLine(); + s2=tfq.nextLine(); +// System.out.println(s2); + String[] qs=s2.toString().split(" "); + StringBuilder sbq=new StringBuilder(qs.length+3); + sbq.append('+').append('\n'); + for(int i=0; i extraFiles=new ArrayList(); + + for(int i=3; i1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("reads")){ + readsOut=Long.parseLong(b); + }else if(a.equals("readlen")){ + MIN_LEN=Integer.parseInt(b); + MIN_LEN_2=(MIN_LEN*2+2)/3; + }else if(a.contains("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + }else if(a.contains("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + }else if(a.contains("testkmer") || a.contains("kmertest")){ + TEST_KMERS=Tools.parseBoolean(b); + }else if(a.startsWith("rcomp") || a.startsWith("reverse")){ + RCOMP=Tools.parseBoolean(b); + }else if(a.equals("k") || a.equals("kmer") || a.equals("kmerlen")){ + K=Integer.parseInt(b); + }else if(a.equals("cbits")){ + CBITS=Integer.parseInt(b); + }else if(a.equals("hashes")){ + hashes=Integer.parseInt(b); + }else if(a.equals("passes")){ + passes=Integer.parseInt(b); + }else if(a.equals("matrixbits")){ + matrixbits=Integer.parseInt(b); + }else if(a.equals("minq") || a.equals("minquality")){ + MIN_QUALITY_READ=Byte.parseByte(b); + MIN_QUALITY_KMER=Byte.parseByte(b); + }else if(a.startsWith("minqr") || a.startsWith("minqualityr")){ + MIN_QUALITY_READ=Byte.parseByte(b); + }else if(a.startsWith("minqk") || a.startsWith("minqualityk")){ + MIN_QUALITY_KMER=Byte.parseByte(b); + }else if(a.startsWith("minavgq")){ + MIN_AVG_QUALITY=Byte.parseByte(b); + }else if(a.startsWith("maxerr")){ + MAX_EXPECTED_ERRORS=Float.parseFloat(b); + }else if(a.equals("lowthresh") || a.equals("lowkmerthresh")){ + LOW_KMER_THRESH=Integer.parseInt(b); + }else if(a.startsWith("extrafile")){ + String[] sp2=b.split(","); + for(String s : sp2){ + extraFiles.add(s); + } + }else if(a.startsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + }else if(a.equals("sections")){ + sections=Integer.parseInt(b); + assert(sections>0); + }else{ + throw new RuntimeException(args[i]+"\n"+a+"\n"+b); + } + } + if(readsOut<1){readsOut=Long.MAX_VALUE;} + if(sections>1){assert(outName.contains("SECTION#"));} + + KCountArray kca=null; + if(TEST_KMERS){ + Timer ht=new Timer(); + ht.start(); + + long maxreads=(readsOut==Long.MAX_VALUE ? -1 : Tools.max(readsOut, readsOut*8*sections)); + KmerCount6MT.maxReads=maxreads; + int kbits=2*K; + matrixbits=Tools.min(kbits, matrixbits); + + kca=KmerCount6MT.makeKca(fname1, fname2, extraFiles, K, CBITS, 0, matrixbits, hashes, MIN_QUALITY_KMER, RCOMP, maxreads, passes, 1, 1, 2); + + KmerCount6MT.printStatistics(kca); + MASK=~((-1L)<<(kbits)); + ht.stop(); + System.out.println("Finished hashing "+KmerCount4.keysCounted+" kmers."); + System.out.println("Hashing time: "+ht); + + } + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(fname1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(fname2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(-1, false, true, ff1, ff2); + Thread th=new Thread(cris); + th.start(); + } + +// assert(false) : cris.getClass(); + long kept=0; + + boolean sf=outName.contains("SECTION#"); + TextStreamWriter tswAll1=sf ? new TextStreamWriter(outName.replace("SECTION#", "ALL").replaceFirst("#", "1"), true, false, true) : null; + TextStreamWriter tswAll2=sf && cris.paired() ? new TextStreamWriter(outName.replace("SECTION#", "ALL").replaceFirst("#", "2"), true, false, true) : null; + TextStreamWriter tswBad1=sf ? new TextStreamWriter(outName.replace("SECTION#", "BAD").replaceFirst("#", "1"), true, false, true) : null; + TextStreamWriter tswBad2=sf && cris.paired() ? new TextStreamWriter(outName.replace("SECTION#", "BAD").replaceFirst("#", "2"), true, false, true) : null; + + if(tswAll1!=null){ + (tswAll1).start(); + tswAll1.print("#"+Read.header()+"\n"); + } + if(tswAll2!=null){ + (tswAll2).start(); + tswAll2.print("#"+Read.header()+"\n"); + } + if(tswBad1!=null){ + (tswBad1).start(); + tswBad1.print("#"+Read.header()+"\n"); + } + if(tswBad2!=null){ + (tswBad2).start(); + tswBad2.print("#"+Read.header()+"\n"); + } + + for(int i=1; i<=sections; i++){ + long x=process(cris, outName.replace("SECTION#", i+""), readsOut, kca, tswAll1, tswAll2, tswBad1, tswBad2); + kept+=x; + if(x<1){break;} + } + ReadWrite.closeStream(cris); + + if(tswAll1!=null){tswAll1.poison();} + if(tswAll2!=null){tswAll2.poison();} + if(tswBad1!=null){tswBad1.poison();} + if(tswBad2!=null){tswBad2.poison();} + + System.out.println("Removed reads: \t"+removed+String.format(" \t%.2f%%", removed*100d/(removed+kept))); + System.out.println("Removed short reads: \t"+removedShort); + System.out.println("Removed low avg quality reads: \t"+removedAvgQuality); + System.out.println("Removed low min quality reads: \t"+removedMinQuality); + System.out.println("Removed N-containing reads: \t"+removedNocall); + System.out.println("Removed error-prone reads: \t"+removedErrors); + System.out.println("Removed low-kmer reads: \t"+removedKmer); + + t.stop(); + System.out.println("Time:\t"+t); + } + + + public static long process(ConcurrentReadStreamInterface cris, String outfile, final long readsOut, final KCountArray kca2, + final TextStreamWriter tswAll1, final TextStreamWriter tswAll2, final TextStreamWriter tswBad1, final TextStreamWriter tswBad2){ + + + ListNum ln=cris.nextList(); + if(ln==null || ln.list==null || ln.list.isEmpty()){return 0;} + ArrayList reads=(ln!=null ? ln.list : null); + + TextStreamWriter tsw1=new TextStreamWriter(outfile.replaceFirst("#", "1"), true, false, true); + tsw1.start(); + tsw1.print("#"+Read.header()+"\n"); + TextStreamWriter tsw2=cris.paired() ? new TextStreamWriter(outfile.replaceFirst("#", "2"), true, false, true) : null; + if(tsw2!=null){ + assert(!tsw1.fname.equalsIgnoreCase(tsw2.fname)); + tsw2=(tsw2); + tsw2.start(); + tsw2.print("#"+Read.header()+"\n"); + } + + + long readsKept=0; + + while(reads!=null && reads.size()>0 && readsKept=MIN_LEN_2 && (r.mate==null || (r.mate.bases!=null && r.bases.length>=MIN_LEN_2))){ + r.setDiscarded(false); + assert(r!=null); + assert(r.toText(false)!=null); + StringBuilder sb=r.toText(false).append('\n'); + if(tswBad1!=null){tswBad1.print(sb);} + if(r.mate!=null){ + r.mate.setDiscarded(false); + sb=r.mate.toText(false).append('\n'); + if(tswBad2==null){if(tswBad1!=null){tswBad1.print(sb);}} + else{tswBad2.print(sb);} + } + } + } + if(readsKept>=readsOut){break;} + } + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } +// System.out.println("Returning last list."); + cris.returnList(ln, ln.list.isEmpty()); +// System.out.println("Closing."); +// cris.close(); +// System.out.println("Closed."); + +// ln=cris.nextList(); +// for(int i=0; i<100; i++){ +// ln=cris.nextList(); +// ln.list.clear(); +// System.out.println("Returned extra list."); +// cris.returnList(ln, true); +// } + + tsw1.poison(); + if(tsw2!=null){tsw2.poison();} + + if(tsw1.isAlive()){ + try { + synchronized(tsw1){ + tsw1.join(); + } + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + if(tsw2!=null && tsw2.isAlive()){ + try { + synchronized(tsw2){ + tsw2.join(); + } + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + return readsKept; + } + + + public static void trim(Read r) { + assert(r.bases.length>=MIN_LEN) : r.bases.length; + byte[] bases=r.bases; + byte[] quals=r.quality; + if(bases.length=MIN_LEN) : r.bases.length; + for(int i=0; i=minsafe){break;} + } + } + + safe=0; + + for(int i=bases.length-1; i>0; i--){ + byte b=bases[i]; + byte q=quals[i]; + if(b=='N' || q=minsafe){break;} + } + } + + if(right-left+1=MIN_LEN || r.discarded()); + } + + + public static boolean passesFilter(Read r, KCountArray kca2){ + boolean b=passesFilter2(r, kca2) && (r.mate==null || passesFilter2(r.mate, kca2)); + if(!b){removed++;} + return b; + } + + public static boolean passesFilter2(Read r, KCountArray kca2){ + + if(r.bases==null || r.bases.lengthMAX_N){removedNocall++; return false;} + final float errors=r.expectedErrors(); + if(errors>MAX_EXPECTED_ERRORS){removedErrors++; return false;} + + if(r.discarded()){removedShort++; return false;} //Removed for some unknown reason; probably too short. + + if(TEST_KMERS && kca2!=null){ + int len=0; + long kmer=0; + int low=0, low2=0, high=0; + final int lim2=Tools.min(LOW_KMER_THRESH+3, kca2.maxValue-1); + byte[] bases=r.bases; + for(int i=0; i=K){ + int count=kca2.read(kmer); + if(count<=LOW_KMER_THRESH){low++;} + if(count<=lim2){low2++;} + else{high++;} + } + } + } + if(low>=MIN_LOW_KMERS_TO_TOSS){removedKmer++; return false;} + if(high<1){removedKmer++; return false;} //Too many 'N' to make a useful read + if(low2>0 && (r.expectedErrors()>0.20f || n>0 || minq<=14)){ + return false; + } + } + + return true; + } + + + public static int MIN_LEN=60; + public static int MIN_LEN_2=45; + public static int MAX_N=1; + public static int MIN_AVG_QUALITY=19; + + /** Quality for trimming/keeping reads */ + public static byte MIN_QUALITY_READ=6; + + /** Quality for generating trusted kmers */ + public static byte MIN_QUALITY_KMER=11; + + public static float MAX_EXPECTED_ERRORS=1.2f; + public static boolean TEST_KMERS=true; + public static int K=17; + public static int CBITS=2; + private static long MASK=0; + private static boolean RCOMP=true; + + /** Kmers with this frequency or less are considered "low" */ + public static int LOW_KMER_THRESH=1; + /** A read must have at least this many low kmers to toss it. Generally, a read with an error should have multiple low kmers, unless they are at the tip. */ + public static int MIN_LOW_KMERS_TO_TOSS=2; + + //TODO: RARE stuff is not implemented + + /** Kmers with at least RARE_KMER_MIN_THRESH and at most RARE_KMER_MAX_THRESH are considered "rare" */ + public static int RARE_KMER_MIN_THRESH=8; + /** Kmers with at least RARE_KMER_MIN_THRESH and at most RARE_KMER_MAX_THRESH are considered "rare" */ + public static int RARE_KMER_MAX_THRESH=32; + /** A read must have at least this many more rare kmers than low kmers to retain it despite being suspect. */ + public static int MIN_RARE_KMERS_TO_KEEP=1; + public static boolean RETAIN_RARE_KMERS=false; + + private static long removedShort=0; + private static long removedAvgQuality=0; + private static long removedMinQuality=0; + private static long removedNocall=0; + private static long removedErrors=0; + private static long removedKmer=0; + private static long removed=0; + + +} diff --git a/current/jgi/FindString.java b/current/jgi/FindString.java new file mode 100755 index 0000000..7f34f47 --- /dev/null +++ b/current/jgi/FindString.java @@ -0,0 +1,25 @@ +package jgi; + +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jun 18, 2013 + * + */ +public class FindString { + + public static void main(String[] args){ + String fname=args[0]; + TextFile tf=new TextFile(fname, true, false); + for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ + boolean b=false; + for(int i=1; i0){ + if(args.length==2 && Character.isDigit(args[1].charAt(0))){ + byte[] s=args[0].getBytes(); + int b=Integer.parseInt(args[1]); + int len=prefixForInfoBits(s, b); + if(len<0){ + System.out.println("Input string only contains "+String.format("%.2f",infoInBitsDouble(s, 0, s.length))+" bits."); + }else{ + System.out.println("Prefix needed for "+b+" bits is length "+len+": "+args[0].substring(0, len)); +// assert(false) : "TODO: This is clearly broken."; + } + }else{ + for(String s : args){ + printInfo(s); + System.out.println(); + } + } + System.exit(0); + } + + System.out.println(); + printInfo(""); + System.out.println(); + printInfo("A"); + System.out.println(); + printInfo("AG"); + System.out.println(); + printInfo("AGT"); + System.out.println(); + printInfo("AANAA"); + System.out.println(); + printInfo("GGGGGGGCGGG"); + System.out.println(); + printInfo("CGGGGGGGGGG"); + System.out.println(); + printInfo("AGTCAGTCCTAGNGTACGT"); + System.out.println(); + printInfo("AGTCAGTCAGTCAGTC"); + System.out.println(); + printInfo("GCGCGCGCGCGCGCGC"); + System.out.println(); + + String[] s=new String[] {"A", "G", "C", "T", ""}; + for(int i=0; i<40; i++){ + System.out.println(); + s[4]=s[4]+s[i%4]; + printInfo(s[4]); + } + + System.out.println("PrefixForBits for AAAATATATGAAATGCATGCAATATGTTATGAAA"); + for(int i=0; i<60; i+=2){ + System.out.println(i+"\t"+prefixForInfoBits("AAAATATATGAAATGCATGCAATATGTTATGAAA".getBytes(), i)); + } + + + System.out.println("PrefixForBits for GCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGC"); + for(int i=0; i<60; i+=2){ + System.out.println(i+"\t"+prefixForInfoBits("GCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCGC".getBytes(), i)); + } + + + System.out.println("PrefixForBits for ACGTACGTACGTACGTACGTACGTACGTACGTAC"); + for(int i=0; i<63; i+=2){ + System.out.println(i+"\t"+prefixForInfoBits("ACGTACGTACGTACGTACGTACGTACGTACGTAC".getBytes(), i)); + } + } + + public static void printInfo(String s){ + long r=info(s); + double bits=Math.log(r)/Math.log(2); + System.out.println(s+"\nlen="+s.length()+" \tinfo = "+String.format("%.2f", bits)+" bits. \t("+r+")"); + } + + public static long info(String s){ + return info(s.getBytes(), 0, s.length()); + } + + public static int infoInBits(final byte[] array, final int from, final int len){return 63-Long.numberOfLeadingZeros(info(array, from, len));} + public static double infoInBitsDouble(final byte[] array, final int from, final int len){return Math.log(info(array, from, len))*invlog2;} + public static long info(final byte[] array){return info(array, 0, array.length);} + public static long info(final byte[] array, final int from, final int len){ + short[] counts=new short[4]; + long r=1; + int used=0; + for(int i=from, lim=min(from+len, array.length); i "); + byte num=baseToNumber[array[i]]; +// System.out.println(num); + if(num>=0){ + counts[num]++; + used++; + + if(used>32 && used>MAX/r){//overflow +// System.out.println("***"); + return MAX; + } + r=r*used; + + /* alternate method */ +// long temp=r*used; +// if(used>32 && temp/used!=r){//overflow +// return MAX; +// } +// r=temp; + + r=r/counts[num]; + } + } + return r; + } + + public static int prefixForInfoBits(final byte[] array, final int bits){assert(bits>=0 && bits<63);return prefixForInfo(array, 1L<=0 && bits<63);return prefixForInfo(array, 1L<=0); + short[] counts=new short[4]; + long r=1; + int used=0; + int i=from; + for(; i "); + byte num=baseToNumber[array[i]]; +// System.out.println(num); + if(num>=0){ + counts[num]++; + used++; + + if(used>32 && used>MAX/r){//overflow +// System.out.println("***"); + return i; + } + r=r*used; + + /* alternate method */ +// long temp=r*used; +// if(used>32 && temp/used!=r){//overflow +// return MAX; +// } +// r=temp; + + r=r/counts[num]; +// +// { +// String s=new String(array).substring(0, i+1); +// System.out.println("\n"+s); +// System.out.println("For len "+i+": r="+r+", bits="+(63-Long.numberOfLeadingZeros(r))+"\t->\t"+(Math.log(r)*invlog2)); +// System.out.println(infoInBitsDouble(s.getBytes(), 0, i+1)); +// System.out.println(info(s.getBytes(), 0, i+1)); +// } + } + } + return ry ? x : y;} + + private static final long MAX=Long.MAX_VALUE; + private static final double invlog2=1.0/Math.log(2); +} diff --git a/current/jgi/KmerCoverage.java b/current/jgi/KmerCoverage.java new file mode 100755 index 0000000..dab1513 --- /dev/null +++ b/current/jgi/KmerCoverage.java @@ -0,0 +1,1227 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.atomic.AtomicLong; + +import kmer.KCountArray; +import kmer.KmerCount7MTA; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; + +import align2.ListNum; +import align2.Tools; +import dna.AminoAcid; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Oct 11, 2012 + * + */ +public class KmerCoverage { + + public static void main(String[] args){ + for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}} + outstream.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + if(args.length<1){throw new RuntimeException("No parameters.");} + + String in1=(args[0].indexOf("=")>0 ? null : args[0]); + String in2=(in1!=null && args.length>1 ? args[1] : null); + if(in2!=null && "null".equalsIgnoreCase(in2)){in2=null;} + + { + if(in1!=null && !in1.contains(",")){ + File f=new File(in1); + if(!f.exists() || !f.isFile()){throw new RuntimeException(in1+" does not exist.");} + } + if(in2!=null && !in2.contains(",")){ + File f=new File(in2); + if(!f.exists() || !f.isFile()){throw new RuntimeException(in2+" does not exist.");} + if(in1.equalsIgnoreCase(in2)){ + throw new RuntimeException("Both input files are the same."); + } + } + } + + FASTQ.PARSE_CUSTOM=false; + KmerCount7MTA.minQuality=4; + KmerCount7MTA.minProb=0.1f; + + int k=31; + int cbits=16; + int gap=0; + int hashes=4; +// int matrixbits=-1; + long cells=-1; + long maxReads=-1; + int buildpasses=1; + long tablereads=-1; //How many reads to process when building the hashtable + int buildStepsize=4; + String output=null; + int prehashes=-1; + long precells=-1; + String histFile=null; + int threads=-1; + + int minq=KmerCount7MTA.minQuality; + KmerCount7MTA.CANONICAL=true; + + boolean auto=true; + + FastaReadInputStream.TARGET_READ_LEN=Integer.MAX_VALUE; + FASTQ.PARSE_CUSTOM=false; + + List extra=null; + + long memory=Runtime.getRuntime().maxMemory(); + long tmemory=Runtime.getRuntime().totalMemory(); +// assert(false) : memory+", "+tmemory; + + for(int i=(in1==null ? 0 : 1); i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.startsWith("bits") ||a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("histlen") ||a.startsWith("histogramlen")){ + HIST_LEN_PRINT=Tools.min(Integer.MAX_VALUE, Long.parseLong(b)+1); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else if(a.startsWith("matrixbits")){ + int matrixbits=Integer.parseInt(b); + assert(matrixbits<63); + cells=1L<0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("samplewhenreadingtable") || a.equals("sampleoutput")){ + DONT_SAMPLE_OUTPUT=!Tools.parseBoolean(b); + }else if(a.equals("dontsamplewhenreadingtable") || a.equals("dontsampleoutput")){ + DONT_SAMPLE_OUTPUT=Tools.parseBoolean(b); + }else if(a.startsWith("kmersample")){ + kmersamplerate=Integer.parseInt(b); + KmerCount7MTA.kmersamplerate=kmersamplerate; + }else if(a.startsWith("sample") || a.startsWith("readsample")){ + readsamplerate=Integer.parseInt(b); + KmerCount7MTA.readsamplerate=readsamplerate; + }else if(a.startsWith("canonical")){ + CANONICAL=KmerCount7MTA.CANONICAL=Tools.parseBoolean(b); + }else if(a.startsWith("fixspikes")){ + FIX_SPIKES=Tools.parseBoolean(b); + }else if(a.equals("printzerocoverage") || a.equals("pzc")){ + PRINT_ZERO_COVERAGE=Tools.parseBoolean(b); + }else if(a.equals("removeduplicatekmers") || a.equals("rdk")){ + KmerCount7MTA.KEEP_DUPLICATE_KMERS=!Tools.parseBoolean(b); + }else if(a.startsWith("extra")){ + if(b!=null && !b.equalsIgnoreCase("null")){ + if(new File(b).exists()){ + extra=new ArrayList(); + extra.add(b); + }else{ + extra=Arrays.asList(b.split(",")); + } + } + }else if(i>2 || (!args[i].equals(in1) && !args[i].equals(in2))){ +// assert(false) : "\n"+i+"\n"+args[i]+"\n"+reads1; + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + if(k>31){CANONICAL=KmerCount7MTA.CANONICAL=false;} + assert(CANONICAL==KmerCount7MTA.CANONICAL); + +// if(output!=null && reads1.contains(",")){ +// throw new RuntimeException("\nLists of input files can only be used with histogram output, not full output.\n" + +// "Please set output=null or move the extra input files to 'extra=file1,file2,...fileN'"); +// } + + { + if(histFile==null){ + + + }else{ + USE_HISTOGRAM=true; + } + + final int maxCount=(int)(cbits>16 ? Integer.MAX_VALUE : (1L<0); + HIST_LEN_PRINT=Tools.max(1, Tools.min(HIST_LEN_PRINT, maxCount)); + assert(HIST_LEN_PRINT<=Integer.MAX_VALUE) : HIST_LEN_PRINT+", "+Integer.MAX_VALUE; + HIST_LEN=(int)Tools.min(maxCount, Tools.max(HIST_LEN_PRINT, HIST_LEN)); + + histogram_total=new long[HIST_LEN]; + } + + if(extra!=null){ + for(String s : extra){ + File f=new File(s); + if(!f.exists() || !f.isFile()){throw new RuntimeException(s+" does not exist.");} + assert(!s.equalsIgnoreCase(in1) && (in2==null || !s.equalsIgnoreCase(in2))) : "\nInput file "+s+" should not be included as an extra file.\n"; + } + } + +// outstream.println("ForceInterleaved = "+FASTQ.FORCE_INTERLEAVED); + +// assert(false) : reads1+", "+reads2+", "+output; +// if(FASTQ.FORCE_INTERLEAVED && in2==null){ +// outstream.println() +// } + + if(threads<=0){ + if(auto){THREADS=Data.LOGICAL_PROCESSORS;} + else{THREADS=8;} + }else{ + THREADS=threads; + } +// KmerCount7MTA.THREADS=Tools.min(THREADS,6); + KmerCount7MTA.THREADS=THREADS; + +// System.err.println("THREADS="+THREADS+", KmerCount7MTA.THREADS="+KmerCount7MTA.THREADS); + + if(auto && cells==-1){ + final long usable=(long)Tools.max(((memory-96000000)*.73), memory*0.45); + long mem=usable-(USE_HISTOGRAM ? (HIST_LEN*8*(THREADS+1)) : 0); + if(buildpasses>1){mem/=2;} + cells=(mem*8)/cbits; +// +// long tablebytes=((1L<0 && prehashes>0 ? Tools.toKMG(precells) : "?")); + outstream.println("prefilter hashes: \t"+(precells>0 && prehashes>0 ? ""+prehashes : "?")); + } + outstream.println("base min quality: \t"+KmerCount7MTA.minQuality); + outstream.println("kmer min prob: \t"+KmerCount7MTA.minProb); + + outstream.println(); + outstream.println("remove duplicates:\t"+!KmerCount7MTA.KEEP_DUPLICATE_KMERS); + outstream.println("fix spikes: \t"+FIX_SPIKES); + if(USE_HISTOGRAM && HIST_LEN>0){ + outstream.println("histogram length: \t"+(USE_HISTOGRAM ? HIST_LEN : 0)); + } + if(histFile!=null){ + outstream.println("print zero cov: \t"+PRINT_ZERO_COVERAGE); + } + + outstream.println(); + } + + if(!prefilter && k<32 && cells>(1L<<(2*k))){cells=(1L<<(2*k));} + assert(cells>0); + +// KmerCount7MTA.THREADS=Tools.max(THREADS/2, KmerCount7MTA.THREADS); //Seems like 4 is actually optimal... + + FastaReadInputStream.MIN_READ_LEN=k; + + Timer t=new Timer(); + Timer ht=new Timer(); + t.start(); + ht.start(); + KCountArray kca; + KCountArray prefilterArray=null; + outstream.println(); + if(prefilter){ + prefilterArray=KmerCount7MTA.makeKca(in1, in2, extra, k, 2, gap, precells, prehashes, minq, true, tablereads, 1, buildStepsize, 1, 1, null); + outstream.println("Made prefilter: \t"+prefilterArray.toShortString(prehashes)); + } + kca=KmerCount7MTA.makeKca(in1, in2, extra, k, cbits, gap, cells, hashes, minq, true, tablereads, buildpasses, buildStepsize, 2, 2, prefilterArray); + ht.stop(); + + outstream.println("Made hash table: \t"+kca.toShortString(hashes)); + + long estUnique; + outstream.println(); + if(prefilterArray!=null){ + int lim1=prefilterArray.maxValue, lim2=prefilterArray.maxValue+1; + double a=prefilterArray.estimateUniqueKmers(prehashes); + double b=kca.estimateUniqueKmers(hashes, lim2); + a=a-b; + if(CANONICAL){ +// a=(a*KCountArray.canonMask)/(KCountArray.canonMask+1); +// b=(b*KCountArray.canonMask)/(KCountArray.canonMask+1); + }else{ + a/=2; + b/=2; + } + estUnique=((long)((a+b))); + outstream.println("Estimated kmers of depth 1-"+lim1+": \t"+(long)a); + outstream.println("Estimated kmers of depth "+lim2+"+ : \t"+(long)b); + }else{ +// double est=kca.cells*(1-Math.pow(1-Math.sqrt(kca.usedFraction()), 1.0/hashes)); +// double est=kca.cells*(1-Math.pow(1-kca.usedFraction(), 1.0/hashes)); + double est=kca.estimateUniqueKmers(hashes); +// System.out.println("Used cells: "+kca.cellsUsed(1)); + if(CANONICAL){ +// est=(est*KCountArray.canonMask)/(KCountArray.canonMask+1); + }else{ + est/=2; + } + estUnique=((long)((est))); + + } + outstream.println("Estimated unique kmers: \t"+estUnique);//+", or "+estUnique+" counting forward kmers only."); +// outstream.println("(Includes forward and reverse kmers)"); + outstream.println(); + outstream.println("Table creation time:\t\t"+ht);//+" \t"+String.format("%.2f", totalBases*1000000.0/(ht.elapsed))+" kb/sec"); + + long bases=0; + + if(in1!=null && in1.contains(",") && !new File(in1).exists()){ + String[] list1=in1.split(","); + String[] list2=(in2==null ? null : in2.split(",")); + bases=count(list1, list2, kca, k, maxReads, output, ordered, overwrite, histFile, estUnique); + }else{ + bases=count(in1, in2, kca, k, maxReads, output, ordered, overwrite, histFile, estUnique); + } + printTopology(); + + t.stop(); + outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec"); + + } + + + public static void printTopology(){ + long total=peaks.get()+spikes.get()+flats.get()+valleys.get()+slopes.get(); + double mult=100.0/total; + + long sp=spikes.get(); + long pe=peaks.get(); + long va=valleys.get(); + long sl=slopes.get(); + long fl=flats.get(); + double dsp=mult*sp; + double dpe=mult*pe; + double dva=mult*va; + double dsl=mult*sl; + double dfl=mult*fl; + + System.err.println("\nDepth Topology\t"); + System.err.println("Spikes: \t\t\t"+(dsp<10 ? " " : "")+String.format("%.3f%% \t%d",dsp,sp)); + System.err.println("Peaks: \t\t\t"+(dpe<10 ? " " : "")+String.format("%.3f%% \t%d",dpe,pe)); + System.err.println("Valleys: \t\t\t"+(dva<10 ? " " : "")+String.format("%.3f%% \t%d",dva,va)); + System.err.println("Slopes: \t\t\t"+(dsl<10 ? " " : "")+String.format("%.3f%% \t%d",dsl,sl)); + System.err.println("Flats: \t\t\t"+(dfl<10 ? " " : "")+String.format("%.3f%% \t%d",dfl,fl)); + } + + + public static long count(String reads1, String reads2, KCountArray kca, int k, long maxReads, + String output, boolean ordered, boolean overwrite, String histFile, long estUnique) { + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + Thread th=new Thread(cris); + th.start(); + } + + assert(cris!=null) : reads1; + + if(fileIO.FileFormat.hasFastaExtension(reads1)){ + ADD_CARROT=false; + } + + new Thread(cris).start(); + if(verbose){System.err.println("Started cris");} + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + final RTextOutputStream3 ros; + if(output!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=output.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(output.contains("#")){ + out2=output.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(reads1) && !out1.equalsIgnoreCase(reads1)); + assert(out2==null || (!out2.equalsIgnoreCase(reads1) && !out2.equalsIgnoreCase(reads2))); + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, ordered); + ros=new RTextOutputStream3(ff1, ff2, buff, null, true); + + ros.start(); + outstream.println("Started output threads."); + }else{ + ros=null; + } + + long bases=calcCoverage(cris, kca, k, maxReads, ros, histFile, overwrite, estUnique); + + ReadWrite.closeStreams(cris, ros); + if(verbose){System.err.println("Closed stream");} + return bases; + } + + + public static long count(String[] list1, String[] list2, KCountArray kca, int k, long maxReads, + String output, boolean ordered, boolean overwrite, String histFile, long estUnique) { + + RTextOutputStream3 ros=null; + String[] out1=null, out2=null; + + + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + if(output!=null){ + if(!new File(output).exists()){ + out1=output.split(","); + }else{ + out1=new String[] {output}; + } + out2=new String[out1.length]; + for(int i=0; i1){ + if(ros!=null){ + ReadWrite.closeStream(ros); + } + + FileFormat ff1=FileFormat.testOutput(out1[x], FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2[x], FileFormat.FASTQ, OUTPUT_ATTACHMENT ? "attachment" : null, true, overwrite, ordered); + ros=new RTextOutputStream3(ff1, ff2, buff, null, true); + + ros.start(); + outstream.println("Started output threads."); + }else{ + ros.resetNextListID(); + } + } + + String reads1=list1[x]; + String reads2=(list2==null || list2.length<=x ? null : list2[x]); + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} + Thread th=new Thread(cris); + th.start(); + if(ff1.fasta()){ADD_CARROT=false;} + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + bases+=calcCoverage(cris, kca, k, maxReads, ros, histFile, overwrite, estUnique); + + ReadWrite.closeStream(cris); + if(verbose){System.err.println("Closed stream");} + + } + + //Wait until threads finish! + + ReadWrite.closeStream(ros); + + return bases; + } + + + + public static long calcCoverage(ConcurrentReadStreamInterface cris, KCountArray kca, int k, long maxReads, RTextOutputStream3 ros, + String histFile, boolean overwrite, long estUnique) { + Timer tdetect=new Timer(); + tdetect.start(); + + long totalBases=0; + long totalReads=0; + +// assert(false) : THREADS; + ProcessThread[] pta=new ProcessThread[THREADS]; + for(int i=0; i1){ + histogram_total[1]+=histogram_total[0]; + histogram_total[0]=0; + } + +// outstream.println(); + tdetect.stop(); + outstream.println("Table read time: \t\t"+tdetect+" \t"+String.format("%.2f", totalBases*1000000.0/(tdetect.elapsed))+" kb/sec"); + outstream.println("Total reads: \t\t"+totalReads); + outstream.println("Total bases: \t\t"+totalBases); +// outstream.println(); + if(histogram_total!=null){ + TextStreamWriter tswh=null; + StringBuilder sb=new StringBuilder(100); + if(USE_HISTOGRAM){ + tswh=new TextStreamWriter(histFile, overwrite, false, false); + tswh.start(); + tswh.print("#Depth\tRaw_Count\tUnique_Kmers\n"); + } + int lim=(int)(HIST_LEN_PRINT-1); + long remaining=Tools.sum(histogram_total); + long sumRaw1=0; + long sumRaw2=0; + long sum1=0; + long sum2=0; + long sumsquare=0; + for(int i=0; i0 || y>0){ + sb.append(i).append('\t'); + sb.append(x).append('\t'); + sb.append(y).append('\n'); + } + tswh.print(sb.toString()); + sb.setLength(0); + } + if(sumRaw1>=remaining){break;} //Stop once there is no more coverage, even if PRINT_ZERO_COVERAGE is not set. + } + for(int i=lim; i0 || sum2>0){ + sb.append(lim).append('\t'); + sb.append(sumRaw2).append('\t'); + sb.append(sum2).append('\n'); + } + tswh.print(sb.toString()); + tswh.poison(); + tswh.waitForFinish(); + outstream.println("Wrote histogram to "+histFile); + } + + long histCount=Tools.sum(histogram_total); //Total number of kmers counted + long halfCount=(histCount+1)/2; + double histCountU=0; //Unique kmers counted + long temp1=0; + double temp2=0; + int median_all=-1; + int median_unique=-1; + for(int i=0; i=halfCount && median_all<0){median_all=i;} +// histSum+=(x*(double)i); + histCountU+=(x/(double)Tools.max(1, i)); + } + double halfCount2=(histCountU)/2; + for(int i=0; i=halfCount2 && median_unique<0){ + median_unique=i; + break; + } + } + if(median_all<0){median_all=0;} + double avg_all=sumsquare/(double)histCount; + double avg_unique=histCount/histCountU; + double stdev_unique=Tools.standardDeviationHistogramKmer(histogram_total); + double stdev_all=Tools.standardDeviationHistogram(histogram_total); + outstream.println("Total kmers counted: \t"+(sumRaw1+sumRaw2)); + + double uniqueC=((sum1+sum2)*100.0/(sumRaw1+sumRaw2)); + double uniqueE=((estUnique)*100.0/(sumRaw1+sumRaw2)); + double uniqueM=Tools.max(uniqueC, uniqueE); + outstream.println("Total unique kmer count: \t"+(sum1+sum2)); + if(CANONICAL){outstream.println("Includes forward kmers only.");} + outstream.println("The unique kmer estimate can be more accurate than the unique count, if the tables are very full."); + outstream.println("The most accurate value is the greater of the two."); + outstream.println(); + + outstream.println("Percent unique: \t"+(uniqueM<10 ? " " : "")+String.format("%.2f%%", uniqueM)); + + outstream.println("Depth average: \t"+String.format("%.2f\t(unique kmers)", avg_unique)); + outstream.println("Depth median: \t"+String.format("%d\t(unique kmers)", median_unique)); + outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(unique kmers)", stdev_unique)); + + outstream.println("\nDepth average: \t"+String.format("%.2f\t(all kmers)", avg_all)); + outstream.println("Depth median: \t"+String.format("%d\t(all kmers)", median_all)); + outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(all kmers)", stdev_all)); + } + + return totalBases; + } + + + + /** + * Locates and fixes spikes in a coverage profile (potentially) caused by false positives in a bloom filter. + * Theory: If a high-count kmer is adjacent on both sides to low-count kmers, it may be a false positive. + * It could either be reduced to the max of the two flanking points or examined in more detail. + * @param array An array of kmer counts for adjacent kmers in a read. + */ + private static void fixSpikes(int[] array){ + + for(int i=1; i1 && b>a && b>c){ + //peak + if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + //spike + array[i]=(int)Tools.max(a, c); + } + } + } + } + private static void fixSpikes(int[] array, long[] kmers, KCountArray kca, int k){ + if(array.length<3){return;} + if(array[1]-array[0]>1){ + array[0]=kca.readPrecise(kmers[0], k, CANONICAL); + } + if(array[array.length-1]-array[array.length-2]>1){ + array[array.length-1]=kca.readPrecise(kmers[array.length-1], k, CANONICAL); + } + + for(int i=1; i1){ + long a=Tools.max(1, array[i-1]); + long c=Tools.max(1, array[i+1]); + long key=kmers[i]; + + if(b>a && b>c){ + //peak + if(b<6 || b>a+1 || b>c+1){ + array[i]=kca.readPreciseMin(key, k, CANONICAL); + } + // if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + // //spike + // int b1=(int)((a+c)/2); + // int b2=kca.readLeft(key, k, CANONICAL); + // int b3=kca.readRight(key, k, CANONICAL); + // array[i]=Tools.min(b, b1, b2, b3); + // } + // else + // { + //// array[i]=kca.readPreciseMin(key, k, CANONICAL); + // } + } + // else + // if(Tools.max(ada, adc)>=Tools.max(2, Tools.min((int)a, b, (int)c)/4)){ + // array[i]=kca.readPrecise(key, k, CANONICAL); + // } + // else + // if(b>a+1 || b>c+1){ + // //steep + // array[i]=kca.readPrecise(key, k, CANONICAL); + // } + } + } + } + + + private static void analyzeSpikes(int[] array, int width){ + if(array.length<3){return;} + int peakcount=0, valleycount=0, spikecount=0, flatcount=0, slopecount=0; + for(int i=1; ia && b>c){ + peakcount++; + if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + spikecount++; + } + }else if(b0){peaks.addAndGet(peakcount);} + if(valleycount>0){valleys.addAndGet(valleycount);} + if(spikecount>0){spikes.addAndGet(spikecount);} + if(flatcount>0){flats.addAndGet(flatcount);} + if(slopecount>0){slopes.addAndGet(slopecount);} + } + + + /** + * @param r + * @param kca + * @return + */ + public static int[] generateCoverage(Read r, KCountArray kca, int k) { + if(k>31){return generateCoverageLong(r, kca, k);} + if(kca.gap>0){throw new RuntimeException();} + if(r==null || r.bases==null || r.bases.length=k){ +// int count=kca.readPrecise(kmer, k, CANONICAL); + int count=kca.read(kmer, k, CANONICAL); + out[i-k+1]=count; + if(kmers!=null){kmers[i-k+1]=kmer;} + } + } + } + }else{ + out=new int[(r.bases.length-k+1+(kmersamplerate-1))/kmersamplerate]; + Arrays.fill(out, -1); + for(int i=0; i=k && i%kmersamplerate==0){ +// int count=kca.readPrecise(kmer, k, CANONICAL); + int count=kca.read(kmer, k, CANONICAL); + out[(i-k+1)/kmersamplerate]=count; + if(kmers!=null){kmers[(i-k+1)/kmersamplerate]=kmer;} + } + } + } + } + if(FIX_SPIKES){fixSpikes(out, kmers, kca, k);} +// fixSpikes(out, 1); + + analyzeSpikes(out, 1); + return out; + } + + + + /** + * @param r + * @param kca + * @return + */ + public static int[] generateCoverageLong(Read r, KCountArray kca, int k) { +// assert(false) : "todo"; +// assert(k>31); + if(kca.gap>0){throw new RuntimeException();} + if(r==null || r.bases==null || r.bases.lengthk){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2<=k){ + int count=kca.read(kmer); + out[i-k+1]=count; + } + } + } + }else{ + out=new int[(r.bases.length-k+1+(kmersamplerate-1))/kmersamplerate]; + Arrays.fill(out, -1); + for(int i=0; ik){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2<=k && i%kmersamplerate==0){ + int count=kca.read(kmer); + out[(i-k+1)/kmersamplerate]=count; + } + } + } + } + fixSpikes(out); + + analyzeSpikes(out, 1); + return out; + } + + + private static class ProcessThread extends Thread{ + + ProcessThread(ConcurrentReadStreamInterface cris_, KCountArray kca_, int k_, RTextOutputStream3 ros_){ + cris=cris_; + kca=kca_; + k=k_; + ros=ros_; + } + + public void run(){ + countInThread(); + } + + void countInThread() { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + + while(reads!=null && reads.size()>0){ + for(int rnum=0; rnum=0){hist[x]++;} + } + } + return cov; + } + } + + private CharSequence toFastaString(Read r){ + if(r.bases==null || r.bases.length0 || MIN_AVERAGE>0){r.setDiscarded(true);} + if(USE_HEADER){ + return (ADD_CARROT ? ">" : "")+r.id+";0;0 0 0 0 0\n"+r.bases==null ? "" : new String(r.bases); + }else{ + return (ADD_CARROT ? ">" : "")+r.id+"\n"+(r.bases==null ? "" : new String(r.bases))+"\n0\n0 0 0 0 0"; + } + }else{ + totalBases+=r.bases.length; + + int[] cov=generateCoverage(r, kca, k); + + if(hist!=null){ + for(int i=0; i');} + sb.append(r.id).append(';'); + + int min=cov[0], max=cov[0], sum=0; + for(int i=0; i');} + sb.append(r.id).append('\n'); + sb.append(new String(r.bases)).append('\n'); + + int min=cov[0], max=cov[0], sum=0; + for(int i=0; i0 || MIN_AVERAGE>0){r.setDiscarded(true);} + sb.append("\n0\n0 0 0 0 0"); + return sb; + }else{ + totalBases+=r.bases.length; + + int[] cov=generateCoverage(r, kca, k); + + if(hist!=null){ + for(int i=0; i=0) : i+", "+cov[i]+", "+HIST_LEN; + hist[x]++; + } + } + sb.append('\n'); + + int min=cov[0], max=cov[0], sum=0; + for(int i=0; i0 ? null : args[0]); + String in2=(in1!=null && args.length>1 ? args[1] : null); + if(in2!=null && "null".equalsIgnoreCase(in2)){in2=null;} + + { + if(in1!=null && !in1.contains(",")){ + File f=new File(in1); + if(!f.exists() || !f.isFile()){ + in1=null; +// throw new RuntimeException(reads1+" does not exist."); + } + } + if(in2!=null && !in2.contains(",")){ + File f=new File(in2); + if(!f.exists() || !f.isFile()){ + in2=null; +// throw new RuntimeException(reads2+" does not exist."); + }else if(in1.equalsIgnoreCase(in2)){ + throw new RuntimeException("Both input files are the same."); + } + } + } + + FASTQ.PARSE_CUSTOM=false; + KmerCount7MTA.minQuality=5; + KmerCount7MTA.minProb=0.5f; + + int k=31; + int cbits=32; + int cbits1=-1; + int gap=0; + int hashes=3; +// int matrixbits=-1; + long cells=-1; + long maxReads=-1; + int buildpasses=1; + long tablereads=-1; //How many reads to process when building the hashtable + int buildStepsize=4; + String outKeep=null; + String outToss=null; + String outLow=null, outMid=null, outHigh=null, outUnc=null; + int prehashes=-1; + long precells=-1; + String histFile=null; + String histFileOut=null; + int threads=-1; + ReadWrite.ZIPLEVEL=2; + + int minq=KmerCount7MTA.minQuality; + KmerCount7MTA.CANONICAL=true; + + + int targetDepthF=TARGET_DEPTH_F; + int targetDepth1=TARGET_DEPTH_1; + int maxDepth=MAX_DEPTH; + int minDepth=MIN_DEPTH; + int minKmersOverMinDepth=MIN_KMERS_OVER_MIN_DEPTH; + float depthPercentile=DEPTH_PERCENTILE; + + int passes=2; + boolean tossErrorReadsF=TOSS_ERROR_READS_F; + boolean tossErrorReads1=TOSS_ERROR_READS_1; + boolean discardBadOnlyF=DISCARD_BAD_ONLY_F; + boolean discardBadOnly1=DISCARD_BAD_ONLY_1; + boolean fixSpikes=FIX_SPIKES; + float highPercentile=HIGH_PERCENTILE; + float lowPercentile=LOW_PERCENTILE; + int errorDetectRatio=ERROR_DETECT_RATIO; + int hthresh=HTHRESH; + int lthresh=LTHRESH; + boolean countup=COUNTUP; + boolean rbb=REQUIRE_BOTH_BAD; + byte qin=-1, qout=-1; + + + boolean auto=true; + + FastaReadInputStream.TARGET_READ_LEN=Integer.MAX_VALUE; + FASTQ.PARSE_CUSTOM=false; + + List extra=null; + + long memory=Runtime.getRuntime().maxMemory(); + long tmemory=Runtime.getRuntime().totalMemory(); +// assert(false) : memory+", "+tmemory; + + for(int i=(in1==null ? 0 : 1); i=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimright")){ + TRIM_RIGHT=Tools.parseBoolean(b); + }else if(a.equals("trimleft")){ + TRIM_LEFT=Tools.parseBoolean(b); + }else if(a.equals("trimq") || a.equals("trimquality")){ + TRIM_QUALITY=Byte.parseByte(b); + }else if(a.equals("minlength") || a.equals("minlen") || a.equals("ml")){ + minlength=Integer.parseInt(b); + }else if(a.equals("q102matrix") || a.equals("q102m")){ + CalcTrueQuality.q102matrix=b; + }else if(a.equals("qbpmatrix") || a.equals("bqpm")){ + CalcTrueQuality.qbpmatrix=b; + }else if(a.equals("adjustquality") || a.equals("adjq")){ + TrimRead.ADJUST_QUALITY=Tools.parseBoolean(b); + }else if(a.equals("bits") ||a.equals("cbits") || a.equals("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.equals("bits1") ||a.equals("cbits1") || a.equals("cellbits1")){ + cbits1=Integer.parseInt(b); + }else if(a.equals("histlen") ||a.equals("histogramlen")){ + HIST_LEN_PRINT=Tools.min(Integer.MAX_VALUE, Long.parseLong(b)+1); + }else if(a.equals("gap")){ + gap=Integer.parseInt(b); + }else if(a.equals("matrixbits")){ + int matrixbits=Integer.parseInt(b); + assert(matrixbits<63); + cells=1L<=1 && passes<=4) : "Passes should be in range 1~4."; + }else if(a.equals("1pass") || a.equals("1p")){ + passes=1; + }else if(a.equals("2pass") || a.equals("2p")){ + passes=2; + }else if(a.equals("buildpasses")){ + buildpasses=Integer.parseInt(b); + }else if(a.equals("printcoverage")){ + assert(false) : "This is not the program you are looking for. Try KmerCoverage. Move along."; + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("threads") || a.equals("t")){ + threads=Integer.parseInt(b); + }else if(a.equals("rn") || a.equals("rename") || a.equals("renamereads")){ + renameReads=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("tablereads") || a.equals("buildreads")){ + tablereads=Long.parseLong(b); + }else if(a.equals("out") || a.equals("outk") || a.equals("outkeep") || a.equals("outgood")){ + outKeep=b; +// outstream.println("k:"+b); + }else if(a.equals("outt") || a.equals("outtoss") || a.equals("outoss") || a.equals("outbad")){ + outToss=b; +// outstream.println("t:"+b); + }else if(a.equals("outl") || a.equals("outlow")){ + outLow=b; + }else if(a.equals("outm") || a.equals("outmid") || a.equals("outmiddle")){ + outMid=b; + }else if(a.equals("outh") || a.equals("outhigh")){ + outHigh=b; + }else if(a.equals("outu") || a.equals("outuncorrected")){ + outUnc=b; + }else if(a.equals("lbd") || a.equals("lowbindepth") || a.equals("lowerlimit")){ + LOW_BIN_DEPTH=Integer.parseInt(b); + }else if(a.equals("hbd") || a.equals("highbindepth") || a.equals("upperlimit")){ + HIGH_BIN_DEPTH=Integer.parseInt(b); + }else if(a.equals("hist") || a.equals("histin") || a.equals("inhist")){ + histFile=b; + }else if(a.equals("histout") || a.equals("outhist") || a.equals("hist2")){ + histFileOut=b; + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.equals("ordered") || a.equals("ord")){ + ordered=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("auto") || a.equals("automatic")){ + auto=Tools.parseBoolean(b); + }else if(a.equals("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("canonical")){ + CANONICAL=KmerCount7MTA.CANONICAL=Tools.parseBoolean(b); + }else if(a.equals("fixspikes") || a.equals("fs")){ + fixSpikes=Tools.parseBoolean(b); + }else if(a.equals("printzerocoverage") || a.equals("pzc")){ + PRINT_ZERO_COVERAGE=Tools.parseBoolean(b); + }else if(a.equals("removeduplicatekmers") || a.equals("rdk")){ + KmerCount7MTA.KEEP_DUPLICATE_KMERS=!Tools.parseBoolean(b); + }else if(a.equals("target") || a.equals("targetdepth") || a.equals("tgt")){ + targetDepthF=Integer.parseInt(b); + }else if(a.equals("target1") || a.equals("targetdepth1") || a.equals("tgt1")){ + targetDepth1=Integer.parseInt(b); + }else if(a.equals("max") || a.equals("maxdepth")){ + maxDepth=Integer.parseInt(b); + }else if(a.equals("min") || a.equals("mindepth")){ + minDepth=Integer.parseInt(b); + }else if(a.equals("minkmers") || a.equals("minkmersovermindepth") || a.equals("mingoodkmersperread") || a.equals("mgkpr")){ + minKmersOverMinDepth=Tools.max(1, Integer.parseInt(b)); + }else if(a.equals("percentile") || a.equals("depthpercentile") || a.equals("dp")){ + depthPercentile=Float.parseFloat(b); + if(depthPercentile>1 && depthPercentile<=100){depthPercentile/=100;} + assert(depthPercentile>=0 && depthPercentile<=1) : "Depth percentile must be between 0 and 100."; + }else if(a.equals("highdepthpercentile") || a.equals("highpercentile") || a.equals("hdp")){ + highPercentile=Float.parseFloat(b); + if(highPercentile>1 && highPercentile<=100){highPercentile/=100;} + assert(highPercentile>=0 && highPercentile<=1) : "Depth percentile must be between 0 and 100."; + }else if(a.equals("lowdepthpercentile") || a.equals("lowpercentile") || a.equals("ldp")){ + lowPercentile=Float.parseFloat(b); + if(lowPercentile>1 && lowPercentile<=100){lowPercentile/=100;} + assert(lowPercentile>=0 && highPercentile<=1) : "Depth percentile must be between 0 and 100."; + }else if(a.equals("targetbadpercentilelow") || a.equals("tbpl")){ + double d=Double.parseDouble(b); + if(d>1 && d<=100){d/=100;} + assert(d>=0) : "TARGET_BAD_PERCENT_LOW must be at least 0."; + TARGET_BAD_PERCENT_LOW=d; + TARGET_BAD_PERCENT_HIGH=Tools.max(TARGET_BAD_PERCENT_HIGH, TARGET_BAD_PERCENT_LOW); + }else if(a.equals("targetbadpercentilehigh") || a.equals("tbph")){ + double d=Double.parseDouble(b); + if(d>1 && d<=100){d/=100;} + assert(d>=0 && lowPercentile<=1) : "TARGET_BAD_PERCENT_HIGH must be at least 0."; + TARGET_BAD_PERCENT_HIGH=d; + TARGET_BAD_PERCENT_LOW=Tools.min(TARGET_BAD_PERCENT_HIGH, TARGET_BAD_PERCENT_LOW); + }else if(a.equals("errordetectratio") || a.equals("edr")){ + errorDetectRatio=Integer.parseInt(b); + }else if(a.equals("errorcorrectratio") || a.equals("ecr")){ + ERROR_CORRECT_RATIO=Integer.parseInt(b); + }else if(a.equals("highthresh") || a.equals("hthresh") || a.equals("ht")){ + hthresh=Integer.parseInt(b); + }else if(a.equals("lowthresh") || a.equals("lthresh") || a.equals("lt")){ + lthresh=Integer.parseInt(b); + }else if(a.equals("echighthresh") || a.equals("echthresh") || a.equals("echt")){ + EC_HTHRESH=Integer.parseInt(b); + }else if(a.equals("eclowthresh") || a.equals("eclthresh") || a.equals("eclt")){ + EC_LTHRESH=Integer.parseInt(b); + }else if(a.equals("markerrors") || a.equals("markonly") || a.equals("meo")){ + MARK_ERRORS_ONLY=Tools.parseBoolean(b); + }else if(a.equals("markuncorrectableerrors") || a.equals("markuncorrectable") || a.equals("mue")){ + MARK_UNCORRECTABLE_ERRORS=Tools.parseBoolean(b); + }else if(a.equals("tam") || a.equals("trimaftermarking")){ + TRIM_AFTER_MARKING=Tools.parseBoolean(b); + }else if(a.equals("markwith1") || a.equals("markwithone") || a.equals("mw1")){ + MARK_WITH_1=Tools.parseBoolean(b); + TrimRead.PROB1=10; + }else if(a.equals("aec") || a.equals("aecc") || a.equals("aggressiveerrorcorrection")){ + boolean x=Tools.parseBoolean(b); + if(x){ + USE_ECC1=USE_ECCF=true; + EC_HTHRESH=Tools.min(EC_HTHRESH, 16); + EC_LTHRESH=Tools.max(EC_LTHRESH, 3); + ERROR_CORRECT_RATIO=Tools.min(ERROR_CORRECT_RATIO, 100); + MAX_ERRORS_TO_CORRECT=Tools.max(MAX_ERRORS_TO_CORRECT, 7); + SUFFIX_LEN=Tools.min(SUFFIX_LEN, 3); + PREFIX_LEN=Tools.min(PREFIX_LEN, 2); + } + }else if(a.equals("cec") || a.equals("cecc") || a.equals("conservativeerrorcorrection")){ + boolean x=Tools.parseBoolean(b); + if(x){ + USE_ECC1=USE_ECCF=true; + EC_HTHRESH=Tools.max(EC_HTHRESH, 30); + EC_LTHRESH=Tools.min(EC_LTHRESH, 1); + ERROR_CORRECT_RATIO=Tools.max(ERROR_CORRECT_RATIO, 170); + MAX_ERRORS_TO_CORRECT=Tools.min(MAX_ERRORS_TO_CORRECT, 2); + MAX_QUAL_TO_CORRECT=Tools.min(MAX_QUAL_TO_CORRECT, 25); + SUFFIX_LEN=Tools.max(SUFFIX_LEN, 4); + PREFIX_LEN=Tools.max(PREFIX_LEN, 4); + } + }else if(a.equals("tossbadreads") || a.equals("tosserrorreads") || a.equals("tbr") || a.equals("ter")){ + tossErrorReads1=tossErrorReadsF=Tools.parseBoolean(b); + }else if(a.equals("tossbadreads2") || a.equals("tosserrorreads2") || a.equals("tbr2") || a.equals("ter2") || + a.equals("tossbadreadsf") || a.equals("tosserrorreadsf") || a.equals("tbrf") || a.equals("terf")){ + tossErrorReadsF=Tools.parseBoolean(b); + }else if(a.equals("tossbadreads1") || a.equals("tosserrorreads1") || a.equals("tbr1") || a.equals("ter1")){ + tossErrorReads1=Tools.parseBoolean(b); + }else if(a.equals("abrc") || a.equals("addbadreadscountup")){ + ADD_BAD_READS_COUNTUP=Tools.parseBoolean(b); + }else if(a.equals("discardbadonly") || a.equals("dbo")){ + discardBadOnly1=discardBadOnlyF=Tools.parseBoolean(b); + }else if(a.equals("discardbadonly1") || a.equals("dbo1")){ + discardBadOnly1=Tools.parseBoolean(b); + }else if(a.equals("discardbadonlyf") || a.equals("dbof") || a.equals("discardbadonly2") || a.equals("dbo2")){ + discardBadOnlyF=Tools.parseBoolean(b); + }else if(a.equals("requirebothbad") || a.equals("rbb")){ + rbb=Tools.parseBoolean(b); + }else if(a.equals("saverarereads") || a.equals("srr")){ + SAVE_RARE_READS=Tools.parseBoolean(b); + }else if(a.equals("ecc")){ + USE_ECC1=USE_ECCF=Tools.parseBoolean(b); + }else if(a.equals("ecc1")){ + USE_ECC1=Tools.parseBoolean(b); + }else if(a.equals("ecc2") || a.equals("eccf")){ + USE_ECCF=Tools.parseBoolean(b); + }else if(a.equals("ecclimit")){ + MAX_ERRORS_TO_CORRECT=Integer.parseInt(b); + }else if(a.equals("eccmaxqual")){ + MAX_QUAL_TO_CORRECT=Integer.parseInt(b); + }else if(a.equals("cfl")){ + CORRECT_FROM_LEFT=Tools.parseBoolean(b); + }else if(a.equals("cfr")){ + CORRECT_FROM_RIGHT=Tools.parseBoolean(b); + }else if(a.equals("sl") || a.equals("suflen") || a.equals("suffixlen")){ + SUFFIX_LEN=Integer.parseInt(b); + }else if(a.equals("pl") || a.equals("prelen") || a.equals("prefixlen")){ + PREFIX_LEN=Integer.parseInt(b); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usetmpdir")){ + USE_TMPDIR=Tools.parseBoolean(b); + }else if(a.equals("tmpdir")){ + TMPDIR=b; + if(b!=null){ + b=b.trim(); + if(b.length()==0){b=null;} + else{b=(b.replace('\\', '/')+"/").replaceAll("//", "/");} + } + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("extra")){ + if(b!=null && !b.equalsIgnoreCase("null")){ + if(new File(b).exists()){ + extra=new ArrayList(); + extra.add(b); + }else{ + extra=Arrays.asList(b.split(",")); + } + } + }else if(i>2 || (!args[i].equals(in1) && !args[i].equals(in2))){ +// assert(false) : "\n"+i+"\n"+args[i]+"\n"+reads1; + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(TrimRead.ADJUST_QUALITY){CalcTrueQuality.initializeMatrices();} + + assert(passes<2 || (outLow==null && outMid==null && outHigh==null && outUnc==null)) : + "\noutLow, outMid, outHigh, and outUnc don't work with multiple passes. Set passes=1 or eliminate those output streams."; + + assert(in1!=null && !in1.equalsIgnoreCase("stdin") && !in1.toLowerCase().startsWith("stdin.")) : + "\nThis program does not allow input from standard in,\nbecause it needs to read the input multiple times.\nOnly real files are permitted."; + + if(MARK_ERRORS_ONLY){ + MAX_ERRORS_TO_CORRECT=Tools.max(MAX_ERRORS_TO_CORRECT, 9999); + if(!USE_ECC1 && !USE_ECCF){USE_ECC1=true;} + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + final boolean paired=(FASTQ.FORCE_INTERLEAVED || in2!=null); + if(in2!=null){ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + if(DETERMINISTIC){ordered=true;} + + assert(Tools.canWrite(outKeep, overwrite)) : outKeep+" already exists, and overwrite="+overwrite; + assert(Tools.canWrite(outToss, overwrite)) : outToss+" already exists, and overwrite="+overwrite; + assert(Tools.canWrite(histFile, overwrite)) : histFile+" already exists, and overwrite="+overwrite; + assert(Tools.canWrite(histFileOut, overwrite)) : histFileOut+" already exists, and overwrite="+overwrite; + assert(outKeep==null || outKeep.startsWith("stdout") || !(outKeep.equals(outToss) || outKeep.equals(histFile) || outKeep.equals(histFileOut))) : + "Duplicate output file: "+outKeep; + assert(outToss==null || outToss.startsWith("stdout") || !(outToss.equals(outKeep) || outToss.equals(histFile) || outToss.equals(histFileOut))) : + "Duplicate output file: "+outToss; + assert(histFile==null || histFile.startsWith("stdout") || !(histFile.equals(outToss) || histFile.equals(outKeep) || histFile.equals(histFileOut))) : + "Duplicate output file: "+histFile; + assert(histFileOut==null || histFileOut.startsWith("stdout") || !(histFileOut.equals(outToss) || histFileOut.equals(histFile) || histFileOut.equals(outKeep))) : + "Duplicate output file: "+histFileOut; + + if(cbits>16 && passes>1){cbits=16;} + + maxDepth=Tools.max(maxDepth, targetDepthF); + assert(targetDepthF>0); + + assert(FastaReadInputStream.settingsOK()); + if(k>31){CANONICAL=KmerCount7MTA.CANONICAL=false;} + assert(CANONICAL==KmerCount7MTA.CANONICAL); + +// if(output!=null && reads1.contains(",")){ +// throw new RuntimeException("\nLists of input files can only be used with histogram output, not full output.\n" + +// "Please set output=null or move the extra input files to 'extra=file1,file2,...fileN'"); +// } + if(extra!=null){ + for(String s : extra){ + File f=new File(s); + if(!f.exists() || !f.isFile()){throw new RuntimeException(s+" does not exist.");} + assert(!s.equalsIgnoreCase(in1) && (in2==null || !s.equalsIgnoreCase(in2))) : "\nInput file "+s+" should not be included as an extra file.\n"; + } + } + +// outstream.println("ForceInterleaved = "+FASTQ.FORCE_INTERLEAVED); + +// assert(false) : reads1+", "+reads2+", "+output; +// if(FASTQ.FORCE_INTERLEAVED && in2==null){ +// outstream.println() +// } + + if(threads<=0){ + if(auto){THREADS=Shared.THREADS;} + else{THREADS=8;} + }else{ + THREADS=threads; + } +// KmerCount7MTA.THREADS=Tools.min(THREADS,6); + KmerCount7MTA.THREADS=THREADS; + +// System.err.println("THREADS="+THREADS+", KmerCount7MTA.THREADS="+KmerCount7MTA.THREADS); + + long bases=0; + qhist_total=new long[128]; + Timer t=new Timer(); + t.start(); + + if(passes>1){ + String lastTemp=null; + + if(passes>2){ +// System.out.println(">>>A"); + + ERROR_DETECT_RATIO+=50; + EC_HTHRESH=EC_HTHRESH*2+20; + + for(int pass=1; pass>>B"); + final String tempOutPrefix=getTempPrefix(in1, outKeep, pass); + final String tempOut=getTempOut(outKeep, tempOutPrefix); + final String tempOutToss=(outToss==null ? null : getTempOut(outToss, tempOutPrefix)); + // assert(false) : tempOut+"\n"+tempOutToss; + + outstream.println("\n *********** Pass "+pass+" ********** \n"); + + int tgt=(targetDepth1<1 ? targetDepthF*4 : targetDepth1*2); + int max=(tgt+tgt/4); + + int tgtBadLow=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_LOW*1.5)); + int tgtBadHigh=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_HIGH*1.5)); + + CORRECT_ERRORS_THIS_PASS=USE_ECC1; + TRIM_LEFT_THIS_PASS=(pass==1 && TRIM_LEFT); + TRIM_RIGHT_THIS_PASS=(pass==1 && TRIM_RIGHT); + bases+=runPass(auto, memory, (cbits1<1 ? cbits : cbits1), cells, precells, buildpasses, hashes, prehashes, k, + maxReads, tablereads, minq, buildStepsize, + (pass==1 ? in1 : lastTemp), (pass==1 ? in2 : null), tempOut, tempOutToss, null, null, null, null, + (pass==1 ? histFile : null), (pass==1 ? extra : null), + tgt, tgtBadLow, tgtBadHigh, max, Tools.min(minDepth, 2), Tools.min(minKmersOverMinDepth, 5), + Tools.min(0.8f, Tools.max(0.4f, depthPercentile)*1.2f), false, rbb, true, + highPercentile, 0, (errorDetectRatio>100 ? 100+(errorDetectRatio-100)/2 : errorDetectRatio), hthresh, lthresh, false, false, false); + lastTemp=tempOut; + FASTQ.TEST_INTERLEAVED=true; + FASTQ.FORCE_INTERLEAVED=paired; + } + FASTQ.TEST_INTERLEAVED=true; + FASTQ.FORCE_INTERLEAVED=paired; +// System.out.println(">>>C"); + + ERROR_DETECT_RATIO-=50; + EC_HTHRESH=(EC_HTHRESH-20)/2; + + for(int pass=passes-1; pass>>D"); + final String tempOutPrefix=getTempPrefix(in1, outKeep, pass); + final String tempOut=getTempOut(outKeep, tempOutPrefix); + final String tempOutToss=(outToss==null ? null : getTempOut(outToss, tempOutPrefix)); + // assert(false) : tempOut+"\n"+tempOutToss; + + outstream.println("\n *********** Pass "+pass+" ********** \n"); + + int tgt=(targetDepth1<1 ? targetDepthF*2 : targetDepth1); + int max=(tgt+tgt/4); + + int tgtBadLow=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_LOW)); + int tgtBadHigh=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_HIGH)); + + CORRECT_ERRORS_THIS_PASS=USE_ECC1; + TRIM_LEFT_THIS_PASS=(pass==1 && TRIM_LEFT); + TRIM_RIGHT_THIS_PASS=(pass==1 && TRIM_RIGHT); + bases+=runPass(auto, memory, (cbits1<1 ? cbits : cbits1), cells, precells, buildpasses, hashes, prehashes, k, + maxReads, tablereads, minq, buildStepsize, + (pass==1 ? in1 : lastTemp), (pass==1 ? in2 : null), tempOut, tempOutToss, null, null, null, null, + (pass==1 ? histFile : null), (pass==1 ? extra : null), + tgt, tgtBadLow, tgtBadHigh, max, Tools.min(minDepth, 3), minKmersOverMinDepth, + Tools.min(0.8f, Tools.max(0.4f, depthPercentile)*1.2f), tossErrorReads1, rbb, discardBadOnly1, + highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, false, false, false); + lastTemp=tempOut; + } + }else{ +// System.out.println(">>>E"); + for(int pass=1; pass>>F"); + final String tempOutPrefix=getTempPrefix(in1, outKeep, pass); + final String tempOut=getTempOut(outKeep, tempOutPrefix); + final String tempOutToss=(outToss==null ? null : getTempOut(outToss, tempOutPrefix)); + // assert(false) : tempOut+"\n"+tempOutToss; + + outstream.println("\n *********** Pass "+pass+" ********** \n"); + + int tgt=(targetDepth1<1 ? targetDepthF*4 : targetDepth1); + int max=(tgt+tgt/4); + + int tgtBadLow=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_LOW)); + int tgtBadHigh=(int)Math.ceil(Tools.min(tgt, targetDepthF*TARGET_BAD_PERCENT_HIGH)); + + CORRECT_ERRORS_THIS_PASS=USE_ECC1; + TRIM_LEFT_THIS_PASS=(pass==1 && TRIM_LEFT); + TRIM_RIGHT_THIS_PASS=(pass==1 && TRIM_RIGHT); + bases+=runPass(auto, memory, (cbits1<1 ? cbits : cbits1), cells, precells, buildpasses, hashes, prehashes, k, + maxReads, tablereads, minq, buildStepsize, + (pass==1 ? in1 : lastTemp), (pass==1 ? in2 : null), tempOut, tempOutToss, null, null, null, null, + (pass==1 ? histFile : null), (pass==1 ? extra : null), + tgt, tgtBadLow, tgtBadHigh, max, Tools.min(minDepth, 3), minKmersOverMinDepth, + Tools.min(0.8f, Tools.max(0.4f, depthPercentile)*1.2f), tossErrorReads1, rbb, discardBadOnly1, + highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, false, false, false); + lastTemp=tempOut; + FASTQ.TEST_INTERLEAVED=true; + FASTQ.FORCE_INTERLEAVED=paired; + } +// System.out.println(">>>G"); + } + + outstream.println("\n *********** Pass "+(passes)+" ********** \n"); + + CORRECT_ERRORS_THIS_PASS=USE_ECCF; + TRIM_LEFT_THIS_PASS=false; + TRIM_RIGHT_THIS_PASS=false; + bases+=runPass(auto, memory, cbits, cells, precells, buildpasses, hashes, prehashes, k, + maxReads, tablereads, minq, buildStepsize, + lastTemp, null, outKeep, outToss, outLow, outMid, outHigh, outUnc, null, null, + targetDepthF, targetDepthF, targetDepthF, maxDepth, minDepth, minKmersOverMinDepth, depthPercentile, tossErrorReadsF, rbb, discardBadOnlyF, + highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, fixSpikes, countup, renameReads); + }else{ + CORRECT_ERRORS_THIS_PASS=(USE_ECC1 || USE_ECCF); + TRIM_LEFT_THIS_PASS=(TRIM_LEFT); + TRIM_RIGHT_THIS_PASS=(TRIM_RIGHT); + bases+=runPass(auto, memory, cbits, cells, precells, buildpasses, hashes, prehashes, k, + maxReads, tablereads, minq, buildStepsize, + in1, in2, outKeep, outToss, outLow, outMid, outHigh, outUnc, histFile, extra, + targetDepthF, targetDepthF, targetDepthF, maxDepth, minDepth, minKmersOverMinDepth, depthPercentile, tossErrorReadsF, rbb, discardBadOnlyF, + highPercentile, lowPercentile, errorDetectRatio, hthresh, lthresh, fixSpikes, countup, renameReads); + } + + if(outKeep!=null && histFileOut!=null){ + outstream.println("\n *********** Output Histogram Generation ********** \n"); + FASTQ.TEST_INTERLEAVED=true; + FASTQ.FORCE_INTERLEAVED=paired; + CORRECT_ERRORS_THIS_PASS=false; + TRIM_LEFT_THIS_PASS=false; + TRIM_RIGHT_THIS_PASS=false; + bases+=runPass(auto, memory, cbits, cells, precells, buildpasses, hashes, prehashes, k, + maxReads, tablereads, minq, buildStepsize, + outKeep, null, null, null, null, null, null, null, histFileOut, extra, + 99999999, 99999999, 99999999, 99999999, 0, 0, .5f, false, rbb, false, + 1, 0, 100, 10, 3, fixSpikes, false, false); + } + + if(REMOVE_TEMP_FILES && temp_file_set!=null){ + outstream.println("\nRemoving temp files."); + for(String s : temp_file_set){ + File f=new File(s); + if(f.exists()){ +// System.out.println("Deleting "+s); + boolean success=false; + for(int i=0; i<100 && f.exists() && !success; i++){ + success=f.delete(); + f=new File(s); + } + if(f.exists() && !success){ +// System.err.println(f.canExecute()); +// System.err.println(f.canRead()); +// System.err.println(f.canWrite()); +// System.err.println(f.lastModified()); +// try { +// java.nio.file.Files.delete(f.toPath()); +// } catch (IOException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } + System.err.println("Some temp files (prefixed TEMPFILE_BBNORM) could not be removed may need to be deleted manually."); + f.deleteOnExit(); + } + } + } + } + + t.stop(); + + + outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec"); + + if(errorState){throw new RuntimeException("KmerNormalize terminated in an error state; the output may be corrupt.");} + } + + private static String getTempPrefix(String inFname, String outFname, int pass){ + String tempOut=null, tempOutPrefix=null; + for(int i=0; i<2000 && tempOut==null; i++){ + tempOutPrefix=(useTmpdir() ? Shared.TMPDIR : "")+"TEMPFILE_BBNORM_P"+pass+"_"+getSalt(inFname, i)+"_"; + tempOut=getTempOut(outFname, tempOutPrefix); + if(new File(tempOut).exists()){ + tempOut=null; + tempOutPrefix=null; + } + } + if(tempOutPrefix==null){ + throw new RuntimeException("Can't generate a random temp file name. Try deleting old temp files."); + } + return tempOutPrefix; + } + + private static String getTempOut(String outFname, final String tempOutPrefix){ + assert(tempOutPrefix!=null); + String tempOut=null; + if(outFname==null || useTmpdir()){ + tempOut=tempOutPrefix+".fq.gz"; + }else{ + outFname=outFname.replace('\\', '/'); + int idx=outFname.lastIndexOf('/'); + if(idx<0){ + tempOut=tempOutPrefix+outFname; + }else{ + tempOut=outFname.substring(0, idx+1)+tempOutPrefix+outFname.substring(idx+1); + } + } + if(temp_file_set==null){temp_file_set=new HashSet();} + if(temp_file_set.contains(tempOut) || new File(tempOut).exists()){ + return getTempOut(outFname, tempOutPrefix+"_"+(100000*Math.random())); + } + temp_file_set.add(tempOut); + return tempOut; + } + + public static String getSalt(String fname, int attempt){ + return Long.toHexString(System.nanoTime()+attempt)+Long.toHexString(Long.rotateLeft(fname.hashCode(), 31)^System.currentTimeMillis()); + } + + private static boolean inMemorySort(ArrayList reads, String sorted, boolean reverse){ + try{ + Collections.sort(reads, ReadErrorComparator.comparator); + if(reverse){Collections.reverse(reads);} + TextStreamWriter tsw=new TextStreamWriter(sorted, overwrite, false, true); + tsw.start(); +// assert(false) : "\nreads: "+reads.size()+"\n"+tsw+"\n"; + for(Read r : reads){ + tsw.println(r); + if(r.mate!=null){tsw.println(r.mate);} + } + tsw.poison(); + tsw.waitForFinish(); + }catch(Throwable t){ + System.err.println("ERROR: "+t); + return false; + } + return true; + } + + private static long runPass(boolean auto, long memory, int cbits, long cells, long precells, int buildpasses, int hashes, int prehashes, int k, + long maxReads, long tablereads, int minq, int buildStepsize, + String in1, String in2, String outKeep, String outToss, String outLow, String outMid, String outHigh, String outUnc, String histFile, List extra, + int targetDepth, int targetDepthBadLow, int targetDepthBadHigh, int maxDepth, int minDepth, + int minKmersOverMinDepth, float depthPercentile, boolean tossErrorReads, boolean rbb, boolean discardBadOnly, + float highPercentile, float lowPercentile, int errorDetectRatio, int hthresh, int lthresh, boolean fixSpikes, boolean countup, + boolean rename){ + assert(in1!=null); + TARGET_DEPTH=targetDepth; + TARGET_DEPTH_BAD_LOW=targetDepthBadLow; + TARGET_DEPTH_BAD_HIGH=targetDepthBadHigh; + MAX_DEPTH=maxDepth; + MIN_DEPTH=minDepth; + MIN_KMERS_OVER_MIN_DEPTH=minKmersOverMinDepth; + DEPTH_PERCENTILE=depthPercentile; + RENAME_THIS_PASS=rename; + + COUNTUP=countup; + if(COUNTUP){ + TARGET_DEPTH=(int)Math.round(TARGET_DEPTH*0.95); + } + TOSS_ERROR_READS=tossErrorReads; +// REQUIRE_BOTH_BAD=(rbb); + REQUIRE_BOTH_BAD=(rbb || COUNTUP); + DISCARD_BAD_ONLY=discardBadOnly; + HIGH_PERCENTILE=highPercentile; + LOW_PERCENTILE=(COUNTUP ? LOW_PERCENTILE_COUNTUP : lowPercentile); +// assert(!COUNTUP) : COUNTUP+", "+LOW_PERCENTILE_COUNTUP+", "+lowPercentile+", "+LOW_PERCENTILE; + ERROR_DETECT_RATIO=errorDetectRatio; + HTHRESH=hthresh; + LTHRESH=lthresh; + FIX_SPIKES=fixSpikes; + + { + if(histFile==null){ +// HIST_LEN=Tools.min(20000, HIST_LEN); +// HIST_LEN_PRINT=Tools.min(20000, HIST_LEN_PRINT); + }else{ + USE_HISTOGRAM=true; + } + + final int maxCount=(int)(cbits>16 ? Integer.MAX_VALUE : (1L<0); + HIST_LEN_PRINT=Tools.max(1, Tools.min(HIST_LEN_PRINT, maxCount)); + assert(HIST_LEN_PRINT<=Integer.MAX_VALUE) : HIST_LEN_PRINT+", "+Integer.MAX_VALUE; + HIST_LEN=(int)Tools.min(maxCount, Tools.max(HIST_LEN_PRINT, HIST_LEN)); + THREAD_HIST_LEN=Tools.min(THREAD_HIST_LEN, HIST_LEN); + + histogram_total=new AtomicLongArray(HIST_LEN); + } + + if(auto && cells==-1){ + final long usable=(long)Tools.max(((memory-96000000)*.73), memory*0.45); + long mem=usable-(USE_HISTOGRAM ? (HIST_LEN*8*(1)) : 0); + if(buildpasses>1){mem/=2;} + + FILTERBYTES=(COUNTUP ? mem/2 : mem); + cells=(FILTERBYTES*8)/cbits; +// +// long tablebytes=((1L<0 && prehashes>0 ? Tools.toKMG(precells) : "?")); + outstream.println("prefilter hashes: \t"+(precells>0 && prehashes>0 ? ""+prehashes : "?")); + } +// outstream.println("base min quality: \t"+KmerCount7MTA.minQuality); + outstream.println("base min quality: \t"+minq); + outstream.println("kmer min prob: \t"+KmerCount7MTA.minProb); + + outstream.println(); + outstream.println("target depth: \t"+TARGET_DEPTH); + outstream.println("min depth: \t"+MIN_DEPTH); + outstream.println("max depth: \t"+MAX_DEPTH); + outstream.println("min good kmers: \t"+MIN_KMERS_OVER_MIN_DEPTH); + outstream.println("depth percentile: \t"+String.format("%.1f", 100*DEPTH_PERCENTILE)); + outstream.println("ignore dupe kmers:\t"+!KmerCount7MTA.KEEP_DUPLICATE_KMERS); + outstream.println("fix spikes: \t"+FIX_SPIKES); + if(USE_HISTOGRAM && HIST_LEN>0){ + outstream.println("histogram length: \t"+(USE_HISTOGRAM ? HIST_LEN : 0)); + } + if(histFile!=null){ + outstream.println("print zero cov: \t"+PRINT_ZERO_COVERAGE); + } + + outstream.println(); + } + + if(!prefilter && k<32 && cells>(1L<<(2*k))){cells=(1L<<(2*k));} + assert(cells>0); + +// KmerCount7MTA.THREADS=Tools.max(THREADS/2, KmerCount7MTA.THREADS); //Seems like 4 is actually optimal... + + FastaReadInputStream.MIN_READ_LEN=k; + + Timer t=new Timer(); + Timer ht=new Timer(); + t.start(); + ht.start(); + KCountArray kca; + KCountArray prefilterArray=null; +// outstream.println(); + if(prefilter){ + prefilterArray=KmerCount7MTA.makeKca(in1, in2, extra, k, 2, 0, precells, prehashes, minq, true, tablereads, 1, buildStepsize, 1, 1, null); + outstream.println("Made prefilter: \t"+prefilterArray.toShortString(prehashes)); + double uf=prefilterArray.usedFraction(); + if(uf>0.6){ + outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" : + uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy for kmers of depth under 3. Ideal load is under 60% used." + + "\nFor better accuracy, run on a node with more memory; quality-trim or error-correct reads; " + + "or increase the values of the minprob flag to reduce spurious kmers."); + } + } + kca=KmerCount7MTA.makeKca(in1, in2, extra, k, cbits, 0, cells, hashes, minq, true, tablereads, buildpasses, buildStepsize, 2, 2, prefilterArray); + ht.stop(); + + outstream.println("Made hash table: \t"+kca.toShortString(hashes)); + double uf=kca.usedFraction(); + if(uf>0.6){ + outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" : + uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy. Ideal load is under 60% used." + + "\nFor better accuracy, use the 'prefilter' flag; run on a node with more memory; quality-trim or error-correct reads; " + + "or increase the values of the minprob flag to reduce spurious kmers. In practice you should still get good normalization results " + + "even with loads over 90%, but the histogram and statistics will be off."); + } + + long estUnique; + outstream.println(); + if(prefilterArray!=null){ + int lim1=prefilterArray.maxValue, lim2=prefilterArray.maxValue+1; + double a=prefilterArray.estimateUniqueKmers(prehashes); + double b=kca.estimateUniqueKmers(hashes, lim2); + a=a-b; + if(CANONICAL){ +// a=(a*KCountArray.canonMask)/(KCountArray.canonMask+1); +// b=(b*KCountArray.canonMask)/(KCountArray.canonMask+1); + }else{ + a/=2; + b/=2; + } + estUnique=((long)((a+b))); + outstream.println("Estimated kmers of depth 1-"+lim1+": \t"+(long)a); + outstream.println("Estimated kmers of depth "+lim2+"+ : \t"+(long)b); + }else{ +// double est=kca.cells*(1-Math.pow(1-Math.sqrt(kca.usedFraction()), 1.0/hashes)); +// double est=kca.cells*(1-Math.pow(1-kca.usedFraction(), 1.0/hashes)); + double est=kca.estimateUniqueKmers(hashes); +// outstream.println("Used cells: "+kca.cellsUsed(1)); + if(CANONICAL){ +// est=(est*KCountArray.canonMask)/(KCountArray.canonMask+1); + }else{ + est/=2; + } + estUnique=((long)((est))); + + } + outstream.println("Estimated unique kmers: \t"+estUnique);//+", or "+estUnique+" counting forward kmers only."); +// outstream.println("(Includes forward and reverse kmers)"); + outstream.println(); + outstream.println("Table creation time:\t\t"+ht);//+" \t"+String.format("%.2f", totalBases*1000000.0/(ht.elapsed))+" kb/sec"); + + ListNum.setDeterministicRandom(DETERMINISTIC); + + long bases=0; + if(COUNTUP){ + COUNTUP=false; + + int td0=TARGET_DEPTH, md0=MIN_DEPTH, mxd0=MAX_DEPTH, mkomd0=MIN_KMERS_OVER_MIN_DEPTH; + TARGET_DEPTH=TARGET_DEPTH*4; + MIN_DEPTH=MIN_DEPTH/2; + MAX_DEPTH=MAX_DEPTH*4; + MIN_KMERS_OVER_MIN_DEPTH=MIN_KMERS_OVER_MIN_DEPTH/2; + + int rnd=(int)(100+Math.random()*1000000); + final String tempOutPrefix1=getTempPrefix(in1, outKeep, rnd); +// final String tempOutPrefix2=getTempPrefix(in1, outKeep, rnd+1); +// final String tempOutPrefix3=getTempPrefix(in1, outKeep, rnd+3); + final String tempOut1=getTempOut(outKeep, tempOutPrefix1); +// final String tempOut2=getTempOut(outKeep, tempOutPrefix2); +// final String tempOut3=getTempOut(outKeep, tempOutPrefix3); + ArrayList storage=new ArrayList(); + + if(in1!=null && in1.contains(",") && !new File(in1).exists()){ + String[] list1=in1.split(","); + String[] list2=(in2==null ? null : in2.split(",")); + bases+=count(list1, list2, kca, k, maxReads, null, null, null, null, null, null, false, overwrite, null, estUnique, storage); + }else{ + bases+=count(in1, in2, kca, k, maxReads, null, null, null, null, null, null, false, overwrite, null, estUnique, storage); + } + inMemorySort(storage, tempOut1, false); + storage=null; + in1=tempOut1; + in2=null; + + TARGET_DEPTH=td0; + MIN_DEPTH=md0; + MAX_DEPTH=mxd0; + MIN_KMERS_OVER_MIN_DEPTH=mkomd0; + + COUNTUP=true; + + + if(in1!=null && in1.contains(",") && !new File(in1).exists()){ + String[] list1=in1.split(","); + String[] list2=(in2==null ? null : in2.split(",")); + bases+=count(list1, list2, kca, k, maxReads, outKeep, outToss, outLow, outMid, outHigh, outUnc, ordered, overwrite, histFile, estUnique, null); + }else{ + bases+=count(in1, in2, kca, k, maxReads, outKeep, outToss, outLow, outMid, outHigh, outUnc, ordered, overwrite, histFile, estUnique, null); + } + + }else{ + + + if(in1!=null && in1.contains(",") && !new File(in1).exists()){ + String[] list1=in1.split(","); + String[] list2=(in2==null ? null : in2.split(",")); + bases+=count(list1, list2, kca, k, maxReads, outKeep, outToss, outLow, outMid, outHigh, outUnc, ordered, overwrite, histFile, estUnique, null); + }else{ + bases+=count(in1, in2, kca, k, maxReads, outKeep, outToss, outLow, outMid, outHigh, outUnc, ordered, overwrite, histFile, estUnique, null); + } + } + + if(ANALYZE_TOPOLOGY){printTopology();} + + t.stop(); +// outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec"); + return bases; + } + + + public static void printTopology(){ + long total=peaks.get()+spikes.get()+flats.get()+valleys.get()+slopes.get(); + double mult=100.0/total; + + long sp=spikes.get(); + long pe=peaks.get(); + long va=valleys.get(); + long sl=slopes.get(); + long fl=flats.get(); + double dsp=mult*sp; + double dpe=mult*pe; + double dva=mult*va; + double dsl=mult*sl; + double dfl=mult*fl; + + System.err.println("\nDepth Topology:\t"); + System.err.println("Spikes: \t\t\t"+(dsp<10 ? " " : "")+String.format("%.3f%% \t%d",dsp,sp)); + System.err.println("Peaks: \t\t\t"+(dpe<10 ? " " : "")+String.format("%.3f%% \t%d",dpe,pe)); + System.err.println("Valleys: \t\t\t"+(dva<10 ? " " : "")+String.format("%.3f%% \t%d",dva,va)); + System.err.println("Slopes: \t\t\t"+(dsl<10 ? " " : "")+String.format("%.3f%% \t%d",dsl,sl)); + System.err.println("Flats: \t\t\t"+(dfl<10 ? " " : "")+String.format("%.3f%% \t%d",dfl,fl)); + } + + + public static long count(String in1, String in2, KCountArray kca, int k, long maxReads, + String outKeep, String outToss, String outLow, String outMid, String outHigh, String outUnc, + boolean ordered, boolean overwrite, String histFile, long estUnique, ArrayList storage) { + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} + Thread th=new Thread(cris); + th.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + RTextOutputStream3 rosKeep=null; + if(outKeep!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=outKeep.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(outKeep.contains("#")){ + out2=outKeep.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, ordered); + rosKeep=new RTextOutputStream3(ff1, ff2, buff, null, true); + rosKeep.start(); + outstream.println("Started output threads."); + } + + RTextOutputStream3 rosToss=null; + if(outToss!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=outToss.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(outToss.contains("#")){ + out2=outToss.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, ordered); + rosToss=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosToss.start(); + outstream.println("Started output threads."); + } + + RTextOutputStream3 rosLow=null; + if(outLow!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=outLow.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(outLow.contains("#")){ + out2=outLow.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, ordered); + rosLow=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosLow.start(); + outstream.println("Started output threads."); + } + + RTextOutputStream3 rosMid=null; + if(outMid!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=outMid.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(outMid.contains("#")){ + out2=outMid.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, ordered); + rosMid=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosMid.start(); + outstream.println("Started output threads."); + } + + RTextOutputStream3 rosHigh=null; + if(outHigh!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=outHigh.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(outHigh.contains("#")){ + out2=outHigh.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, ordered); + rosHigh=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosHigh.start(); + outstream.println("Started output threads."); + } + + RTextOutputStream3 rosUnc=null; + if(outUnc!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=outUnc.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(outUnc.contains("#")){ + out2=outUnc.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, ordered); + rosUnc=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosUnc.start(); + outstream.println("Started output threads."); + } + + long bases=downsample(cris, kca, k, maxReads, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc, histFile, overwrite, estUnique, storage); + + errorState|=ReadWrite.closeStreams(cris, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc); + if(verbose){System.err.println("Closed streams");} + + return bases; + } + + + public static long count(String[] list1, String[] list2, KCountArray kca, int k, long maxReads, + String outKeep, String outToss, String outLow, String outMid, String outHigh, String outUnc, + boolean ordered, boolean overwrite, String histFile, long estUnique, ArrayList storage) { + + RTextOutputStream3 rosKeep=null, rosToss=null, rosLow=null, rosMid=null, rosHigh=null, rosUnc=null; + String[] outKeep1=null, outKeep2=null; + String[] outToss1=null, outToss2=null; + String[] outLow1=null, outLow2=null; + String[] outMid1=null, outMid2=null; + String[] outHigh1=null, outHigh2=null; + String[] outUnc1=null, outUnc2=null; + + + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + if(outKeep!=null){ + if(!new File(outKeep).exists()){ + outKeep1=outKeep.split(","); + }else{ + outKeep1=new String[] {outKeep}; + } + outKeep2=new String[outKeep1.length]; + for(int i=0; i1){ + if(rosKeep!=null){ + rosKeep.close(); + rosKeep.join(); + } + + FileFormat ff1=FileFormat.testOutput(outKeep1[x], FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(outKeep2[x], FileFormat.FASTQ, null, true, overwrite, ordered); + rosKeep=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosKeep.start(); + outstream.println("Started output threads."); + }else{ + rosKeep.resetNextListID(); + } + } + + if(outToss1!=null){ + if(x==0 || outToss1.length>1){ + if(rosToss!=null){ + rosToss.close(); + rosToss.join(); + } + + FileFormat ff1=FileFormat.testOutput(outToss1[x], FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(outToss2[x], FileFormat.FASTQ, null, true, overwrite, ordered); + rosToss=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosToss.start(); + outstream.println("Started output threads."); + }else{ + rosToss.resetNextListID(); + } + } + + if(outLow1!=null){ + if(x==0 || outLow1.length>1){ + if(rosLow!=null){ + rosLow.close(); + rosLow.join(); + } + + FileFormat ff1=FileFormat.testOutput(outLow1[x], FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(outLow2[x], FileFormat.FASTQ, null, true, overwrite, ordered); + rosLow=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosLow.start(); + outstream.println("Started output threads."); + }else{ + rosLow.resetNextListID(); + } + } + + if(outMid1!=null){ + if(x==0 || outMid1.length>1){ + if(rosMid!=null){ + rosMid.close(); + rosMid.join(); + } + + FileFormat ff1=FileFormat.testOutput(outMid1[x], FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(outMid2[x], FileFormat.FASTQ, null, true, overwrite, ordered); + rosMid=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosMid.start(); + outstream.println("Started output threads."); + }else{ + rosMid.resetNextListID(); + } + } + + if(outHigh1!=null){ + if(x==0 || outHigh1.length>1){ + if(rosHigh!=null){ + rosHigh.close(); + rosHigh.join(); + } + + FileFormat ff1=FileFormat.testOutput(outHigh1[x], FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(outHigh2[x], FileFormat.FASTQ, null, true, overwrite, ordered); + rosHigh=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosHigh.start(); + outstream.println("Started output threads."); + }else{ + rosHigh.resetNextListID(); + } + } + + if(outUnc1!=null){ + if(x==0 || outUnc1.length>1){ + if(rosUnc!=null){ + rosUnc.close(); + rosUnc.join(); + } + + FileFormat ff1=FileFormat.testOutput(outUnc1[x], FileFormat.FASTQ, null, true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(outUnc2[x], FileFormat.FASTQ, null, true, overwrite, ordered); + rosUnc=new RTextOutputStream3(ff1, ff2, buff, null, true); + + rosUnc.start(); + outstream.println("Started output threads."); + }else{ + rosUnc.resetNextListID(); + } + } + + String in1=list1[x]; + String in2=(list2==null || list2.length<=x ? null : list2[x]); + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} + Thread th=new Thread(cris); + th.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + bases+=downsample(cris, kca, k, maxReads, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc, histFile, overwrite, estUnique, storage); + + errorState|=ReadWrite.closeStream(cris); + if(verbose){System.err.println("Closed stream");} + + } + + errorState|=ReadWrite.closeStreams(null, rosKeep, rosToss, rosLow, rosMid, rosHigh, rosUnc); + + return bases; + } + + + + public static long downsample(ConcurrentReadStreamInterface cris, KCountArray kca, int k, long maxReads, + RTextOutputStream3 rosKeep, RTextOutputStream3 rosToss, RTextOutputStream3 rosLow, RTextOutputStream3 rosMid, RTextOutputStream3 rosHigh, RTextOutputStream3 rosUnc, + String histFile, boolean overwrite, long estUnique, ArrayList storage) { + Timer tdetect=new Timer(); + tdetect.start(); + + long totalBases=0; + long totalReads=0; + + long readsKept=0; + long readsTossed=0; + long readsLowBin=0; + long readsMidBin=0; + long readsHighBin=0; + long readsUncorrected=0; + long basesKept=0; + long basesTossed=0; + long basesLowBin=0; + long basesMidBin=0; + long basesHighBin=0; + long basesUncorrected=0; + + + long errorReads=0; + long errorPairs=0; + long errorType1=0; + long errorType2=0; + long errorType3=0; + + long errorsDetected=0; + long errorsMarked=0; + long errorsCorrected=0; + long basesTrimmed=0; + + KCountArray kcaup=null; + if(COUNTUP){ + final int bits; + if(TARGET_DEPTH<=15){ + bits=4; + }else if(TARGET_DEPTH<=255){ + bits=8; + }else{ + bits=16; + } + + long cells=(FILTERBYTES*8)/bits; + int kbits=2*k; + kcaup=KCountArray.makeNew(1L<1){ + histogram_total.addAndGet(1, histogram_total.get(0)); + histogram_total.set(0, 0); + } + +// outstream.println(); + tdetect.stop(); + outstream.println("Table read time: \t\t"+tdetect+" \t"+String.format("%.2f", totalBases*1000000.0/(tdetect.elapsed))+" kb/sec"); + + { + String pad=""; + String s=""+totalReads; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Total reads in: \t\t"+totalReads+pad+String.format("\t%.3f%% Kept", (readsKept*100.0/totalReads))); + s=""+totalBases; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Total bases in: \t\t"+totalBases+pad+String.format("\t%.3f%% Kept", (basesKept*100.0/totalBases))); + + if(rosLow!=null){ + s=""+readsLowBin; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Low bin reads: \t\t"+readsLowBin+pad+String.format("\t%.3f%%", (readsLowBin*100.0/totalReads))); + s=""+basesLowBin; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Low bin bases: \t\t"+basesLowBin+pad+String.format("\t%.3f%%", (basesLowBin*100.0/totalBases))); + } + if(rosMid!=null){ + s=""+readsMidBin; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Mid bin reads: \t\t"+readsMidBin+pad+String.format("\t%.3f%%", (readsMidBin*100.0/totalReads))); + s=""+basesMidBin; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Mid bin bases: \t\t"+basesMidBin+pad+String.format("\t%.3f%%", (basesMidBin*100.0/totalBases))); + } + if(rosHigh!=null){ + s=""+readsHighBin; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("High bin reads: \t\t"+readsHighBin+pad+String.format("\t%.3f%%", (readsHighBin*100.0/totalReads))); + s=""+basesHighBin; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("High bin bases: \t\t"+basesHighBin+pad+String.format("\t%.3f%%", (basesHighBin*100.0/totalBases))); + } + + s=""+errorReads; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Error reads in: \t\t"+errorReads+pad+String.format("\t%.3f%%", (errorReads*100.0/totalReads))); + if(cris.paired()){ + s=""+errorPairs; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Error pairs in: \t\t"+errorPairs+pad+String.format("\t%.3f%%", (errorPairs*200.0/totalReads))); + } + s=""+errorType1; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Error type 1: \t\t"+errorType1+pad+String.format("\t%.3f%%", (errorType1*100.0/totalReads))); + s=""+errorType2; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Error type 2: \t\t"+errorType2+pad+String.format("\t%.3f%%", (errorType2*100.0/totalReads))); + s=""+errorType3; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Error type 3: \t\t"+errorType3+pad+String.format("\t%.3f%%", (errorType3*100.0/totalReads))); + + + if(TRIM_LEFT_THIS_PASS || TRIM_RIGHT_THIS_PASS){ + outstream.println("\nDuring Trimming:"); + s=""+(errorsDetected+errorsCorrected+errorsMarked); + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Bases Trimmed: \t\t"+(basesTrimmed)); + } + + if(CORRECT_ERRORS_THIS_PASS){ + outstream.println("\nDuring Error Correction:"); + s=""+(errorsDetected+errorsCorrected+errorsMarked); + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Errors Suspected:\t\t"+(errorsDetected+errorsCorrected+errorsMarked)); + s=""+errorsCorrected; + pad=""; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Errors Corrected:\t\t"+errorsCorrected); + s=""+errorsMarked; + pad=""; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Errors Marked: \t\t"+errorsMarked+"\n"); + } + } + +// outstream.println(); + if(histogram_total!=null){ + TextStreamWriter tswh=null; + StringBuilder sb=new StringBuilder(100); + if(USE_HISTOGRAM && histFile!=null){ + tswh=new TextStreamWriter(histFile, overwrite, false, false); + tswh.start(); + tswh.print("#Depth\tRaw_Count\tUnique_Kmers\n"); + } + int lim=(int)(HIST_LEN_PRINT-1); + long remaining=Tools.sum(histogram_total); + long sumRaw1=0; + long sumRaw2=0; + long sum1=0; + long sum2=0; + long sumsquare=0; + for(int i=0; i0*/ || y>0){ + sb.append(i).append('\t'); + sb.append(x).append('\t'); + sb.append(y).append('\n'); + } + tswh.print(sb.toString()); + sb.setLength(0); + } + if(sumRaw1>=remaining){break;} //Stop once there is no more coverage, even if PRINT_ZERO_COVERAGE is not set. + } + for(int i=lim; i0 || sum2>0){ + sb.append(lim).append('\t'); + sb.append(sumRaw2).append('\t'); + sb.append(sum2).append('\n'); + } + tswh.print(sb.toString()); + tswh.poison(); + tswh.waitForFinish(); + outstream.println("Wrote histogram to "+histFile); + } + + long histCount=Tools.sum(histogram_total); //Total number of kmers counted + long halfCount=(histCount+1)/2; + double histCountU=0; //Unique kmers counted + long temp1=0; + double temp2=0; + int median_all=-1; + int median_unique=-1; + for(int i=0; i=halfCount && median_all<0){median_all=i;} +// histSum+=(x*(double)i); + histCountU+=(x/(double)Tools.max(1, i)); + } + double halfCount2=(histCountU)/2; + for(int i=0; i=halfCount2 && median_unique<0){ + median_unique=i; + break; + } + } + if(median_all<0){median_all=0;} + double avg_all=sumsquare/(double)histCount; + double avg_unique=histCount/histCountU; + double stdev_unique=Tools.standardDeviationHistogramKmer(histogram_total); + double stdev_all=Tools.standardDeviationHistogram(histogram_total); + outstream.println("Total kmers counted: \t"+(sumRaw1+sumRaw2)); + + double uniqueC=((sum1+sum2)*100.0/(sumRaw1+sumRaw2)); + double uniqueE=((estUnique)*100.0/(sumRaw1+sumRaw2)); + double uniqueM=Tools.max(uniqueC, uniqueE); + outstream.println("Total unique kmer count: \t"+(sum1+sum2)); + if(CANONICAL){outstream.println("Includes forward kmers only.");} + outstream.println("The unique kmer estimate can be more accurate than the unique count, if the tables are very full."); + outstream.println("The most accurate value is the greater of the two."); + outstream.println(); + + outstream.println("Percent unique: \t"+(uniqueM<10 ? " " : "")+String.format("%.2f%%", uniqueM)); + + outstream.println("Depth average: \t"+String.format("%.2f\t(unique kmers)", avg_unique)); + outstream.println("Depth median: \t"+String.format("%d\t(unique kmers)", median_unique)); + outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(unique kmers)", stdev_unique)); + + outstream.println("\nDepth average: \t"+String.format("%.2f\t(all kmers)", avg_all)); + outstream.println("Depth median: \t"+String.format("%d\t(all kmers)", median_all)); + outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(all kmers)", stdev_all)); + + double avgReadLen=totalBases*1.0/totalReads; + double readDepth=median_all*(avgReadLen/(avgReadLen-k+1)); + + outstream.println("\nApprox. read depth median: \t"+String.format("%.2f", (readDepth))); + } + + return totalBases; + } + + + + /** + * Locates and fixes spikes in a coverage profile (potentially) caused by false positives in a bloom filter. + * Theory: If a high-count kmer is adjacent on both sides to low-count kmers, it may be a false positive. + * It could either be reduced to the max of the two flanking points or examined in more detail. + * @param cov An array of kmer counts for adjacent kmers in a read. + */ + private static void fixSpikes(int[] cov){ + + for(int i=1; i1 && b>a && b>c){ + //peak + if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + //spike + cov[i]=(int)Tools.max(a, c); + } + } + } + } + + private static void fixSpikes(int[] cov, long[] kmers, KCountArray kca, final int k){ + assert(k<32); + if(cov.length<3){return;} + if(cov[1]-cov[0]>1){ + cov[0]=kca.readPrecise(kmers[0], k, true); + } + if(cov[cov.length-1]-cov[cov.length-2]>1){ + cov[cov.length-1]=kca.readPrecise(kmers[cov.length-1], k, true); + } + + for(int i=1; i1){ + long a=Tools.max(1, cov[i-1]); + long c=Tools.max(1, cov[i+1]); + long key=kmers[i]; + + if(b>a && b>c){ + //peak + if(b<6 || b>a+1 || b>c+1){ + cov[i]=kca.readPreciseMin(key, k, true); + } + // if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + // //spike + // int b1=(int)((a+c)/2); + // int b2=kca.readLeft(key, k, CANONICAL); + // int b3=kca.readRight(key, k, CANONICAL); + // array[i]=Tools.min(b, b1, b2, b3); + // } + // else + // { + //// array[i]=kca.readPreciseMin(key, k, CANONICAL); + // } + } + // else + // if(Tools.max(ada, adc)>=Tools.max(2, Tools.min((int)a, b, (int)c)/4)){ + // array[i]=kca.readPrecise(key, k, CANONICAL); + // } + // else + // if(b>a+1 || b>c+1){ + // //steep + // array[i]=kca.readPrecise(key, k, CANONICAL); + // } + } + } + } + + + private static int correctErrors(Read r, int[] cov, long[] kmers, KCountArray kca, final int k, + final int low, final int high, final int mult, int maxToCorrect, int maxQual, boolean kmersAlreadyValid, boolean coverageAlreadyValid, long[] qhist, final boolean markOnly){ + assert(k<32); + assert(maxToCorrect>0) : "Don't do error correction with a maximum of 0 errors; it's a waste of time."; + if(maxToCorrect<1){return 0;} + + if(!kmersAlreadyValid){kmers=r.toKmers(k, 0, kmers, false);} + if(kmers==null || kmers.length<3){return -99;} + + if(!coverageAlreadyValid){cov=generateCoverage(kca, k, cov, kmers, true);} + + int disc=countDiscontinuities(cov, low, high, mult); + + if(disc==0){return 0;} + + byte[] copy=r.bases.clone(); + + byte[] suffix=new byte[SUFFIX_LEN]; + + int cfl=0, cfr=0; + + if(CORRECT_FROM_LEFT){ + cfl=correctErrorsFromLeft(r, cov, kmers, kca, k, low, high, mult, maxToCorrect, maxQual, suffix, qhist, markOnly); + if(cfl<0){ + //Failed correction. + r.bases=copy; + return cfl; + } + maxToCorrect-=cfl; + } + + if(CORRECT_FROM_RIGHT && maxToCorrect>0){ + {//Optional block - allows saving of errors corrected from left even if correctErrorsFromRight fails. + if(cfl>0){ + for(int i=0; i0) : "Don't do error correction with a maximum of 0 errors; it's a waste of time."; + if(maxToCorrect<1){return 0;} + + if(!kmersAlreadyValid){kmers=r.toKmers(k, 0, kmers, false);} + if(kmers==null || kmers.length<3){return 0;} + + if(!coverageAlreadyValid){cov=generateCoverage(kca, k, cov, kmers, true);} + + int disc=countDiscontinuities(cov, low, high, mult); + + if(disc==0){return 0;} + + int cfl=0, cfr=0; + + if(CORRECT_FROM_LEFT){ + cfl=markErrorsFromLeft(r, cov, k, low, high, mult, maxToCorrect, qhist); + maxToCorrect-=cfl; + } + + if(CORRECT_FROM_RIGHT){ + cfr=markErrorsFromRight(r, cov, k, low, high, mult, maxToCorrect, qhist); + } + + int marked=cfl+cfr; + final byte[] quals=r.quality; + if(marked>0){ + int found=0; + if(quals!=null){ + for(int i=0; i=0); + } + } + assert(marked==0) : marked; + } + + return marked; + } + + /** Returns number of discontinuities detected. This is not the same as the number of errors, + * but the presence of discontiniuities indicates the presence of errors. + * @param cov + * @param low + * @param high + * @param mult + * @return + */ + private static int countDiscontinuities(final int[] cov, final int low, final int high, final int mult){ + + int found=0; + + for(int i=2; i=high && (b<=low || a>=b*mult)){//error + found++; + } + } + + for(int i=cov.length-3; i>=0; i--){ + int a=Tools.min(cov[i+2], cov[i+1]); + int b=cov[i]; + if(a>=high && (b<=low || a>=b*mult)){//error + found++; + } + } + + return found; + } + + + private static void regenerateKmersAndCoverage(final Read r, final long[] kmers, final int[] cov, final KCountArray kca, final int k, boolean makeCanonical){ + assert(r!=null && kmers!=null && cov!=null && kca!=null && kca.gap==0); + final byte[] bases=r.bases; + if(bases==null || bases.length31){ + r.toKmers(k, 0, kmers, false); + generateCoverage(kca, k, cov, kmers, true); + return; + } + + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + int len=0; + long kmer=0; + final int arraylen=bases.length-k+1; + assert(kmers.length==arraylen && cov.length==arraylen); + + for(int i=0, j=1-k; i=k){ + long y=kmer; + if(makeCanonical){ + y=KCountArray.makeCanonical2(y, k); + } + + if(kmers[j]!=y){ + kmers[j]=y; + cov[j]=kca.read(y, k, !makeCanonical); + } + } + } + } + } + + + private static int correctErrorsFromLeft(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k, + final int low, final int high, final int mult, final int maxToCorrect, int maxQual, final byte[] suffix, final long[] qhist, boolean markOnly){ + + int found=0; + int corrected=0; + int uncorrected=0; + final byte[] quals=r.quality; + + for(int i=PREFIX_LEN; i=high && (b<=low || a>=b*mult)){//error + found++; + final int loc=i+k-1; + final byte q=(quals==null ? 10 : quals[loc]); + if(qhist!=null){qhist[q]++;} + + if(markOnly){ + corrected++; + if(quals==null){r.bases[loc]='N';} +// else if(q>0){quals[loc]=(byte)Tools.max(1, q/2);} + else if(q>0){quals[loc]=(byte)Tools.max(1, q/2-3);} + }else{ + if(found>maxToCorrect || q>maxQual){return 0-found;} + boolean success=correctErrorFromLeft(r, cov, kmers, kca, k, low, Tools.max(high, a/2), 2*a, mult, i, suffix); + if(success){ + corrected++; + // r.toKmers(k, 0, kmers, false); + // generateCoverage(kca, k, cov, kmers, true); + regenerateKmersAndCoverage(r, kmers, cov, kca, k, false); + }else{ + uncorrected++; + break; + } + } + } + } + +// assert(false) : Arrays.toString(cov)+"\nlow="+low+", high="+high+", mult="+mult+", found="+found+", corrected="+corrected+", uncorrected="+uncorrected; + + + return (uncorrected>0 ? 0-found : corrected); + } + + + private static int correctErrorsFromRight(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k, + final int low, final int high, final int mult, final int maxToCorrect, int maxQual, final byte[] suffix, final long[] qhist, final boolean markOnly){ + + int found=0; + int corrected=0; + int uncorrected=0; + final byte[] quals=r.quality; + + final int start=(markOnly ? Tools.min(cov.length-PREFIX_LEN-1, k-1) : cov.length-PREFIX_LEN-1); + for(int i=start; i>=0; i--){ +// int a=Tools.min(cov[i+2], cov[i+1]); + int a=Tools.min(cov, i+1, i+PREFIX_LEN); + int b=cov[i]; + if(a>=high && (b<=low || a>=b*mult)){//error + found++; + final byte q=(quals==null ? 10 : quals[i]); + if(qhist!=null){qhist[q]++;} + + if(markOnly){ + corrected++; + if(quals==null){r.bases[i]='N';} +// else if(q>0){quals[i]=(byte)Tools.max(1, q/2);} + else if(q>0){quals[i]=(byte)Tools.max(1, q/2-3);} + }else{ + if(found>maxToCorrect || q>maxQual){return 0-found;} + boolean success=correctErrorFromRight(r, cov, kmers, kca, k, low, Tools.max(high, a/2), 2*a, mult, i, suffix); + if(success){ + corrected++; + // r.toKmers(k, 0, kmers, false); + // generateCoverage(kca, k, cov, kmers, true); + regenerateKmersAndCoverage(r, kmers, cov, kca, k, false); + }else{ + uncorrected++; + break; + } + } + } + } + +// assert(false) : Arrays.toString(cov)+"\nlow="+low+", high="+high+", mult="+mult+", found="+found+", corrected="+corrected+", uncorrected="+uncorrected; + + + return (uncorrected>0 ? 0-found : corrected); + } + + + private static int markErrorsFromLeft(final Read r, final int[] cov, final int k, + final int low, final int high, final int mult, final int maxToCorrect, final long[] qhist){ + + int found=0; + final byte[] quals=r.quality, bases=r.bases; + + for(int i=PREFIX_LEN; i=high && (b<=low || a>=b*mult)){//error + final int loc=i+k-1; + final byte q=(quals==null ? 10 : quals[loc]); + + if(q>0){ + found++; + if(qhist!=null){qhist[q]++;} + if(quals==null){bases[loc]='N';} + else{quals[loc]=(byte)-q;} + } + } + } + return found; + } + + + private static int markErrorsFromRight(final Read r, final int[] cov, final int k, + final int low, final int high, final int mult, final int maxToCorrect, final long[] qhist){ + + int found=0; + final byte[] quals=r.quality, bases=r.bases; + + final int start=cov.length-PREFIX_LEN-1; + for(int i=start; i>=0; i--){ + int a=Tools.min(cov, i+1, i+PREFIX_LEN); + int b=cov[i]; + if(a>=high && (b<=low || a>=b*mult)){//error + final byte q=(quals==null ? 10 : quals[i]); + + if(q>0){ + found++; + if(qhist!=null){qhist[q]++;} + if(quals==null){bases[i]='N';} + else{quals[i]=(byte)-q;} + } + } + } + + return found; + } + + private static boolean correctErrorFromLeft(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k, + final int low, final int targetLowerBound, final int targetUpperBound, final int mult, final int loc, final byte[] suffix){ + + for(int i=0, j=loc+k-1; i0){ + assert(kmer==-1L) : new String(suffix)+"\t"+kmer; + if(kmers[loc-1]!=-1L){ + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + kmer=((kmers[loc-1]<<2)&mask); + } + } +// int leftCov=Tools.min(cov[loc-1], cov[loc-2]); + +// assert(false) : "kmer = "+AminoAcid.kmerToString(kmers[0], k); +// assert(false) : "suffix = "+new String(suffix); + +// assert(false) : new String(suffix)+"\t"+kmer; + + suffix[0]='A'; + final int a=testRightSuffix(kca, k, kmer, suffix); + suffix[0]='C'; + final int c=testRightSuffix(kca, k, kmer, suffix); + suffix[0]='G'; + final int g=testRightSuffix(kca, k, kmer, suffix); + suffix[0]='T'; + final int t=testRightSuffix(kca, k, kmer, suffix); + + final int max=Tools.max(a, c, g, t); + byte best='N'; + +// assert(false) : "rid="+r.numericID+"\n"+Arrays.toString(cov)+"\n" + +// new String(r.bases)+"\n" + +// "loc="+loc+", "+new String(suffix)+"\n" + +// "low="+low+", high="+high+", mult="+mult+", a="+a+", c="+c+", g="+g+", t="+t+", max="+max; + + if(max>=targetLowerBound && max<=targetUpperBound){ + //Found correct answer! + final int max2; + if(a==max){ + max2=Tools.max(c, g, t); + best='A'; + }else if(c==max){ + max2=Tools.max(a, g, t); + best='C'; + }else if(g==max){ + max2=Tools.max(a, c, t); + best='G'; + }else if(t==max){ + max2=Tools.max(a, c, g); + best='T'; + }else{ + max2=max; + assert(false); + } + +// assert(false) : max+", "+max2+", "+low+", "+(char)best; + if(max2<=low || max2*mult<=max){ + final int bnum=loc+k-1; + r.bases[bnum]=best; + if(!defined && r.quality!=null){ + assert(r.quality[bnum]==0) : r; + r.quality[bnum]=FIXED_N_QUAL; + } + return true; + } + } + +// assert(false) : max+", "+targetLowerBound+", "+targetUpperBound+", "+low+", "+(char)best; + + return false; + } + + private static boolean correctErrorFromRight(final Read r, final int[] cov, final long[] kmers, final KCountArray kca, final int k, + final int low, final int targetLowerBound, final int targetUpperBound, final int mult, final int loc, final byte[] suffix){ + + for(int i=0, j=loc; i=0){ + suffix[i]=r.bases[j]; + }else{ + suffix[i]='N'; + } + } +// if(r.numericID!=3500){return false;} + + long kmer=kmers[loc]; + final boolean defined=(AminoAcid.isFullyDefined(suffix[0])); + + //This block added to allow correction of no-calls + if(!defined && loc+1>2)&mask); + } + } +// int rightCov=Tools.min(cov[loc+1], cov[loc+2]); + +// assert(false) : "kmer = "+AminoAcid.kmerToString(kmers[0], k); +// assert(false) : "suffix = "+new String(suffix); + + suffix[0]='A'; + final int a=testLeftSuffix(kca, k, kmer, suffix); + suffix[0]='C'; + final int c=testLeftSuffix(kca, k, kmer, suffix); + suffix[0]='G'; + final int g=testLeftSuffix(kca, k, kmer, suffix); + suffix[0]='T'; + final int t=testLeftSuffix(kca, k, kmer, suffix); + + final int max=Tools.max(a, c, g, t); + byte best='N'; + +// assert(false) : "\nrid="+r.numericID+"\n"+Arrays.toString(cov)+"\n" + +// new String(r.bases)+"\n"+ +// "kmer-2 = "+AminoAcid.kmerToString(kmers[loc-2], k)+"\n"+ +// "kmer-1 = "+AminoAcid.kmerToString(kmers[loc-1], k)+"\n"+ +// "kmer = "+AminoAcid.kmerToString(kmer, k)+"\n"+ +// "kmer+1 = "+AminoAcid.kmerToString(kmers[loc+1], k)+"\n"+ +// "kmer+2 = "+AminoAcid.kmerToString(kmers[loc+2], k)+"\n"+ +// "count=("+kca.read(kmers[loc-2], k, true)+", "+kca.read(kmers[loc-1], k, true)+", "+ +// kca.read(kmer, k, true)+", "+kca.read(kmers[loc+1], k, true)+", "+kca.read(kmers[loc+2], k, true)+")\n"+ +// "loc="+loc+", suffix="+new String(suffix)+"\n" + +// "low="+low+", high="+high+", mult="+mult+", a="+a+", c="+c+", g="+g+", t="+t+", max="+max; + + if(max>=targetLowerBound && max<=targetUpperBound){ + //Found correct answer! + final int max2; + if(a==max){ + max2=Tools.max(c, g, t); + best='A'; + }else if(c==max){ + max2=Tools.max(a, g, t); + best='C'; + }else if(g==max){ + max2=Tools.max(a, c, t); + best='G'; + }else if(t==max){ + max2=Tools.max(a, c, g); + best='T'; + }else{ + max2=max; + assert(false); + } + + if(max2<=low || max2*mult<=max){ + r.bases[loc]=best; + if(!defined && r.quality!=null){ + assert(r.quality[loc]==0) : r; + r.quality[loc]=FIXED_N_QUAL; + } + return true; + } + } + return false; + } + + private static int testRightSuffix(final KCountArray kca, final int k, final long kmer0, final byte[] suffix){ + assert(k<=31); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + long kmer=kmer0>>2; + int min=Integer.MAX_VALUE; + +// System.out.println("Processing suffix "+new String(suffix)); +// System.out.println("kmer = "+AminoAcid.kmerToString(kmer0, k)); +// System.out.println("cov = "+kca.read(kmer0, k, true)); + + + for(int i=0; i0; i++){ + byte b=suffix[i]; + if(b=='N'){ + //TODO: Find best next letter + return 0; + } + assert(b!='N'); + int x=AminoAcid.baseToNumber[b]; + assert(x>=0); + + + kmer=((kmer<<2)|x)&mask; + int cov=kca.read(kmer, k, true); + min=Tools.min(min, cov); + +// System.out.println("kmer = "+AminoAcid.kmerToString(kmer, k)); +// System.out.println("cov = "+cov); + } +// System.out.println("returning "+min); + + assert(mina && b>c){ + peakcount++; + if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + spikecount++; + } + }else if(b0){peaks.addAndGet(peakcount);} + if(valleycount>0){valleys.addAndGet(valleycount);} + if(spikecount>0){spikes.addAndGet(spikecount);} + if(flatcount>0){flats.addAndGet(flatcount);} + if(slopecount>0){slopes.addAndGet(slopecount);} + } + + + /** + * kmer array must be valid at this point + * @param r + * @param kca + * @return + */ + public static int[] generateCoverage(Read r, KCountArray kca, final int k, int[] out, long[] kmers){ + if(kca.gap>0){throw new RuntimeException("Gapped reads: TODO");} + + assert(kmers!=null); + if(kmers==null){return null;} //Read is too short + + out=generateCoverage(kca, k, out, kmers, k<=31); + + if(ANALYZE_TOPOLOGY){analyzeSpikes(out, 1);} + return out; + } + + /** + * kmer array must be valid at this point + * @param r + * @param kca + * @return + */ + public static int[] generateCoverage(KCountArray kca, int k, int[] out, long[] kmers, boolean makeCanonical){ + if(kca.gap>0){throw new RuntimeException("Gapped reads: TODO");} + if(kmers==null){return null;} + + if(out==null || out.length!=kmers.length){out=new int[kmers.length];} + Arrays.fill(out, -1); + + for(int i=0; i0){ +// targetDepth=(int)((TARGET_DEPTH_BAD_LOW*(long)lowcount+TARGET_DEPTH_BAD_HIGH*(totalcount-(long)lowcount))/totalcount); + + double fractionGood=(totalcount-lowcount)/(float)totalcount; + targetDepth=(int)(TARGET_DEPTH_BAD_LOW+(TARGET_DEPTH_BAD_HIGH-TARGET_DEPTH_BAD_LOW)*(fractionGood*fractionGood)); + assert(TARGET_DEPTH_BAD_LOW<=TARGET_DEPTH_BAD_HIGH); + assert(TARGET_DEPTH>=99999999 || (targetDepth>0 && targetDepth<=TARGET_DEPTH)) : + targetDepth+", "+TARGET_DEPTH+", "+TARGET_DEPTH_BAD_LOW+", "+TARGET_DEPTH_BAD_HIGH+", "+lowcount+", "+totalcount; + assert(TARGET_DEPTH>=99999999 || (targetDepth>=TARGET_DEPTH_BAD_LOW && targetDepth<=TARGET_DEPTH_BAD_HIGH)) : + targetDepth+", "+TARGET_DEPTH+", "+TARGET_DEPTH_BAD_LOW+", "+TARGET_DEPTH_BAD_HIGH+", "+lowcount+", "+totalcount; + maxDepth=targetDepth; + } + + + final int lowerdepthAL=(depthAL1>=0 ? (depthAL2>=0 ? Tools.min(depthAL1, depthAL2) : depthAL1) : depthAL2); //The lower of depth1 and depth2 + final int lowertruedepth=(r2==null ? truedepth1 : Tools.min(truedepth1, truedepth2)); + long coin=0; + if(lowerdepthAL>maxDepth && (error1 || error2 || !DISCARD_BAD_ONLY)){ + if(r.rand<0){ + coin=randy.nextLong(lowerdepthAL)+1; + }else{ + coin=((long)(r.rand*lowerdepthAL))+1; + } + } + + totalReads+=readcount; + totalBases+=basecount; + + boolean toss=(lowerdepthAL<0 || coin>targetDepth || (r!=null && r.bases.length=HTHRESH){ + //do nothing + }else if(!REQUIRE_BOTH_BAD || r2==null || (error1 && error2)){ + toss=true; + } + } + + if(TOSS_BY_LOW_TRUEDEPTH && !SAVE_RARE_READS && lowertruedepthHIGH_BIN_DEPTH) && (depth2<0 || depth2>=HIGH_BIN_DEPTH)){ + }else if((depthAL1HIGH_BIN_DEPTH) && (depthAL2=HIGH_BIN_DEPTH)){ + readsHighBin+=readcount; + basesHighBin+=basecount; + if(highList!=null){highList.add(r);} + }else{ + assert((depthAL1>=LOW_BIN_DEPTH && depthAL1<=HIGH_BIN_DEPTH) || (depthAL2>=LOW_BIN_DEPTH && depthAL2<=HIGH_BIN_DEPTH)) : + depthAL1+", "+depthAL2+", "+LOW_BIN_DEPTH+", "+HIGH_BIN_DEPTH; + readsMidBin+=readcount; + basesMidBin+=basecount; + if(midList!=null){midList.add(r);} + } + + if(uncorrectable1 || uncorrectable2){ + readsUncorrected+=readcount; + basesUncorrected+=basecount; + if(uncList!=null){uncList.add(r);} + } + } + + + if(storage!=null){ + synchronized(storage){ + storage.addAll(keepList); + if(ADD_BAD_READS_COUNTUP){storage.addAll(tossList);} + } + } + + + if(rosk!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rosk.add(keepList, ln.id); + keepList.clear(); + } + if(rost!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rost.add(tossList, ln.id); + tossList.clear(); + } + + if(rosl!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rosl.add(lowList, ln.id); + lowList.clear(); + } + if(rosm!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rosm.add(midList, ln.id); + midList.clear(); + } + if(rosh!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rosh.add(highList, ln.id); + highList.clear(); + } + if(rosu!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rosu.add(uncList, ln.id); + uncList.clear(); + } + + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + } + + + void normalizeInThreadByCountup() { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + final ArrayList keepList=(rosk==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); + final ArrayList tossList=(rost==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); + + int[] cov=null, covSorted=null, covup=null; + long[] kmers1=null, kmers2=null, kmers3=null; + + while(reads!=null && reads.size()>0){ + for(int rnum=0; rnum=k){ + if(verbose){outstream.println();} + kmers1=r.toKmers(k, kca.gap, kmers1, true); + k1valid=true; + } + } + if(r2!=null && r2.bases!=null){ + readcount++; + basecount+=r2.bases.length; + if(r2.bases.length>=k){ + if(verbose){outstream.println();} + kmers2=r2.toKmers(k, kca.gap, kmers2, true); + k2valid=true; + } + } + + final int mergelen=(k1valid ? kmers1.length : 0)+(k2valid ? kmers2.length : 0); + int valid=0, unique=0, desired=0, needed=0, badlyneeded=0; + if(mergelen>0){ + if(kmers3==null || kmers3.length!=mergelen){kmers3=new long[mergelen];} + int j=0; + if(k1valid){ + for(int i=0; i0 && kmer==kmers3[i-1]){ + cov[i]=-1; + covup[i]=-1; + valid++; + }else{ + cov[i]=kca.read(kmer); + covup[i]=kcaup.read(kmer); + valid++; + unique++; + if(cov[i]>=MIN_DEPTH){ + desired++; + if(covup[i]-1){ + if((x>=HTHRESH && prev<=LTHRESH) || x>=prev*ERROR_DETECT_RATIO){ + errors=covSorted.length-i; + break; + }else{nonerrors++;} + } + prev=x; + } + } + + int t1=Tools.max(8, (unique+5)/6); + int t2=Tools.max(2, (unique+23)/24); + + boolean toss=!((needed>=t1 || badlyneeded>=t2) && (desired>=MIN_KMERS_OVER_MIN_DEPTH || unique8 && (needed<2*t1 && badlyneeded<2*t2)){toss=true;} + if(TOSS_ERROR_READS && errors>unique/2 && (needed<3*t1 && badlyneeded<4*t2)){toss=true;} +// assert(false) : "\n"+TOSS_ERROR_READS+", "+unique+", "+desired+", "+needed+", "+badlyneeded+", "+errors+", "+t1+", "+t2; +// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss); + if(KEEP_ALL){toss=false;} + + totalReads+=readcount; + totalBases+=basecount; + + if(toss){ +// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss); + if(tossList!=null){tossList.add(r);} + readsTossed+=readcount; + basesTossed+=basecount; + }else{ +// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss); +// System.out.println("valid="+valid+", unique="+unique+", desired="+desired+", needed="+needed+", toss="+toss+"\n"+Arrays.toString(cov) +// +"\n"+Arrays.toString(covup)+"\n"+Arrays.toString(kmers3)); + for(int i=0; i=MIN_DEPTH){ + long kmer=kmers3[i]; + kcaup.increment(kmer); + } + } + if(keepList!=null){keepList.add(r);} + readsKept+=readcount; + basesKept+=basecount; + } + + if(mergelen>0){ +// Arrays.sort(cov); +// incrementHistogramSorted(cov); + incrementHistogramSorted(covSorted); + } + } + + if(storage!=null){ + synchronized(storage){ + storage.addAll(keepList); + } + } + + if(rosk!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rosk.add(keepList, ln.id); + keepList.clear(); + } + if(rost!=null){ //Important to send all lists to output, even empty ones, to keep list IDs straight. +// System.err.println("Adding list "+ln.id+" of length "+reads.size()); + rost.add(tossList, ln.id); + tossList.clear(); + } + + assert(rosl==null) : "Low fraction out not supported by countup."; + assert(rosm==null) : "Mid fraction out not supported by countup."; + assert(rosh==null) : "High fraction out not supported by countup."; + assert(rosu==null) : "TODO - Uncorrectable fraction out not supported by countup."; + + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + } + + private final int[] getSortedCoverageAndIncrementHistogram(Read r, int[] cov, long[] kmers, + boolean kmersAlreadyValid, boolean kmersAlreadyCanonical, boolean coverageAlreadyValid){ + assert(r!=null && r.bases!=null && r.bases.length>=k) : r; + + if(!coverageAlreadyValid){ + if(!kmersAlreadyValid){kmers=r.toKmers(k, kca.gap, kmers, false);} + cov=generateCoverage(kca, k, cov, kmers, (!kmersAlreadyCanonical && k<32)); + } + + sortCoverageAndIncrementHistogram(cov); + return cov; + } + + private void sortCoverageAndIncrementHistogram(int[] cov){ + if(cov==null || cov.length==0){return;} + Arrays.sort(cov); + Tools.reverseInPlace(cov); + incrementHistogramSorted(cov); + } + + /** Handles coverage sorted in either direction */ + private final void incrementHistogramSorted(int[] cov){ + if(hist==null || cov==null || cov.length==0){return;} + +// outstream.println(Arrays.toString(cov)); + + int last=cov[0]; + long sum=0; +// long sum2=0; + int i=0; + while(i0){ +// outstream.println("Incrementing "+last+" by "+sum); +// sum2+=sum; + if(last0){ +// outstream.println("Incrementing "+last+" by "+sum); +// sum2+=sum; + if(last storage; + + private long totalBases=0; + private long totalReads=0; +// private final java.util.Random randy=new java.util.Random(); + private ThreadLocalRandom randy; + + public long readsKept=0; + public long readsTossed=0; + public long readsLowBin=0; + public long readsMidBin=0; + public long readsHighBin=0; + public long readsUncorrected=0; + public long basesKept=0; + public long basesTossed=0; + public long basesLowBin=0; + public long basesMidBin=0; + public long basesHighBin=0; + public long basesUncorrected=0; + + public long errorReads=0; + public long errorPairs=0; + public long errorType1=0; + public long errorType2=0; + public long errorType3=0; + + public long errorsDetected=0; + public long errorsCorrected=0; + public long errorsMarked=0; + public long basesTrimmed=0; + } + + public static PrintStream outstream=Data.sysout; + + public static int THREAD_HIST_LEN=1<<12; + public static int HIST_LEN=1<<20; + public static long HIST_LEN_PRINT=HIST_LEN; + public static boolean USE_HISTOGRAM=false; + public static boolean PRINT_ZERO_COVERAGE=false; + public static AtomicLongArray histogram_total; + public static long[] qhist_total; + + private static int THREADS=8; + private static boolean verbose=false; + private static boolean errorState=false; + + private static boolean EA=false; + static{assert(EA=true);} + + /** High-depth reads will be downsampled to this level in the current pass */ + private static int TARGET_DEPTH=40; + /** Error-containing reads will be downsampled to at least this level in the current pass */ + private static int TARGET_DEPTH_BAD_LOW=40; + /** Error-containing reads will be downsampled to at most this level in the current pass */ + private static int TARGET_DEPTH_BAD_HIGH=40; + /** High-depth reads will be downsampled to this level in the final pass */ + private static int TARGET_DEPTH_F=40; + /** High-depth reads will be downsampled to this level in the first pass */ + private static int TARGET_DEPTH_1=-1; + /** Reads under this depth will not be downsampled */ + private static int MAX_DEPTH=-1; + /** Reads under this depth will be discarded, and kmers under this depth will be ignored */ + private static int MIN_DEPTH=6; + /** Reads without this many kmers of at least min depth will be discarded */ + private static int MIN_KMERS_OVER_MIN_DEPTH=15; + /** Position in sorted kmer depths array to use as proxy for overall read depth */ + private static float DEPTH_PERCENTILE=0.54f; + + /** Throw out reads with depth at absolute depth percentile below mindepth */ + public static boolean TOSS_BY_LOW_TRUEDEPTH=true; + /** Throw out reads containing errors in the current pass */ + public static boolean TOSS_ERROR_READS=false; + /** Throw out reads containing errors in the final pass */ + public static boolean TOSS_ERROR_READS_F=false; + /** Throw out reads containing errors in the first pass */ + public static boolean TOSS_ERROR_READS_1=false; + /** Only downsample error reads on current pass (keep all error-free reads) */ + public static boolean DISCARD_BAD_ONLY=false; + /** Only downsample error reads on first pass (keep all error-free reads) */ + public static boolean DISCARD_BAD_ONLY_F=false; + /** Only downsample error reads on final pass (keep all error-free reads) */ + public static boolean DISCARD_BAD_ONLY_1=false; + /** Require both reads in a pair to be bad before tossing the read */ + public static boolean REQUIRE_BOTH_BAD=false; + /** Don't toss error reads with depth below max */ + public static boolean SAVE_RARE_READS=false; + /** Position in sorted kmer depths array to use as proxy for high depth kmer */ + public static float HIGH_PERCENTILE=0.90f; + /** Position in sorted kmer depths array to use as proxy for low depth kmer */ + public static float LOW_PERCENTILE=0.25f; + /** Position in sorted kmer depths array to use as proxy for low depth kmer, during countup presort pass */ + public static float LOW_PERCENTILE_COUNTUP=0.20f; + /** Set to true to keep error reads during countup presort pass */ + public static boolean ADD_BAD_READS_COUNTUP=false; + + /** Reads with a high/low ratio of at least this are considered error reads. */ + public static int ERROR_DETECT_RATIO=125; + /** Threshold for high kmer in detection. A high kmer at this or above is considered possibly non-error. */ + public static int HTHRESH=12; + /** Threshold for low kmer in detection. Kmers at this and below are always considered errors. */ + public static int LTHRESH=3; + + /** Reads with a high/low ratio of at least this are considered error reads. */ + public static int ERROR_CORRECT_RATIO=140; + /** Threshold for high kmer in correction. A high kmer at this or above considered possibly non-error. */ + public static int EC_HTHRESH=22; + /** Threshold for low kmer in correction. Kmers at this and below are considered errors if an adjacent kmer is at or above the high thresh. */ + public static int EC_LTHRESH=2; + + public static double TARGET_BAD_PERCENT_LOW=0.85; + public static double TARGET_BAD_PERCENT_HIGH=1.5; + + private static long FILTERBYTES=-1; + + private static int SUFFIX_LEN=3; + private static int PREFIX_LEN=3; + + private static boolean TRIM_LEFT_THIS_PASS=false; + private static boolean TRIM_RIGHT_THIS_PASS=false; + private static boolean RENAME_THIS_PASS=false; + + private static boolean CORRECT_ERRORS_THIS_PASS=false; + private static boolean MARK_ERRORS_ONLY=false; + private static boolean TRIM_AFTER_MARKING=false; + private static boolean TRIM_EVEN_IF_NO_ERRORS_DETECTED=true; + private static boolean MARK_WITH_1=false; + private static boolean MARK_UNCORRECTABLE_ERRORS=false; + private static boolean USE_ECC1=false; + private static boolean USE_ECCF=false; + private static boolean CORRECT_FROM_LEFT=true; + private static boolean CORRECT_FROM_RIGHT=true; + + + private static int LOW_BIN_DEPTH=10; + private static int HIGH_BIN_DEPTH=80; + + /** ECC_LIMIT */ + private static int MAX_ERRORS_TO_CORRECT=3; + private static int MAX_QUAL_TO_CORRECT=127; + + + public static boolean IGNORE_DUPLICATE_KMERS_COUNTUP=true; + + public static boolean CANONICAL=true; + public static boolean ZERO_BIN=false; + public static boolean FIX_SPIKES=false; + public static boolean KEEP_ALL=false; + public static boolean ordered=false; + public static boolean overwrite=true; + public static boolean prefilter=false; + public static boolean renameReads=false; + public static boolean DETERMINISTIC=true; + public static boolean COUNTUP=false; + public static boolean ANALYZE_TOPOLOGY=false; + /** Quality-trim left side of reads before further processing. */ + public static boolean TRIM_LEFT=false; + /** Quality-trim right side of reads before further processing. */ + public static boolean TRIM_RIGHT=false; + public static int minlength=1; + /** Trim until 2 consecutive bases are encountered with at least this quality. */ + public static byte TRIM_QUALITY=5; + + public static boolean REMOVE_TEMP_FILES=true; + public static boolean USE_TMPDIR=true; + public static String TMPDIR=Shared.TMPDIR; + public static boolean useTmpdir(){return USE_TMPDIR && TMPDIR!=null;} + + private static HashSet temp_file_set=null; + + public static AtomicLong peaks=new AtomicLong(); + public static AtomicLong spikes=new AtomicLong(); + public static AtomicLong flats=new AtomicLong(); + public static AtomicLong valleys=new AtomicLong(); + public static AtomicLong slopes=new AtomicLong(); + + public static final byte FIXED_N_QUAL=20; + +} diff --git a/current/jgi/KmerSample.java b/current/jgi/KmerSample.java new file mode 100755 index 0000000..427e73f --- /dev/null +++ b/current/jgi/KmerSample.java @@ -0,0 +1,123 @@ +package jgi; + +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Oct 10, 2012 + * + */ +public class KmerSample { + + + public static int[] makeKmerSet(int K, String filename){ + + //Number of bits in a kmer + int kbits=2*K; + + //Number of possible kmers + long kmerSpace=(1L<'){ + //The line is name of a new contig/scaffold, so reset the kmer + kmer=0; + len=0; + }else{ + //Otherwise, generate kmers + + for(int i=0; i=K){ + //If the kmer is long enough, then add it to the array + + //The index in the array is the upper bits of the kmer. Each location in the array is 32 bits. + int index=(int)(kmer/32); + + //The bit within the word of the array is the lower 5 bits of the kmer + int bit=(int)(kmer%32); + + //A bitmask to set the correct bit in the array to 1. + int x=(1<1 ? split[1] : null; + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("maxdepth")){ + MAX_DEPTH=Integer.parseInt(b); + } + } +// assert(false) : "MAX_DEPTH="+MAX_DEPTH; + assert(Data.GENOME_BUILD>-1); + + calc(args[0], args[1]); + t.stop(); + System.out.println("Time: \t"+t); + } + + public static void calc(String fname1, String fname2){ + RTextInputStream rtis=new RTextInputStream(fname1, (fname2==null || fname2.equals("null") ? null : fname2), -1); + ConcurrentReadInputStream cris=new ConcurrentReadInputStream(rtis, -1); + + new Thread(cris).start(); + System.err.println("Started cris"); + boolean paired=cris.paired(); + System.err.println("Paired: "+paired); + + ArrayList pcov=new ArrayList(8); + pcov.add(new CoverageArray2(0,1000)); + ArrayList cov=new ArrayList(8); + cov.add(new CoverageArray2(0,1000)); + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + +// System.out.println("Processing read "+r.numericID); + + if(r!=null){ + if(r.sites!=null){ + + for(int x=0; x0); + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + System.err.println("Processed "+readsProcessed+" reads."); + System.err.println("Processed "+sitesProcessed+" sites."); + } + + int max=MAX_DEPTH; + long[] hist=new long[max+1]; + long[] phist=new long[max+1]; + double[] histF=new double[max+1]; + double[] phistF=new double[max+1]; + long[] histC=new long[max+1]; + long[] phistC=new long[max+1]; + double[] histCF=new double[max+1]; + double[] phistCF=new double[max+1]; + + for(int chrom=1; chrom<=Data.numChroms; chrom++){ + ChromosomeArray cha=Data.getChromosome(chrom); + if(pcov.size()>chrom){ + CoverageArray ca=pcov.get(chrom); + for(int i=0; i<=cha.maxIndex; i++){ + int x=ca.get(i); + byte b=cha.get(i); + if(b!='N'){ + phist[Tools.min(max, x)]++; + } + } + } + } + + for(int chrom=1; chrom<=Data.numChroms; chrom++){ + ChromosomeArray cha=Data.getChromosome(chrom); + if(cov.size()>chrom){ + CoverageArray ca=cov.get(chrom); + for(int i=0; i<=cha.maxIndex; i++){ + int x=ca.get(i); + byte b=cha.get(i); + if(b!='N'){ + hist[Tools.min(max, x)]++; + } + } + } + } + + phistC[max]=phist[max]; + histC[max]=hist[max]; + for(int i=max; i>0; i--){ + phistC[i-1]=phistC[i]+phist[i-1]; + histC[i-1]=histC[i]+hist[i-1]; + } + for(int i=0; i<=max; i++){ + phistCF[i]=phistC[i]*100d/phistC[0]; + phistF[i]=phist[i]*100d/phistC[0]; + histCF[i]=histC[i]*100d/histC[0]; + histF[i]=hist[i]*100d/histC[0]; + } + + System.out.println("\nTotal Coverage:"); + for(int i=0; i<=max; i++){ + System.out.println(i+"\t"+hist[i]+String.format("\t%.3f%%", histF[i])+"\t"+histC[i]+String.format("\t%.3f%%", histCF[i])); + } + + + System.out.println("\nPerfect Coverage:"); + for(int i=0; i<=max; i++){ + System.out.println(i+"\t"+phist[i]+String.format("\t%.3f%%", phistF[i])+"\t"+phistC[i]+String.format("\t%.3f%%", phistCF[i])); + } + + } + + private static boolean checkPerfection(int start, int stop, byte[] bases, ChromosomeArray cha, boolean rcomp, float f) { + + int noref=0; + if(rcomp){ + for(int i=0; i=f*bases.length; + } + + public static long readsProcessed=0; + public static long sitesProcessed=0; + public static boolean PROCESS_ALL_SITES=false; + public static int MAX_DEPTH=100; + +} diff --git a/current/jgi/MakeLengthHistogram.java b/current/jgi/MakeLengthHistogram.java new file mode 100755 index 0000000..3ac1d07 --- /dev/null +++ b/current/jgi/MakeLengthHistogram.java @@ -0,0 +1,194 @@ +package jgi; + +import java.io.File; +import java.util.ArrayList; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import dna.Data; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextStreamWriter; + +import align2.ListNum; +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Jul 16, 2012 + * + */ +public class MakeLengthHistogram { + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + + String in1=null, in2=null; + String out=null; + + Data.GENOME_BUILD=-1; + /* Parse arguments */ + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.equals("out")){ + out=b; + }else if(a.equals("max") || a.equals("maxlength")){ + MAX_LENGTH=Integer.parseInt(b); + }else if(a.startsWith("mult") || a.startsWith("div") || a.startsWith("bin")){ + MULT=Integer.parseInt(b); + }else if(i==0 && !arg.contains("=")){ + in1=arg; + }else if(i==1 && !arg.contains("=")){ + in2=arg; + }else if(i==3 && !arg.contains("=")){ + out=arg; + }else{ + throw new RuntimeException("Unknown argument: "+arg); + } + } + + MAX_LENGTH/=MULT; + + calc(in1, in2, out); + t.stop(); + System.err.println("Time: \t"+t); + } + + public static void calc(String in1, String in2, String out){ + if(fileIO.FileFormat.hasFastaExtension(in1)){ + FastaReadInputStream.SPLIT_READS=false; + FastaReadInputStream.MIN_READ_LEN=1; + }else{ + FASTQ.PARSE_CUSTOM=false; + } + long maxReads=-1; + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); +// if(verbose){System.err.println("Started cris");} + Thread th=new Thread(cris); + th.start(); + } + boolean paired=cris.paired(); +// if(verbose){System.err.println("Paired: "+paired);} + + + final int max=MAX_LENGTH; + long[] hist=new long[max+1]; + long[] bhist=new long[max+1]; + + int maxFound=0; + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + +// System.out.println("Processing read "+r.numericID); + + if(r!=null && r.bases!=null){ + readsProcessed++; + int x=r.bases.length; + int y=Tools.min(max, (x+MULT/2)/MULT); + hist[y]++; + bhist[y]+=x; + maxFound=Tools.max(maxFound, x); + } + + if(r.mate!=null){ + readsProcessed++; + Read r2=r.mate; + int x=r2.bases.length; + int y=Tools.min(max, (x+MULT/2)/MULT); + hist[y]++; + bhist[y]+=x; + maxFound=Tools.max(maxFound, x); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + System.err.println("Processed "+readsProcessed+" reads."); + } + + double[] histF=new double[max+1]; + long[] histC=new long[max+1]; + double[] histCF=new double[max+1]; + + double[] bhistF=new double[max+1]; + long[] bhistC=new long[max+1]; + double[] bhistCF=new double[max+1]; + + + histC[max]=hist[max]; + bhistC[max]=bhist[max]; + for(int i=max; i>0; i--){ + histC[i-1]=histC[i]+hist[i-1]; + bhistC[i-1]=bhistC[i]+bhist[i-1]; + } + for(int i=0; i<=max; i++){ + histCF[i]=histC[i]*100d/histC[0]; + histF[i]=hist[i]*100d/histC[0]; + bhistCF[i]=bhistC[i]*100d/bhistC[0]; + bhistF[i]=bhist[i]*100d/bhistC[0]; + } + + TextStreamWriter tsw=new TextStreamWriter(out==null ? "stdout" : out, true, false, false); + tsw.start(); + tsw.println("Reads: \t"+readsProcessed); + tsw.println("Bases: \t"+bhistC[0]); + tsw.println("Avg Length: \t"+String.format("%.1f",(bhistC[0]*1d/readsProcessed))); + tsw.println("Read Length Histogram:\n"); + tsw.println("Length\treads\t%reads\tcum reads\tcum %reads\tbases\t%bases\tcum bases\tcum %bases"); + for(int i=0; i<=max; i++){ + tsw.println((i*MULT)+"\t"+hist[i]+String.format("\t%.3f%%", histF[i])+"\t"+histC[i]+String.format("\t%.3f%%", histCF[i])+ + "\t"+bhist[i]+String.format("\t%.3f%%", bhistF[i])+"\t"+bhistC[i]+String.format("\t%.3f%%", bhistCF[i])); + if(i*MULT>=maxFound){break;} + } + tsw.poisonAndWait(); + } + + public static long readsProcessed=0; + public static int MAX_LENGTH=4000; + public static int MULT=10; + +} diff --git a/current/jgi/MateReadsMT.java b/current/jgi/MateReadsMT.java new file mode 100755 index 0000000..629322f --- /dev/null +++ b/current/jgi/MateReadsMT.java @@ -0,0 +1,1707 @@ +package jgi; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.List; + +import kmer.KCountArray; +import kmer.KmerCount7MT; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; +import stream.ReadStreamWriter; + +import dna.AminoAcid; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextFile; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import align2.TrimRead; + +/** + * @author Brian Bushnell + * @date Aug 14, 2012 + * + */ +public class MateReadsMT { + + + public static void main(String[] args){ + MateReadsMT mr=new MateReadsMT(args); + mr.process(); + } + + public MateReadsMT(String[] args){ + System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + System.err.println("BBMerge version "+version); + + Timer ttotal=new Timer(); + ttotal.start(); + + in1_primary=(args[0].indexOf('=')>0 ? null : args[0]); + in2_primary=(in1_primary!=null && args.length>1 && args[1].indexOf('=')<0 ? args[1] : null); + if(in2_primary!=null && "null".equalsIgnoreCase(in2_primary)){in2_primary=null;} + + { + if(in1_primary!=null && !in1_primary.contains(",") && !in1_primary.startsWith("stdin.") && !in1_primary.equals("stdin")){ + File f=new File(in1_primary); + if(!f.exists() || !f.isFile()){ + in1_primary=null; +// throw new RuntimeException(in1+" does not exist."); + } + } + if(in2_primary!=null && !in2_primary.contains(",")){ + File f=new File(in2_primary); + if(!f.exists() || !f.isFile()){ + in2_primary=null; +// throw new RuntimeException(in2+" does not exist."); + }else if(in1_primary.equalsIgnoreCase(in2_primary)){ + throw new RuntimeException("Both input files are the same."); + } + } + } + + FASTQ.PARSE_CUSTOM=false; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + +// assert(false) : FASTQ.TEST_INTERLEAVED; + + int threads_=-1; + + boolean trimRight_=false; + boolean trimLeft_=false; + boolean setPigz=false; + byte trimq_=4; + int minReadLength_=0; + + for(int i=0; i1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("in") || a.equals("in1")){ + in1_primary=b; + }else if(a.equals("in2")){ + in2_primary=b; + }else if(a.equals("k") || a.equals("kmer")){ + k_G=Integer.parseInt(b); + }else if(a.equals("minoverlappingbases") || a.equals("minoverlapbases")){ + MIN_OVERLAPPING_BASES=Integer.parseInt(b); + }else if(a.equals("minoverlap") || a.equals("minoverlappingkmers") || a.equals("minoverlapkmers")){ + MIN_OVERLAPPING_KMERS=Integer.parseInt(b); + }else if(a.equals("minoverlappingbases0") || a.equals("minoverlapbases0")){ + MIN_OVERLAPPING_BASES_0=Integer.parseInt(b); + }else if(a.equals("minoverlap0") || a.equals("minoverlappingkmers0") || a.equals("minoverlapkmers0")){ + MIN_OVERLAPPING_KMERS_0=Integer.parseInt(b); + }else if(a.equals("minoverlapinsert")){ + MIN_OVERLAP_INSERT=Integer.parseInt(b); + }else if(a.equals("badlimit")){ + DEFAULT_BADLIMIT=Integer.parseInt(b); + }else if(a.startsWith("matrixbits")){ + int matrixbits=Integer.parseInt(b); + assert(matrixbits<63); + totalcells_G=1L<0 && hashes<25); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits_G=Integer.parseInt(b); + int cmax=(1<=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimright") || a.equals("qtrimright")){ + trimRight_=Tools.parseBoolean(b); + }else if(a.equals("trimleft") || a.equals("qtrimleft")){ + trimLeft_=Tools.parseBoolean(b); + }else if(a.equals("trimq") || a.equals("trimquality")){ + trimq_=Byte.parseByte(b); + }else if(a.equals("q102matrix") || a.equals("q102m")){ + CalcTrueQuality.q102matrix=b; + }else if(a.equals("qbpmatrix") || a.equals("bqpm")){ + CalcTrueQuality.qbpmatrix=b; + }else if(a.equals("adjustquality") || a.equals("adjq")){ + TrimRead.ADJUST_QUALITY=Tools.parseBoolean(b); + }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){ + minReadLength_=Integer.parseInt(b); + }else if(a.equals("mi") || a.equals("minins") || a.equals("mininsert")){ + minInsert=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + setPigz=true; + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(TrimRead.ADJUST_QUALITY){CalcTrueQuality.initializeMatrices();} + + if(in2_primary==null && in1_primary!=null && in1_primary.contains("#") && !new File(in1_primary).exists()){ + in2_primary=in1_primary.replaceFirst("#", "2"); + in1_primary=in1_primary.replaceFirst("#", "1"); + } + + if(!setPigz && gap_G==null){ + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + } + + trimRight=trimRight_; + trimLeft=trimLeft_; + trimq=trimq_; + minReadLength=minReadLength_; + qtrim=trimLeft_||trimRight_; + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + if(in2_primary!=null){ + assert(!in1_primary.equalsIgnoreCase(in2_primary)); + FASTQ.TEST_INTERLEAVED=false; + FASTQ.FORCE_INTERLEAVED=false; + }else{ + FASTQ.TEST_INTERLEAVED=true; + FASTQ.FORCE_INTERLEAVED=true; + } + +// assert(false) : MATE_BY_OVERLAP; + + if(FILL_MIDDLE_INTERMEDIATE){ + if(!WRITE_INTERMEDIATE_JOINED){System.err.println("WRITE_INTERMEDIATE_JOINED forced to true.");} + WRITE_INTERMEDIATE_JOINED=true; + } + if(WRITE_INTERMEDIATE_JOINED){ + if(!join_G){System.err.println("Final output forced to be joined reads.");} + join_G=true; + //Ultimately I could re-read the initial files, so this is not truly needed. + } + } + + void process(){ + Timer ttotal=new Timer(); + ttotal.start(); +// assert(!FASTQ.PARSE_CUSTOM); + + final int hwthr=Shared.THREADS; + if(THREADS<1){THREADS=hwthr;} + System.err.println("Detected "+Runtime.getRuntime().availableProcessors()+" hardware threads; using "+THREADS+" for main process."); + long memory=(Runtime.getRuntime().maxMemory()); + System.err.println("Detected "+(memory/(1L<<20))+" MB available memory."); +// System.err.println("PARSE_CUSTOM="+FASTQ.PARSE_CUSTOM); + + if(gap_G!=null){ + for(int[] g : gap_G){ + maxtables=Tools.max(maxtables, g.length); + for(int g2 : g){assert(g2>0) : "TODO: Ungapped kmers do not currently work. Please use gap lengths of >0.";} + } + } + if(maxtables<1 && !USE_MAPPING && !MATE_BY_OVERLAP){ + throw new RuntimeException("No gap sizes have been specified, so there is no work to do."); + } + + if(passes_G>1){totalcells_G*=2;} + + if(auto && maxtables>0 && totalcells_G<0){ + final long usable=(long)Tools.max(((memory-(256000000))*.7), memory*0.4); + long mem=usable; + totalcells_G=(mem*8)/cbits_G; + +// long tablebytes=((1L<(1L<<(2*k_G))){cells=(1L<<(2*k_G));} + middleTable=KmerCount7MT.makeKca(in1, in2, extra_G, MIDDLE_TABLE_K, cbits_G, 0, cells, hashes+1, MIN_QUALITY, true, tableReads_G, 1, 4, 2, 2, null); + middleTable.shutdown(); + System.err.println("MiddleTable: \tgap = "+middleTable.gap+" \tmem = "+middleTable.mem()+" \tused = "+String.format("%.3f%%",middleTable.usedFraction()*100)); + } + + final int cmax=(1<MAX_HITS_FOR_BAD && MIN_HITS_FOR_GOOD<=cmax && MAX_HITS_FOR_BAD>0); + + FastaReadInputStream.SPLIT_READS=false; + + + if(ecc){ + System.err.println("\nDoing error correction."); + String x=tempfile.replaceFirst("#", "_ecc_#"); + ErrorCorrectMT.main(new String[] {in1, in2, "cbits="+/*cbits*/2, "auto", "reads="+tableReads_G, "tablereads="+tableReads_G, + "passes=1", "hashes="+hashes, "k="+/*k*/29, "overwrite="+overwrite, "out="+x, "forceinterleaved="+FASTQ.FORCE_INTERLEAVED, + "testinterleaved=false", "dontoutputbadpairs="+ecctossbad, "dontoutputbadreads="+ecctossbad, "threads="+THREADS, "parsecustom="+FASTQ.PARSE_CUSTOM}); + in1=x.replaceFirst("#", "1"); + in2=x.replaceFirst("#", "2"); + System.err.println("Finished error correction.\n"); + } + + final int phases=(gap_G==null ? 1 : gap_G.length); + + KmerCount7MT.PREJOIN=false; + + String a1=in1, a2=in2; + + int oldzip=ReadWrite.ZIPLEVEL; + for(int phase=0; phase0){System.err.println("Too Short: \t"+tooShortCountTotal+String.format((tooShortCountTotal<10000 ? " " : " ")+"\t%.3f%%", tooShortCountTotal*div));} + System.err.println("Avg Insert: \t\t"+String.format("%.1f", (insertSumCorrectTotal+insertSumIncorrectTotal)*1d/(correctCountTotal+incorrectCountTotal))); + if(FASTQ.PARSE_CUSTOM){ + System.err.println("Avg Insert Correct: \t\t"+String.format("%.1f", (insertSumCorrectTotal)*1d/(correctCountTotal))); + System.err.println("Avg Insert Incorrect:\t\t"+String.format("%.1f", (insertSumIncorrectTotal)*1d/(incorrectCountTotal))); + } + + System.err.println("\nPhase "+(phases)+" statistics."); + System.err.println("Insert range: \t"+insertMinTotal+" - "+insertMaxTotal); + System.err.println("90th percentile: \t"+Tools.percentile(histTotal, .9)); + System.err.println("50th percentile: \t"+Tools.percentile(histTotal, .5)); + System.err.println("10th percentile: \t"+Tools.percentile(histTotal, .1)); + } + + public static void runPhase(int[] gap, String in1, String in2, List extra, String outinsert, String outgood, String outbad, + int cbits, int k, long totalcells, int multihash, int passes, boolean join, long maxReads, long tableReads, boolean perfectonly, KCountArray middleTable){ + + assert(((USE_MAPPING || MATE_BY_OVERLAP) && MIN_VOTES<2) || + (MIN_VOTES>0 && MIN_VOTES<=gap.length)) : "minVotes is set too high. Should be at most the number of (overlapping) gaps."; + + Timer thash=new Timer(), talign=new Timer(); + + assert(totalcells>1); + if(middleTable!=null){totalcells=totalcells-middleTable.cells;} + long cells=totalcells/(gap==null || gap.length==0 ? 1 : gap.length); + if(k<32 && cells>1L<<(2*k)){cells=1L<<(2*k);} + + RTextOutputStream3 rosgood=null; + RTextOutputStream3 rosbad=null; + RTextOutputStream3 rosinsert=null; + + if(outgood!=null){ + final String out1, out2; + +// assert(outgood.contains("#") || sam || fq) : outgood; + if(outgood.contains("#")){ + out1=outgood.replaceFirst("#", "1"); + out2=outgood.replaceFirst("#", "2"); + }else{ + out1=outgood; + out2=null; + if(!join){System.err.println("Writing joinable reads interleaved.");} + else{System.err.println("Writing joinable reads joined.");} + } + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + final FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, true); + final FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, true); + assert(!ff1.samOrBam()) : "Sam files need reference info for the header."; + + final int buff=Tools.max(16, 2*THREADS); + rosgood=new RTextOutputStream3(ff1, ff2, null, null, buff, null, false); + rosgood.start(); + } + + if(outbad!=null){ + final String out1, out2; + +// assert(outbad.contains("#") || sam || fq) : outbad; + if(outbad.contains("#")){ + out1=outbad.replaceFirst("#", "1"); + out2=outbad.replaceFirst("#", "2"); + }else{ + out1=outbad; + out2=null; + System.err.println("Writing unjoinable reads interleaved."); + } + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + + final FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, true); + final FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, true); + assert(!ff1.samOrBam()) : "Sam files need reference info for the header."; + + final int buff=Tools.max(16, 2*THREADS); + rosbad=new RTextOutputStream3(ff1, ff2, null, null, buff, null, false); + rosbad.start(); + } + + if(outinsert!=null){ + final int buff=Tools.max(16, 2*THREADS); + boolean sam=false, bam=false; + boolean fq=false; + boolean info=true; + + String out1=outinsert.replaceFirst("#", "1"); + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + + ReadStreamWriter.HEADER=header(); + rosinsert=new RTextOutputStream3(out1, null, buff, true, sam, bam, fq, false, info, overwrite, false); + rosinsert.start(); + } + + + if(rosgood!=null || rosbad!=null || rosinsert!=null){ + System.err.println("Started output threads."); + } + + thash.start(); + + KCountArray[] kca=new KCountArray[gap==null ? 0 : gap.length]; + for(int i=0; i0){ + thash.stop(); + System.err.println("Hash time: "+thash); + } + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + boolean paired=cris.paired(); + assert(paired); + if(verbose){System.err.println("Paired: "+paired);} + + talign.start(); + + + MateThread[] pta=new MateThread[THREADS]; + for(int i=0; i0){ + + final int overlap=Tools.min(insert, a.bases.length+b.bases.length-insert); + int matches=0; + int mismatches=0; + boolean ok=true; + if(overlap>=4){ + mismatches=countMismatches(a, b, insert, bestMismatches+10); + matches=overlap-mismatches; + ok=(mismatches<3 || mismatches*2=MIN_OVERLAPPING_KMERS && bad<=bestBad){ + if(badbestGood || (good==bestGood && sum>bestSum)){ + ambig=(bestBad==0); + bestScore=score; + bestInsert=insert; + bestGood=good; + bestBad=bad; + bestSum=sum; + if(ambig){break;} + }else if(good==bestGood && sum==bestSum){ + assert(bad==bestBad && sum==bestSum) : bad+"~"+bestBad+", "+good+"~"+bestGood+", "+sum+"~"+bestSum; + ambig=true; + } + } + } + } + + rvector[0]=bestScore; + rvector[1]=bestGood; + rvector[2]=bestBad; + rvector[3]=bestSum; + rvector[4]=(ambig ? 1 : 0); + + return bestInsert; + } + + + public static int mateRead(Read a, Read b, int k1, int k2, long mask1, long mask2, KCountArray kca[], int[] rvector) { + +// verbose=a.numericID>145; + + if(USE_MAPPING){ + rvector[0]=100; + rvector[1]=20; + rvector[2]=0; + rvector[3]=20; //What is this? + rvector[4]=0; + rvector[5]=Tools.max(1, MIN_VOTES); + return a.insertSizeMapped(ignoreMappingStrand); + } + if(rvector==null){rvector=new int[6];} + + if(a.obj==null){a.obj=hash(a, k1, mask1, 2*k2);} + if(b.obj==null){b.obj=hash(b, k2, mask2, 0);} + long[] half1=(long[])a.obj; + long[] half2=(long[])b.obj; + if(half1==null || half2==null){return -1;} + + int bestInsert=-1; + int bestScore=-1; //This serves as the threshold for the minimum score to report. + int bestGood=-1; + int bestBad=DEFAULT_BADLIMIT; + + int minGap=kca[0].gap; + int maxGap=kca[0].gap; + + final int[] pivot=new int[kca.length]; + final int[] minInsert=new int[kca.length]; + final int[] maxInsert=new int[kca.length]; + + for(int i=0; i=minInsert[g] && insert<=maxInsert[g]){ + if(verbose){System.err.println("Testing gap "+kca[g].gap);} + int x=scoreIP(half1, half2, insert, pivot[g], kca[g], rvector, bestBad); + final int good0=rvector[1], bad0=rvector[2]; + if(verbose){System.err.println("score="+score+", rvector="+Arrays.toString(rvector));} + if((good0>MIN_OVERLAPPING_KMERS) && (bad0>bestBad || good0+bad0>=ACCEL_FACTOR)){ +// score=votes==0 ? x : Tools.min(score, x); +// if(verbose){System.err.println("new score="+score);} +// good=votes==0 ? good0 : Tools.min(good0, good); +// bad=votes==0 ? bad0 : Tools.max(bad0, bad); + + score=votes==0 ? x : Tools.max(score, x); + if(verbose){System.err.println("new score="+score);} + good=votes==0 ? good0 : Tools.max(good0, good); + bad=votes==0 ? bad0 : Tools.min(bad0, bad); + // sum=votes==0 ? rvector[3] : Tools.max(rvector[3], sum); + votes++; + if(bad>bestBad || score<=0){break;} + } + } + + } + if(score>0/* && votes>=MIN_VOTES*/){ + + final int overlap=Tools.min(insert, a.bases.length+b.bases.length-insert); + int matches=0; + int mismatches=0; + boolean ok=true; + if(overlap>=minOverlap){ + mismatches=countMismatches(a, b, insert, bestMismatches+10); + matches=overlap-mismatches; + matchScore=matches-mismatches*4; + ok=(mismatches<3 || mismatches*2=MIN_OVERLAPPING_KMERS && bad<=bestBad){ + + + ambig=true; + boolean setBest=false; + boolean quit=(bad==0 && bestBad==0); + + if(badbestVotes){ + setBest=true; + }else if(votes==bestVotes && matches>=bestMatches && mismatches<=bestMismatches && (matches>bestMatches || mismatches=bestMismatches){ambig=true;} + setBest=true; + }else if(matchScore>=bestMatchScore && (good>bestGood /*|| (good==bestGood && sum>bestSum)*/)){ + setBest=true; + } + + if(setBest){ + bestScore=score; + bestInsert=insert; + bestGood=good; + bestBad=bad; + bestVotes=votes; + if(overlap>=minOverlap){ + bestMismatches=mismatches; + bestMatches=matches; + bestMatchScore=matchScore; + } + if(votes0 && bad331>0 && good332>0 && bad332>0){assert(false);} +// } + + rvector[0]=bestScore; + rvector[1]=bestGood; + rvector[2]=bestBad; +// rvector[3]=bestSum; + rvector[4]=(ambig ? 1 : 0); + rvector[5]=bestVotes; + + return bestInsert; + } + + + public static int mateByOverlap(Read a, Read b, int[] rvector, final int minOverlap0, final int minOverlap) { + if(USE_MAPPING){ + rvector[0]=100; + rvector[1]=20; + rvector[2]=0; + rvector[3]=20; //What is this? + rvector[4]=0; + rvector[5]=Tools.max(1, MIN_VOTES); + return a.insertSizeMapped(ignoreMappingStrand); + } + if(rvector==null){rvector=new int[6];} + final byte[] abases=a.bases, bbases=b.bases, aqual=a.quality, bqual=b.quality; + + int bestOverlap=-1; +// int bestScore=-1; //This serves as the threshold for the minimum score to report. + int bestGood=-1; + int bestBad=DEFAULT_BADLIMIT_FOR_BASE_MATCHING; + final int margin=2; + + boolean ambig=false; + final int maxOverlap=abases.length+bbases.length-Tools.max(minOverlap, MIN_OVERLAP_INSERT); +// assert(false) : minOverlap+", "+maxOverlap; +// System.err.print("\nm"); + + for(int overlap=Tools.max(minOverlap0, 0); overlap=0 && j<=abases.length && i>=0 && i<=bbases.length) : "\njstart="+jstart+", j="+j+", istart="+istart+", i="+i+" \n"+ + "overlap="+overlap+", a.length="+a.bases.length+", b.length="+b.bases.length+", bad="+bad+", badlim="+badlim+", good="+good+", tested="+tested; + byte ca=abases[j], cb=bbases[i]; + if(ca=='N' || cb=='N' || (aqual!=null && aqual[j]minOverlap){//Candidate + if(bad<=bestBad){ + +// System.err.print("b"); + if(badbestGood)){//Current winner + if(bad>bestBad-margin){ambig=true;} + bestOverlap=overlap; + bestBad=bad; + bestGood=good; +// assert(abases.length+bbases.length-bestOverlap<299) : +// ((abases.length+bbases.length-bestOverlap)+", "+ambig+", "+overlap+", "+good+", "+bad+", "+tested+", "+bestGood+", "+bestBad+", "+a.insertSize()); + }else if(bad==bestBad){ + ambig=true; + } + +// System.err.print("c"); + if(ambig && bestBad299){ +// System.err.println((abases.length+bbases.length-bestOverlap)+", "+ambig+", "+rvector[0]+", "+bestGood+", "+bestBad+", "+a.insertSize()); +// } + + } +// System.err.println("i"); + + rvector[0]=((bestBad==0 ? 8 : 4)*bestGood-6*bestBad); + rvector[1]=bestGood; + rvector[2]=bestBad; +// rvector[3]=bestSum; + rvector[4]=(ambig ? 1 : 0); + rvector[5]=0; + +// if(abases.length+bbases.length-bestOverlap>299){ +// System.err.println((abases.length+bbases.length-bestOverlap)+", "+ambig+", "+rvector[0]+", "+bestGood+", "+bestBad+", "+a.insertSize()); +// } + +// assert(bestOverlap>-1); + return (bestOverlap<0 ? -1 : abases.length+bbases.length-bestOverlap); + } + + + public static int countMismatches(Read a, Read b, int insert, int maxMismatches){ + final int lengthSum=a.bases.length+b.bases.length; + if(insert>=lengthSum){return 0;} + final int overlap=Tools.min(insert, lengthSum-insert); + + int mismatches=0; + + + int start1=(insert>a.bases.length ? a.bases.length-overlap : 0); + int start2=(insert>=b.bases.length ? 0 : b.bases.length-overlap); +// System.err.println(insert+", "+overlap+", "+start1+", "+start2); + + while(start1<0 || start2<0){start1++; start2++;} + for(int i=start1, j=start2; imaxMismatches){break;} + } + } + } + return mismatches; + } + + public static int scoreIP(long[] half1, long[] half2, int insert, int pivot, KCountArray kca, int[] rvector, final int badlimit){ + int start1, start2; + if(insert<=pivot){ //Short mode; start from base 0 of read a + start1=0; + start2=pivot-insert; + }else{ //Long mode; start from base 0 of read b + start1=insert-pivot; + start2=0; + } + if(verbose){ + System.err.println("ScoreIP. Insert: "+insert+", gap="+kca.gap+", badlimit="+badlimit); + } + return score(half1, half2, start1, start2, kca, rvector, badlimit); + } + + public static int score(long[] half1, long[] half2, int start1, int start2, KCountArray kca, int[] rvector, final int badlimit){ + int good=0; + int bad=0; + int sum=0; + final int len=Tools.min(half1.length-start1, half2.length-start2); +// final int incr=Tools.min(len/8, 8); //Accelerates scoring by a factor of 8 for a preview + final int incr=Tools.min(len/ACCEL_FACTOR, ACCEL_FACTOR); + + if(incr>1){ + for(int i=start1, j=start2; i=MIN_HITS_FOR_GOOD){good++;} + else if(x<=MAX_HITS_FOR_BAD){ + bad++; + if(bad>badlimit){break;} + } +// if(verbose){System.err.print("("+Long.toHexString(half1[i])+","+Long.toHexString(half2[j])+","+Long.toHexString(key)+","+x+")");} + } + } + if(verbose){ + System.err.println("\n(incr="+incr+") Good: "+good+" \tBad: "+bad); + } + if(bad>good || bad>badlimit){ + rvector[0]=incr*((bad==0 ? 8 : 4)*good-6*bad); + rvector[1]=incr*good; + rvector[2]=incr*bad; + rvector[3]=incr*sum; + return rvector[0]; + }else{ + good=0; + bad=0; + sum=0; + } + } + + for(int i=start1, j=start2; i=MIN_HITS_FOR_GOOD){good++;} + else if(x<=MAX_HITS_FOR_BAD){ + bad++; + if(bad>badlimit){break;} + } +// if(verbose){System.err.print("("+Long.toHexString(half1[i])+","+Long.toHexString(half2[j])+","+Long.toHexString(key)+","+x+")");} + } + } + if(verbose){ + System.err.println("\nGood: "+good+" \tBad: "+bad+" \tSum: "+sum); + } + rvector[0]=((bad==0 ? 8 : 4)*good-6*bad); + rvector[1]=good; + rvector[2]=bad; + rvector[3]=sum; + return rvector[0]; + } + + public static void toHex(long[] array){ + for(int i=0; i=k){ + half[j]=(kmer<=1 && k1+k2<20) : k1+", "+k2+", "+(k1+k2); + assert(USE_MAPPING || MATE_BY_OVERLAP || kca[0].gap>=0); + final int kbits1=2*k1; + final int kbits2=2*k2; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(r.mate!=null || WRITE_INTERMEDIATE_JOINED); + } + + + while(reads!=null && reads.size()>0){ + + ArrayList listg=(rosgood==null ? null : new ArrayList()); + ArrayList listb=(rosbad==null ? null : new ArrayList()); + ArrayList listi=(rosinsert==null ? null : new ArrayList()); + + for(Read r1 : reads){ + final Read r2=r1.mate; + + TrimRead tr1=null, tr2=null; + + boolean remove=false; + if(qtrim){ + if(untrim){ + if(r1!=null){ + tr1=TrimRead.trim(r1, trimLeft, trimRight, trimq, 1); + int x=(tr1==null ? 0 : tr1.leftTrimmed+tr1.rightTrimmed); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + if(r2!=null){ + tr2=TrimRead.trim(r2, trimLeft, trimRight, trimq, 1); + int x=(tr2==null ? 0 : tr2.leftTrimmed+tr2.rightTrimmed); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + }else{ + if(r1!=null){ + int x=TrimRead.trimFast(r1, trimLeft, trimRight, trimq, 1); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + if(r2!=null){ + int x=TrimRead.trimFast(r2, trimLeft, trimRight, trimq, 1); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + } + } + + if(minReadLength>0 && !remove){ + int rlen=(r1==null || r1.bases==null ? 0 : r1.bases.length); + int rlen2=(r2==null || r2.bases==null ? 0 : r2.bases.length); + if(rlen0 && bestBad<1){sb.append('P');} //Perfect + else{sb.append('I');}//Imperfect + + if(bestInsert>0){ + sb.append("\t"+bestGood+"\t"+bestBad+"\t"+bestScore+"\t"+bestVotes); + } + r1.obj=sb; + listi.add(r1); + } + } + + // if(bestInsert!=trueSize && bestInsert>0 && !ambig){ + // System.err.println("\nIncorrect answer for read "+r.numericID+"; true insert = "+trueSize+", called at "+bestInsert); + //// verbose=true; + // for(int i=0; i<300; i++){ + // int x=testRead(r, r.mate, k1, k2, mask1, mask2, kca, rvector, i); + // if((x>0 && rvector[2]<=bestBad) || i==trueSize || i==bestInsert){ + // verbose=true; + // testRead(r, r.mate, k1, k2, mask1, mask2, kca, rvector, i); + // verbose=false; + // } + // } + //// verbose=false; + // } + + // assert(r.numericID<200); + // assert(false); + if(r2!=null){r2.reverseComplement();} + } + } + + if(rosgood!=null){rosgood.add(listg, ln.id);} + if(rosbad!=null){rosbad.add(listb, ln.id);} + if(rosinsert!=null){rosinsert.add(listi, ln.id);} + + // System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + // System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + // System.err.println("reads: "+(reads==null ? "null" : reads.size())); + } + cris.returnList(ln, ln.list.isEmpty()); + } + + int[] hist=new int[1000]; + + long readsProcessed=0; + long matedCount=0; + long correctCount=0; + long ambiguousCount=0; + long tooShortCount=0; + long incorrectCount=0; + long noSolutionCount=0; + long insertSumCorrect=0; + long insertSumIncorrect=0; + int insertMax=0; + int insertMin=999999999; + + long basesTrimmedT=0; + long readsTrimmedT=0; + + private final ConcurrentReadStreamInterface cris; + private final RTextOutputStream3 rosgood; + private final RTextOutputStream3 rosbad; + private final RTextOutputStream3 rosinsert; + private final int k; + private final KCountArray[] kca; + private final boolean joinReads; + private final boolean joinperfectonly; + private final KCountArray middleTable; + } + + private String in1_primary; + private String in2_primary; + + private String outgood_G=null; + private String outbad_G=null; + private String outinsert_G=null; + private String outhist=null; + private String outhist2=null; + private String outhist3=null; + + private long maxReads_G=-1; + private long tableReads_G=-1; + private int k_G=K_DEFAULT; + private int[][] gap_G=null; + private int cbits_G=2; + private long totalcells_G=-1; + private int hashes=2; + private int passes_G=1; + private String tempfile="mateReadsTemp#.txt.gz"; + private boolean join_G=true; + private int maxtables=0; + private boolean ecc=false; + private boolean ecctossbad=false; + private boolean auto=true; + + private byte qin=-1; + private byte qout=-1; + + private List extra_G=null; + + static boolean errorState=false; + + static boolean trimRight=false; + static boolean trimLeft=false; + static boolean untrim=false; + static byte trimq=6; + static int minReadLength=0; + static int minInsert=0; + static boolean qtrim=false; + static boolean TRIM_ON_OVERLAP_FAILURE=true; + + + static int[] histTotal=new int[1000]; + static int bin=1; + + static long readsProcessedTotal=0; + static long matedCountTotal=0; + static long correctCountTotal=0; + static long ambiguousCountTotal=0; + static long tooShortCountTotal=0; + static long incorrectCountTotal=0; + static long noSolutionCountTotal=0; + static long insertSumCorrectTotal=0; + static long insertSumIncorrectTotal=0; + static long basesTrimmedTotal=0; + static long readsTrimmedTotal=0; + static int insertMinTotal=999999999; + static int insertMaxTotal=0; + + public static int MIN_OVERLAPPING_KMERS=10; + public static int MIN_OVERLAPPING_KMERS_0=4; + public static int MIN_OVERLAPPING_BASES=12; + public static int MIN_OVERLAPPING_BASES_0=8; + public static int MIN_OVERLAP_INSERT=16; + public static int ACCEL_DIV=10; //Acceleration is actually proportional to inverse of this number. + public static int ACCEL_FACTOR=ACCEL_DIV; //Max distance between samples + public static int DEFAULT_BADLIMIT=25; + public static int DEFAULT_BADLIMIT_FOR_BASE_MATCHING=3; + public static int DEFAULT_MISMATCHLIMIT=6; + public static int MIN_HITS_FOR_GOOD=3; + public static int MAX_HITS_FOR_BAD=1; + public static int MIN_VOTES=1; + public static int K_DEFAULT=29; + public static int MIDDLE_TABLE_K=31; + public static byte MIN_QUALITY=8; + public static byte MIN_QUALITY_FOR_OVERLAP=7; + /** Skip alignment and calculate insert from mapping info */ + public static boolean USE_MAPPING=false; + public static boolean MATE_BY_OVERLAP=true; + public static boolean SKIP_MATED_READS=false; + public static boolean OUTPUT_FAILED=true; + public static boolean MIX_BAD_AND_GOOD=false; + public static boolean WRITE_INTERMEDIATE_JOINED=false; + public static boolean FILL_MIDDLE_INTERMEDIATE=false; + public static boolean FILL_MIDDLE_FINAL=false; + public static boolean overwrite=true; + public static boolean verbose=false; + public static boolean ignoreMappingStrand=false; + + public static int THREADS=-1; + public static float version=1.4f; + +} diff --git a/current/jgi/MatrixMult.java b/current/jgi/MatrixMult.java new file mode 100755 index 0000000..0236ef7 --- /dev/null +++ b/current/jgi/MatrixMult.java @@ -0,0 +1,224 @@ +package jgi; + +import java.util.Arrays; +import java.util.Random; + +/** + * @author Brian Bushnell + * @date Nov 2, 2012 + * + */ +public final class MatrixMult { + + public static void main(String[] args){ + + Timer t=new Timer(); + + //Grab arguments + int N=Integer.parseInt(args[0]); + int threads=(args.length>1 ? Integer.parseInt(args[1]) : 1); + int iters=(args.length>2 ? Integer.parseInt(args[2]) : 1); + + //Initialize arrays + int[][] A=new int[N][N]; + int[][] B=new int[N][N]; + int[][] C=new int[N][N]; + + Random randy=new Random(0); + int max=100; + + //Fill random arrays + for(int i=0; i=A.length); + threads=(A.length+chunk-1)/chunk; + + if(!printed){System.out.println("Using "+threads+" threads, chunksize "+chunk);} + printed=true; + + //Create workers + MultThread[] workers=new MultThread[threads]; + for(int i=0; iy ? x : y;} + + private static class Timer { + + public Timer(){} + + public long start(){ + time1=time2=System.nanoTime(); + elapsed=0; + return time1; + } + + public long stop(){ + time2=System.nanoTime(); + elapsed=time2-time1; + return time2; + } + + public String toString(){ + return String.format("%.3f seconds.", elapsed/1000000000d); + } + + public long time1; + public long time2; + /** in nanos */ + public long elapsed; + + } + + + + /* ~~~~~~~~~~~~~~~~~~~~~~ Fields ~~~~~~~~~~~~~~~~~~~~~~ */ + + private static boolean printed=false; + private static boolean verbose=false; + +} diff --git a/current/jgi/MatrixMultFloat.java b/current/jgi/MatrixMultFloat.java new file mode 100755 index 0000000..88b2787 --- /dev/null +++ b/current/jgi/MatrixMultFloat.java @@ -0,0 +1,224 @@ +package jgi; + +import java.util.Arrays; +import java.util.Random; + +/** + * @author Brian Bushnell + * @date Nov 2, 2012 + * + */ +public final class MatrixMultFloat { + + public static void main(String[] args){ + + Timer t=new Timer(); + + //Grab arguments + int N=Integer.parseInt(args[0]); + int threads=(args.length>1 ? Integer.parseInt(args[1]) : 1); + int iters=(args.length>2 ? Integer.parseInt(args[2]) : 1); + + //Initialize arrays + float[][] A=new float[N][N]; + float[][] B=new float[N][N]; + float[][] C=new float[N][N]; + + Random randy=new Random(0); + int max=100; + + //Fill random arrays + for(int i=0; i=A.length); + threads=(A.length+chunk-1)/chunk; + + if(!printed){System.out.println("Using "+threads+" threads, chunksize "+chunk);} + printed=true; + + //Create workers + MultThread[] workers=new MultThread[threads]; + for(int i=0; iy ? x : y;} + + private static class Timer { + + public Timer(){} + + public long start(){ + time1=time2=System.nanoTime(); + elapsed=0; + return time1; + } + + public long stop(){ + time2=System.nanoTime(); + elapsed=time2-time1; + return time2; + } + + public String toString(){ + return String.format("%.3f seconds.", elapsed/1000000000d); + } + + public long time1; + public long time2; + /** in nanos */ + public long elapsed; + + } + + + + /* ~~~~~~~~~~~~~~~~~~~~~~ Fields ~~~~~~~~~~~~~~~~~~~~~~ */ + + private static boolean printed=false; + private static boolean verbose=false; + +} diff --git a/current/jgi/MergeReadHeaders.java b/current/jgi/MergeReadHeaders.java new file mode 100755 index 0000000..eb9bdaf --- /dev/null +++ b/current/jgi/MergeReadHeaders.java @@ -0,0 +1,419 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import dna.Data; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ByteFile1; +import fileIO.ByteFile2; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Feb 7, 2014 + * + */ +public class MergeReadHeaders { + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + MergeReadHeaders mgr=new MergeReadHeaders(args); + mgr.process(t); + } + + public MergeReadHeaders(String[] args){ + if(args==null || args.length==0){ + printOptions(); + System.exit(0); + } + + for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}} + outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + boolean setInterleaved=false; //Whether it was explicitly set. + + FastaReadInputStream.SPLIT_READS=false; + stream.FastaReadInputStream.MIN_READ_LEN=1; + Shared.READ_BUFFER_LENGTH=Tools.min(200, Shared.READ_BUFFER_LENGTH); + Shared.READ_BUFFER_NUM_BUFFERS=Tools.min(8, Shared.READ_BUFFER_NUM_BUFFERS); + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null") || a.equals(in2)){ + // do nothing + }else if(a.equals("passes")){ + assert(false) : "'passes' is disabled."; +// passes=Integer.parseInt(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + ByteFile2.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + ConcurrentGenericReadInputStream.verbose=verbose; +// align2.FastaReadInputStream2.verbose=verbose; + stream.FastqReadInputStream.verbose=verbose; + ReadWrite.verbose=verbose; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("t") || a.equals("threads")){ + Shared.THREADS=Tools.max(Integer.parseInt(b), 1); + }else if(a.equals("build") || a.equals("genome")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("header")){ + headerFile=b; + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){ + out1=b; + }else if(a.equals("out2") || a.equals("output2")){ + out2=b; + }else if(a.equals("extin")){ + extin=b; + }else if(a.equals("extout")){ + extout=b; + }else if(a.equals("trd") || a.equals("trc") || a.equals("trimreaddescription")){ + Shared.TRIM_READ_COMMENTS=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("fastareadlen") || a.equals("fastareadlength")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.equals("fastaminread") || a.equals("fastaminlen") || a.equals("fastaminlength")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + } + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in1=arg; + if(arg.indexOf('#')>-1 && !new File(arg).exists()){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(out1==null && i==1 && !arg.contains("=")){ + out1=arg; + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + + if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ + in2=in1.replace("#", "2"); + in1=in1.replace("#", "1"); + } + if(out1!=null && out2==null && out1.indexOf('#')>-1){ + out2=out1.replace("#", "2"); + out1=out1.replace("#", "1"); + } + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + assert(FastaReadInputStream.settingsOK()); + + if(in1==null || headerFile==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file and a header file are required."); + } + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.THREADS>2){ +// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;} + ByteFile.FORCE_MODE_BF2=true; + } + + if(out1==null){ + if(out2!=null){ + printOptions(); + throw new RuntimeException("Error - cannot define out2 without defining out1."); + } + //out1="stdout"; + System.err.println("Warning: output destination not set; producing no output. To print to standard out, set 'out=stdout.fq'"); + } + + if(!setInterleaved){ + assert(in1!=null && out1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; + if(in2!=null){ //If there are 2 input streams. + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else{ //There is one input stream. + if(out2!=null){ + FASTQ.FORCE_INTERLEAVED=true; + FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + } + } + + if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} + if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;} + + if(!Tools.testOutputFiles(overwrite, false, out1, out2)){ + throw new RuntimeException("\n\nOVERWRITE="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); + } + + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, false); + ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, false); + + ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); + ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); + + ffheader=FileFormat.testInput(headerFile, FileFormat.TEXT, null, true, true); + } + + /** TODO */ + public static void printOptions(){ + System.err.println("Usage information unavailable"); + } + + void process(Timer t){ + + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, false, ffin1, ffin2, null, null); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));} + + RTextOutputStream3 ros=null; + if(out1!=null){ + final int buff=4; + + if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){ + outstream.println("Writing interleaved."); + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name."; + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name."; + + ros=new RTextOutputStream3(ffout1, ffout2, null, null, buff, null, false); + ros.start(); + } + + long readsProcessed=0; + long basesProcessed=0; + + TextFile tf=new TextFile(ffheader); + + { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + +// System.err.println("Fetched "+reads); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); + } + + while(reads!=null && reads.size()>0){ + + for(int idx=0; idx{ + + public Orf(String name_, int start_, int stop_, byte strand_){ + name=name_; + start=start_; + stop=stop_; + strand=strand_; + assert(stop>start || (start==0 && stop==0)); + } + + public String toString(){ + return name+"\t"+start+"\t"+stop+"\t"+strand; + } + + public int length(){return stop-start+1;} + + public double avgCoverage(){ + int len=length(); + return len<=0 ? 0 : baseDepth/(double)len; + } + + public double fractionCovered(){ + int len=length(); + return len<=0 ? 0 : baseCoverage/(double)len; + } + + public int[] readCoverageArray(CoverageArray ca){ + + final int len=length(); + if(len<1 || ca==null){return null;} + final int[] array=new int[len]; + + baseCoverage=0; + baseDepth=0; + minDepth=Integer.MAX_VALUE; + maxDepth=0; + medianDepth=0; + stdevDepth=0; + + for(int i=start, j=0; i<=stop; i++, j++){ + int cov=ca.get(i); + array[j]=cov; + if(cov>1){ + baseCoverage++; + baseDepth+=cov; + minDepth=Tools.min(minDepth, cov); + maxDepth=Tools.max(maxDepth, cov); + } + } + if(baseDepth>0){ + Arrays.sort(array); + medianDepth=array[array.length/2]; + stdevDepth=Tools.standardDeviation(array); + } + return array; + } + + @Override + public int compareTo(Orf o) { + int x=name.compareTo(o.name); + if(x!=0){return x;} + x=o.start-start; + if(x!=0){return x;} + x=o.stop-stop; + if(x!=0){return x;} + return o.strand-strand; + } + + @Override + public boolean equals(Object o){return equals((Orf)o);} + public boolean equals(Orf o){return compareTo(o)==0;} + + @Override + public int hashCode(){return Integer.rotateLeft(name.hashCode(),16)^(start<<8)^(stop)^strand;} + + /** Name of ORF (not necessarily the name of its scaffold) */ + public String name; + public int start; + public int stop; + public byte strand; + + /** Number of bases with nonzero coverage */ + public long baseCoverage; + /** Number of reads mapped to this orf */ + public long readDepth=0; + /** Number of bases mapped to this orf */ + public long baseDepth=0; + /** Lowest base depth */ + public long minDepth=0; + /** Highest base depth */ + public long maxDepth=0; + /** Median base depth */ + public long medianDepth=0; + /** Standard deviation of depth */ + public double stdevDepth=0; + + +} diff --git a/current/jgi/RQCFilter.java b/current/jgi/RQCFilter.java new file mode 100755 index 0000000..e92ee10 --- /dev/null +++ b/current/jgi/RQCFilter.java @@ -0,0 +1,983 @@ +package jgi; + +import java.io.File; +import java.text.SimpleDateFormat; +import java.util.ArrayList; +import java.util.Date; +import java.util.TimeZone; + +import dna.Data; + +import stream.FASTQ; +import stream.Read; + +import align2.BBMap; +import align2.Shared; +import align2.Tools; +import align2.TrimRead; +import fileIO.ByteFile1; +import fileIO.ReadWrite; +import fileIO.TextStreamWriter; + +/** + * Wrapper for BBDukF to implement Rolling QC's filter stage. + * @author Brian Bushnell + * @date Nov 26, 2013 + * + */ +public class RQCFilter { + + + /*--------------------------------------------------------------*/ + /*---------------- Initialization Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * Program entrance from command line. + * @param args Command line arguments + */ + public static void main(String[] args){ + //Create a filter instance + RQCFilter filter=new RQCFilter(args); + + ///...and execute it. + filter.process(); + } + + /** + * Constructor. + * @param args Command line arguments + */ + RQCFilter(String[] args){ + + //Optional default parameters to match current pipeline +// arglist.add("k=22"); +// arglist.add("maxbadkmers=2"); + + //Symbols to insert in output filename to denote operations performed; may be overriden from command line + String symbols_=null;//"filtered" + + //Parse argument list + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + +// System.out.println("Processing '"+arg+"' a='"+a+"', b='"+b+"'"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null") || a.equals(in2)){ + // do nothing + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + primaryArgList.add(arg); + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){ + out1=b; + }else if(a.equals("out2") || a.equals("output2")){ + out2=b; + }else if(a.equals("qfin") || a.equals("qfin1")){ + qfin1=b; + }else if(a.equals("qfout") || a.equals("qfout1")){ + qfout1=b; + }else if(a.equals("qfin2")){ + qfin2=b; + }else if(a.equals("qfout2")){ + qfout2=b; + }else if(a.equals("ref")){ + if(b!=null){ + if(!b.contains(",") || new File(b).exists()){ + refs.add(b); + }else{ + String[] split2=b.split(","); + for(String s2 : split2){ + refs.add(s2); + } + } + } + }else if(a.equals("artifactdb")){ + mainArtifactFile=b; + }else if(a.equals("rnadb")){ + artifactFileRna=b; + }else if(a.equals("dnadb")){ + artifactFileDna=b; + }else if(a.equals("phixref")){ + phixRef=b; + }else if(a.equals("fragadapter")){ + fragAdapter=b; + }else if(a.equals("lfpelinker")){ + lfpeLinker=b; + }else if(a.equals("cliplinker") || a.equals("jointseq")){ + clipLinker=b; + }else if(a.equals("clrslinker")){ + clrsLinker=b; + }else if(a.equals("trimfragadapter")){ + fragAdapterFlag=Tools.parseBoolean(b); + }else if(a.equals("removehuman")){ + humanFlag=Tools.parseBoolean(b); + }else if(a.equals("useindex")){ + humanRefIndexedFlag=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){ + minLen=Integer.parseInt(b); + }else if(a.equals("mlf") || a.equals("minlenfrac") || a.equals("minlenfraction") || a.equals("minlengthfraction")){ + minLenFraction=Float.parseFloat(b); + }else if(a.equals("libtype") || a.equals("library")){ + libType=toLibType(b); + }else if(a.equals("path") || a.equals("outdir")){ + outDir=b; + }else if(a.equals("symbols")){ + symbols_=b; + }else if(a.equals("overallstats") || a.equals("stats")){ + rqcStatsName=b; + }else if(a.equals("scafstats")){ + scaffoldStatsName=b; + }else if(a.equals("kmerstats")){ + kmerStatsName=b; + }else if(a.equals("log")){ + logName=b; + }else if(a.equals("filelist")){ + fileListName=b; + }else if(a.equals("compress")){ + compress=Tools.parseBoolean(b); + }else if(a.equals("rna")){ + rnaFlag=Tools.parseBoolean(b); + }else if(a.equals("phix")){ + phixFlag=Tools.parseBoolean(b); + }else if(a.equals("jointseq")){ + jointSeq=b; + }else if(a.equals("ktrim")){ + ktrim=b; + }else if(a.equals("mink")){ + mink=Integer.parseInt(b); + }else if(a.equals("maq")){ + maq=Byte.parseByte(b); + }else if(a.equals("trimq")){ + trimq=Byte.parseByte(b); + }else if(a.equals("qtrim")){ + if(b==null){qtrim="rl";} + else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){qtrim="l";} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){qtrim="r";} + else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){qtrim="lr";} + else if(Character.isDigit(b.charAt(0))){ + trimq=Byte.parseByte(b); + qtrim=(trimq>=0 ? "lr" : "f"); + }else{qtrim=""+Tools.parseBoolean(b);} + }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){ + if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){ + TrimRead.optimalMode=true; + TrimRead.optimalBias=Float.parseFloat(b); + assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("maxns")){ + maxNs=Integer.parseInt(b); + }else if(a.equals("usetmpdir")){ + writeTempToTmpdir=Tools.parseBoolean(b); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + System.err.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in1=arg; + if(arg.indexOf('#')>-1 && !new File(arg).exists()){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else{ + //Uncaptured arguments are passed to BBDuk + primaryArgList.add(arg); + } + } + +// assert(false) : rnaFlag+"\n"+primaryArgList+"\n"+libType+"\n"+outDir; + + if(writeTempToTmpdir){tmpDir=Shared.TMPDIR;} + else{tmpDir=null;} + + //Set final field 'symbols' + symbols=(symbols_==null ? abbreviation() : symbols_); + + //Pass overwrite flag to BBDuk + primaryArgList.add("ow="+overwrite); + + if(outDir!=null){ + outDir=outDir.trim().replace('\\', '/'); + if(outDir.length()>0 && !outDir.endsWith("/")){outDir=outDir+"/";} + }else{outDir="";} + + {//Prepend output directory to output files + if(logName!=null){logName=outDir+logName+".tmp";} //Add '.tmp' to log file + if(reproduceName!=null){reproduceName=outDir+reproduceName;} + if(fileListName!=null){fileListName=outDir+fileListName;} + } + + {//Create unique output file names for second pass + if(rqcStatsName!=null){ + rqcStatsName_kt=outDir+"ktrim_"+rqcStatsName; + rqcStatsName=outDir+rqcStatsName; + } + if(kmerStatsName!=null){ + kmerStatsName_kt=outDir+"ktrim_"+kmerStatsName; + kmerStatsName=outDir+kmerStatsName; + } + if(scaffoldStatsName!=null){ + scaffoldStatsName_kt=outDir+"ktrim_"+scaffoldStatsName; + scaffoldStatsName=outDir+scaffoldStatsName; + } + } + + //Create output filename from input filename if no output filename is specified + if(out1==null && in1!=null){ + File f=new File(in1); + String name=f.getName(); + String raw=ReadWrite.rawName(name); + int x=raw.lastIndexOf('.'); + if(x>-1){ + out1=raw.substring(0, x)+"."+symbols+raw.substring(x)+(compress ? ".gz" : ""); + }else{ + out1=raw+"."+symbols+".fastq"+(compress ? ".gz" : ""); + } + } + + tempSalt=KmerNormalize.getSalt(out1, 1); + trimPrefix="TEMP_TRIM_"+tempSalt+"_"; + humanPrefix="TEMP_HUMAN_"+tempSalt+"_"; + filterPrefix="TEMP_FILTER_"+tempSalt+"_"; + } + + + /*--------------------------------------------------------------*/ + /*---------------- Processing Methods ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Primary method to fully execute the program. + */ + public void process(){ + + //Create output directory + if(outDir!=null && outDir.length()>0){ + File f=new File(outDir); + if(!f.exists()){ + f.mkdirs(); + } + } + + //Create log file + if(logName!=null){ + boolean b=Tools.canWrite(logName, overwrite); + assert(b) : "Can't write to "+logName; + log("start", false); + } + + //Create file list file + if(fileListName!=null){ + boolean b=Tools.canWrite(fileListName, overwrite); + assert(b) : "Can't write to "+fileListName; + + StringBuilder sb=new StringBuilder(); + if(out1!=null){sb.append("filtered_fastq="+out1).append('\n');} + if(qfout1!=null){sb.append("filtered_qual="+qfout1).append('\n');} + if(out2!=null){sb.append("filtered_fastq_2="+out2).append('\n');} + if(qfout2!=null){sb.append("filtered_qual_2="+qfout2).append('\n');} + + if(sb.length()>0){ + ReadWrite.writeString(sb, fileListName, false); + } + } + + final boolean doFilter; + final boolean doTrim; + final boolean doHuman=humanFlag; + + //Determine execution path + if(libType==FRAG || ((libType==LFPE && lfpeLinker==null) || (libType==CLIP && clipLinker==null) || (libType==CLRS && clrsLinker==null))){ + doTrim=fragAdapterFlag; + doFilter=true; + }else if(libType==LFPE){ + doTrim=true; + doFilter=true; + }else if(libType==CLIP){ + doTrim=true; + doFilter=true; + }else if(libType==CLRS){ + doTrim=true; + doFilter=true; + }else{ + throw new RuntimeException("Unknown library type."); + } + + { + int step=0; + final int numSteps=(doFilter ? 1 : 0)+(doTrim ? 1 : 0)+(doHuman ? 1 : 0); + String inPrefix=null, outPrefix=null; + if(doTrim){ + step++; + inPrefix=outPrefix; + outPrefix=(step1){ + delete(inPrefix, out1, out2, qfout1, qfout2); + } + } + + if(doHuman){ + step++; + inPrefix=outPrefix; + outPrefix=(step1){ + delete(inPrefix, out1, out2, qfout1, qfout2); + } + } + } + + //Write combined stats file (number of reads/bases present/removed in each stage) + if(rqcStatsName!=null){ + final TextStreamWriter tsw=new TextStreamWriter(rqcStatsName, overwrite, false, false); + tsw.start(); + tsw.println(BBDukF.rqcString()); + tsw.poisonAndWait(); + } + +// {//Set files to permission 777 +// setPermissions((out1==null ? null : outDir+out1),(out2==null ? null : outDir+out2)); +// setPermissions((qfout1==null ? null : outDir+qfout1),(qfout2==null ? null : outDir+qfout2)); +// setPermissions(reproduceName,fileListName); +// setPermissions(rqcStatsName,kmerStatsName,scaffoldStatsName); +// setPermissions(rqcStatsName_kt,kmerStatsName_kt,scaffoldStatsName_kt); +// setPermissions(outDir); +// } + + //Finish writing log file + if(logName!=null){ + log("complete", true); + if(logName.endsWith(".tmp")){ //Remove .tmp extension + String old=logName; + logName=logName.substring(0, logName.length()-4); + new File(old).renameTo(new File(logName)); + } + } + +// //Set log file permission +// setPermissions(logName); + + } + + /** + * Runs BBMap to perform: + * Human contaminant removal. + * + * @param in1 Primary input reads file (required) + * @param in2 Secondary input reads file + * @param out1 Primary output reads file (required) + * @param out2 Secondary output reads file + * @param qfin1 Primary input qual file + * @param qfin2 Secondary input qual file + * @param qfout1 Primary output qual file + * @param qfout2 Secondary output qual file + * @param inPrefix Append this prefix to input filenames + */ + private void dehumanize(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2, String inPrefix, String outPrefix){ + + log("dehumanize start", true); + + ArrayList argList=new ArrayList(); + + final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix); + final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix); + + { +// argList.add("kfilter="+47); + argList.add("minratio=.75"); + argList.add("maxindel=20"); + argList.add("bw=20"); + argList.add("bwr=0.18"); + argList.add("minhits=2"); + if(humanRefIndexedFlag){ + argList.add("path="+humanPath); + }else{ + argList.add("ref="+humanRef); + argList.add("nodisk"); + } + argList.add("quickmatch"); + argList.add("overwrite="+overwrite); + + //Pass along uncaptured arguments + for(String s : primaryArgList){argList.add(s);} + + //Set read I/O files + if(in1!=null){argList.add("in1="+inPre+in1);} + if(in2!=null){argList.add("in2="+inPre+in2);} + if(out1!=null){argList.add("outu1="+outPre+out1);} + if(out2!=null){argList.add("outu2="+outPre+out2);} + if(qfin1!=null){argList.add("qfin1="+inPre+qfin1);} + if(qfin2!=null){argList.add("qfin2="+inPre+qfin2);} + if(qfout1!=null){argList.add("qfoutu1="+outPre+qfout1);} + if(qfout2!=null){argList.add("qfoutu2="+outPre+qfout2);} + + } + + String[] args=argList.toArray(new String[0]); + + {//Run BBMap + try { + BBMap.main(args); + } catch (Exception e) { + e.printStackTrace(); + log("failed", true); + System.exit(1); + } + } + + if(reproduceName!=null){ + writeReproduceFile(reproduceName, "bbmap.sh", args, true, overwrite); + } + + //Optionally append files to file list here + + log("dehumanize finish", true); + } + + /** + * Runs BBDuk to perform: + * Quality filtering, quality trimming, n removal, short read removal, artifact removal (via kmer filtering), phiX removal. + * + * @param in1 Primary input reads file (required) + * @param in2 Secondary input reads file + * @param out1 Primary output reads file (required) + * @param out2 Secondary output reads file + * @param qfin1 Primary input qual file + * @param qfin2 Secondary input qual file + * @param qfout1 Primary output qual file + * @param qfout2 Secondary output qual file + * @param inPrefix Append this prefix to input filenames + */ + private void filter(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2, String inPrefix, String outPrefix){ + + log("filter start", true); + + ArrayList argList=new ArrayList(); + + final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix); + final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix); + + {//Fill list with BBDuk arguments + if(maq>-1){argList.add("maq="+maq);} + if(qtrim!=null){ + argList.add("trimq="+trimq); + argList.add("qtrim="+qtrim); + } + argList.add("overwrite="+overwrite); + if(maxNs>=0){argList.add("maxns="+maxNs);} + if(minLen>0){argList.add("minlen="+minLen);} + if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);} + + //Pass along uncaptured arguments + for(String s : primaryArgList){argList.add(s);} + + //Set read I/O files + if(in1!=null){argList.add("in1="+inPre+in1);} + if(in2!=null){argList.add("in2="+inPre+in2);} + if(out1!=null){argList.add("out1="+outPre+out1);} + if(out2!=null){argList.add("out2="+outPre+out2);} + if(qfin1!=null){argList.add("qfin1="+inPre+qfin1);} + if(qfin2!=null){argList.add("qfin2="+inPre+qfin2);} + if(qfout1!=null){argList.add("qfout1="+outPre+qfout1);} + if(qfout2!=null){argList.add("qfout2="+outPre+qfout2);} + +// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName);} //Old style for 2 log files + if(rqcStatsName!=null){argList.add("rqc=hashmap");} + if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName);} + if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName);} + } + + {//Add BBDuk references + refs.add(mainArtifactFile); + refs.add(rnaFlag ? artifactFileRna : artifactFileDna); + + if(phixFlag){refs.add(phixRef);} + + if(libType==FRAG){ + + }else if(libType==LFPE){ + + }else if(libType==CLIP){ + + }else if(libType==CLRS){ + + }else{ + throw new RuntimeException("Unknown library type."); + } + + StringBuilder refstring=new StringBuilder(); + for(String ref : refs){ + if(ref!=null){ + refstring.append(refstring.length()==0 ? "ref=" : ","); + refstring.append(ref); + } + } + + if(refstring!=null && refstring.length()>0){ + argList.add(refstring.toString()); + } + } + + String[] dukargs=argList.toArray(new String[0]); + + if(reproduceName!=null){ + writeReproduceFile(reproduceName, "bbduk.sh", dukargs, true, overwrite); + } + + {//Run BBDuk + BBDukF duk=new BBDukF(dukargs); + try { + duk.process(); + } catch (Exception e) { + e.printStackTrace(); + log("failed", true); + System.exit(1); + } + } + + //Optionally append files to file list here + + log("filter finish", true); + } + + + /** + * Runs BBDuk to perform: + * Kmer trimming, short read removal. + * + * @param in1 Primary input reads file (required) + * @param in2 Secondary input reads file + * @param out1 Primary output reads file (required) + * @param out2 Secondary output reads file + * @param qfin1 Primary input qual file + * @param qfin2 Secondary input qual file + * @param qfout1 Primary output qual file + * @param qfout2 Secondary output qual file + * @param outPrefix Append this prefix to output filenames + */ + private void trim(String in1, String in2, String out1, String out2, String qfin1, String qfin2, String qfout1, String qfout2, String inPrefix, String outPrefix){ + + log("ktrim start", true); + + ArrayList argList=new ArrayList(); + + final String inPre=(inPrefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+inPrefix); + final String outPre=(outPrefix==null ? outDir : (tmpDir==null ? outDir : tmpDir)+outPrefix); + + {//Fill list with BBDuk arguments + argList.add("mink="+mink); + argList.add("ktrim="+(ktrim==null ? "f" : ktrim)); + argList.add("overwrite="+overwrite); + if(minLen>0){argList.add("minlen="+minLen);} + if(minLenFraction>0){argList.add("minlenfraction="+minLenFraction);} + + //Pass along uncaptured arguments + for(String s : primaryArgList){argList.add(s);} + + //Set read I/O files + if(in1!=null){argList.add("in1="+inPre+in1);} + if(in2!=null){argList.add("in2="+inPre+in2);} + if(out1!=null){argList.add("out1="+outPre+out1);} + if(out2!=null){argList.add("out2="+outPre+out2);} + if(qfin1!=null){argList.add("qfin1="+inPre+qfin1);} + if(qfin2!=null){argList.add("qfin2="+inPre+qfin2);} + if(qfout1!=null){argList.add("qfout1="+outPre+qfout1);} + if(qfout2!=null){argList.add("qfout2="+outPre+qfout2);} + +// if(rqcStatsName!=null){al.add("rqc="+rqcStatsName_kt);} //Old style for 2 log files + if(rqcStatsName!=null){argList.add("rqc=hashmap");} + if(kmerStatsName!=null){argList.add("outduk="+kmerStatsName_kt);} + if(scaffoldStatsName!=null){argList.add("stats="+scaffoldStatsName_kt);} + } + + {//Add BBDuk references + ArrayList refs=new ArrayList(); + + if(libType==FRAG){ + refs.add(fragAdapter); + }else if(libType==LFPE){ + refs.add(lfpeLinker); + }else if(libType==CLIP){ +// refs.add(clipLinker); + if(clipLinker!=null){ + argList.add("literal="+clipLinker); + {//Special processing for literal strings of approx 4bp + String[] split=clipLinker.split(","); + int min=split[0].length(); + for(String s : split){min=Tools.min(min, s.length());} + argList.add("k="+min); + argList.add("mink=-1"); + argList.add("mm=f"); + argList.add("hdist=0"); + argList.add("edist=0"); + argList.add("ktrimexclusive=t"); + } + }else{ + throw new RuntimeException("Null clip linker."); + } + }else if(libType==CLRS){ + refs.add(clrsLinker); + }else{ + throw new RuntimeException("Unknown library type."); + } + + StringBuilder refstring=new StringBuilder(); + for(String ref : refs){ + if(ref!=null){ + refstring.append(refstring.length()==0 ? "ref=" : ","); + refstring.append(ref); + } + } + + if(refstring!=null && refstring.length()>0){ + argList.add(refstring.toString()); + } + } + + String[] dukargs=argList.toArray(new String[0]); + + if(reproduceName!=null){ + writeReproduceFile(reproduceName, "bbduk.sh", dukargs, false, overwrite); + } + + {//run BBDuk + BBDukF duk=new BBDukF(dukargs); + try { + duk.process(); + } catch (Exception e) { + e.printStackTrace(); + log("failed", true); + System.exit(1); + } + } + + //Optionally append files to file list here + + log("ktrim finish", true); + } + + /*--------------------------------------------------------------*/ + /*---------------- Helper Methods ----------------*/ + /*--------------------------------------------------------------*/ + + + /** + * Log a message in the log file + * @param message Message to log + * @param append True to append, false to overwrite + */ + private void log(String message, boolean append){ + if(logName!=null){ + ReadWrite.writeString(message+", "+timeString()+"\n", logName, append); + } + } + + + /** + * Delete all non-null filenames. + * @param prefix Append this prefix to filenames before attempting to delete them + * @param names Filenames to delete + */ + private void delete(String prefix, String...names){ + log("delete temp files start", true); + if(names!=null){ + final String pre=(prefix==null ? "" : (tmpDir==null ? outDir : tmpDir)+prefix); + for(String s : names){ + if(s!=null){ + s=pre+s; + if(verbose){System.err.println("Trying to delete "+s);} + File f=new File(s); + if(f.exists()){ + f.delete(); + writeReproduceFile(reproduceName, "rm", new String[] {s}, true, overwrite); + } + } + } + } + log("delete temp files finish", true); + } + + /** + * @return String of symbols indicating which processes were applied to the input reads + */ + private String abbreviation(){ + StringBuilder sb=new StringBuilder(); + + if(mainArtifactFile!=null || (rnaFlag ? artifactFileRna!=null : artifactFileDna!=null)){sb.append("a");} + + if(maxNs>=0){sb.append("n");} +// if(qtrim!=null && !qtrim.equalsIgnoreCase("f") && !qtrim.equalsIgnoreCase("false")){sb.append("q");} + if(maq>0){sb.append("q");} + + if(rnaFlag){sb.append("r");} + else{sb.append("d");} + + if(libType==CLIP){sb.append("c");} + else if(libType==LFPE){sb.append("l");} + else if(libType==CLRS){sb.append("s");} + + if(phixFlag){sb.append("p");} + + return sb.toString(); + } + + /*--------------------------------------------------------------*/ + /*---------------- Static Methods ----------------*/ + /*--------------------------------------------------------------*/ + + /** + * TODO: Some machines are set to UTC rather than PST + * @return Timestamp in RQC's format + */ + public static String timeString(){ + SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss"); +// sdf.setTimeZone(TimeZone.getTimeZone("PST")); + sdf.setTimeZone(TimeZone.getDefault()); + return sdf.format(new Date()); + } + + /** + * Set permissions on these files to 777 + * @param names List of filenames + */ + private static void setPermissions(String...names){ + if(names==null){return;} + for(String name : names){ + if(name!=null && name.trim().length()>0 && new File(name).exists()){ + ReadWrite.setPermissions(name, true, true, true, false); + } + } + } + + /** + * Write a string to the file containing steps needed to regenerate the output + * @param fname Filename to write, including path + * @param command Command to add to file + * @param args Arguments to the command + * @param append Append to existing file rather than overwriting + * @param overwrite Permission to overwrite + */ + private static void writeReproduceFile(String fname, String command, String[] args, boolean append, boolean overwrite){ + StringBuilder sb=new StringBuilder(); + if(!append){ + boolean b=Tools.canWrite(fname, overwrite); + assert(b) : "Can't write to "+fname; + sb.append("#!/bin/bash\n"); + } + sb.append(command); + if(args!=null){ + for(String s : args){ + sb.append(' ').append(s); + } + } + sb.append('\n'); + ReadWrite.writeString(sb, fname, append); + } + + /** + * @param s String representation of library type + * @return Numeric code for library type + */ + private static int toLibType(String s){ + if(s==null){return FRAG;} + s=s.trim().toLowerCase(); + if(s.equals("lfpe")){return LFPE;} + if(s.equals("clip")){return CLIP;} + if(s.equals("clrs")){return CLRS;} + if(s.equals("frag") || s.equals("fragment")){return FRAG;} + throw new RuntimeException("Unknown library type "+s); + } + + /*--------------------------------------------------------------*/ + /*---------------- Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Symbols to insert in output filename to denote operations performed */ + private final String symbols; + + /** Type of library; controls processing methods and references to use */ + private int libType=FRAG; + /** True for rna artifacts, false for dna artifacts */ + private boolean rnaFlag=false; + /** True if phix should be filtered out */ + private boolean phixFlag=false; + /** Unused */ + private String jointSeq=null; + /** Toss reads shorter than this */ + private int minLen=25; + /** Toss reads shorter than this fraction of initital length, after trimming */ + private float minLenFraction=0.333f; + /** Trim bases at this quality or below */ + private byte trimq=6; + /** Throw away reads below this average quality before trimming. Default: 6 */ + private byte maq=5; + /** Quality-trimming mode */ + private String qtrim="f";//"rl"; + /** Kmer-trimming mode */ + private String ktrim="r"; + /** Shortest kmer to use for trimming */ + private int mink=8; + /** Throw away reads containing more than this many Ns. Default: 0 (toss reads with any Ns) */ + private int maxNs=0; + + /** Trim fragment adapters from right side of reads */ + private boolean fragAdapterFlag=false; + + /** Remove reads mapping to human with high identity */ + private boolean humanFlag=false; + /** Use indexed version of human reference, rather than regenerating from fasta */ + private boolean humanRefIndexedFlag=true; + + private boolean verbose=false; + private boolean overwrite=true; + private boolean compress=true; + + private boolean writeTempToTmpdir=false; + + /** Arguments to pass to BBDuk */ + private ArrayList primaryArgList=new ArrayList(); + /** References to pass to BBDuk for artifact removal */ + private ArrayList refs=new ArrayList(); + + /*--------------------------------------------------------------*/ + /*---------------- Read Data Files ----------------*/ + /*--------------------------------------------------------------*/ + + private final String tempSalt; + + private final String trimPrefix; + private final String humanPrefix; + private final String filterPrefix; + + /** Directory in which to write all files */ + private String outDir=""; + + /** Directory in which to write all temp files */ + private String tmpDir=Shared.TMPDIR; + + /** Primary input reads file (required) */ + private String in1=null; + /** Secondary input reads file */ + private String in2=null; + /** Primary output reads file (required) */ + private String out1=null; + /** Secondary output reads file */ + private String out2=null; + /** Primary input qual file */ + private String qfin1=null; + /** Secondary input qual file */ + private String qfin2=null; + /** Primary output qual file */ + private String qfout1=null; + /** Secondary output qual file */ + private String qfout2=null; + + /*--------------------------------------------------------------*/ + /*---------------- Log Files ----------------*/ + /*--------------------------------------------------------------*/ + + private String logName="status.log"; + private String reproduceName="reproduce.sh"; + private String fileListName="file-list.txt"; + + private String rqcStatsName="filterStats.txt"; + private String kmerStatsName="kmerStats.txt"; + private String scaffoldStatsName="scaffoldStats.txt"; + + /** ktrim phase rqc stats file */ + private String rqcStatsName_kt; + /** ktrim phase stats file */ + private String kmerStatsName_kt; + /** ktrim phase scaffold stats file */ + private String scaffoldStatsName_kt; + + /*--------------------------------------------------------------*/ + /*---------------- Reference Files ----------------*/ + /*--------------------------------------------------------------*/ + + private String mainArtifactFile = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/Illumina.artifacts.2013.12.no_DNA_RNA_spikeins.fa"; + private String artifactFileRna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/RNA_spikeins.artifacts.2012.10.NoPolyA.fa"; + private String artifactFileDna = "/global/dna/shared/rqc/ref_databases/qaqc/databases/illumina.artifacts/DNA_spikeins.artifacts.2012.10.fa"; + private String phixRef = "/global/dna/shared/rqc/ref_databases/qaqc/databases/phix174_ill.ref.fa"; + private String lfpeLinker = "/global/dna/shared/rqc/ref_databases/qaqc/databases/lfpe.linker.fa"; + private String clrsLinker = "/global/dna/shared/rqc/ref_databases/qaqc/databases/crelox.fa"; + private String clipLinker = clipLinkerDefault; //A literal string; "CATG" is supposed to be the normal linker. + + private String allArtifactsLatest = "/global/projectb/sandbox/rqc/qcdb/illumina.artifacts/Illumina.artifacts.fa"; + private String fragAdapter = "/global/projectb/sandbox/gaag/bbtools/data/adapters.fa"; + private String humanPath = "/global/projectb/sandbox/gaag/bbtools/hg19/"; + private String humanRef = "/global/projectb/sandbox/gaag/bbtools/hg19/hg19.fa.gz"; + + /*--------------------------------------------------------------*/ + /*---------------- Static Fields ----------------*/ + /*--------------------------------------------------------------*/ + + /** Library type codes */ + private static final int FRAG=0, LFPE=1, CLIP=2, CLRS=3; + private static final String clipLinkerDefault = "CATG"; + +} diff --git a/current/jgi/RandomGenome.java b/current/jgi/RandomGenome.java new file mode 100755 index 0000000..26a7166 --- /dev/null +++ b/current/jgi/RandomGenome.java @@ -0,0 +1,54 @@ +package jgi; + +import java.util.Random; + +import dna.AminoAcid; +import fileIO.ReadWrite; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jan 3, 2013 + * + */ +public class RandomGenome { + + public static void main(String[] args){ + ReadWrite.ZIPLEVEL=2; + Random randy=new Random(); + int chroms=Integer.parseInt(args[0]); + int len=Integer.parseInt(args[1]); + + String fname=args[2]; + TextStreamWriter tsw=new TextStreamWriter(fname, false, false, true); + tsw.start(); + + for(int chrom=1; chrom<=chroms; chrom++){ + tsw.println(">"+chrom); + StringBuilder sb=new StringBuilder(101); + for(int i=0, j=0; i0){ + sb.append('\n'); + tsw.print(sb); + } + } + tsw.poison(); + tsw.waitForFinish(); + + } + +} diff --git a/current/jgi/ReadKmerDepthDistribution.java b/current/jgi/ReadKmerDepthDistribution.java new file mode 100755 index 0000000..1bfb6af --- /dev/null +++ b/current/jgi/ReadKmerDepthDistribution.java @@ -0,0 +1,1096 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.ThreadLocalRandom; +import java.util.concurrent.atomic.AtomicLong; +import java.util.concurrent.atomic.AtomicLongArray; + +import kmer.KCountArray; +import kmer.KmerCount7MTA; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import dna.AminoAcid; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextStreamWriter; + + + +/** + * This class is designed to visualize the distribution of kmer depths across individual reads. + * @author Brian Bushnell + * @date May 15, 2013 + * + */ +public class ReadKmerDepthDistribution { + + public static void main(String[] args){ + for(String s : args){if(s.contains("=standardout") || s.contains("=stdout")){outstream=System.err;}} + outstream.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + if(args.length<1){throw new RuntimeException("No parameters.");} + + String reads1=(args[0].indexOf("=")>0 ? null : args[0]); + String reads2=(reads1!=null && args.length>1 ? args[1] : null); + if(reads2!=null && "null".equalsIgnoreCase(reads2)){reads2=null;} + + { + if(reads1!=null && !reads1.contains(",")){ + File f=new File(reads1); + if(!f.exists() || !f.isFile()){throw new RuntimeException(reads1+" does not exist.");} + } + if(reads2!=null && !reads2.contains(",")){ + File f=new File(reads2); + if(!f.exists() || !f.isFile()){throw new RuntimeException(reads2+" does not exist.");} + if(reads1.equalsIgnoreCase(reads2)){ + throw new RuntimeException("Both input files are the same."); + } + } + } + + FASTQ.PARSE_CUSTOM=false; + KmerCount7MTA.minQuality=4; + KmerCount7MTA.minProb=0.4f; + + int k=31; + int cbits=32; + int gap=0; + int hashes=3; +// int matrixbits=-1; + long cells=-1; + long maxReads=-1; + int buildpasses=1; + long tablereads=-1; //How many reads to process when building the hashtable + int buildStepsize=4; + String outKeep=null; + int prehashes=-1; + long precells=-1; + String histFile=null; + int threads=-1; + ReadWrite.ZIPLEVEL=2; + + int minq=KmerCount7MTA.minQuality; + KmerCount7MTA.CANONICAL=true; + + boolean auto=true; + boolean deterministic=true; + + FastaReadInputStream.TARGET_READ_LEN=Integer.MAX_VALUE; + FASTQ.PARSE_CUSTOM=false; + + List extra=null; + + long memory=Runtime.getRuntime().maxMemory(); + long tmemory=Runtime.getRuntime().totalMemory(); +// assert(false) : memory+", "+tmemory; + + for(int i=(reads1==null ? 0 : 1); i16 ? Integer.MAX_VALUE : (1L<0); + HIST_LEN_PRINT=Tools.max(1, Tools.min(HIST_LEN_PRINT, maxCount)); + assert(HIST_LEN_PRINT<=Integer.MAX_VALUE) : HIST_LEN_PRINT+", "+Integer.MAX_VALUE; + HIST_LEN=(int)Tools.min(maxCount, Tools.max(HIST_LEN_PRINT, HIST_LEN)); + THREAD_HIST_LEN=Tools.min(THREAD_HIST_LEN, HIST_LEN); + + histogram_total=new AtomicLongArray(HIST_LEN); + } + + if(extra!=null){ + for(String s : extra){ + File f=new File(s); + if(!f.exists() || !f.isFile()){throw new RuntimeException(s+" does not exist.");} + assert(!s.equalsIgnoreCase(reads1) && (reads2==null || !s.equalsIgnoreCase(reads2))) : "\nInput file "+s+" should not be included as an extra file.\n"; + } + } + +// outstream.println("ForceInterleaved = "+FASTQ.FORCE_INTERLEAVED); + +// assert(false) : reads1+", "+reads2+", "+output; +// if(FASTQ.FORCE_INTERLEAVED && in2==null){ +// outstream.println() +// } + + if(threads<=0){ + if(auto){THREADS=Data.LOGICAL_PROCESSORS;} + else{THREADS=8;} + }else{ + THREADS=threads; + } +// KmerCount7MTA.THREADS=Tools.min(THREADS,6); + KmerCount7MTA.THREADS=THREADS; + +// System.err.println("THREADS="+THREADS+", KmerCount7MTA.THREADS="+KmerCount7MTA.THREADS); + + if(auto && cells==-1){ + final long usable=(long)Tools.max(((memory-96000000)*.73), memory*0.45); + long mem=usable-(USE_HISTOGRAM ? (HIST_LEN*8*(1)) : 0); + if(buildpasses>1){mem/=2;} + cells=(mem*8)/cbits; +// +// long tablebytes=((1L<0 && prehashes>0 ? Tools.toKMG(precells) : "?")); + outstream.println("prefilter hashes: \t"+(precells>0 && prehashes>0 ? ""+prehashes : "?")); + } + outstream.println("base min quality: \t"+KmerCount7MTA.minQuality); + outstream.println("kmer min prob: \t"+KmerCount7MTA.minProb); + + outstream.println(); + outstream.println("target depth: \t"+TARGET_DEPTH); + outstream.println("min depth: \t"+MIN_DEPTH); + outstream.println("max depth: \t"+MAX_DEPTH); + outstream.println("min good kmers: \t"+MIN_KMERS_OVER_MIN_DEPTH); + outstream.println("depth percentile: \t"+String.format("%.1f", 100*DEPTH_PERCENTILE)); + outstream.println("remove duplicates:\t"+!KmerCount7MTA.KEEP_DUPLICATE_KMERS); + outstream.println("fix spikes: \t"+FIX_SPIKES); + if(USE_HISTOGRAM && HIST_LEN>0){ + outstream.println("histogram length: \t"+(USE_HISTOGRAM ? HIST_LEN : 0)); + } + if(histFile!=null){ + outstream.println("print zero cov: \t"+PRINT_ZERO_COVERAGE); + } + + outstream.println(); + } + + if(!prefilter && k<32 && cells>(1L<<(2*k))){cells=(1L<<(2*k));} + assert(cells>0); + +// KmerCount7MTA.THREADS=Tools.max(THREADS/2, KmerCount7MTA.THREADS); //Seems like 4 is actually optimal... + + FastaReadInputStream.MIN_READ_LEN=k; + + Timer t=new Timer(); + Timer ht=new Timer(); + t.start(); + ht.start(); + KCountArray kca; + KCountArray prefilterArray=null; +// outstream.println(); + if(prefilter){ + prefilterArray=KmerCount7MTA.makeKca(reads1, reads2, extra, k, 2, gap, precells, prehashes, minq, true, tablereads, 1, buildStepsize, 1, 1, null); + outstream.println("Made prefilter: \t"+prefilterArray.toShortString(prehashes)); + double uf=prefilterArray.usedFraction(); + if(uf>0.6){ + outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" : + uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy for kmers of depth under 3. Ideal load is under 60% used." + + "\nFor better accuracy, run on a node with more memory; quality-trim or error-correct reads; " + + "or increase the values of the minprob flag to reduce spurious kmers."); + } + } + kca=KmerCount7MTA.makeKca(reads1, reads2, extra, k, cbits, gap, cells, hashes, minq, true, tablereads, buildpasses, buildStepsize, 2, 2, prefilterArray); + ht.stop(); + + outstream.println("Made hash table: \t"+kca.toShortString(hashes)); + double uf=kca.usedFraction(); + if(uf>0.6){ + outstream.println("Warning: This table is "+(uf>0.995 ? "totally" : uf>0.99 ? "crazy" : uf>0.95 ? "incredibly" : uf>0.9 ? "extremely" : uf>0.8 ? "very" : + uf>0.7 ? "fairly" : "somewhat")+" full, which may reduce accuracy. Ideal load is under 60% used." + + "\nFor better accuracy, use the 'prefilter' flag; run on a node with more memory; quality-trim or error-correct reads; " + + "or increase the values of the minprob flag to reduce spurious kmers. In practice you should still get good normalization results " + + "even with loads over 90%, but the histogram and statistics will be off."); + } + + long estUnique; + outstream.println(); + if(prefilterArray!=null){ + int lim1=prefilterArray.maxValue, lim2=prefilterArray.maxValue+1; + double a=prefilterArray.estimateUniqueKmers(prehashes); + double b=kca.estimateUniqueKmers(hashes, lim2); + a=a-b; + if(CANONICAL){ +// a=(a*KCountArray.canonMask)/(KCountArray.canonMask+1); +// b=(b*KCountArray.canonMask)/(KCountArray.canonMask+1); + }else{ + a/=2; + b/=2; + } + estUnique=((long)((a+b))); + outstream.println("Estimated kmers of depth 1-"+lim1+": \t"+(long)a); + outstream.println("Estimated kmers of depth "+lim2+"+ : \t"+(long)b); + }else{ +// double est=kca.cells*(1-Math.pow(1-Math.sqrt(kca.usedFraction()), 1.0/hashes)); +// double est=kca.cells*(1-Math.pow(1-kca.usedFraction(), 1.0/hashes)); + double est=kca.estimateUniqueKmers(hashes); +// outstream.println("Used cells: "+kca.cellsUsed(1)); + if(CANONICAL){ +// est=(est*KCountArray.canonMask)/(KCountArray.canonMask+1); + }else{ + est/=2; + } + estUnique=((long)((est))); + + } + outstream.println("Estimated unique kmers: \t"+estUnique);//+", or "+estUnique+" counting forward kmers only."); +// outstream.println("(Includes forward and reverse kmers)"); + outstream.println(); + outstream.println("Table creation time:\t\t"+ht);//+" \t"+String.format("%.2f", totalBases*1000000.0/(ht.elapsed))+" kb/sec"); + + long bases=0; + + ListNum.setDeterministicRandom(deterministic); + + if(reads1!=null && reads1.contains(",") && !new File(reads1).exists()){ + throw new RuntimeException("This class is not designed to deal with lists of input files."); + }else{ + bases=count(reads1, reads2, kca, k, maxReads, outKeep, overwrite, histFile, estUnique); + } + printTopology(); + + t.stop(); + outstream.println("\nTotal time: \t\t"+t+" \t"+String.format("%.2f", bases*1000000.0/(t.elapsed))+" kb/sec"); + + } + + + public static void printTopology(){ + long total=peaks.get()+spikes.get()+flats.get()+valleys.get()+slopes.get(); + double mult=100.0/total; + + long sp=spikes.get(); + long pe=peaks.get(); + long va=valleys.get(); + long sl=slopes.get(); + long fl=flats.get(); + double dsp=mult*sp; + double dpe=mult*pe; + double dva=mult*va; + double dsl=mult*sl; + double dfl=mult*fl; + + System.err.println("\nDepth Topology:\t"); + System.err.println("Spikes: \t\t\t"+(dsp<10 ? " " : "")+String.format("%.3f%% \t%d",dsp,sp)); + System.err.println("Peaks: \t\t\t"+(dpe<10 ? " " : "")+String.format("%.3f%% \t%d",dpe,pe)); + System.err.println("Valleys: \t\t\t"+(dva<10 ? " " : "")+String.format("%.3f%% \t%d",dva,va)); + System.err.println("Slopes: \t\t\t"+(dsl<10 ? " " : "")+String.format("%.3f%% \t%d",dsl,sl)); + System.err.println("Flats: \t\t\t"+(dfl<10 ? " " : "")+String.format("%.3f%% \t%d",dfl,fl)); + } + + + public static long count(String in1, String in2, KCountArray kca, int k, long maxReads, + String outKeep, boolean overwrite, String histFile, long estUnique) { + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} + Thread th=new Thread(cris); + th.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + RTextOutputStream3 rosKeep=null; + if(outKeep!=null){ + final int buff=(!ordered ? 8 : Tools.max(16, 2*THREADS)); + + String out1=outKeep.replaceFirst("#", "1"); + String out2=null; + + if(cris.paired()){ + if(outKeep.contains("#")){ + out2=outKeep.replaceFirst("#", "2"); + }else{ + outstream.println("Writing interleaved."); + } + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)); + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))); + +// assert(false) : out1+", "+out2; + + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, "attachment", true, overwrite, ordered); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, "attachment", true, overwrite, ordered); + rosKeep=new RTextOutputStream3(ff1, ff2, buff, null, true); + } + + if(rosKeep!=null){ + rosKeep.start(); + outstream.println("Started output threads."); + } + + long bases=downsample(cris, kca, k, maxReads, rosKeep, histFile, overwrite, estUnique); + + ReadWrite.closeStreams(cris, rosKeep); + if(verbose){System.err.println("Closed streams");} + + return bases; + } + + + + public static long downsample(ConcurrentReadStreamInterface cris, KCountArray kca, int k, long maxReads, RTextOutputStream3 rosKeep, + String histFile, boolean overwrite, long estUnique) { + Timer tdetect=new Timer(); + tdetect.start(); + + long totalBases=0; + long totalReads=0; + long basesKept=0; + long readsKept=0; + long basesTossed=0; + long readsTossed=0; + +// assert(false) : THREADS; + ProcessThread[] pta=new ProcessThread[THREADS]; + for(int i=0; i1){ + histogram_total.addAndGet(1, histogram_total.get(0)); + histogram_total.set(0, 0); + } + +// outstream.println(); + tdetect.stop(); + outstream.println("Table read time: \t\t"+tdetect+" \t"+String.format("%.2f", totalBases*1000000.0/(tdetect.elapsed))+" kb/sec"); + + { + String pad=""; + String s=""+totalReads; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Total reads in: \t\t"+totalReads+pad+String.format("\t(%.3f%% Kept)", (readsKept*100.0/totalReads))); + s=""+totalBases; + while(pad.length()+s.length()<9){pad+=" ";} + outstream.println("Total bases in: \t\t"+totalBases+pad+String.format("\t(%.3f%% Kept)", (basesKept*100.0/totalBases))); + } +// outstream.println(); + if(histogram_total!=null){ + TextStreamWriter tswh=null; + StringBuilder sb=new StringBuilder(100); + if(USE_HISTOGRAM){ + tswh=new TextStreamWriter(histFile, overwrite, false, false); + tswh.start(); + tswh.print("#Depth\tRaw_Count\tUnique_Kmers\n"); + } + int lim=(int)(HIST_LEN_PRINT-1); + long remaining=Tools.sum(histogram_total); + long sumRaw1=0; + long sumRaw2=0; + long sum1=0; + long sum2=0; + long sumsquare=0; + for(int i=0; i0*/ || y>0){ + sb.append(i).append('\t'); + sb.append(x).append('\t'); + sb.append(y).append('\n'); + } + tswh.print(sb.toString()); + sb.setLength(0); + } + if(sumRaw1>=remaining){break;} //Stop once there is no more coverage, even if PRINT_ZERO_COVERAGE is not set. + } + for(int i=lim; i0 || sum2>0){ + sb.append(lim).append('\t'); + sb.append(sumRaw2).append('\t'); + sb.append(sum2).append('\n'); + } + tswh.print(sb.toString()); + tswh.poison(); + tswh.waitForFinish(); + outstream.println("Wrote histogram to "+histFile); + } + + long histCount=Tools.sum(histogram_total); //Total number of kmers counted + long halfCount=(histCount+1)/2; + double histCountU=0; //Unique kmers counted + long temp1=0; + double temp2=0; + int median_all=-1; + int median_unique=-1; + for(int i=0; i=halfCount && median_all<0){median_all=i;} +// histSum+=(x*(double)i); + histCountU+=(x/(double)Tools.max(1, i)); + } + double halfCount2=(histCountU)/2; + for(int i=0; i=halfCount2 && median_unique<0){ + median_unique=i; + break; + } + } + if(median_all<0){median_all=0;} + double avg_all=sumsquare/(double)histCount; + double avg_unique=histCount/histCountU; + double stdev_unique=Tools.standardDeviationHistogramKmer(histogram_total); + double stdev_all=Tools.standardDeviationHistogram(histogram_total); + outstream.println("Total kmers counted: \t"+(sumRaw1+sumRaw2)); + + double uniqueC=((sum1+sum2)*100.0/(sumRaw1+sumRaw2)); + double uniqueE=((estUnique)*100.0/(sumRaw1+sumRaw2)); + double uniqueM=Tools.max(uniqueC, uniqueE); + outstream.println("Total unique kmer count: \t"+(sum1+sum2)); + if(CANONICAL){outstream.println("Includes forward kmers only.");} + outstream.println("The unique kmer estimate can be more accurate than the unique count, if the tables are very full."); + outstream.println("The most accurate value is the greater of the two."); + outstream.println(); + + outstream.println("Percent unique: \t"+(uniqueM<10 ? " " : "")+String.format("%.2f%%", uniqueM)); + + outstream.println("Depth average: \t"+String.format("%.2f\t(unique kmers)", avg_unique)); + outstream.println("Depth median: \t"+String.format("%d\t(unique kmers)", median_unique)); + outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(unique kmers)", stdev_unique)); + + outstream.println("\nDepth average: \t"+String.format("%.2f\t(all kmers)", avg_all)); + outstream.println("Depth median: \t"+String.format("%d\t(all kmers)", median_all)); + outstream.println("Depth standard deviation: \t"+String.format("%.2f\t(all kmers)", stdev_all)); + } + + return totalBases; + } + + + + /** + * Locates and fixes spikes in a coverage profile (potentially) caused by false positives in a bloom filter. + * Theory: If a high-count kmer is adjacent on both sides to low-count kmers, it may be a false positive. + * It could either be reduced to the max of the two flanking points or examined in more detail. + * @param array An array of kmer counts for adjacent kmers in a read. + */ + private static void fixSpikes(int[] array){ + + for(int i=1; i1 && b>a && b>c){ + //peak + if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + //spike + array[i]=(int)Tools.max(a, c); + } + } + } + } + private static void fixSpikes(int[] array, long[] kmers, KCountArray kca, int k){ + if(array.length<3){return;} + if(array[1]-array[0]>1){ + array[0]=kca.readPrecise(kmers[0], k, CANONICAL); + } + if(array[array.length-1]-array[array.length-2]>1){ + array[array.length-1]=kca.readPrecise(kmers[array.length-1], k, CANONICAL); + } + + for(int i=1; i1){ + long a=Tools.max(1, array[i-1]); + long c=Tools.max(1, array[i+1]); + long key=kmers[i]; + + if(b>a && b>c){ + //peak + if(b<6 || b>a+1 || b>c+1){ + array[i]=kca.readPreciseMin(key, k, CANONICAL); + } + // if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + // //spike + // int b1=(int)((a+c)/2); + // int b2=kca.readLeft(key, k, CANONICAL); + // int b3=kca.readRight(key, k, CANONICAL); + // array[i]=Tools.min(b, b1, b2, b3); + // } + // else + // { + //// array[i]=kca.readPreciseMin(key, k, CANONICAL); + // } + } + // else + // if(Tools.max(ada, adc)>=Tools.max(2, Tools.min((int)a, b, (int)c)/4)){ + // array[i]=kca.readPrecise(key, k, CANONICAL); + // } + // else + // if(b>a+1 || b>c+1){ + // //steep + // array[i]=kca.readPrecise(key, k, CANONICAL); + // } + } + } + } + + + private static void analyzeSpikes(int[] array, int width){ + if(array.length<3){return;} + int peakcount=0, valleycount=0, spikecount=0, flatcount=0, slopecount=0; + for(int i=1; ia && b>c){ + peakcount++; + if((b>=2*a || b>a+2) && (b>=2*c || b>c+2)){ + spikecount++; + } + }else if(b0){peaks.addAndGet(peakcount);} + if(valleycount>0){valleys.addAndGet(valleycount);} + if(spikecount>0){spikes.addAndGet(spikecount);} + if(flatcount>0){flats.addAndGet(flatcount);} + if(slopecount>0){slopes.addAndGet(slopecount);} + } + + + /** + * @param r + * @param kca + * @return + */ + public static int[] generateCoverage(Read r, KCountArray kca, int k, int[] out, long[] kmers) { + if(k>31){return generateCoverageLong(r, kca, k, out);} + if(kca.gap>0){throw new RuntimeException("Gapped reads: TODO");} + if(r==null || r.bases==null || r.bases.length=k){ + // int count=kca.readPrecise(kmer, k, CANONICAL); + int count=kca.read(kmer, k, CANONICAL); + out[i-k+1]=count; + if(kmers!=null){kmers[i-k+1]=kmer;} + } + } + } + + if(FIX_SPIKES){fixSpikes(out, kmers, kca, k);} +// fixSpikes(out, 1); + + analyzeSpikes(out, 1); + return out; + } + + + + /** + * @param r + * @param kca + * @return + */ + public static int[] generateCoverageLong(Read r, KCountArray kca, int k, int[] out) { + assert(k>31); + if(kca.gap>0){throw new RuntimeException();} + if(r==null || r.bases==null || r.bases.lengthk){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2<=k){ + int count=kca.read(kmer); + out[i-k+1]=count; + } + } + } + + fixSpikes(out); + + analyzeSpikes(out, 1); + return out; + } + + + private static class ProcessThread extends Thread{ + + ProcessThread(ConcurrentReadStreamInterface cris_, KCountArray kca_, int k_, RTextOutputStream3 rosk_){ + cris=cris_; + kca=kca_; + k=k_; + rosk=rosk_; + } + + public void run(){ + countInThread(); + } + + void countInThread() { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + final ArrayList keep=new ArrayList(Shared.READ_BUFFER_LENGTH); + + int[] cov1=null; + long[] kmers1=null; + + while(reads!=null && reads.size()>0){ + for(int rnum=0; rnum=k){ + if(verbose){outstream.println();} + if(FIX_SPIKES && k<32){ + final int arraylen=r.bases.length-k+1; + if(kmers1==null || kmers1.length!=arraylen){kmers1=new long[arraylen];} + kmers=kmers1; + } + cov=getSortedCoverageAndIncrementHistogram(r, cov1, kmers1); + if(cov!=null){; + int i=cov.length-1; + while(i>=0 && cov[i]=MIN_KMERS_OVER_MIN_DEPTH){depth=cov[(int)(i*(1-DEPTH_PERCENTILE))];} + cov1=cov; + min=cov[cov.length-1]; + max=cov[(int)(cov.length*0.05f)]; + } + } + } + + + totalReads+=readcount; + totalBases+=basecount; + if(max>TARGET_DEPTH && max>2*min){ + readsKept+=readcount; + basesKept+=basecount; + StringBuilder sb=new StringBuilder(); + sb.append(cov[0]); + for(int i=1; i=k) : r; + cov=generateCoverage(r, kca, k, cov, kmers); + if(cov!=null){ + Arrays.sort(cov); + Tools.reverseInPlace(cov); + incrementHistogramSorted(cov); + } + return cov; + } + + private final void incrementHistogramSorted(int[] cov){ + if(hist==null || cov==null || cov.length==0){return;} + +// outstream.println(Arrays.toString(cov)); + + int last=cov[0]; + long sum=0; +// long sum2=0; + for(int x : cov){ +// outstream.println("Processing "+x); + if(x<0){break;} + int y=Tools.min(x, HIST_LEN-1); + if(y==last){sum++;} + else if(sum>0){ +// outstream.println("Incrementing "+last+" by "+sum); +// sum2+=sum; + if(last0){ +// outstream.println("Incrementing "+last+" by "+sum); +// sum2+=sum; + if(last1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null") || a.equals(in2)){ + // do nothing + }else if(a.equals("passes")){ + assert(false) : "'passes' is disabled."; +// passes=Integer.parseInt(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + ByteFile2.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + ConcurrentGenericReadInputStream.verbose=verbose; +// align2.FastaReadInputStream2.verbose=verbose; + stream.FastqReadInputStream.verbose=verbose; + ReadWrite.verbose=verbose; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("t") || a.equals("threads")){ + Shared.THREADS=Tools.max(Integer.parseInt(b), 1); + }else if(a.equals("build") || a.equals("genome")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){ + out1=b; + }else if(a.equals("out2") || a.equals("output2")){ + out2=b; + }else if(a.equals("extin")){ + extin=b; + }else if(a.equals("extout")){ + extout=b; + }else if(a.equals("trd") || a.equals("trc") || a.equals("trimreaddescription")){ + Shared.TRIM_READ_COMMENTS=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("fastareadlen") || a.equals("fastareadlength")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.equals("fastaminread") || a.equals("fastaminlen") || a.equals("fastaminlength")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + } + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in1=arg; + if(arg.indexOf('#')>-1 && !new File(arg).exists()){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(out1==null && i==1 && !arg.contains("=")){ + out1=arg; + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + + if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ + in2=in1.replace("#", "2"); + in1=in1.replace("#", "1"); + } + if(out1!=null && out2==null && out1.indexOf('#')>-1){ + out2=out1.replace("#", "2"); + out1=out1.replace("#", "1"); + } + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + assert(FastaReadInputStream.settingsOK()); + + if(in1==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.THREADS>2){ +// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;} + ByteFile.FORCE_MODE_BF2=true; + } + + if(out1==null){ + if(out2!=null){ + printOptions(); + throw new RuntimeException("Error - cannot define out2 without defining out1."); + } + //out1="stdout"; + System.err.println("Warning: output destination not set; producing no output. To print to standard out, set 'out=stdout.fq'"); + } + + if(!setInterleaved){ + assert(in1!=null && out1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; + if(in2!=null){ //If there are 2 input streams. + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else{ //There is one input stream. + if(out2!=null){ + FASTQ.FORCE_INTERLEAVED=true; + FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + } + } + + if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} + if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;} + + if(!Tools.testOutputFiles(overwrite, false, out1, out2)){ + throw new RuntimeException("\n\nOVERWRITE="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); + } + + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + + ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, false); + ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, false); + + ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); + ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); + + /* Check for output file collisions */ + Tools.testOutputFiles(overwrite, false, out1, out2); + + k1=k1_; + assert(k1>=-1 && k1<=15) : "k1 must lie between 1 and 15, inclusive (0 to disable)"; + k2=k2_; + assert(k2>=-1 && k2<=6) : "k2 must lie between 1 and 6, inclusive (0 to disable)"; + + arraylen1=(k1>0 ? maxCanonicalKmer(k1)+1 : 0); + arraylen2=(k2>0 ? maxCanonicalKmer(k2)+1 : 0); + } + + /*--------------------------------------------------------------*/ + + /** TODO */ + public static void printOptions(){ + System.err.println("Usage information unavailable"); + } + + void process(Timer t){ + + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, false, ffin1, ffin2, null, null); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));} + + RTextOutputStream3 ros=null; + if(out1!=null){ + final int buff=4; + + if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){ + outstream.println("Writing interleaved."); + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name."; + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name."; + + ros=new RTextOutputStream3(ffout1, ffout2, null, null, buff, null, false); + ros.start(); + } + + long readsProcessed=0; + long basesProcessed=0; + + { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + +// System.err.println("Fetched "+reads); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); + } + + while(reads!=null && reads.size()>0){ + + for(int idx=0; idxclusterList.size()){ + synchronized(clusterList){ + clusterList.ensureCapacity(2*x); + for(int i=clusterList.size(); i alct=new ArrayList(THREADS); + + for(int i=0; i alct=new ArrayList(THREADS); + for(int i=0; i>>2)|(x2<bestScore1){ + bestCluster1=c; + bestScore1=score1; + bestScore1_2=score2; + } + if(bestCluster2==null || score2>bestScore2){ + bestCluster2=c; + bestScore2=score2; + bestScore2_1=score1; + } + } + + if(r2==null){ + rt1.cluster1=bestCluster1.id; + }else if(bestCluster1==bestCluster2){ + rt1.cluster1=rt2.cluster1=bestCluster1.id; + }else{ + assert(r1!=null && r2!=null && bestCluster1!=bestCluster2); + + float a=bestScore1+bestScore1_2; + float b=bestScore2+bestScore2_1; + + if(ambigMode==AMBIG_MODE_BEST){ + if(a>=b){ + rt1.cluster1=rt2.cluster1=bestCluster1.id; + }else{ + rt1.cluster1=rt2.cluster1=bestCluster2.id; + } + }else if(ambigMode==AMBIG_MODE_BOTH){ + assert(false) : "TODO"; + }else if(ambigMode==AMBIG_MODE_TOSS){ + rt1.cluster1=rt2.cluster1=-1; + }else if(ambigMode==AMBIG_MODE_RAND){ + if(a<0 || b<0){ + float c=0-(Tools.min(a, b))*1.5f; + a=a+c; + b=a+c; + } + float coin=randy.nextFloat()*(a+b); + if(coin<=a){ + rt1.cluster1=rt2.cluster1=bestCluster1.id; + }else{ + rt1.cluster1=rt2.cluster1=bestCluster2.id; + } + } + } + + } + + final int id; + final int clusterMode; + final int ambigMode; + final ConcurrentReadStreamInterface cris; + + final ThreadLocalRandom randy; + + long readsInT; + long basesInT; + + } + + /*--------------------------------------------------------------*/ + + private class ReadTag{ + + public ReadTag(Read r_){ + r=r_; + strand=r.strand(); + + int gcCount_=0; + for(byte b : r.bases){ + if(b=='G' || b=='C'){ + gcCount_++; + } + } + gcCount=gcCount_; + + processHeader(r.id); + } + + private void processHeader(String s){ + assert(false) : "TODO"; + gc=-1; + depth=-1; + cluster0=-1; + } + + Read r1(){ + return strand==0 ? r : r.mate; + } + + Read r2(){ + return strand==1 ? r : r.mate; + } + + ReadTag tag1(){ + return (ReadTag)r1().obj; + } + + ReadTag tag2(){ + Read r2=r2(); + return r2==null ? null : (ReadTag)r2.obj; + } + +// private int[] toKmers(final int k){ +// return ReclusterByKmer.toKmers(r.bases, null, k); +// } + + int[] kmerArray1(){ + if(kmerArray1==null){kmerArray1=ReclusterByKmer.toKmers(r.bases, null, k1);} + return kmerArray1; + } + + int[] kmerArray2(){ + if(kmerArray2==null){kmerArray2=ReclusterByKmer.toKmerCounts(r.bases, null, k2);} + return kmerArray2; + } + + float[] kmerFreq2(){ + if(kmerFreq2==null){ + int[] counts=kmerArray2(); + if(counts!=null){ + long sum=Tools.sum(counts); + kmerFreq2=new float[counts.length]; + float extra=(0.05f/counts.length); + float mult=0.95f/sum; + for(int i=0; i0){ + long kmerCount=0; + for(int i=0; i0){ + long kmerCount=0; + for(int i=0; i0){ + int[] kmers=rt.kmerArray1(); + int kmer=-1, run=0; + for(int i=0; i0){kmerArray1.addAndGet(kmer, run);} + kmer=x; + run=1; + } + } + if(run>0){kmerArray1.addAndGet(kmer, run);} + } + + if(k2>0){ + int[] kmers=rt.kmerArray2(); + for(int kmer=0; kmer0){kmerArray2.addAndGet(kmer, x);} + } + } + } + + /** + * @param r1 + * @return + */ + public float score(Read r) { + if(r==null){return 0;} + return r.mate==null ? scoreSingle(r) : scorePaired(r); + } + + /** + * @param r1 + * @return + */ + public float scoreSingle(Read r) { + if(r==null){return 0;} + ReadTag rt=(ReadTag)r.obj; + + assert(false) : "TODO"; + float depthScore=scoreDepthSingle(rt); + float gcScore=scoreGcSingle(rt); + float kmerScore=scoreKmer1(rt); + assert(false); + float depthWeight=.2f; + float gcWeight=.2f; + float kmerWeight=.6f; + + return depthWeight*depthScore+gcWeight*gcScore+kmerWeight*kmerScore; + } + + /** + * @param rt + * @return + */ + private float scoreKmer1(ReadTag rt) { + int[] kmers=rt.kmerArray1(); + + float score=0; + if(scoreMode1==SCORE_MODE_AND){ + float f=andCount(kmers, kmerArray1); + assert(false); + }else if(scoreMode1==SCORE_MODE_MULT){ + float f=innerProduct(kmers, kmerProbArray1); + assert(false); + }else{ + throw new RuntimeException(""+scoreMode1); + } + + return score; + } + + /** + * @param rt + * @return + */ + private float scoreKmer2(ReadTag rt) { + int[] kmers=rt.kmerArray2(); + float[] probs=rt.kmerFreq2(); + + float score=0; + if(scoreMode2==SCORE_MODE_AND){ + float f=andCount(kmers, kmerArray2); + assert(false); + }else if(scoreMode2==SCORE_MODE_MULT){ + float f=innerProduct(kmers, kmerProbArray2); + assert(false); + }if(scoreMode2==SCORE_MODE_DIF){ + float f=absDif(probs, kmerProbArray2); + assert(false); + }else if(scoreMode2==SCORE_MODE_RMS){ + float f=rmsDif(probs, kmerProbArray2); + assert(false); + }else if(scoreMode2==SCORE_MODE_KS){ + float f=ksFunction(probs, kmerProbArray2); + assert(false); + }else{ + throw new RuntimeException(""+scoreMode2); + } + + return score; + } + + /** + * @param rt + * @return + */ + private float scoreGcSingle(ReadTag rt) { + assert(false) : "TODO"; + // TODO Auto-generated method stub + return 0; + } + + /** + * @param rt + * @return + */ + private float scoreDepthSingle(ReadTag rt) { + assert(false) : "TODO"; + // TODO Auto-generated method stub + return 0; + } + + /** + * @param r1 + * @return + */ + public float scorePaired(Read r) { + assert(false) : "TODO"; + if(r==null){return 0;} + ReadTag rt=(ReadTag)r.obj; + +// ReadTag rt1=rt.r + + return 0; + } + + public final int id; + + public float gc; + public int depth1, depth2; + + final AtomicLongArray kmerArray1; + final float[] kmerProbArray1; + + final AtomicLongArray kmerArray2; + final float[] kmerProbArray2; + + final AtomicLong depthsum1=new AtomicLong(0); + final AtomicLong depthsum2=new AtomicLong(0); + + final AtomicLong readCount=new AtomicLong(0); + final AtomicLong baseCount=new AtomicLong(0); +// final AtomicLong kmerCount=new AtomicLong(0); + final AtomicLong gcCount=new AtomicLong(0); + } + + /*--------------------------------------------------------------*/ + + /*--------------------------------------------------------------*/ + + public boolean errorState=false; + + private final ArrayList clusterList=new ArrayList(256); + + /** 'big' kmer */ + public final int k1; + /** 'small' kmer */ + public final int k2; + + public final int arraylen1; + public final int arraylen2; + + private String in1=null; + private String in2=null; + + private String out1=null; + private String out2=null; + + private String extin=null; + private String extout=null; + + private boolean overwrite=false; + private boolean colorspace=false; + + private long maxReads=-1; + + private byte qin=-1; + private byte qout=-1; + + private int scoreMode1=SCORE_MODE_MULT; + private int scoreMode2=SCORE_MODE_RMS; + private int ambigMode=AMBIG_MODE_RAND; + + private final FileFormat ffin1; + private final FileFormat ffin2; + + private final FileFormat ffout1; + private final FileFormat ffout2; + + private PrintStream outstream=System.err; + + private int THREADS=Shared.THREADS; + + /*--------------------------------------------------------------*/ + + public static boolean verbose=false; + + public static final int CLUSTER_MODE_CREATE=0; + public static final int CLUSTER_MODE_RECLUSTER=1; + public static final int CLUSTER_MODE_REFINE=2; + + public static final int SCORE_MODE_DIF=0; + public static final int SCORE_MODE_RMS=1; + public static final int SCORE_MODE_AND=2; + public static final int SCORE_MODE_MULT=3; + public static final int SCORE_MODE_KS=4; + + public static final int AMBIG_MODE_BEST=0; + public static final int AMBIG_MODE_BOTH=1; + public static final int AMBIG_MODE_TOSS=2; + public static final int AMBIG_MODE_RAND=3; + +} diff --git a/current/jgi/RedirectTest.java b/current/jgi/RedirectTest.java new file mode 100755 index 0000000..69f98ec --- /dev/null +++ b/current/jgi/RedirectTest.java @@ -0,0 +1,78 @@ +package jgi; + +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; + +import dna.Data; + +import fileIO.PipeThread; +import fileIO.ReadWrite; + +/** + * @author Brian Bushnell + * @date Jan 22, 2013 + * + */ +public class RedirectTest { + + public static void main(String[] args) throws IOException{ + + String fin=args[0]; +// String fout=args[1]; + + System.out.println("fin="+fin); + + InputStream in=null; + final OutputStream os=System.out; + InputStream es=null; + Process p=null; + + System.out.println("Samtools="+Data.SAMTOOLS()); + System.out.println("Gzip="+Data.GZIP()); + System.out.println("Pigz="+Data.PIGZ()); + System.out.println("Gunzip="+Data.GUNZIP()); + + if(Data.WINDOWS){ + System.out.println("WINDOWS"); + in=ReadWrite.getInputStream(fin, false, false); + }else{ + System.out.println("LINUX"); + p=Runtime.getRuntime().exec("gunzip -c -d "+fin); + in=p.getInputStream(); + es=p.getErrorStream(); + assert(es!=null); + PipeThread et=new PipeThread(es, System.err); + et.start(); + System.out.println(p); + } + + final byte[] buf=new byte[4096]; + for(int len=in.read(buf); len>0; len=in.read(buf)){ + os.write(buf, 0, len); + } + + in.close(); + if(es!=null){es.close();} + ReadWrite.close(os); + + } + + public static void main_0(String[] args) throws IOException{ + + String fin=args[0]; + String fout=args[1]; + + InputStream in=ReadWrite.getInputStream(fin, false, false); + + OutputStream os=System.out; + + byte[] buf=new byte[4096]; + + for(int len=in.read(buf); len>0; len=in.read(buf)){ + os.write(buf, 0, len); + } + + } + +} diff --git a/current/jgi/ReformatFasta.java b/current/jgi/ReformatFasta.java new file mode 100755 index 0000000..38d6e29 --- /dev/null +++ b/current/jgi/ReformatFasta.java @@ -0,0 +1,107 @@ +package jgi; + +import align2.Tools; +import dna.Timer; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Nov 1, 2012 + * + */ +public class ReformatFasta { + + public static void main(String[] args){ + Timer t=new Timer(); + t.start(); + + String in=args[0]; + String out=args[1]; + int minLen=0; + int maxLen=Integer.MAX_VALUE; + boolean rename=false; + if(args.length>2){ + minLen=Integer.parseInt(args[2]); + } + if(args.length>3){ + maxLen=Integer.parseInt(args[3]); + } + if(args.length>4){ + rename=Tools.parseBoolean(args[4]); + } + + if(in.equalsIgnoreCase(out)){throw new RuntimeException("in == out");} + + TextFile tf=new TextFile(in, false, false); + TextStreamWriter tsw=new TextStreamWriter(out, true, false, false); + tsw.start(); + + long kept=0; + long dropped=0; + + String header=null; + StringBuilder sb=new StringBuilder(100); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)=='>'){ + if(header!=null){ + if(sb.length()>=minLen){ + if(rename){ + tsw.println(">"+(kept+1)); + }else{ + tsw.println(header); + } +// tsw.println(sb); + printAsLines(sb, tsw, maxLen); + kept++; + }else{ + dropped++; + } + } + header=s; + sb=new StringBuilder(100); + }else{ + sb.append(s); + } + } + if(header!=null){ + if(sb.length()>=minLen){ + if(rename){ + tsw.println(">"+(kept+1)); + }else{ + tsw.println(header); + } +// tsw.println(sb); + printAsLines(sb, tsw, maxLen); + kept++; + }else{ + dropped++; + } + } + tsw.poison(); + t.stop(); + System.out.println("Time: \t"+t); + System.out.println("Kept "+kept+" reads."); + System.out.println("Dropped "+dropped+" reads."); + + } + + private static void printAsLines(final CharSequence sb, final TextStreamWriter tsw, final int max){ + + final int len=sb.length(); + + if(len<=max){ + tsw.println(sb); + }else{ + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null") || a.equals(in2)){ + // do nothing + }else if(a.equals("passes")){ + assert(false) : "'passes' is disabled."; +// passes=Integer.parseInt(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + ByteFile1.verbose=verbose; + ByteFile2.verbose=verbose; + stream.FastaReadInputStream.verbose=verbose; + ConcurrentGenericReadInputStream.verbose=verbose; +// align2.FastaReadInputStream2.verbose=verbose; + stream.FastqReadInputStream.verbose=verbose; + ReadWrite.verbose=verbose; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("t") || a.equals("threads")){ + Shared.THREADS=Tools.max(Integer.parseInt(b), 1); + }else if(a.equals("sample") || a.equals("samplereads") || a.equals("samplereadstarget") || a.equals("srt")){ + sampleReadsTarget=Long.parseLong(b); + sampleReadsExact=(sampleReadsTarget>0); + }else if(a.equals("samplebases") || a.equals("samplebasestarget") || a.equals("sbt")){ + sampleBasesTarget=Long.parseLong(b); + sampleBasesExact=(sampleBasesTarget>0); + }else if(a.equals("build") || a.equals("genome")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("out") || a.equals("output") || a.equals("out1") || a.equals("output1")){ + out1=b; + }else if(a.equals("out2") || a.equals("output2")){ + out2=b; + }else if(a.equals("qfin") || a.equals("qfin1")){ + qfin1=b; + }else if(a.equals("qfout") || a.equals("qfout1")){ + qfout1=b; + }else if(a.equals("qfin2")){ + qfin2=b; + }else if(a.equals("qfout2")){ + qfout2=b; + }else if(a.equals("extin")){ + extin=b; + }else if(a.equals("extout")){ + extout=b; + }else if(a.equals("trd") || a.equals("trc") || a.equals("trimreaddescription")){ + Shared.TRIM_READ_COMMENTS=Tools.parseBoolean(b); + }else if(a.equals("parsecustom")){ + parsecustom=Tools.parseBoolean(b); + }else if(a.equals("testsize")){ + testsize=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("fastareadlen") || a.equals("fastareadlength")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.equals("fastaminread") || a.equals("fastaminlen") || a.equals("fastaminlength")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ascii") || a.equals("quality") || a.equals("qual")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qin=qout=x; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY=true;} + else{x=(byte)Integer.parseInt(b);} + qin=x; + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + byte x; + if(b.equalsIgnoreCase("sanger")){x=33;} + else if(b.equalsIgnoreCase("illumina")){x=64;} + else if(b.equalsIgnoreCase("auto")){x=-1;FASTQ.DETECT_QUALITY_OUT=true;} + else{x=(byte)Integer.parseInt(b);} + qout=x; + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("tossbrokenreads") || a.equals("tbr")){ + boolean x=Tools.parseBoolean(b); + Read.NULLIFY_BROKEN_QUALITY=x; + ConcurrentGenericReadInputStream.REMOVE_DISCARDED_READS=x; + }else if(a.equals("verifyinterleaved") || a.equals("verifyinterleaving") || a.equals("vint")){ + verifyinterleaving=Tools.parseBoolean(b); + }else if(a.equals("testinterleaved")){ + FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set TEST_INTERLEAVED to "+FASTQ.TEST_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("forceinterleaved")){ + FASTQ.FORCE_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set FORCE_INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true);} + else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + setInterleaved=true; + } + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("rcompmate") || a.equals("rcm")){ + RCOMPMATE=Tools.parseBoolean(b); + outstream.println("Set RCOMPMATE to "+RCOMPMATE); + }else if(a.equals("samplerate")){ + samplerate=Float.parseFloat(b); + assert(samplerate<=1f && samplerate>=0f) : "samplerate="+samplerate+"; should be between 0 and 1"; + }else if(a.equals("sampleseed")){ + sampleseed=Long.parseLong(b); + }else if(a.startsWith("minscaf") || a.startsWith("mincontig")){ + int x=Integer.parseInt(b); + stream.FastaReadInputStream.MIN_READ_LEN=(x>0 ? x : Integer.MAX_VALUE); + }else if(a.equals("samversion") || a.equals("samv") || a.equals("sam")){ + SamLine.VERSION=Float.parseFloat(b); + }else if(a.equals("mdtag") || a.equals("md")){ + SamLine.MAKE_MD_TAG=Tools.parseBoolean(b); + }else if(a.equals("xstag") || a.equals("xs")){ + SamLine.MAKE_XS_TAG=true; + if(b!=null){ + b=b.toLowerCase(); + if(b.startsWith("fr-")){b=b.substring(3);} + if(b.equals("ss") || b.equals("secondstrand")){ + SamLine.XS_SECONDSTRAND=true; + }else if(b.equals("fs") || b.equals("firststrand")){ + SamLine.XS_SECONDSTRAND=false; + }else if(b.equals("us") || b.equals("unstranded")){ + SamLine.XS_SECONDSTRAND=false; + }else{ + SamLine.MAKE_XS_TAG=Tools.parseBoolean(b); + } + } + setxs=true; + }else if(a.equals("intronlen") || a.equals("intronlength")){ + SamLine.INTRON_LIMIT=Integer.parseInt(b); + setintron=true; + }else if(a.equals("idtag")){ + SamLine.MAKE_IDENTITY_TAG=Tools.parseBoolean(b); + }else if(a.equals("xmtag") || a.equals("xm")){ + SamLine.MAKE_XM_TAG=Tools.parseBoolean(b); + }else if(a.equals("stoptag")){ + SamLine.MAKE_STOP_TAG=Tools.parseBoolean(b); + }else if(a.equals("ftl") || a.equals("forcetrimleft")){ + forceTrimLeft=Integer.parseInt(b); + }else if(a.equals("ftr") || a.equals("forcetrimright")){ + forceTrimRight=Integer.parseInt(b); +// }else if(a.equals("ftlb") || a.equals("forcetrimleftby")){ +// forceTrimLeftBy=Integer.parseInt(b); +// }else if(a.equals("ftrb") || a.equals("forcetrimrightby")){ +// forceTrimRightBy=Integer.parseInt(b); + }else if(a.equals("trim") || a.equals("qtrim")){ + if(b==null){trimRight_=trimLeft_=true;} + else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){trimLeft_=true;trimRight_=false;} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){trimLeft_=false;trimRight_=true;} + else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){trimLeft_=trimRight_=true;} + else{trimRight_=trimLeft_=Tools.parseBoolean(b);} + }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){ + if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){ + TrimRead.optimalMode=true; + TrimRead.optimalBias=Float.parseFloat(b); + assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimright") || a.equals("qtrimright")){ + trimRight_=Tools.parseBoolean(b); + }else if(a.equals("trimleft") || a.equals("qtrimleft")){ + trimLeft_=Tools.parseBoolean(b); + }else if(a.equals("trimq") || a.equals("trimquality")){ + trimq=Byte.parseByte(b); + }else if(a.equals("q102matrix") || a.equals("q102m")){ + CalcTrueQuality.q102matrix=b; + }else if(a.equals("qbpmatrix") || a.equals("bqpm")){ + CalcTrueQuality.qbpmatrix=b; + }else if(a.equals("adjustquality") || a.equals("adjq")){ + TrimRead.ADJUST_QUALITY=Tools.parseBoolean(b); + }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength")){ + minReadLength=Integer.parseInt(b); + }else if(a.equals("mlf") || a.equals("minlenfrac") || a.equals("minlenfraction") || a.equals("minlengthfraction")){ + minLenFraction=Float.parseFloat(b); + }else if(a.equals("minavgquality") || a.equals("maq")){ + minAvgQuality=Byte.parseByte(b); + }else if(in1==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + in1=arg; + if(arg.indexOf('#')>-1 && !new File(arg).exists()){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(out1==null && i==1 && !arg.contains("=")){ + out1=arg; + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(TrimRead.ADJUST_QUALITY){CalcTrueQuality.initializeMatrices();} + + if(setxs && !setintron){SamLine.INTRON_LIMIT=10;} + qtrim=trimLeft_||trimRight_; + + if(verifyinterleaving){ + setInterleaved=true; +// if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true; + } + if(fixinterleaving){ + setInterleaved=true; + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true; //? Not sure which way to go + throw new RuntimeException("fixinterleaving: TODO"); + } + + if(in1!=null && in2==null && in1.indexOf('#')>-1 && !new File(in1).exists()){ + in2=in1.replace("#", "2"); + in1=in1.replace("#", "1"); + } + if(out1!=null && out2==null && out1.indexOf('#')>-1){ + out2=out1.replace("#", "2"); + out1=out1.replace("#", "1"); + } + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + assert(FastaReadInputStream.settingsOK()); +// if(maxReads!=-1){ReadWrite.USE_GUNZIP=ReadWrite.USE_UNPIGZ=false;} + + if(in1==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + if(!ByteFile.FORCE_MODE_BF1 && !ByteFile.FORCE_MODE_BF2 && Shared.THREADS>2){ +// if(ReadWrite.isCompressed(in1)){ByteFile.FORCE_MODE_BF2=true;} + ByteFile.FORCE_MODE_BF2=true; + } + + if(out1==null){ + if(out2!=null){ + printOptions(); + throw new RuntimeException("Error - cannot define out2 without defining out1."); + } + out1="stdout"; + } + + if(!setInterleaved){ + assert(in1!=null && out1!=null) : "\nin1="+in1+"\nin2="+in2+"\nout1="+out1+"\nout2="+out2+"\n"; + if(in2!=null){ //If there are 2 input streams. + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + }else{ //There is one input stream. + if(out2!=null){ + FASTQ.FORCE_INTERLEAVED=true; + FASTQ.TEST_INTERLEAVED=false; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + } + } + + if(out1!=null && out1.equalsIgnoreCase("null")){out1=null;} + if(out2!=null && out2.equalsIgnoreCase("null")){out2=null;} + + if(!Tools.testOutputFiles(overwrite, false, out1, out2)){ + throw new RuntimeException("\n\nOVERWRITE="+overwrite+"; Can't write to output files "+out1+", "+out2+"\n"); + } + + FASTQ.PARSE_CUSTOM=parsecustom; + + + if(qin!=-1 && qout!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY=false; + }else if(qin!=-1){ + FASTQ.ASCII_OFFSET=qin; + FASTQ.DETECT_QUALITY=false; + }else if(qout!=-1){ + FASTQ.ASCII_OFFSET_OUT=qout; + FASTQ.DETECT_QUALITY_OUT=false; + } + +// assert(false) : "qin="+qin+", qout="+qout+", FASTQ.ASCII_OFFSET="+FASTQ.ASCII_OFFSET+"\n"+ +// ", FASTQ.ASCII_OFFSET_OUT="+FASTQ.ASCII_OFFSET_OUT+", FASTQ.DETECT_QUALITY="+FASTQ.DETECT_QUALITY; + + ffout1=FileFormat.testOutput(out1, FileFormat.FASTQ, extout, true, overwrite, false); + ffout2=FileFormat.testOutput(out2, FileFormat.FASTQ, extout, true, overwrite, false); +// extsOut=(out1==null ? null : FileFormat.testFormat(out1, false)); +// outsam=(extsOut!=null && (extsOut[0]==FileFormat.SAM || extsOut[0]==FileFormat.BAM)); +// outbread=(extsOut!=null && extsOut[0]==FileFormat.BREAD); + + ffin1=FileFormat.testInput(in1, FileFormat.FASTQ, extin, true, true); + ffin2=FileFormat.testInput(in2, FileFormat.FASTQ, extin, true, true); + +// System.err.println("\n"+ReadWrite.USE_PIGZ+", "+ReadWrite.USE_UNPIGZ+", "+Data.PIGZ()+", "+Data.UNPIGZ()+", "+ffin1+"\n"); +// assert(false) : ReadWrite.USE_PIGZ+", "+ReadWrite.USE_UNPIGZ+", "+Data.PIGZ()+", "+Data.UNPIGZ()+", "+ffin1; + +// extsIn=(in1==null ? null : FileFormat.testFormat(in1, false)); +// insam=(extsIn!=null && (extsIn[0]==FileFormat.SAM || extsIn[0]==FileFormat.BAM)); + + if(ffin1!=null && ffout1!=null && ffin1.samOrBam()){ + if(ffout1.samOrBam()){ + useSharedHeader=true; + SamLine.CONVERT_CIGAR_TO_MATCH=true; + }else if(ffout1.bread()){ + SamLine.CONVERT_CIGAR_TO_MATCH=true; + } + } + } + + void process(Timer t){ + + long readsRemaining=0; + long basesRemaining=0; + + if(sampleReadsExact || sampleBasesExact){ + long[] counts=countReads(in1, in2, maxReads); + readsRemaining=counts[0]; + basesRemaining=counts[2]; + setSampleSeed(sampleseed); + } + + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { +// FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); +// FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, useSharedHeader, ffin1, ffin2, qfin1, qfin2); + cris.setSampleRate(samplerate, sampleseed); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Input is "+(paired ? "paired" : "unpaired"));} + + RTextOutputStream3 ros=null; + if(out1!=null){ + final int buff=4; + +// if(!fq && !fa && !bread && !sam){ +// outstream.println("Unspecified output format; defaulting to uncompressed fastq."); +// fq=true; +// } + + if(cris.paired() && out2==null && (in1==null || !in1.contains(".sam"))){ + outstream.println("Writing interleaved."); + } + + assert(!out1.equalsIgnoreCase(in1) && !out1.equalsIgnoreCase(in1)) : "Input file and output file have same name."; + assert(out2==null || (!out2.equalsIgnoreCase(in1) && !out2.equalsIgnoreCase(in2))) : "out1 and out2 have same name."; + +// System.err.println("Calling RTextOutputStream3 with out1="+out1+", out2="+out2+", qfout1="+qfout1+", qfout2="+qfout2); + ros=new RTextOutputStream3(ffout1, ffout2, qfout1, qfout2, buff, null, useSharedHeader); + ros.start(); + } + + long readsProcessed=0; + long basesProcessed=0; + + long basesTrimmedT=0; + long readsTrimmedT=0; + + long lowqBasesT=0; + long lowqReadsT=0; + + long readShortDiscardsT=0; + long baseShortDiscardsT=0; + +// for(int pass=1; pass<=passes; pass++){ +//// outstream.println("pass="+pass); +// if(pass>1){ +// cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, useSharedHeader, true, in1, in2); +// cris.setSampleRate(samplerate, sampleseed); +// cristhread=new Thread(cris); +// cristhread.start(); +// } + { + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + +// System.err.println("Fetched "+reads); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert((ffin1==null || ffin1.samOrBam()) || (r.mate!=null)==cris.paired()); + } + + while(reads!=null && reads.size()>0){ + + for(int idx=0; idx0){ + final int a=(r1==null ? 50 : r1.avgQuality()); + final int b=(r2==null ? 50 : r2.avgQuality()); + remove=(remove || a0 || forceTrimRight>0)){ + if(r1!=null){ + int x=TrimRead.trimToPosition(r1, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r1.bases.length, 1); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + if(r2!=null){ + int x=TrimRead.trimToPosition(r2, forceTrimLeft>0 ? forceTrimLeft : 0, forceTrimRight>0 ? forceTrimRight : r2.bases.length, 1); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + } + + if(qtrim && !remove){ + if(r1!=null){ + int x=TrimRead.trimFast(r1, trimLeft_, trimRight_, trimq, 1); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + if(r2!=null){ + int x=TrimRead.trimFast(r2, trimLeft_, trimRight_, trimq, 1); + basesTrimmedT+=x; + readsTrimmedT+=(x>0 ? 1 : 0); + } + } + + if((minlen1>0 || minlen2>0) && !remove){ + int rlen=(r1==null || r1.bases==null ? 0 : r1.bases.length); + int rlen2=(r2==null || r2.bases==null ? 0 : r2.bases.length); + if(rlen reads2=new ArrayList(); + if(sampleReadsExact){ + for(Read r : reads){ + if(r!=null){ + assert(readsRemaining>0) : readsRemaining; + double prob=sampleReadsTarget/(double)(readsRemaining); +// System.err.println("sampleReadsTarget="+sampleReadsTarget+", readsRemaining="+readsRemaining+", prob="+prob); + if(randy.nextDouble()0) : basesRemaining; + int bases=r.bases.length+(r.mate==null ? 0 : r.mate.bases.length); + double prob=sampleBasesTarget/(double)(basesRemaining); + if(randy.nextDouble()0 || forceTrimRight>0){ + outstream.println("QTrimmed: \t"+readsTrimmedT+" reads ("+String.format("%.2f",readsTrimmedT*100.0/readsProcessed)+"%) \t"+ + basesTrimmedT+" bases ("+String.format("%.2f",basesTrimmedT*100.0/basesProcessed)+"%)"); + }else if(minReadLength>0){ + outstream.println("Short Read Discards: \t"+readShortDiscardsT+" reads ("+String.format("%.2f",readShortDiscardsT*100.0/readsProcessed)+"%) \t"+ + baseShortDiscardsT+" bases ("+String.format("%.2f",baseShortDiscardsT*100.0/basesProcessed)+"%)"); + } + if(minAvgQuality>0){ + outstream.println("Low quality discards: \t"+lowqReadsT+" reads ("+String.format("%.2f",lowqReadsT*100.0/readsProcessed)+"%) \t"+ + lowqBasesT+" bases ("+String.format("%.2f",lowqBasesT*100.0/basesProcessed)+"%)"); + } +// if(qtrim || minAvgQuality>0){ +// outstream.println("Result: \t"+readsOut+" reads ("+String.format("%.2f",readsOut*100.0/readsProcessed)+"%) \t"+ +// basesOut+" bases ("+String.format("%.2f",basesOut*100.0/basesProcessed)+"%)"); +// } + + outstream.println("Time: \t"+t); + outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + if(testsize){ + long bytesProcessed=(new File(in1).length()+(in2==null ? 0 : new File(in2).length())+ + (qfin1==null ? 0 : new File(qfin1).length())+(qfin2==null ? 0 : new File(qfin2).length()));//*passes + double xpnano=bytesProcessed/(double)(t.elapsed); + String xpstring=(bytesProcessed<100000 ? ""+bytesProcessed : bytesProcessed<100000000 ? (bytesProcessed/1000)+"k" : (bytesProcessed/1000000)+"m"); + while(xpstring.length()<8){xpstring=" "+xpstring;} + outstream.println("Bytes Processed: "+xpstring+" \t"+String.format("%.2fm bytes/sec", xpnano*1000)); + } + + if(verifyinterleaving){ + outstream.println("Names appear to be correctly paired."); + } + + if(errorState){ + throw new RuntimeException("ReformatReads terminated in an error state; the output may be corrupt."); + } + } + + /*--------------------------------------------------------------*/ + + private long[] countReads(String fname1, String fname2, long maxReads){ + { + String x=fname1.toLowerCase(); + if((x.equals("stdin") || x.startsWith("stdin.")) && !new File(fname1).exists()){ + throw new RuntimeException("Can't precount reads from standard in, only from a file."); + } + } + + final ConcurrentReadStreamInterface cris; + final Thread cristhread; + { + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, colorspace, false, ffin1, ffin2, null, null); + if(verbose){System.err.println("Counting Reads");} + cristhread=new Thread(cris); + cristhread.start(); + } + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + long count=0, count2=0, bases=0; + + while(reads!=null && reads.size()>0){ + count+=reads.size(); + for(Read r : reads){ + bases+=r.bases.length; + count2++; + if(r.mate!=null){ + bases+=r.mate.bases.length; + count2++; + } + } + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + errorState|=ReadWrite.closeStream(cris); + return new long[] {count, count2, bases}; + } + + private void printOptions(){ + outstream.println("Syntax:\n"); + outstream.println("java -ea -Xmx512m -cp jgi.ReformatReads in= in2= out= out2="); + outstream.println("\nin2 and out2 are optional. \nIf input is paired and there is only one output file, it will be written interleaved.\n"); + outstream.println("Other parameters and their defaults:\n"); + outstream.println("overwrite=false \tOverwrites files that already exist"); + outstream.println("ziplevel=5 \tSet compression level, 1 (low) to 9 (max)"); + outstream.println("interleaved=false\tDetermines whether input file is considered interleaved"); + outstream.println("fastawrap=100 \tLength of lines in fasta output"); + outstream.println("qin=auto \tASCII offset for input quality. May be set to 33 (Sanger), 64 (Illumina), or auto"); + outstream.println("qout=auto \tASCII offset for output quality. May be set to 33 (Sanger), 64 (Illumina), or auto (meaning same as input)"); + } + + + public void setSampleSeed(long seed){ + if(seed>-1){ + randy=new java.util.Random(seed); + }else{ + randy=new java.util.Random(); + } + } + + + /*--------------------------------------------------------------*/ + + public boolean errorState=false; + + private String in1=null; + private String in2=null; + + private String qfin1=null; + private String qfin2=null; + + private String out1=null; + private String out2=null; + + private String qfout1=null; + private String qfout2=null; + + private String extin=null; + private String extout=null; + + private boolean parsecustom=false; + private boolean testsize=false; + private boolean overwrite=false; + private boolean RCOMPMATE=false; + private boolean verifyinterleaving=false; + private boolean fixinterleaving=false; + private boolean colorspace=false; + + private long maxReads=-1; +// private int passes=1; + private float samplerate=1f; + private long sampleseed=-1; + + private boolean trimRight_=false; + private boolean trimLeft_=false; + private byte trimq=4; + private byte minAvgQuality=0; + private int minReadLength=0; + private float minLenFraction=0; + + private int forceTrimLeft=-1; + private int forceTrimRight=-1; + + private byte qin=-1; + private byte qout=-1; + + private final FileFormat ffin1; + private final FileFormat ffin2; + + private final FileFormat ffout1; + private final FileFormat ffout2; + private boolean useSharedHeader; + private final boolean qtrim; + + + /*--------------------------------------------------------------*/ + + private PrintStream outstream=System.err; + private boolean sampleReadsExact=false; + private long sampleReadsTarget=0; + private boolean sampleBasesExact=false; + private long sampleBasesTarget=0; + public static boolean verbose=false; + + private java.util.Random randy; + +} diff --git a/current/jgi/RenameReads.java b/current/jgi/RenameReads.java new file mode 100755 index 0000000..ca9646a --- /dev/null +++ b/current/jgi/RenameReads.java @@ -0,0 +1,67 @@ +package jgi; + +import java.util.ArrayList; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.Read; + +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextStreamWriter; + +import align2.ListNum; + +/** + * @author Brian Bushnell + * @date Aug 23, 2013 + * + */ +public class RenameReads { + + public static void main(String[] args){ + + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + + long maxReads=-1; + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(args[0], FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, null); + Thread th=new Thread(cris); + th.start(); + } + + TextStreamWriter tsw=new TextStreamWriter(args[2], false, false, true); + tsw.start(); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + long x=0; + while(reads!=null && reads.size()>0){ + + for(Read r : reads){ + r.id=args[1]+"_"+x; + if(r.mate!=null){ + r.id=r.id+" /1"; + r.mate.id=args[1]+"_"+x+" /2"; + } + tsw.println(r); + if(r.mate!=null){tsw.println(r.mate);} + x++; + } + + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + ReadWrite.closeStream(cris); + + tsw.poisonAndWait(); + + } + +} diff --git a/current/jgi/SamPileup.java b/current/jgi/SamPileup.java new file mode 100755 index 0000000..c8c6950 --- /dev/null +++ b/current/jgi/SamPileup.java @@ -0,0 +1,613 @@ +package jgi; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; +import java.util.HashMap; + +import stream.SamLine; + +import align2.Tools; + +import dna.CoverageArray; +import dna.CoverageArray2; +import dna.CoverageArray3; +import dna.Data; +import dna.Gene; +import dna.Scaffold; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jan 4, 2013 + * + */ +public class SamPileup { + + public static void main(String[] args){ + SamPileup sp=new SamPileup(args); + + Timer t=new Timer(); + t.start(); + + sp.process(); + + t.stop(); + Data.sysout.println("Time: \t"+t); + + } + + public SamPileup(String[] args){ + for(String s : args){ + if(s.contains("=stdout")){Data.sysout=System.err;} +// if(s.equals("in=stdin") || s.startsWith("in=stdin.")){SYSIN=true;} + } + System.err.println("Executing "+(this.getClass().getName())+" "+Arrays.toString(args)+"\n"); + + boolean bs=false, setbs=false; + boolean outset=false; + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+args[i]); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("ref") || a.equals("reference") || a.equals("fasta")){ + reference=b; + }else if(a.equals("in") || a.equals("in1")){ + in=b; + }else if(a.equals("out") || a.equals("outfile")){ + if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none")){ +// System.err.println("No output file."); + out=null; + }else{ + out=b; + } + outset=true; + }else if(a.equals("outsam") || a.equals("samout")){ + outsam=(b==null || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("outorf") || a.equals("orfout")){ + outorf=(b==null || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("orffasta") || a.equals("fastaorf")){ + orffasta=(b==null || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("basecov")){ + basecov=(b==null || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("bincov")){ + bincov=(b==null || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("delta")){ + deltaOnly=Tools.parseBoolean(b); + }else if(a.equals("hist") || a.equals("histogram")){ + histogram=(b==null || b.equalsIgnoreCase("none")) ? null : b; + }else if(a.equals("reads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("scafs") || a.equals("scaffolds")){ + initialScaffolds=Tools.max(128, (int)(Tools.min(Long.parseLong(b),2000000000))); + }else if(a.equals("binsize")){ + binsize=Integer.parseInt(b); + }else if(a.equals("32bit")){ + bits32=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("bitset") || a.equals("usebitset")){ + bs=Tools.parseBoolean(b); + setbs=true; + }else if(a.startsWith("nonzero") || a.equals("nzo")){ + NONZERO_ONLY=Tools.parseBoolean(b); + System.err.println("Set NONZERO_ONLY to "+NONZERO_ONLY); + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + System.err.println("Set OVERWRITE to "+OVERWRITE); + }else if(a.equalsIgnoreCase("twocolumn")){ + TWOCOLUMN=Tools.parseBoolean(b); + System.err.println("Set TWOCOLUMN to "+TWOCOLUMN); + }else if(a.equals("secondary") || a.equals("usesecondary")){ + USE_SECONDARY=Tools.parseBoolean(b); + System.err.println("Set USE_SECONDARY_ALIGNMENTS to "+USE_SECONDARY); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(i>1){ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + } + + if(setbs){ + USE_BITSETS=bs; + USE_COVERAGE_ARRAYS=!bs; + System.err.println("Set USE_BITSETS to "+USE_BITSETS); + }else{ + if(histogram==null && basecov==null && bincov==null && outorf==null){//No need for coverage array! + USE_COVERAGE_ARRAYS=false; + if(TWOCOLUMN){//No need for bitset, either! + USE_BITSETS=false; + }else{ + USE_BITSETS=true; + } + System.err.println("Set USE_COVERAGE_ARRAYS to "+USE_COVERAGE_ARRAYS); + System.err.println("Set USE_BITSETS to "+USE_BITSETS); + } + } +// assert(false) : USE_COVERAGE_ARRAYS; + + if(maxReads<0){maxReads=Long.MAX_VALUE;} + { + final String a=(args.length>0 ? args[0] : null); + final String b=(args.length>1 ? args[1] : null); + if(in==null && a!=null && a.indexOf('=')<0 && (a.startsWith("stdin") || new File(a).exists())){in=a;} + if(out==null && b!=null && b.indexOf('=')<0){out=b;} + if(in==null){in="stdin";} + if(out==null && !outset){ +// out="stdout"; +// System.err.println("Warning: output destination not set; producing no output. To print to standard out, set 'out=stdout'"); + Data.sysout=System.err; + } + } + assert(in!=null); + assert(out!=null || outset); + } + + + public static void processOrfsFasta(String fname_in, String fname_out, HashMap map, long readBases, long refBases){ + TextFile tf=new TextFile(fname_in, false, false); + assert(!fname_in.equalsIgnoreCase(fname_out)); + TextStreamWriter tsw=new TextStreamWriter(fname_out, OVERWRITE, false, true); + tsw.start(); + +// tsw.println("#refBases="+refBases); + tsw.print("#readBases="+readBases+"\n"); + tsw.print("#name\tlength\tdepthSum\tavgDepth\tavgDepth/readBases\tminDepth\tmaxDepth\tmedianDepth\tstdDevDepth\tfractionCovered\n"); + + String line; + final StringBuilder sb=new StringBuilder(500); + while((line=tf.nextLine())!=null){ + if(line.length()>1 && line.charAt(0)=='>'){ + + String[] split=line.split(" # "); //' # ' used as delimiters + + String orfname=split[0].substring(1).trim(); //In case there are spaces around the ' # ' delimiters + String scafname=orfname; + if(scafname.contains("_")){//PRODIGAL pads _1 to the name of the first orf of a scaffold, and etc + int last=scafname.lastIndexOf('_'); + boolean numeric=false; + for(int i=last+1; i=0 && orf.stop=scaf.length){ + Data.sysout.println("orf goes out of scaffold bounds.\n"+orf+"\n"+scaf); + if(ABORT_ON_ERROR){ + tsw.poison(); + throw new RuntimeException("Aborting."); + } + } + + if(scaf!=null){ + CoverageArray ca=(CoverageArray)scaf.obj; + orf.readCoverageArray(ca); + } + + sb.append(orf.name).append('\t'); + sb.append(orf.length()).append('\t'); + sb.append(orf.baseDepth).append('\t'); + sb.append(String.format("%.4f", orf.avgCoverage())).append('\t'); + sb.append(orf.avgCoverage()/readBases); + + sb.append('\t'); + sb.append(orf.minDepth).append('\t'); + sb.append(orf.maxDepth).append('\t'); + sb.append(orf.medianDepth).append('\t'); + sb.append(String.format("%.4f",orf.stdevDepth)).append('\t'); + sb.append(String.format("%.4f",orf.fractionCovered())); + + sb.append('\n'); + tsw.print(sb.toString()); + sb.setLength(0); + } + } + + tsw.poison(); + tsw.waitForFinish(); + } + + + public void process(){ + long refBases=0; + long readBases=0; + ArrayList list=new ArrayList(initialScaffolds); + HashMap table=new HashMap(initialScaffolds); + TextFile tf=new TextFile(in, false, false); + String line=null; + + String program=null; + String version=null; + + boolean bbmap=false; + float bbversion=-1; + + final TextStreamWriter tsw=(outsam==null ? null : new TextStreamWriter(outsam, OVERWRITE, false, true)); + if(outsam!=null){tsw.start();} + + for(line=tf.nextLine(); line!=null && line.startsWith("@"); line=tf.nextLine()){ + if(tsw!=null){tsw.println(line);} + + final String[] split=line.split("\t"); + final String a=split[0]; + + if(a.equals("@SQ")){ + Scaffold sc=new Scaffold(split); + if(COUNT_GC){sc.basecount=new long[6];} + assert(!table.containsKey(sc.name)) : "\nDuplicate scaffold name!\n"+sc+"\n\n"+table.get(sc.name); + table.put(sc.name, sc); + list.add(sc); + refBases+=sc.length; +// sc.obj=new CoverageArray2(table.size(), sc.length+1); +// Data.sysout.println("Made scaffold "+sc.name+" of length "+sc.length); + }else if(a.equals("@PG")){ + for(String s : split){ + if(s.startsWith("PN:")){ + String s2=s.substring(3); + if(s2.equalsIgnoreCase("bbmap") || s2.startsWith("BBMap")){bbmap=true;} + if(program==null){program=Data.forceIntern(s.substring(3));} + }else if(s.startsWith("VN:")){ + if(bbmap && bbversion<0){bbversion=Float.parseFloat(s.substring(3));} + if(version==null){version=Data.forceIntern(s.substring(3));} + } + } + }else if(a.equals("@RG")){ + //Do nothing + }else if(a.equals("@HD")){ + //Do nothing + }else if(a.equals("@CO")){ + //Do nothing + }else{ +// assert(false) : line; + } + } + +// if(bbmap && bbversion<=17){ +// for(Scaffold sc : list){ +// sc.length-=1000; +// assert(sc.length>0) : "Error when reducing scaffold length: "+bbversion+", "+sc.length; +// } +// } +// assert(false) : bbmap+", "+bbversion+", "+program+", "+version; + + if(reference!=null){ + TextFile tf2=new TextFile(reference, false, false); + Scaffold sc=null; + int len=0; + final long[] acgtn=new long[6]; + for(String s=tf2.nextLine(); s!=null; s=tf2.nextLine()){ + if(s.startsWith(">")){ + if(sc!=null){ + sc.length=len; + sc.gc=(float)((acgtn[1]+acgtn[2])*1d/Data.max(1, acgtn[0]+acgtn[1]+acgtn[2]+acgtn[3])); + sc=null; + len=0; + Arrays.fill(acgtn, 0); + } + + String name=s.substring(1); + sc=table.get(name); + if(ADD_FROM_REF && sc==null){ + sc=new Scaffold(name, 0); + System.err.println("Warning - SAM header did not include "+name); + table.put(name, sc); + } + }else{ + len+=s.length(); + for(int i=0; i1000){sc.length-=1000;} + if(!table.containsKey(sc.name)){ + if(COUNT_GC){sc.basecount=new long[6];} + table.put(sc.name, sc); + list.add(sc); + refBases+=sc.length; + } + } + }else{ + + SamLine sl=new SamLine(line); + if(sl.mapped() && (USE_SECONDARY || sl.primary())){ +// readBases+=sl.seq.length(); + readBases+=sl.seq.length; + final Scaffold scaf=table.get(new String(sl.rname())); + assert(scaf!=null) : "Can't find "+new String(sl.rname()); + final int a=Tools.max(sl.start(), 0); + final int b=Tools.min(sl.stop2(), scaf.length-1); + scaf.basehits+=(b-a+1); + + if(USE_COVERAGE_ARRAYS){ + if(scaf.obj==null){ + scaf.obj=(bits32 ? new CoverageArray3(table.size(), scaf.length+1) : new CoverageArray2(table.size(), scaf.length+1)); + } + CoverageArray ca=(CoverageArray)scaf.obj; + ca.incrementRange(a, b, 1); + }else if(USE_BITSETS){ + if(scaf.obj==null){ + scaf.obj=new BitSet(scaf.length+1); + } + BitSet bs=(BitSet)scaf.obj; + bs.set(a, b+1); + } +// assert(false) : a+", "+b+", "+scaf.length; + if(sl.seq!=null && scaf.basecount!=null){ + final long[] counts=scaf.basecount; +// final String seq=sl.seq; +// for(int i=0; i0){covered++;} + } + } + }else if(USE_BITSETS){ +// sum+=scaf.basehits; + BitSet bs=(BitSet)scaf.obj; + covered=(bs==null ? 0 : bs.cardinality()); + } + // pw.print(scaf.name); + if(tsw2!=null && (sum>0 || !NONZERO_ONLY)){ + if(TWOCOLUMN){ + tsw2.print(String.format("%s\t%.4f\n", scaf.name, sum/(double)scaf.length)); + }else if(COUNT_GC){ + long[] bc=scaf.basecount; + double gc=(bc[1]+bc[2])*1d/Data.max(1, bc[0]+bc[1]+bc[2]+bc[3]); + tsw2.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\t%.4f\n", scaf.name, sum/(double)scaf.length, scaf.length, scaf.gc, covered*100d/scaf.length, gc)); + }else{ + tsw2.print(String.format("%s\t%.4f\t%d\t%.4f\t%.4f\n", scaf.name, sum/(double)scaf.length, scaf.length, scaf.gc, covered*100d/scaf.length)); + } + } + } + + if(tsw2!=null){tsw2.poison();} + + if(histogram!=null && hist!=null){writeHist(histogram, hist);} + } + + if(basecov!=null){writeCoveragePerBase(basecov, list, deltaOnly);} + if(bincov!=null){writeCoveragePerBaseBinned(bincov, list, binsize);} + + if(orffasta!=null){ + processOrfsFasta(orffasta, outorf, table, readBases, refBases); + } + + if(tsw2!=null){tsw2.waitForFinish();} + if(tsw!=null){tsw.waitForFinish();} + } + + public static void writeHist(String fname, long[] counts){ + TextStreamWriter tsw=new TextStreamWriter(fname, OVERWRITE, false, false); + tsw.start(); + tsw.print("#Coverage\tnumBases\n"); + int max=0; + for(max=counts.length-1; max>0 && counts[max]==0; max--){} + for(int i=0; i<=max; i++){ + long x=counts[i]; + tsw.print(i+"\t"+x+"\n"); + } + tsw.poison(); + tsw.waitForFinish(); + } + + public static void writeCoveragePerBase(String fname, ArrayList list, boolean deltaOnly){ + TextStreamWriter tsw=new TextStreamWriter(fname, OVERWRITE, false, true); + tsw.start(); + tsw.print("#RefName\tPos\tCoverage\n"); + + for(Scaffold scaf : list){ + int last=-1; + CoverageArray ca=(CoverageArray)scaf.obj; + for(int i=0; i list, int binsize){ + TextStreamWriter tsw=new TextStreamWriter(fname, OVERWRITE, false, false); + tsw.start(); + tsw.print("#RefName\tCov\tPos\tRunningPos\n"); + + long running=0; + final float invbin=1f/binsize; + for(Scaffold scaf : list){ + if(scaf.length>=binsize){ + CoverageArray ca=(CoverageArray)scaf.obj; + int lastPos=-1, nextPos=binsize-1; + long sum=0; + for(int i=0; i=nextPos){ +// float bin=(i-lastPos); +// tsw.print(String.format("%s\t%.1f\t%d\t%d\n", scaf.name, sum/bin, (i+1), running)); + tsw.print(String.format("%s\t%.2f\t%d\t%d\n", scaf.name, sum*invbin, (i+1), running)); + lastPos=i; + running+=binsize; + nextPos+=binsize; + sum=0; + } + } + } + } + + tsw.poison(); + tsw.waitForFinish(); + } + + + public static boolean COUNT_GC=true; + public static boolean verbose=false; + public static boolean OVERWRITE=false; + public static boolean TWOCOLUMN=false; + public static boolean ADD_FROM_REF=false; + public static boolean NONZERO_ONLY=false; + public static boolean USE_COVERAGE_ARRAYS=true; + public static boolean USE_BITSETS=false; + public static boolean deltaOnly=true; + /** Process secondary alignments */ + public static boolean USE_SECONDARY=true; + public static boolean ABORT_ON_ERROR=true; + + public long maxReads=-1; + public int initialScaffolds=4096; + public String in=null; + public String out=null; + public String outsam=null; + public String outorf=null; + public String reference=null; + public String histogram=null; + public String basecov=null; + public String bincov=null; + public String orffasta=null; + public boolean bits32=false; + public int binsize=1000; + + private static final byte[] charToNum=makeCharToNum(); + private static byte[] makeCharToNum() { + byte[] r=new byte[256]; + Arrays.fill(r, (byte)4); + r['a']=r['A']=0; + r['c']=r['C']=1; + r['g']=r['G']=2; + r['t']=r['T']=3; + r['\n']=r['\r']=r['>']=r['@']=r['+']=5; + return r; + } + + + + +} diff --git a/current/jgi/SamToEst.java b/current/jgi/SamToEst.java new file mode 100755 index 0000000..c23f4c6 --- /dev/null +++ b/current/jgi/SamToEst.java @@ -0,0 +1,470 @@ +package jgi; + + +import java.io.File; +import java.util.Arrays; +import java.util.HashMap; +import java.util.HashSet; + +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; +import stream.SamLine; + +import dna.Data; +import dna.Scaffold; +import fileIO.ByteFile; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; +import align2.LongList; +import align2.Shared; +import align2.Tools; + +/** + * + * Processes a sam file of mapped ESTs. + * These ESTs may have been broken into smaller pieces for mapping, + * and if so, are reassembled. + * + * Produces a mapping statistics file. + * + * @author Brian Bushnell + * @date Sep 27, 2013 + * + */ +public class SamToEst { + + public static void main(String[] args){ + + ByteFile.FORCE_MODE_BF2=Shared.THREADS>2; + FastaReadInputStream.SPLIT_READS=false; + stream.FastaReadInputStream.MIN_READ_LEN=1; + ReadWrite.USE_UNPIGZ=true; + + String est=null, stats=null, ref=null, sam=null; + float fractionForAllCaptured=0.98f; + + for(int i=0; i1 ? split[1] : null; + while(a.startsWith("-")){a=a.substring(1);} //In case people use hyphens + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("sam")){ + sam=b; + }else if(a.equals("out") || a.equals("output") || a.equals("stats")){ + stats=b; + }else if(a.equals("ref")){ + ref=b; + }else if(a.equals("est")){ + est=b; + }else if(a.equals("fraction")){ + fractionForAllCaptured=Float.parseFloat(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(a.equals("tuc") || a.equals("touppercase")){ + Read.TO_UPPER_CASE=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(sam==null && i==0 && !arg.contains("=") && (arg.toLowerCase().startsWith("stdin") || new File(arg).exists())){ + sam=arg; + }else if(stats==null && i==1 && !arg.contains("=")){ + stats=arg; + }else{ + System.err.println("Unknown parameter "+args[i]); + assert(false) : "Unknown parameter "+args[i]; + // throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(stats==null){stats="stdout";} + SamToEst ste=new SamToEst(sam, stats, ref, est, fractionForAllCaptured); + ste.process(); + } + + public SamToEst(String in_, String stats_, String ref_, String est_, float fractionForAll_){ + in=in_; + stats=stats_; + ref=ref_; + estFile=est_; + fractionForAll=fractionForAll_; + } + + public void process(){ +// HashMap table=new HashMap(initialSize); + HashMap table=new HashMap(initialSize); + TextFile tf=new TextFile(in, true, false); + String line=null; + + String program=null; + String version=null; + + boolean bbmap=false; + float bbversion=-1; + + for(line=tf.nextLine(); line!=null && line.startsWith("@"); line=tf.nextLine()){ + final String[] split=line.split("\t"); + final String a=split[0]; + + if(a.equals("@SQ")){ + Scaffold sc=new Scaffold(split); +// assert(!table.containsKey(sc.name)) : "\nDuplicate scaffold name!\n"+sc+"\n\n"+table.get(sc.name); +// table.put(sc.name, sc); + refBases+=sc.length; + refCount++; + }else if(a.equals("@PG")){ + for(String s : split){ + if(s.startsWith("PN:")){ + String s2=s.substring(3); + if(s2.equalsIgnoreCase("bbmap") || s2.startsWith("BBMap")){bbmap=true;} + if(program==null){program=Data.forceIntern(s.substring(3));} + }else if(s.startsWith("VN:")){ + if(bbmap && bbversion<0){bbversion=Float.parseFloat(s.substring(3));} + if(version==null){version=Data.forceIntern(s.substring(3));} + } + } + }else if(a.equals("@RG")){ + //Do nothing + }else if(a.equals("@HD")){ + //Do nothing + }else if(a.equals("@CO")){ + //Do nothing + }else{ +// assert(false) : line; + } + } + + EST current=null; + boolean err=false; + for(; line!=null; line=tf.nextLine()){ + + if(line.length()==0){ + + }else if(line.charAt(0)=='@'){ + if(!err){ + System.err.println("Unexpected header line: "+line); + System.err.println("This should not cause problems, and is probably due to concatenated sam files.\n" + + "Supressing future unexpected header warnings."); + err=true; + } + + if(line.startsWith("@SQ")){ + String[] split=line.split("\t"); + Scaffold sc=new Scaffold(split); +// if(!table.containsKey(sc.name)){ +// table.put(sc.name, sc); +// refBases+=sc.length; +// refCount++; +// } + } + }else{ + + SamLine sl=new SamLine(line); + if(USE_SECONDARY || sl.primary()){ + + if(sl.mapped() && sl.cigar!=null){ + String cigar=sl.cigar; + if(cigar.contains("D") || cigar.contains("N")){ + int len=0; + for(int i=0; i0){ + int partlen=name.length()-x-1; + if(partlen>0 && partlen<6){ + int p2=0; + for(int i=x+1; i9){ + p2=-1; + break; + } + } + if(p2>-1){ + part=p2; + name=name.substring(0, x); + }else{ +// assert(false) : x+"\t"+p2+"\t"+name; + } + }else{ +// assert(false) : x+"\t"+name; + } + }else{ +// assert(false) : x+"\t"+name; + } + if(current==null || !current.name.equals(name)){ +// assert(part==1) : "Sam file must be in input order. Run BBMap with the 'ordered' flag.\n"+part+"\n"+sl.qname; + if(current!=null){addEst(current);} + current=new EST(name); + } + current.add(sl); + } + } + } + if(current!=null){addEst(current);} + tf.close(); + + if(stats!=null){ + final TextStreamWriter tsw=new TextStreamWriter(stats, overwrite, false, false); + tsw.start(); + +// numRef: 786 +// numEst: 30985 +// EST-good: 30312 ( 97.83%) +// EST-best: 30312 ( 97.83%) +// EST-miss: 379 ( 1.22%) +// EST-zero: 294 ( 0.95%) + +// tsw.println("EST-good:\t"+good+"\t"++""); +// tsw.println("EST-best:\t"+best+"\t"++""); +// tsw.println("EST-miss:\t"+miss+"\t"++""); +// tsw.println("EST-zero:\t"+zero+"\t"++""); + + boolean oldStyle=false; + + if(oldStyle){ + tsw.println("ref:\t"+ref); + tsw.println("est:\t"+estFile); + tsw.println("sam:\t"+in); + + tsw.println("numRef:\t"+refCount+"\t"+refBases); + tsw.println("numEst:\t"+estCount+"\t"+estBases); + tsw.println("type\t#ests\t%ests\t#bases\t%bases"); + }else{ + + tsw.println("ref_file="+ref); + tsw.println("est_file="+estFile); + tsw.println("sam_file="+in); + + tsw.println("n_ref_scaffolds="+refCount); + tsw.println("n_ref_bases="+refBases); + tsw.println("n_est="+estCount); + tsw.println("n_est_bases="+estBases); + tsw.println("type\tn_est\tpct_est\tn_bases\tpct_bases"); + } + + double multE=100.0/estCount; + double multB=100.0/estBases; + + double allBasesPct=multE*allBasesMapped; + double mostBasesPct=multE*mostBasesMapped; + double someBasesPct=multE*someBasesMapped; + double noBasesPct=multE*noBasesMapped; + double multiScaffoldPct=multE*multiScaffold; + + double allBasesPctB=multB*allBasesMappedB; + double mostBasesPctB=multB*mostBasesMappedB; + double someBasesPctB=multB*someBasesMappedB; + double noBasesPctB=multB*noBasesMappedB; + double multiScaffoldPctB=multB*multiScaffoldB; + + int min=0, max=0, median=0; + long sum=0, count=0; + for(int i=minIntron; i0){ + if(min==0){min=i;} + max=i; + sum+=(i*x); + count+=x; + } + } + if(count>0){ //If there are any introns + long half=(count+1)/2; //50th percentile of number of introns + assert(half<=count); + long count2=0; //Current sum of length + for(int i=0; count20){ + count2+=x; + median=i; + } + } + } + + tsw.println("all:\t"+allBasesMapped+"\t"+String.format("%.4f%%",allBasesPct)+"\t"+allBasesMappedB+"\t"+String.format("%.4f%%",allBasesPctB)); + tsw.println("most:\t"+mostBasesMapped+"\t"+String.format("%.4f%%",mostBasesPct)+"\t"+mostBasesMappedB+"\t"+String.format("%.4f%%",mostBasesPctB)); + tsw.println("some:\t"+someBasesMapped+"\t"+String.format("%.4f%%",someBasesPct)+"\t"+someBasesMappedB+"\t"+String.format("%.4f%%",someBasesPctB)); + tsw.println("zero:\t"+noBasesMapped+"\t"+String.format("%.4f%%",noBasesPct)+"\t"+noBasesMappedB+"\t"+String.format("%.4f%%",noBasesPctB)); + tsw.println("multi:\t"+multiScaffold+"\t"+String.format("%.4f%%",multiScaffoldPct)+"\t"+multiScaffoldB+"\t"+String.format("%.4f%%",multiScaffoldPctB)); +// tsw.println("numIntrons:\t"+count); +// tsw.println("minIntron:\t"+min); +// tsw.println("maxIntron:\t"+max); +// tsw.println("medIntron:\t"+median); +// tsw.println("avgIntron:\t"+(long)(sum/(double)(Tools.max(count,1)))); + tsw.println("introns\tmin\tmax\tmedian\taverage"); + tsw.println(count+"\t"+min+"\t"+max+"\t"+median+"\t"+String.format("%.1f", (sum/(double)(Tools.max(count,1))))); + + tsw.poisonAndWait(); + } + } + + private void addEst(EST est){ +// Data.sysout.println("\n"+est); + estCount++; + partCount+=est.parts; + estBases+=est.length; + estBasesMapped+=est.mappedLength; + partCountMapped+=est.mappedParts; + + for(int i=0; i1){ + multiScaffold++; + multiScaffoldB+=est.length; + } + + if(est.mappedParts==est.parts){ +// Data.sysout.print("A"); + allPartsMapped++; + }else if(est.mappedParts>=Tools.max(1, est.parts/2)){ +// Data.sysout.print("B"); + mostPartsMapped++; + }else if(est.mappedParts>0){ +// Data.sysout.print("C"); + somePartsMapped++; + }else{ +// Data.sysout.print("D"); + noPartsMapped++; + } + + int match=est.match(); + if(match>=(est.length*fractionForAll)){ +// Data.sysout.print("E"); + allBasesMapped++; + allBasesMappedB+=est.length; + }else if(match>=est.length/2){ +// Data.sysout.print("F"); + mostBasesMapped++; + mostBasesMappedB+=est.length; + }else if(match>0){ +// Data.sysout.print("G"); + someBasesMapped++; + someBasesMappedB+=est.length; + }else{ +// Data.sysout.print("H"); + noBasesMapped++; + noBasesMappedB+=est.length; + } + } + + public final float fractionForAll; + public final String in, stats, ref, estFile; + + public long refBases=0; + public long estBases=0; + public long estBasesMapped=0; + + public long refCount=0; + public long estCount=0; + public long partCount=0; + public long partCountMapped=0; + + public long good=0, best=0, miss=0, zero=0; + public long multiScaffold=0, multiScaffoldB=0; + public long allPartsMapped=0, mostPartsMapped=0, somePartsMapped=0, noPartsMapped=0; + public long allBasesMapped=0, mostBasesMapped=0, someBasesMapped=0, noBasesMapped=0; + public long allBasesMappedB=0, mostBasesMappedB=0, someBasesMappedB=0, noBasesMappedB=0; + public long[] msdicnOverall=new long[6]; + public LongList introns=new LongList(1); + + public int initialSize=4096; + public boolean ADD_FROM_REF=true; + public boolean USE_SECONDARY=false; + public static int minIntron=10; + public static boolean overwrite=true; +// public HashMap //Only needed if sam file is unordered. + + public static class EST{ + + public EST(String name_){ + name=name_; + } + + public void add(SamLine sl){ + parts++; +// length+=sl.seq.length(); + length+=sl.seq.length; + if(sl.mapped()){ +// mappedLength+=sl.seq.length(); + mappedLength+=sl.seq.length; + mappedParts++; + if(sl.cigar!=null){ + String matchTag=sl.matchTag(); + + int[] temp; + if(matchTag==null){ + temp=SamLine.cigarToMsdic(sl.cigar); + }else{ + temp=Read.matchToMsdicn(matchTag.getBytes()); + } + for(int i=0; i scafnames=new HashSet(4); + + int[] msdicn=new int[6]; + + } + +} diff --git a/current/jgi/SplitPairsAndSingles.java b/current/jgi/SplitPairsAndSingles.java new file mode 100755 index 0000000..107a9f3 --- /dev/null +++ b/current/jgi/SplitPairsAndSingles.java @@ -0,0 +1,650 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Arrays; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.RTextOutputStream3; +import stream.Read; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import align2.TrimRead; +import dna.Timer; +import fileIO.ByteFile; +import fileIO.ReadWrite; +import fileIO.FileFormat; + +/** + * @author Brian Bushnell + * @date Sep 4, 2013 + * + */ +public final class SplitPairsAndSingles { + + + + public static void main(String[] args){ + + if(args==null || args.length==0 || (args.length==1 && + (args[0].equalsIgnoreCase("-h") || args[0].equals("-help") || args[0].equals("--help") || args[0].equals("-?") || args[0].equals("?")))){ + printOptions(); + System.exit(0); + } + SplitPairsAndSingles dd=new SplitPairsAndSingles(args); + dd.process(); + } + + private static void printOptions(){ + outstream.println("Syntax:\n"); + outstream.println("\njava -ea -Xmx100m -cp jgi.SplitPairsAndSingles in= out= outs= minlen=20"); + outstream.println("\nOptional flags:"); + outstream.println("in= \tThe 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in."); + outstream.println("in2= \tUse this if 2nd read of pairs are in a different file."); + outstream.println("out= \tThe 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out."); + outstream.println("out2= \tUse this to write 2nd read of pairs to a different file."); + outstream.println("outsingle= \t(outs) Write singleton reads here."); + outstream.println(""); + outstream.println("overwrite=t \t(ow) Set to false to force the program to abort rather than overwrite an existing file."); + outstream.println("showspeed=t \t(ss) Set to 'f' to suppress display of processing speed."); + outstream.println("interleaved=auto \t(int) If true, forces fastq input to be paired and interleaved."); + outstream.println("qtrim=f \tTrim read ends to remove bases with quality below minq."); + outstream.println(" \tValues: t (trim both ends), f (neither end), r (right end only), l (left end only)."); + outstream.println("trimq=4 \tTrim quality threshold."); + outstream.println("minlen=2 \t(ml) Reads shorter than this after trimming will be discarded."); + outstream.println("ziplevel=20 \t(zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster."); + outstream.println("fixpairs=f \t(fp, fint) Fixes corrupted interleaved files by examining paired read names."); + + } + + public SplitPairsAndSingles(String[] args){ + for(String s : args){if(s.contains("standardout") || s.contains("stdout")){outstream=System.err;}} + System.err.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + ReadWrite.ZIPLEVEL=2; + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + ReadWrite.MAX_ZIP_THREADS=8; + ReadWrite.ZIP_THREAD_DIVISOR=2; + FastaReadInputStream.SPLIT_READS=false; + ByteFile.FORCE_MODE_BF2=Shared.THREADS>2; + boolean setOut=false, setOuts=false, trimRight_=false, trimLeft_=false, setInterleaved=false, fixPairs_=false; + + { + boolean b=false; + assert(b=true); + EA=b; + } + + for(int i=0; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + while(a.charAt(0)=='-' && (a.indexOf('.')<0 || i>1 || !new File(a).exists())){a=a.substring(1);} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("in") || a.equals("in1")){ + in1=b; + }else if(a.equals("in2")){ + in2=b; + }else if(a.equals("out") || a.equals("out1") || a.equals("outp") || a.equals("outp1") || a.equals("outpair") || a.equals("outpair1")){ + out1=b; + setOut=true; + }else if(a.equals("out2") || a.equals("outp2") || a.equals("outpair2")){ + out2=b; + }else if(a.equals("outs") || a.equals("outsingle") || a.equals("outb") || a.equals("outbad")){ + outsingle=b; + setOut=true; + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("bf1")){ + ByteFile.FORCE_MODE_BF1=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF2=!ByteFile.FORCE_MODE_BF1; + }else if(a.equals("bf2")){ + ByteFile.FORCE_MODE_BF2=Tools.parseBoolean(b); + ByteFile.FORCE_MODE_BF1=!ByteFile.FORCE_MODE_BF2; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else if(a.equals("interleaved") || a.equals("int")){ + if("auto".equalsIgnoreCase(b)){ + FASTQ.FORCE_INTERLEAVED=!(FASTQ.TEST_INTERLEAVED=true); + }else{ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=Tools.parseBoolean(b); + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + setInterleaved=true; + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("showspeed") || a.equals("ss")){ + showSpeed=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("trim") || a.equals("qtrim")){ + if(b==null){trimRight_=trimLeft_=true;} + else if(b.equalsIgnoreCase("left") || b.equalsIgnoreCase("l")){trimLeft_=true;trimRight_=false;} + else if(b.equalsIgnoreCase("right") || b.equalsIgnoreCase("r")){trimLeft_=false;trimRight_=true;} + else if(b.equalsIgnoreCase("both") || b.equalsIgnoreCase("rl") || b.equalsIgnoreCase("lr")){trimLeft_=trimRight_=true;} + else{trimRight_=trimLeft_=Tools.parseBoolean(b);} + }else if(a.equals("optitrim") || a.equals("otf") || a.equals("otm")){ + if(b!=null && (b.charAt(0)=='.' || Character.isDigit(b.charAt(0)))){ + TrimRead.optimalMode=true; + TrimRead.optimalBias=Float.parseFloat(b); + assert(TrimRead.optimalBias>=0 && TrimRead.optimalBias<1); + }else{ + TrimRead.optimalMode=Tools.parseBoolean(b); + } + }else if(a.equals("trimright")){ + trimRight_=Tools.parseBoolean(b); + }else if(a.equals("trimleft")){ + trimLeft_=Tools.parseBoolean(b); + }else if(a.equals("trimq") || a.equals("trimquality") || a.equals("minq")){ + trimq=Byte.parseByte(b); + }else if(a.equals("fixpairs") || a.equals("fp") || a.equals("fint")){ + fixPairs_=Tools.parseBoolean(b); + }else if(a.equals("ml") || a.equals("minlen") || a.equals("minlength") || a.equals("minreadlength")){ + minReadLength=Integer.parseInt(b); + assert(minReadLength>=0) : "minReadLength must be at least 0"; + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + if(b.equalsIgnoreCase("auto")){ + FASTQ.DETECT_QUALITY=true; + }else{ + byte ascii_offset=Byte.parseByte(b); + FASTQ.ASCII_OFFSET=ascii_offset; + System.err.println("Set fastq input ASCII offset to "+FASTQ.ASCII_OFFSET); + FASTQ.DETECT_QUALITY=false; + } + }else if(a.equals("asciiout") || a.equals("qualityout") || a.equals("qualout") || a.equals("qout")){ + if(b.equalsIgnoreCase("auto")){ + FASTQ.DETECT_QUALITY_OUT=true; + }else{ + byte ascii_offset=Byte.parseByte(b); + FASTQ.ASCII_OFFSET_OUT=ascii_offset; + System.err.println("Set fastq output ASCII offset to "+FASTQ.ASCII_OFFSET_OUT); + FASTQ.DETECT_QUALITY_OUT=false; + } + }else if(a.equals("qauto")){ + FASTQ.DETECT_QUALITY=FASTQ.DETECT_QUALITY_OUT=true; + }else if(i==0 && in1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + in1=args[i]; + }else if(i==1 && out1==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + out1=args[i]; + setOut=true; + }else if(i==2 && outsingle==null && arg.indexOf('=')<0 && arg.lastIndexOf('.')>0){ + outsingle=args[i]; + setOuts=true; + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + fixPairs=fixPairs_; + trimRight=trimRight_; + trimLeft=trimLeft_; + + assert(FastaReadInputStream.settingsOK()); + + if(in1==null){ + printOptions(); + throw new RuntimeException("Error - at least one input file is required."); + } + + if(in1!=null && in1.contains("#") && !new File(in1).exists()){ + int pound=in1.lastIndexOf('#'); + String a=in1.substring(0, pound); + String b=in1.substring(pound+1); + in1=a+1+b; + in2=a+2+b; + } + if(in2!=null && (in2.contains("=") || in2.equalsIgnoreCase("null"))){in2=null;} + + if(fixPairs){ + if(in2!=null){ + System.err.println("ERROR: 'FixPairs' mode only works with a single interleaved input file, not paired input files."); + System.err.println("Aborting."); + System.exit(1); + } + setInterleaved=true; + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + outstream.println("Paired input disabled; running in FixPairs mode"); + } + + if(!setInterleaved && in2==null){ + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=true; + outstream.println("Set INTERLEAVED to "+FASTQ.FORCE_INTERLEAVED); + } + if(in2!=null){ + if(FASTQ.FORCE_INTERLEAVED){System.err.println("Reset INTERLEAVED to false because paired input files were specified.");} + FASTQ.FORCE_INTERLEAVED=FASTQ.TEST_INTERLEAVED=false; + } + + if(out1!=null && out1.contains("#")){ + int pound=out1.lastIndexOf('#'); + String a=out1.substring(0, pound); + String b=out1.substring(pound+1); + out1=a+1+b; + out2=a+2+b; + } + + if(!setOut){ + out1="stdout.fq"; + outstream=System.err; + out2=null; + }else if("stdout".equalsIgnoreCase(out1) || "standarddout".equalsIgnoreCase(out1)){ + out1="stdout.fq"; + outstream=System.err; + out2=null; + } + if(out1!=null && !Tools.canWrite(out1, overwrite)){throw new RuntimeException("Output file "+out1+" already exists, and overwrite="+overwrite);} + + assert(!in1.equalsIgnoreCase(out1)); + assert(!in1.equalsIgnoreCase(outsingle)); + assert(!in1.equalsIgnoreCase(in2)); + assert(out1==null || !out1.equalsIgnoreCase(out2)); + assert(out1==null || !out1.equalsIgnoreCase(outsingle)); + } + + public void process(){ + + Timer t=new Timer(); + t.start(); + + process2(); + + t.stop(); + + outstream.println("\nInput: \t"+readsIn+" reads \t\t"+basesIn+" bases."); + + if(trimLeft || trimRight){ + outstream.println("Trimmed: \t"+readsTrimmed+" reads ("+String.format("%.2f",readsTrimmed*100.0/readsIn)+"%) \t"+ + basesTrimmed+" bases ("+String.format("%.2f",basesTrimmed*100.0/basesIn)+"%)"); + } + outstream.println("Result: \t"+readsOut+" reads ("+String.format("%.2f",readsOut*100.0/readsIn)+"%) \t"+ + basesOut+" bases ("+String.format("%.2f",basesOut*100.0/basesIn)+"%)"); + outstream.println("Pairs: \t"+pairsOut+" reads ("+String.format("%.2f",pairsOut*100.0/readsIn)+"%) \t"+ + pairBasesOut+" bases ("+String.format("%.2f",pairBasesOut*100.0/basesIn)+"%)"); + outstream.println("Singletons: \t"+singlesOut+" reads ("+String.format("%.2f",singlesOut*100.0/readsIn)+"%) \t"+ + singleBasesOut+" bases ("+String.format("%.2f",singleBasesOut*100.0/basesIn)+"%)"); + + double rpnano=readsIn/(double)(t.elapsed); + double bpnano=basesIn/(double)(t.elapsed); + + String rpstring=(readsIn<100000 ? ""+readsIn : readsIn<100000000 ? (readsIn/1000)+"k" : (readsIn/1000000)+"m"); + String bpstring=(basesIn<100000 ? ""+basesIn : basesIn<100000000 ? (basesIn/1000)+"k" : (basesIn/1000000)+"m"); + + while(rpstring.length()<8){rpstring=" "+rpstring;} + while(bpstring.length()<8){bpstring=" "+bpstring;} + + if(showSpeed){ + outstream.println("\nTime: \t\t\t"+t); + outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + } + + if(errorState){ + throw new RuntimeException("BBDuk terminated in an error state; the output may be corrupt."); + } + } + + private void process2(){ + final Thread cristhread; + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} + cristhread=new Thread(cris); + cristhread.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + final RTextOutputStream3 ros, rosb; + final int buff=4; + if(out1!=null){ + FileFormat ff1=FileFormat.testOutput(out1, FileFormat.FASTQ, null, true, overwrite, false); + FileFormat ff2=FileFormat.testOutput(out2, FileFormat.FASTQ, null, true, overwrite, false); + ros=new RTextOutputStream3(ff1, ff2, buff, null, true); + ros.start(); + }else{ros=null;} + if(outsingle!=null){ + FileFormat ff1=FileFormat.testOutput(outsingle, FileFormat.FASTQ, null, true, overwrite, false); + rosb=new RTextOutputStream3(ff1, null, buff, null, true); + rosb.start(); + }else{rosb=null;} + if(ros!=null || rosb!=null){ + outstream.println("Started output stream."); + } + +// assert(false) : out1+", "+out2+", "+outsingle; + if(fixPairs){ + process3_fixPairs(cris, ros, rosb); + }else{ + process3(cris, ros, rosb); + } + + + ReadWrite.closeStreams(cris, ros, rosb); + } +// +// private void process3_old(final ConcurrentReadStreamInterface cris, final RTextOutputStream3 ros, final RTextOutputStream3 rosb){ +// +// ListNum ln=cris.nextList(); +// ArrayList reads0=(ln!=null ? ln.list : null); +// ArrayList single=(rosb==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); +// +// while(reads0!=null && reads0.size()>0){ +// ArrayList reads=(ArrayList) reads0.clone(); +// int removed=0; +// for(int i=0; i0 ? 1 : 0); +// } +// if(r2!=null){ +// int x=TrimRead.trimFast(r2, trimLeft, trimRight, trimq, 1); +// basesTrimmed+=x; +// readsTrimmed+=(x>0 ? 1 : 0); +// } +// } +// +// final int rlen1=(r1==null ? -1 : r1.bases==null ? 0 : r1.bases.length); +// final int rlen2=(r2==null ? -1 : r2.bases==null ? 0 : r2.bases.length); +// +// if(verbose){System.err.println("rlen1="+rlen1+", rlen2="+rlen2);} +// +// if(rlen1=minReadLength){ +// single.add(r1); +// singlesOut++; +// singleBasesOut+=rlen1; +// } +// if(rlen2>=minReadLength){ +// single.add(r2); +// singlesOut++; +// singleBasesOut+=rlen2; +// } +// }else{ +// if(r1!=null){ +// pairsOut++; +// pairBasesOut+=rlen2; +// } +// if(r2!=null){ +// pairsOut++; +// pairBasesOut+=rlen2; +// } +// } +// } +// } +// +// if(rosb!=null){ +// if(verbose){System.err.println("Adding "+single.size()+" to single out.");} +// rosb.add(new ArrayList(single), ln.id); +// single.clear(); +// } +// +// if(ros!=null){ +// if(removed>0){Tools.condenseStrict(reads);} +// ArrayList x=new ArrayList(reads.size()); +// x.addAll(reads); +// if(verbose){System.err.println("Adding "+x.size()+" to pair out.");} +// ros.add(x, ln.id); +// } +// +// cris.returnList(ln, ln.list.isEmpty()); +// ln=cris.nextList(); +// reads0=(ln!=null ? ln.list : null); +// } +// cris.returnList(ln, ln.list.isEmpty()); +// +// readsOut+=singlesOut+pairsOut; +// basesOut+=singleBasesOut+pairBasesOut; +// } + + private void process3(final ConcurrentReadStreamInterface cris, final RTextOutputStream3 ros, final RTextOutputStream3 rosb){ + + ListNum ln=cris.nextList(); + ArrayList reads=ln.list; + + final ArrayList pairs=(ros==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); + final ArrayList singles=(rosb==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); + + while(reads!=null && reads.size()>0){ + for(int i=0; i(singles), ln.id); + singles.clear(); + } + + if(ros!=null){ + if(verbose){System.err.println("Adding "+pairs.size()+" to pair out.");} + ros.add(new ArrayList(pairs), ln.id); + pairs.clear(); + } + } + cris.returnList(ln, ln.list.isEmpty()); + + readsOut+=singlesOut+pairsOut; + basesOut+=singleBasesOut+pairBasesOut; + } + + private void process3_fixPairs(final ConcurrentReadStreamInterface cris, final RTextOutputStream3 ros, final RTextOutputStream3 rosb){ + + ListNum ln=cris.nextList(); + ArrayList reads=ln.list; + + final ArrayList pairs=(ros==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); + final ArrayList singles=(rosb==null ? null : new ArrayList(Shared.READ_BUFFER_LENGTH)); + + Read current=null, prev=null; + + while(reads!=null && reads.size()>0){ + for(int i=0; i(singles), ln.id); + singles.clear(); + } + + if(ros!=null){ + if(verbose){System.err.println("Adding "+pairs.size()+" to pair out.");} + ros.add(new ArrayList(pairs), ln.id); + pairs.clear(); + } + } + cris.returnList(ln, ln.list.isEmpty()); + + readsOut+=singlesOut+pairsOut; + basesOut+=singleBasesOut+pairBasesOut; + } + + + private int processPair(Read r1, Read r2, ArrayList pairs, ArrayList singles){ + int removed=0; + readsIn++; + basesIn+=(r1.bases==null ? 0 : r1.bases.length); + if(r2!=null){ + readsIn++; + basesIn+=(r2.bases==null ? 0 : r2.bases.length); + } + + if(trimLeft || trimRight){ + if(r1!=null){ + int x=TrimRead.trimFast(r1, trimLeft, trimRight, trimq, 1); + basesTrimmed+=x; + readsTrimmed+=(x>0 ? 1 : 0); + } + if(r2!=null){ + int x=TrimRead.trimFast(r2, trimLeft, trimRight, trimq, 1); + basesTrimmed+=x; + readsTrimmed+=(x>0 ? 1 : 0); + } + } + final int rlen1=(r1==null ? -1 : r1.bases==null ? 0 : r1.bases.length); + final int rlen2=(r2==null ? -1 : r2.bases==null ? 0 : r2.bases.length); + if(verbose){System.err.println("rlen="+rlen1+", rlen2="+rlen2);} + + if(rlen1>=minReadLength && rlen2>=minReadLength){ + if(verbose){System.err.println("Sending to pair out:\t"+r1.id+"\t"+r2.id);} + r1.mate=r2; + r2.mate=r1; + r1.setPairnum(0); + r2.setPairnum(1); + if(pairs!=null){pairs.add(r1);} + pairsOut+=2; + pairBasesOut+=(rlen1+rlen2); + }else if(rlen1>=minReadLength){ + if(verbose){System.err.println("Sending r1 to single out:\t"+r1.id+"\t"+(r2==null ? "*" : r2.id));} + r1.mate=null; + r1.setPairnum(0); + if(singles!=null){singles.add(r1);} + singlesOut++; + singleBasesOut+=rlen1; + if(r2!=null){removed++;} + }else if(rlen2>=minReadLength){ + if(verbose){System.err.println("Sending r2 to single out:\t"+(r1==null ? "*" : r1.id)+"\t"+r2.id);} + r2.mate=null; + r2.setPairnum(0); + if(singles!=null){singles.add(r2);} + singlesOut++; + singleBasesOut+=rlen2; + if(r1!=null){removed++;} + }else{ + if(verbose){System.err.println("Removed both reads:\t"+(r1==null ? "*" : r1.id)+"\t"+(r2==null ? "*" : r2.id));} + if(r1!=null){removed++;} + if(r2!=null){removed++;} + } + return removed; + } + + + private String in1=null, in2=null; + private String out1=null, out2=null; + private String outsingle=null; + private long maxReads=-1; + public boolean errorState=false; + + long readsIn=0; + long basesIn=0; + long readsOut=0; + long basesOut=0; + long pairsOut=0; + long pairBasesOut=0; + long singlesOut=0; + long singleBasesOut=0; + long readsTrimmed=0; + long basesTrimmed=0; + + private byte trimq=4; + private int minReadLength=20; + private final boolean trimLeft, trimRight; + + private final boolean EA; + private final boolean fixPairs; + + private static PrintStream outstream=System.err; + public static boolean overwrite=false; + public static boolean showSpeed=true; + public static boolean verbose=false; + +} diff --git a/current/jgi/SplitSam4Way.java b/current/jgi/SplitSam4Way.java new file mode 100755 index 0000000..9398431 --- /dev/null +++ b/current/jgi/SplitSam4Way.java @@ -0,0 +1,153 @@ +package jgi; + +import java.io.File; +import java.io.PrintStream; +import java.util.Arrays; + +import stream.SamLine; + +import dna.Gene; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jul 23, 2013 + * + */ +public class SplitSam4Way { + + public static void main(String[] args){ + new SplitSam4Way(args); + } + + private void printOptions(){ + outstream.println("Syntax:\n"); + outstream.println("java -ea -Xmx128m -cp jgi.SplitSam4Way "); + outstream.println("If you do not want one of the output files, use the word 'null'.\n"); + } + + public SplitSam4Way(String[] args){ + if(args==null || args.length!=5){ + printOptions(); + System.exit(0); + } + + for(String s : args){if(s.startsWith("out=standardout") || s.startsWith("out=stdout")){outstream=System.err;}} + outstream.println("Executing "+getClass().getName()+" "+Arrays.toString(args)+"\n"); + + Timer t=new Timer(); + t.start(); + long reads=0, bases=0; + long preads=0, mreads=0, creads=0, ureads=0; + + String fin=args[0]; + String fplus=args[1]; + String fminus=args[2]; + String fchimeric=args[3]; + String funmapped=args[4]; + + TextFile tf=new TextFile(fin, true, false); + TextStreamWriter plus=("null".equalsIgnoreCase(fplus) ? null : new TextStreamWriter(fplus, true, false, true, FileFormat.SAM)); + TextStreamWriter minus=("null".equalsIgnoreCase(fminus) ? null : new TextStreamWriter(fminus, true, false, true, FileFormat.SAM)); + TextStreamWriter chimeric=("null".equalsIgnoreCase(fchimeric) ? null : new TextStreamWriter(fchimeric, true, false, true, FileFormat.SAM)); + TextStreamWriter unmapped=("null".equalsIgnoreCase(funmapped) ? null : new TextStreamWriter(funmapped, true, false, true, FileFormat.SAM)); + + if(plus!=null){plus.start();} + if(minus!=null){minus.start();} + if(chimeric!=null){chimeric.start();} + if(unmapped!=null){unmapped.start();} + + for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ + if(line.charAt(0)=='@'){ + if(plus!=null){plus.println(line);} + if(minus!=null){minus.println(line);} + if(chimeric!=null){chimeric.println(line);} + if(unmapped!=null){unmapped.println(line);} + }else{ + SamLine sl=new SamLine(line); + reads++; +// bases+=sl.seq.length(); + bases+=sl.seq.length; + + if(!sl.mapped() || !sl.nextMapped() || !sl.hasMate() || !sl.primary()){ + if(unmapped!=null){unmapped.println(line);} + ureads++; +// System.out.println("unmapped: "+sl.mapped()+", "+sl.nextMapped()+", "+sl.hasMate()+", "+!sl.primary()); + }else if(!sl.pairedOnSameChrom() || sl.strand()==sl.nextStrand()){ + if(chimeric!=null){chimeric.println(line);} + creads++; +// System.out.println("chimeric: "+sl.pairedOnSameChrom()+", "+(sl.strand()==sl.nextStrand())+", "+sl.strand()+", "+sl.nextStrand()+", "+new String(sl.rname())+", "+new String(sl.rnext())); + }else if((sl.firstFragment() ? sl.strand() : sl.nextStrand())==Gene.PLUS){ + if(plus!=null){plus.println(line);} + preads++; + }else if((sl.firstFragment() ? sl.strand() : sl.nextStrand())==Gene.MINUS){ + if(minus!=null){minus.println(line);} + mreads++; + }else{ + throw new RuntimeException("Unhandled case: "+sl.firstFragment()+", "+sl.lastFragment()+", "+sl.strand()+", "+sl.nextStrand()+"\n"+sl+"\n"); + } + } + } + + +// String line2=tf.nextLine(); +// SamLine sl1=new SamLine(line); +// SamLine sl2=(line2==null ? null : new SamLine(line2)); +// if(sl2==null){ +// +// }else{ +// if(sl1.mapped()){ +// if(sl2.mapped()){ +// if(sl1.strand()==sl2.strand()){ +// chimeric.println(line); +// chimeric.println(line2); +// }else if(sl1.strand()==Gene.PLUS){ +// plus.println(line); +// plus.println(line2); +// }else{ +// minus.println(line); +// minus.println(line2); +// } +// }else{ +// +// } +// }else{ +// +// } +// } +// } +// } + + if(plus!=null){plus.poisonAndWait();} + if(minus!=null){minus.poisonAndWait();} + if(chimeric!=null){chimeric.poisonAndWait();} + if(unmapped!=null){unmapped.poisonAndWait();} + t.stop(); + + + double rpnano=reads/(double)(t.elapsed); + double bpnano=bases/(double)(t.elapsed); + + String rpstring=(reads<100000 ? ""+reads : reads<100000000 ? (reads/1000)+"k" : (reads/1000000)+"m"); + String bpstring=(bases<100000 ? ""+bases : bases<100000000 ? (bases/1000)+"k" : (bases/1000000)+"m"); + + while(rpstring.length()<8){rpstring=" "+rpstring;} + while(bpstring.length()<8){bpstring=" "+bpstring;} + + outstream.println("Time: \t"+t); + outstream.println("Reads Processed: "+rpstring+" \t"+String.format("%.2fk reads/sec", rpnano*1000000)); + outstream.println("Bases Processed: "+bpstring+" \t"+String.format("%.2fm bases/sec", bpnano*1000)); + outstream.println("Plus Reads: "+preads); + outstream.println("Minus Reads: "+mreads); + outstream.println("Chimeric Reads: "+creads); + outstream.println("Unmapped Reads: "+ureads); + + + } + + private PrintStream outstream=System.err; + +} diff --git a/current/jgi/TestLargeKmer.java b/current/jgi/TestLargeKmer.java new file mode 100755 index 0000000..fc27439 --- /dev/null +++ b/current/jgi/TestLargeKmer.java @@ -0,0 +1,195 @@ +package jgi; + +import java.util.ArrayList; +import java.util.Arrays; + +import kmer.KCountArray2; +import kmer.KmerCount3; + +import stream.ConcurrentGenericReadInputStream; +import stream.FastqReadInputStream; +import stream.Read; + +import align2.ListNum; +import dna.AminoAcid; +import dna.Timer; +import fileIO.ReadWrite; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class TestLargeKmer { + + public static void main(String args[]){ + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null); + int k=Integer.parseInt(args[args.length-3]); + int cbits=Integer.parseInt(args[args.length-2]); + int k2=Integer.parseInt(args[args.length-1]); + + KCountArray2 counts=KmerCount3.countFastq(fname1, fname2, k, cbits); + long[] counts2=countK2(fname1, fname2, k, counts, k2); + + t.stop(); + System.out.println("Finished counting; time = "+t+"\n"); + + for(int i=0; i=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + FastqReadInputStream fris1=new FastqReadInputStream(fname1, false, false); + FastqReadInputStream fris2=(fname2==null ? null : new FastqReadInputStream(fname2, false, false)); + ConcurrentGenericReadInputStream cris=new ConcurrentGenericReadInputStream(fris1, fris2, KmerCount3.maxReads); + + new Thread(cris).start(); + System.err.println("Started cris"); + boolean paired=cris.paired(); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + + final long[] upperBound=new long[BOUND_LEN]; //Lowest upper bound provable of kmer count + final int[] ring=new int[k2-k+1]; + final int[] subcount=new int[BOUND_LEN]; + final int maxValue=subcount.length-1; + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + len=0; + kmer=0; + Arrays.fill(subcount, 0); + + byte[] bases=r.bases; + byte[] quals=r.quality; + for(int i=0; i=k){ + value=counts1.read(kmer); + } + } + value=min(value, maxValue); + + ring[ringpos]=value; + subcount[value]++; + + if(i>=ring.length){ + subcount[old]--; + } + + if(len>=k2){ + int sub=0; + while(sub=k){ + value=counts1.read(kmer); + } + } + value=min(value, maxValue); + + ring[ringpos]=value; + subcount[value]++; + + if(i>=ring.length){ + subcount[old]--; + } + + if(len>=k2){ + int sub=0; + while(suby ? x : y;} + + public static final int BOUND_LEN=256; + +} diff --git a/current/kmer/AbstractKmerTable.java b/current/kmer/AbstractKmerTable.java new file mode 100755 index 0000000..013c2ed --- /dev/null +++ b/current/kmer/AbstractKmerTable.java @@ -0,0 +1,82 @@ +package kmer; + +import dna.AminoAcid; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Oct 23, 2013 + * + */ +public abstract class AbstractKmerTable { + + /** Returns count */ + abstract int increment(long kmer); + + /** Returns number of entries created */ + abstract int incrementAndReturnNumCreated(final long kmer); + + abstract int set(long kmer, int value); + + /** Returns number of kmers added */ + public abstract int setIfNotPresent(long kmer, int value); + + abstract Object get(long kmer); + + public abstract int getCount(long kmer); + + public abstract boolean contains(long kmer); + +// abstract boolean insert(KmerLink n); + + public abstract void rebalance(); + + abstract void resize(); + + public abstract long size(); + public abstract int arrayLength(); + abstract boolean canResize(); + public abstract boolean canRebalance(); + + public abstract boolean dumpKmersAsText(TextStreamWriter tsw, int k); + + static final StringBuilder toText(long kmer, int count, int k){ + StringBuilder sb=new StringBuilder(k+10); + return toText(kmer, count, k, sb); + } + +// static final StringBuilder toText(long kmer, int count, int k, StringBuilder sb){ +// for(int i=0; i>=2; +// } +// sb.reverse(); +// sb.append('\t'); +// sb.append(count); +// return sb; +// } + + static final StringBuilder toText(long kmer, int count, int k, StringBuilder sb){ + for(int i=k-1; i>=0; i--){ + int x=(int)((kmer>>(2*i))&3); + sb.append((char)AminoAcid.numberToBase[x]); + } + sb.append('\t'); + sb.append(count); + return sb; + } + + static void appendKmerText(long kmer, int count, int k, StringBuilder sb){ + sb.setLength(0); + toText(kmer, count, k, sb); + sb.append('\n'); + } + + + /** For buffered tables. */ + long flush(){ + throw new RuntimeException("Not supported."); + } + +} diff --git a/current/kmer/HashArray.java b/current/kmer/HashArray.java new file mode 100755 index 0000000..61214b1 --- /dev/null +++ b/current/kmer/HashArray.java @@ -0,0 +1,296 @@ +package kmer; + +import java.util.ArrayList; +import java.util.Arrays; + + +import fileIO.TextStreamWriter; + +import align2.Tools; + +/** + * Stores kmers in a long[] and counts in an int[], with a victim cache. + * @author Brian Bushnell + * @date Oct 25, 2013 + * + */ +public final class HashArray extends AbstractKmerTable { + + public HashArray(int initialSize, boolean autoResize_){ + if(initialSize>1){ + initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize)); + }else{ + initialSize=1; + } + prime=initialSize; + sizeLimit=(long)(sizeLimit=(long)(maxLoadFactor*prime)); + array=new long[prime+extra]; + counts=new int[prime+extra]; + victims=new HashForest(Tools.max(10, initialSize/8), autoResize_); + Arrays.fill(array, -1); + autoResize=autoResize_; + } + + + + int increment(final long kmer){ + int cell=(int)(kmer%prime); + + for(final int max=cell+extra; cellsizeLimit){resize();} + return 1; + } + } + return victims.increment(kmer); + } + + int incrementAndReturnNumCreated(final long kmer){ + int cell=(int)(kmer%prime); + + for(final int max=cell+extra; cellsizeLimit){resize();} + return 1; + } + } + return victims.incrementAndReturnNumCreated(kmer); + } + + + int set(long kmer, int value){ + int cell=(int)(kmer%prime); + + for(final int max=cell+extra; cellsizeLimit){resize();} + return 1; + } + } + return victims.set(kmer, value); + } + + public int setIfNotPresent(long kmer, int value){ + int cell=(int)(kmer%prime); + + for(final int max=cell+extra; cellsizeLimit){resize();} + return 1; + } + } +// System.err.println("size="+size+", prime="+prime+", limit="+sizeLimit); + return victims.setIfNotPresent(kmer, value); + } + + final Object get(long kmer){ + throw new RuntimeException("Unimplemented"); + } + + public final int getCount(long kmer){ + int cell=(int)(kmer%prime); + + for(final int max=cell+extra; cell list=new ArrayList(1000); +// for(int i=0; i "+size+", "+victims.size; + + sizeLimit=(long)(maxLoadFactor*prime); + } + + @Override + public boolean dumpKmersAsText(TextStreamWriter tsw, int k){ + tsw.print("HashArray:\n"); + for(int i=0; i=buflen){ + return incrementBuffer(way); + } + return 0; + } + + private int incrementBuffer(final int way){ + final int size=sizes[way]; + if(size<1){return 0;} + sizes[way]=0; + final long[] buffer=buffers[way]; + int added=0; + final AbstractKmerTable table=tables[way]; + synchronized(table){ + for(int i=0; i1){ + initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize)); + }else{ + initialSize=1; + } + prime=initialSize; + sizeLimit=(long) (initialSize*resizeMult); + array=new KmerNode[prime]; + autoResize=autoResize_; + } + + int increment(long kmer){ + final int cell=(int)(kmer%prime); + KmerNode n=array[cell], prev=null; + while(n!=null && n.pivot!=kmer){ + prev=n; + n=(kmersizeLimit){resize();} + }else{ + n.count++; + if(n.count<0){n.count=Integer.MAX_VALUE;} + } + return n.count; + } + + int incrementAndReturnNumCreated(long kmer){ + final int cell=(int)(kmer%prime); + KmerNode n=array[cell], prev=null; + while(n!=null && n.pivot!=kmer){ + prev=n; + n=(kmersizeLimit){resize();} + return 1; + }else{ + n.count++; + if(n.count<0){n.count=Integer.MAX_VALUE;} + return 0; + } + } + + int set(long kmer, int value){ + int x=1, cell=(int)(kmer%prime); + final KmerNode n=array[cell]; + if(n==null){ + array[cell]=new KmerNode(kmer, value); + }else{ + x=n.set(kmer, value); + } + size+=x; + if(autoResize && size>sizeLimit){resize();} + return x; + } + + public int setIfNotPresent(long kmer, int value){ + int x=1, cell=(int)(kmer%prime); + final KmerNode n=array[cell]; + if(n==null){ + array[cell]=new KmerNode(kmer, value); + }else{ + x=n.setIfNotPresent(kmer, value); + } + size+=x; + if(autoResize && size>sizeLimit){resize();} + return x; + } + + final KmerNode get(long kmer){ +// int cell=(int)(kmer%prime); +// KmerNode n=array[cell]; +// return n==null ? null : n.get(kmer); + + int cell=(int)(kmer%prime); + KmerNode n=array[cell]; + while(n!=null && n.pivot!=kmer){ + n=(kmer list=new ArrayList(1000); + for(int i=0; i0){ + StringBuilder sb=new StringBuilder(); + tsw.print(node.dumpKmersAsText(sb, k)); + } + } + return true; + } + + /* (non-Javadoc) + * @see jgi.AbstractKmerTable#size() + */ + @Override + public long size() {return size;} + + /* (non-Javadoc) + * @see jgi.AbstractKmerTable#arrayLength() + */ + @Override + public int arrayLength() {return array.length;} + + /* (non-Javadoc) + * @see jgi.AbstractKmerTable#canResize() + */ + @Override + boolean canResize() {return true;} + + /* (non-Javadoc) + * @see jgi.AbstractKmerTable#canRebalance() + */ + @Override + public boolean canRebalance() {return true;} + + KmerNode[] array; + int prime; + long size=0; + long sizeLimit; + final boolean autoResize; + + final static int maxPrime=(int)Primes.primeAtMost(Integer.MAX_VALUE); + final static float resizeMult=2.5f; //Resize by a minimum of this much + final static float minLoadFactor=0.75f; //Resize by enough to get the load above this factor + final static float maxLoadFactor=2.5f; //Resize by enough to get the load under this factor + final static float minLoadMult=1/minLoadFactor; + final static float maxLoadMult=1/maxLoadFactor; + + + +} diff --git a/current/kmer/KCountArray.java b/current/kmer/KCountArray.java new file mode 100755 index 0000000..c3e8317 --- /dev/null +++ b/current/kmer/KCountArray.java @@ -0,0 +1,445 @@ +package kmer; + +import java.util.concurrent.atomic.AtomicIntegerArray; + +import dna.AminoAcid; +import dna.Data; +import align2.Shared; +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + */ +public abstract class KCountArray { + + public static KCountArray makeNew(long cells_, int cbits_, int gap_){ + return makeNew(cells_+1, cells_, cbits_, gap_, 1); + } + + public static KCountArray makeNew(long keys_, long cells_, int cbits_, int gap_, int hashes_){ + return makeNew(keys_, cells_, cbits_, gap_, hashes_, null); + } + +// public static KCountArray makeNew(long keys_, long cells_, int cbits_, int gap_, int hashes_, boolean prefilter_){ +// if(!prefilter_){ +// return makeNew(keys_, cells_, cbits_, gap_, hashes_, 0, 0); +// }else{ +// long totalbits=cells_*cbits_; +// long prebits=totalbits/4; +// long postCells=(totalbits-prebits+cbits_-1)/cbits_; +// int prehashes=(hashes_+1)/2; +// return makeNew(keys_, postCells, cbits_, gap_, hashes_, prebits, prehashes); +// } +// } + + public static KCountArray makeNew(long keys_, long cells_, int cbits_, int gap_, int hashes_, KCountArray prefilter){ +// assert(false) : keys_+", "+cells_+", "+cbits_+", "+gap_; + assert(keys_>=cells_) : keys_+", "+cells_; +// assert(cells_>1) : cells_; + KCountArray kca; + if(keys_<=cells_){ + kca=new KCountArray3(cells_, cbits_, gap_); +// return new KCountArray4(cells_, cbits_, gap_, 2); + }else{ +// kca=new KCountArray4(cells_, cbits_, gap_, hashes_); //Single-threaded; most accurate +// kca=new KCountArray4MT(cells_, cbits_, gap_, hashes_); //Fast +// kca=new KCountArray5MT(cells_, cbits_, gap_, hashes_); //Less efficient than 4MT +// kca=new KCountArray6MT(cells_, cbits_, gap_, hashes_); //Fastest but substantial drop in accuracy + +// if(prefilter==null){ +// kca=new KCountArray7MT(cells_, cbits_, gap_, hashes_); //Like 4MT but uses primes +// }else{ +// kca=new KCountArray8MT(cells_, cbits_, gap_, hashes_, prefilter); //Like 7MT but uses prefilter +// } + kca=new KCountArray7MTA(cells_, cbits_, gap_, hashes_, prefilter); //Like 4MT but uses primes + +// if(prefilter==null){ +// kca=new KCountArray9MT(cells_, cbits_, gap_, hashes_); //Like 7MT but uses canonical kmers +// }else{ +// kca=new KCountArray10MT(cells_, cbits_, gap_, hashes_, prefilter); //Like 8MT but uses canonical kmers +// } + } + kca.initialize(); + + return kca; + } + + protected KCountArray(long cells_, int cbits_){ + this(cells_, cbits_, 0); + } + + protected KCountArray(final long cells_, int cbits_, int gap_){ + gap=gap_; + assert(cbits_<=32); + assert(Integer.bitCount(cbits_)==1); + assert(Long.bitCount(cells_)==1) || this.getClass()==KCountArray7MT.class; + + numArrays=64; + arrayBits=31-Integer.numberOfLeadingZeros(numArrays); + arrayMask=numArrays-1; + + while(cbits_*cells_<32*numArrays){ + assert(false) : cells_+", "+cbits_+", "+numArrays+", "+(cbits_*cells_)+"<"+(32*numArrays); + cbits_*=2; + } //Increases bits per cell so that at minimum each array is size 1 + + assert(cbits_<=32); + + cells=cells_; + cellBits=cbits_; + valueMask=(cellBits==32 ? Integer.MAX_VALUE : ~((-1)<=b){return b;} + int c=readRight(key, k, makeCanonical); + if(c>=b){return b;} + return (int)(((long)a+(long)c)/2); +// return max(a, c); +// int mid=Tools.min(a, b, c); +// System.out.println("a="+a+", b="+b+", c="+c+" -> "+mid); +// return mid; + } + + public final int readPreciseMin(long key, int k, boolean makeCanonical){ + assert(k<=32); + int b=read(makeCanonical ? makeCanonical2(key, k) : key); + if(b<1){return b;} + int a=readLeft(key, k, makeCanonical); + if(a<1){return a;} + int c=readRight(key, k, makeCanonical); + return Tools.min(a, b, c); + } + + /** + * @param key Kmer to evaluate + * @return Sum of counts of all 4 possible left-adjacent kmers + */ + public int readLeft(long key, int k, boolean makeCanonical){throw new RuntimeException("Unsupported.");} + /** + * @param key Kmer to evaluate + * @return Sum of counts of all 4 possible right-adjacent kmers + */ + public int readRight(long key, int k, boolean makeCanonical){throw new RuntimeException("Unsupported.");} + /** + * @param key Kmer to evaluate + * @return Array of counts of all 4 possible left-adjacent kmers + */ + public int[] readAllLeft(final long key, final int k, boolean makeCanonical, int[] rvec){throw new RuntimeException("Unsupported.");} + /** + * @param key Kmer to evaluate + * @return Array of counts of all 4 possible right-adjacent kmers + */ + public int[] readAllRight(final long key, final int k, boolean makeCanonical, int[] rvec){throw new RuntimeException("Unsupported.");} + + public void increment(long[] keys){ + synchronized(this){ + for(long key : keys){ + increment(key); + } + } + } + + /** Returns incremented value. Optional method. */ + public abstract int incrementAndReturn(long key, int incr); + + /** Returns decremented value. Optional method. */ + public int decrementAndReturn(long key, int incr){ + throw new RuntimeException("This class "+getClass().getName()+" does not support decrementAndReturn."); + } + + /** Returns unincremented value */ + public abstract int incrementAndReturnUnincremented(long key, int incr); + + public abstract long[] transformToFrequency(); + public final long[] transformToFrequency(int[][] matrix){ + long[] freq=new long[100000]; + int maxFreq=freq.length-1; + + if(cellBits!=32){ + assert(cellBits>0); + for(int[] array : matrix){ + for(int i=0; i0 ? "gap = "+gap+" \t " : "")+"mem = "+mem()+" \tcells = "+toKMG(cells)+" \tused = "+String.format("%.3f%%",usedFraction()*100); + } + + public final String toShortString(int hashes){ + return (gap>0 ? "gap = "+gap+" \t " : "")+("hashes = "+hashes+" \t ")+ + "mem = "+mem()+" \tcells = "+toKMG(cells)+" \tused = "+String.format("%.3f%%",usedFraction()*100); + } + + public final String toString(){ + return description().toString(); + } + + public abstract String toContentsString(); + + public abstract double usedFraction(); + + public abstract double usedFraction(int mindepth); + + public abstract long cellsUsed(int mindepth); + + public final double estimateUniqueKmers(int hashes){ + double f=usedFraction(); + double f2=(1-Math.pow(1-f, 1.0/hashes)); + double n=(-cells)*Math.log(1-f2); + return n; + } + + public final double estimateUniqueKmers(int hashes, int mindepth){ +// assert(false) : this.getClass().getName(); + double f=usedFraction(mindepth); + double f2=(1-Math.pow(1-f, 1.0/hashes)); + double n=(-cells)*Math.log(1-f2); + return n; + } + + public final String mem(){ + long mem=(cells*cellBits)/8; + if(mem<(1<<20)){ + return (String.format("%.2f KB", mem*1d/(1<<10))); + }else if(mem<(1<<30)){ + return (String.format("%.2f MB", mem*1d/(1<<20))); + }else{ + return (String.format("%.2f GB", mem*1d/(1<<30))); + } + } + + public static String toKMG(long x){ + double div=1; + String ext=""; + if(x>10000000000L){ + div=1000000000L; + ext="B"; + }else if(x>10000000){ + div=1000000; + ext="M"; + }else if(x>100000){ + div=1000; + ext="K"; + } + return String.format("%.2f", x/div)+ext; + } + +// long hash(long x, int y){throw new RuntimeException("Not supported.");} + abstract long hash(long x, int y); + + public static final int min(int x, int y){return xy ? x : y;} + public static final long min(long x, long y){return xy ? x : y;} + + /** Any necessary initialization. */ + public void initialize(){} + + /** Any necessary shutdown steps. */ + public void shutdown(){} + + public final long cells; + public final int cellBits; + /** Originally this was different than valueMask in the case that valueMask was negative, but now they are the same. */ + public final int maxValue; + public final int gap; //Set this for convenience on gapped tables to make sure you're using the right table. + + protected final int cellsPerWord; + protected final int indexShift; + protected final int cellMask; + protected final int valueMask; + + protected static int minArrays=calcMinArrays(); + protected final int arrayBits; + protected final int numArrays; + protected final int arrayMask; + +// protected static final int arrayBits=6; +// protected static final int numArrays=1<3 && k<=32); +// short a=(short)(key&canonMask); +// short b=AminoAcid.rcompBinaryTable[(int)((key>>(2*(k-4)))&canonMask)]; +//// System.out.println("x="+Long.toBinaryString(key)+"\na="+Integer.toBinaryString(a)+"\nb="+Integer.toBinaryString(b)+"\n"+(a>=b)); +//// assert(a>=b || isCanonical(AminoAcid.reverseComplementBinaryFast(key, k), k)); +// return a>=b; +// } + +// public static final boolean isCanonical(long key, int k){ +// assert(k>3 && k<=32); +// short a=(short)(key&canonMask); +// short b=AminoAcid.rcompBinaryTable[(int)((key>>(2*(k-4)))&canonMask)]; +//// System.out.println("x="+Long.toBinaryString(key)+"\na="+Integer.toBinaryString(a)+"\nb="+Integer.toBinaryString(b)+"\n"+(a>=b)); +//// assert(a>=b || isCanonical(AminoAcid.reverseComplementBinaryFast(key, k), k)); +// return a>=b; +// } + + public static final boolean isCanonical(long key, int k){ + assert(k>3 && k<=32); + long b=AminoAcid.reverseComplementBinaryFast(key, k); + return key>=b; + } + + /** Assumes that the key is not canonical */ + public static final long makeCanonical(final long key, final int k){ + assert(k>3 && k<=32); +// assert(!isCanonical(key, k)); + final long r=AminoAcid.reverseComplementBinaryFast(key, k); + assert(r>=key); +// assert(isCanonical(r, k)); +// assert(AminoAcid.reverseComplementBinaryFast(r, k)==key); + return r; + } + + + public static final long makeCanonical2(final long key, final int k){ + assert(k>3 && k<=32); + if(isCanonical(key, k)){return key;} + long r=AminoAcid.reverseComplementBinaryFast(key, k); +// assert(isCanonical(r, k)) : k+"\n"+Long.toBinaryString(key)+"\n"+Long.toBinaryString(r)+"\n"+Long.toBinaryString(AminoAcid.reverseComplementBinaryFast(r, k)); +// assert(AminoAcid.reverseComplementBinaryFast(r, k)==key) : k+"\n"+Long.toBinaryString(key)+"\n"+Long.toBinaryString(r)+"\n"+Long.toBinaryString(AminoAcid.reverseComplementBinaryFast(r, k)); + return r; + } + +// private static final short[] canonMask={0, 3, 15, 63, 255, 1023, 4095, 16383}; + private static final long canonK=4; + static final long canonMask=(1<<(canonK*2))-1; //e.g. 255 for k=4 + +} diff --git a/current/kmer/KCountArray2.java b/current/kmer/KCountArray2.java new file mode 100755 index 0000000..b840c80 --- /dev/null +++ b/current/kmer/KCountArray2.java @@ -0,0 +1,227 @@ +package kmer; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + */ +public class KCountArray2 { + + public static void main(String[] args){ + KCountArray2 kca=new KCountArray2(1024, 16); + } + + public KCountArray2(long cells_, int bits_){ + this(cells_, bits_, 0); + } + + public KCountArray2(long cells_, int bits_, int gap_){ + gap=gap_; + assert(bits_<=32); + assert(Integer.bitCount(bits_)==1); + assert(Long.bitCount(cells_)==1); + + while(bits_*cells_<32*numArrays){ + assert(false); + bits_*=2; + } //Increases bits per cell so that at minimum each array is size 1 + + assert(bits_!=32); + + cells=cells_; + cellBits=bits_; + valueMask=~((-1)<>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; + for(int[] array : matrix){ + if(array!=null){ + for(int word : array){ + while(word>0){ + int x=word&valueMask; + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + public String mem(){ + long mem=(cells*cellBits)/8; + if(mem<(1<<20)){ + return (String.format("%.2f KB", mem*1d/(1<<10))); + }else if(mem<(1<<30)){ + return (String.format("%.2f MB", mem*1d/(1<<20))); + }else{ + return (String.format("%.2f GB", mem*1d/(1<<30))); + } + } + + public static final int min(int x, int y){return xy ? x : y;} + public static final long min(long x, long y){return xy ? x : y;} + + private long cellsUsed; + + public final long cells; + public final int cellBits; + public final int maxValue; + public final int gap; //Set this for convenience on gapped tables to make sure you're using the right table. + + private final int cellsPerWord; + private final int indexShift; + private final int valueMask; + private final int[][] matrix; + + private static final int arrayBits=2; + private static final int numArrays=1<>>=arrayBits; + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + word=(value<>>=arrayBits; + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + int value=((word>>>cellShift)&valueMask); + if(value==0 && incr>0){cellsUsed++;} + else if(incr<0 && value+incr==0){cellsUsed--;} + value=min(value+incr, maxValue); + word=(value<>>=arrayBits; + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + final int value=((word>>>cellShift)&valueMask); + final int value2=min(value+incr, maxValue); + word=(value2<>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; + for(int[] array : matrix){ + if(array!=null){ + for(int word : array){ + while(word>0){ + int x=word&valueMask; + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + @Override + long hash(long x, int y) { + assert(false) : "Unsupported."; + return x; + } + + private long cellsUsed; + private final int[][] matrix; + +} diff --git a/current/kmer/KCountArray4.java b/current/kmer/KCountArray4.java new file mode 100755 index 0000000..31b7f5d --- /dev/null +++ b/current/kmer/KCountArray4.java @@ -0,0 +1,367 @@ +package kmer; + +import java.util.Random; + +import dna.Timer; + + +/** + * + * Uses hashing rather than direct-mapping to support longer kmers. + * + * @author Brian Bushnell + * @date Aug 17, 2012 + * + */ +public class KCountArray4 extends KCountArray { + + public static void main(String[] args){ + long cells=Long.parseLong(args[0]); + int bits=Integer.parseInt(args[1]); + int gap=Integer.parseInt(args[2]); + int hashes=Integer.parseInt(args[3]); + + verbose=false; + + KCountArray4 kca=new KCountArray4(cells, bits, gap, hashes); + + System.out.println(kca.read(0)); + kca.increment(0); + System.out.println(kca.read(0)); + kca.increment(0); + System.out.println(kca.read(0)); + System.out.println(); + + System.out.println(kca.read(1)); + kca.increment(1); + System.out.println(kca.read(1)); + kca.increment(1); + System.out.println(kca.read(1)); + System.out.println(); + + System.out.println(kca.read(100)); + kca.increment(100); + System.out.println(kca.read(100)); + kca.increment(100); + System.out.println(kca.read(100)); + kca.increment(100); + System.out.println(kca.read(100)); + System.out.println(); + + + System.out.println(kca.read(150)); + kca.increment(150); + System.out.println(kca.read(150)); + System.out.println(); + + } + + public KCountArray4(long cells_, int bits_, int gap_, int hashes_){ + super(cells_, bits_, gap_); + long words=cells/cellsPerWord; + assert(words/numArrays<=Integer.MAX_VALUE); + int wordsPerArray=(int)(words/numArrays); + hashes=hashes_; +// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes); +// assert(false); + matrix=new int[numArrays][wordsPerArray]; + assert(hashes>0 && hashes<=hashMasks.length); + } + + public int read(final long rawKey){ + if(verbose){System.err.println("Reading raw key "+rawKey);} + long key2=hash(rawKey, 0); + int min=readHashed(key2); + for(int i=1; i0; i++){ + if(verbose){System.err.println("Reading. i="+i+", key2="+key2);} + key2=Long.rotateRight(key2, hashBits); + key2=hash(key2, i); + if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);} + min=min(min, readHashed(key2)); + } + return min; + } + + private int readHashed(long key){ + if(verbose){System.err.print("Reading hashed key "+key);} + key=((key&Long.MAX_VALUE)%(cells-1)); +// System.out.println("key="+key); + int arrayNum=(int)(key&arrayMask); +// System.out.println("array="+arrayNum); + key>>>=arrayBits; +// System.out.println("key2="+key); + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); +// assert(false) : indexShift; +// System.out.println("index="+index); + int word=array[index]; +// System.out.println("word="+Integer.toHexString(word)); + assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask))); +// int cellShift=(int)(cellBits*(key&cellMask)); + int cellShift=(int)(cellBits*key); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));} +// System.out.println("cellShift="+cellShift); + return (int)((word>>>cellShift)&valueMask); + } + + public void write(final long key, int value){ + throw new RuntimeException("Not allowed for this class."); + } + + public int incrementAndReturn(final long rawKey, int incr){ +// verbose=(rawKey==32662670693L); + if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");} +// verbose=true; + assert(incr>0); + + long key2=rawKey; + if(hashes==1){ + key2=hash(key2, 0); + int x=incrementHashedIfAtMost(key2, incr, maxValue-1); + assert(x>=incr) : "original=?, new should be >="+(incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey; + return x; + } + + final int min=read(rawKey); + if(min>=maxValue){return maxValue;} + + assert(key2==rawKey); + for(int i=0; i0); + + long key2=rawKey; + if(hashes==1){ + key2=hash(key2, 0); + int x=incrementHashedIfAtMost(key2, incr, maxValue-1); + assert(x>=incr) : "original=?, new should be >="+(incr)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey; + return x; + } + + final int min=read(rawKey); + if(min>=maxValue){return maxValue;} + + assert(key2==rawKey); + for(int i=0; i>>=arrayBits; + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + int value=((word>>>cellShift)&valueMask); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+value+", limit="+lim);} + if(value>lim){return value;} + if(value==0 && incr>0){cellsUsed++;} + value=min(value+incr, maxValue); + word=(value<0); + int arrayNum=(int)(key&arrayMask); + key>>>=arrayBits; + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + int value=((word>>>cellShift)&valueMask); + if(value==0 && incr>0){cellsUsed++;} + value=min(value+incr, maxValue); + word=(value<>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; + for(int[] array : matrix){ + if(array!=null){ + for(int word : array){ + while(word>0){ + int x=word&valueMask; + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + + final long hash(long key, int row){ + int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); +// int cell=(int)(hashCellMask&(key)); + + if(row==0){//Doublehash only first time + key=key^hashMasks[(row+4)%hashMasks.length][cell]; + cell=(int)(hashCellMask&(key>>4)); +// cell=(int)(hashCellMask&(key>>hashBits)); +// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); + } + + return key^hashMasks[row][cell]; + } + + /** + * @param i + * @param j + * @return + */ + private static long[][] makeMasks(int rows, int cols) { + + long seed; + synchronized(KCountArray4.class){ + seed=counter; + counter++; + } + + Timer t=new Timer(); + t.start(); + long[][] r=new long[rows][cols]; + Random randy=new Random(seed); + for(int i=0; i200000000L){System.out.println("Mask-creation time: "+t);} + return r; + } + + + /** + * @param cols + * @param randy + * @return + */ + private static void fillMasks(long[] r, Random randy) { +// for(int i=0; i16){ + x&=(~(1L<16){ + x&=(~(1L<<(randy.nextInt(32)+32))); + } + +// System.out.print("."); +// y=(((int)(x&mask))^i); + y=(((int)(x&mask))); + z=(int)((x>>hashBits)&mask); + if(count1[y]>0 || count2[z]>0){ + x=0; + } + } +// System.out.println(Long.toBinaryString(x)); + r[i]=(x&Long.MAX_VALUE); + count1[y]++; + count2[z]++; + } + + } + + public long cellsUsed(){return cellsUsed;} + + private long cellsUsed; + private final int[][] matrix; + private final int hashes; + + + private static final int hashBits=6; + private static final int hashArrayLength=1<=Integer.MAX_VALUE){x=Integer.MAX_VALUE-3;} + assert(x<=Integer.MAX_VALUE); + wordsPerArray=(int)(x); + cellsPerArray=cells/numArrays; + cellMod=cellsPerArray-1; + hashes=hashes_; +// System.out.println("cells="+cells+", words="+words+", wordsPerArray="+wordsPerArray+", numArrays="+numArrays+", hashes="+hashes); +// assert(false); + matrix=new int[numArrays][]; + assert(hashes>0 && hashes<=hashMasks.length); + } + + public int read(final long rawKey){ + assert(finished); + if(verbose){System.err.println("Reading raw key "+rawKey);} + long key2=hash(rawKey, 0); + int min=readHashed(key2); + for(int i=1; i0; i++){ + if(verbose){System.err.println("Reading. i="+i+", key2="+key2);} + key2=Long.rotateRight(key2, hashBits); + key2=hash(key2, i); + if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);} + min=min(min, readHashed(key2)); + } + return min; + } + + private int readHashed(long key){ + if(verbose){System.err.print("Reading hashed key "+key);} +// System.out.println("key="+key); + int arrayNum=(int)(key&arrayMask); + key=(key>>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); +// System.out.println("array="+arrayNum); +// System.out.println("key2="+key); + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); +// assert(false) : indexShift; +// System.out.println("index="+index); + int word=array[index]; +// System.out.println("word="+Integer.toHexString(word)); + assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask))); +// int cellShift=(int)(cellBits*(key&cellMask)); + int cellShift=(int)(cellBits*key); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));} +// System.out.println("cellShift="+cellShift); + return (int)((word>>>cellShift)&valueMask); + } + + public void write(final long key, int value){ + throw new RuntimeException("Not allowed for this class."); + } + +// @Override +// /** This should increase speed by doing the first hash outside the critical section, but it does not seem to help. */ +// public void increment(long[] keys){ +// for(int i=0; i>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; + for(int[] array : matrix){ + if(array!=null){ + for(int word : array){ + while(word>0){ + int x=word&valueMask; + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + + final long hash(long key, int row){ + int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); +// int cell=(int)(hashCellMask&(key)); + + if(row==0){//Doublehash only first time + key=key^hashMasks[(row+4)%hashMasks.length][cell]; + cell=(int)(hashCellMask&(key>>4)); +// cell=(int)(hashCellMask&(key>>hashBits)); +// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); + } + + return key^hashMasks[row][cell]; + } + + /** + * @param i + * @param j + * @return + */ + private static long[][] makeMasks(int rows, int cols) { + + long seed; + synchronized(KCountArray4MT.class){ + seed=counter; + counter++; + } + + Timer t=new Timer(); + t.start(); + long[][] r=new long[rows][cols]; + Random randy=new Random(seed); + for(int i=0; i200000000L){System.out.println("Mask-creation time: "+t);} + return r; + } + + + /** + * @param cols + * @param randy + * @return + */ + private static void fillMasks(long[] r, Random randy) { +// for(int i=0; i16){ + x&=(~(1L<16){ + x&=(~(1L<<(randy.nextInt(32)+32))); + } + +// System.out.print("."); +// y=(((int)(x&mask))^i); + y=(((int)(x&mask))); + z=(int)((x>>hashBits)&mask); + if(count1[y]>0 || count2[z]>0){ + x=0; + } + } +// System.out.println(Long.toBinaryString(x)); + r[i]=(x&Long.MAX_VALUE); + count1[y]++; + count2[z]++; + } + + } + + + public void initialize(){ + for(int i=0; i0){ + writers[i].add(array); + } + } + + //Add poison + for(WriteThread wt : writers){ + wt.add(poison); + } + + //Wait for termination + for(WriteThread wt : writers){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + while(wt.isAlive()){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + try { + wt.join(10000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");} + } + cellsUsed+=wt.cellsUsedPersonal; +// System.out.println("cellsUsed="+cellsUsed); + } + + assert(!finished); + finished=true; + } + } + + private class WriteThread extends Thread{ + + public WriteThread(int tnum){ + num=tnum; + } + + @Override + public void run(){ + assert(matrix[num]==null); + array=new int[wordsPerArray]; //Makes NUMA systems use local memory. + + matrix[num]=array; + + long[] keys=null; + while(!shutdown){ + + if(verbose){System.err.println(" - Reading keys for wt"+num+".");} + while(keys==null){ + try { + keys=writeQueue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(keys==poison){ +// assert(false); + shutdown=true; + }else{ + for(long key : keys){ + incrementHashedLocal(key); + } + } +// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length); + if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");} + keys=null; + if(verbose){System.err.println("shutdown="+shutdown);} + } + +// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."); +// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."; + + array=null; + } + + private void add(long[] keys){ +// assert(isAlive()); + assert(!shutdown); + if(shutdown){return;} +// assert(keys!=poison); + if(verbose){System.err.println(" + Adding keys to wt"+num+".");} + boolean success=false; + while(!success){ + try { + writeQueue.put(keys); + success=true; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");} + } + + private int incrementHashedLocal(long key){ + assert((key&arrayMask)==num); + key=(key>>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + int value=((word>>>cellShift)&valueMask); + if(value==0){cellsUsedPersonal++;} + value=min(value+1, maxValue); + word=(value< writeQueue=new ArrayBlockingQueue(16); + public boolean shutdown=false; + + } + + + public long cellsUsed(){return cellsUsed;} + + private boolean finished=false; + + private long cellsUsed; + private final int[][] matrix; + private final WriteThread[] writers=new WriteThread[numArrays]; + private final int hashes; + private final int wordsPerArray; + private final long cellsPerArray; + private final long cellMod; + private final long[][] hashMasks=makeMasks(8, hashArrayLength); + + private final long[][] buffers=new long[numArrays][1000]; + private final int[] bufferlen=new int[numArrays]; + + private static final int hashBits=6; + private static final int hashArrayLength=1<0; i++){ + if(verbose){System.err.println("Reading. i="+i+", key2="+key2);} + key2=Long.rotateRight(key2, hashBits); + key2=hash(key2, i); + if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);} + min=min(min, readHashed(key2)); + } + return min; + } + + private int readHashed(long key){ + assert(finished); + if(verbose){System.err.print("Reading hashed key "+key);} +// System.out.println("key="+key); + int arrayNum=(int)(key&arrayMask); + key=(key>>>arrayBits)%(cellMod); +// System.out.println("array="+arrayNum); +// System.out.println("key2="+key); + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); +// assert(false) : indexShift; +// System.out.println("index="+index); + int word=array[index]; +// System.out.println("word="+Integer.toHexString(word)); + assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask))); +// int cellShift=(int)(cellBits*(key&cellMask)); + int cellShift=(int)(cellBits*key); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));} +// System.out.println("cellShift="+cellShift); + return (int)((word>>>cellShift)&valueMask); + } + + public void write(final long key, int value){ + throw new RuntimeException("Not allowed for this class."); + } + + public void increment(final long rawKey){ + if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");} + + buffer[bufferlen]=hash(rawKey, 0); + bufferlen++; + + if(bufferlen>=buffer.length){ + + if(verbose){System.err.println("Moving array.");} + + for(int w=0; w>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; + for(int[] array : matrix){ + if(array!=null){ + for(int word : array){ + while(word>0){ + int x=word&valueMask; + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + + final long hash(long key, int row){ + int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); +// int cell=(int)(hashCellMask&(key)); + + if(row==0){//Doublehash only first time + key=key^hashMasks[(row+4)%hashMasks.length][cell]; + cell=(int)(hashCellMask&(key>>4)); +// cell=(int)(hashCellMask&(key>>hashBits)); +// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); + } + + return key^hashMasks[row][cell]; + } + + /** + * @param i + * @param j + * @return + */ + private static long[][] makeMasks(int rows, int cols) { + + long seed; + synchronized(KCountArray5MT.class){ + seed=counter; + counter++; + } + + Timer t=new Timer(); + t.start(); + long[][] r=new long[rows][cols]; + Random randy=new Random(seed); + for(int i=0; i200000000L){System.out.println("Mask-creation time: "+t);} + return r; + } + + + /** + * @param cols + * @param randy + * @return + */ + private static void fillMasks(long[] r, Random randy) { +// for(int i=0; i16){ + x&=(~(1L<16){ + x&=(~(1L<<(randy.nextInt(32)+32))); + } + +// System.out.print("."); +// y=(((int)(x&mask))^i); + y=(((int)(x&mask))); + z=(int)((x>>hashBits)&mask); + if(count1[y]>0 || count2[z]>0){ + x=0; + } + } +// System.out.println(Long.toBinaryString(x)); + r[i]=(x&Long.MAX_VALUE); + count1[y]++; + count2[z]++; + } + + } + + + public void initialize(){ + for(int i=0; i0){ + for(int i=0; i=0) : "i="+0+", original=?, new should be >=0, new="+readHashed(key2)+", max="+maxValue+", key="+rawKey; + if(verbose){System.err.println("postIncr value="+readHashed(key2));} + +// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~4 "; + } + + for(int i=1; i=0, new="+readHashed(key2)+", max="+maxValue+", key="+rawKey; + if(verbose){System.err.println("postIncr value="+readHashed(key2));} + +// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~4 "; + } + +// assert(false) : " ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~5 "; + } +// int z=read(rawKey); +// assert(hashes!=1 || !b || z==maxValue || z==y+1) : "b="+b+", y="+y+", z="+z+", rawKey="+rawKey+", num="+num; + } + } +// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length); + if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");} + keys=null; + if(verbose){System.err.println("shutdown="+shutdown);} + } + +// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."); +// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."; + + array=null; + } + + private void add(long[] keys){ +// assert(isAlive()); + +// assert(!shutdown); +// if(shutdown){return;} + + if(verbose){System.err.println(" + Adding keys to wt"+num+".");} + boolean success=false; + while(!success){ + try { + writeQueue.put(keys); + success=true; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");} + } + + private int incrementHashedLocal(final long key_){ + if(verbose){System.err.println("\n*** wt"+num+" incrementing hashed key "+key_+" ***");} + assert((key_&arrayMask)==num); + long key=(key_>>>arrayBits)%(cellMod); + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + int value=((word>>>cellShift)&valueMask); + if(value==0){cellsUsedPersonal++;} + value=min(value+1, maxValue); + word=(value< writeQueue=new ArrayBlockingQueue(8); + public boolean shutdown=false; + + } + + + public long cellsUsed(){return cellsUsed;} + + private boolean finished=false; + + private long cellsUsed; + private final int[][] matrix; + private final WriteThread[] writers=new WriteThread[numArrays]; + private final int hashes; + private final int wordsPerArray; + private final long cellsPerArray; + private final long cellMod; + private final long[][] hashMasks=makeMasks(8, hashArrayLength); + + private long[] buffer=new long[2000]; + private int bufferlen=0; + + private static final int hashBits=6; + private static final int hashArrayLength=1<0; i++){ + if(verbose){System.err.println("Reading. i="+i+", key2="+key2);} + key2=Long.rotateRight(key2, hashBits); + key2=hash(key2, i); + if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);} + min=min(min, readHashed(key2, arrayNum)); + } + return min; + } + + private int readHashed(long key, int arrayNum){ + if(verbose){System.err.print("Reading hashed key "+key);} +// System.out.println("key="+key); +// int arrayNum=(int)(key&arrayMask); + key=(key&Long.MAX_VALUE)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); +// System.out.println("array="+arrayNum); +// System.out.println("key2="+key); + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); +// assert(false) : indexShift; +// System.out.println("index="+index); + int word=array[index]; +// System.out.println("word="+Integer.toHexString(word)); + assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask))); +// int cellShift=(int)(cellBits*(key&cellMask)); + int cellShift=(int)(cellBits*key); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));} +// System.out.println("cellShift="+cellShift); + return (int)((word>>>cellShift)&valueMask); + } + + public void write(final long key, int value){ + throw new RuntimeException("Not allowed for this class."); + } + + public void increment(final long rawKey){ + if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");} + + long key1=hash(rawKey, 3); + + if(verbose){System.err.println("key2="+key1+", value="+read(rawKey));} + + int bnum=(int)(key1&arrayMask); + long[] array=buffers[bnum]; + int loc=bufferlen[bnum]; + +// key2=Long.rotateRight(key2, hashBits); +// array[loc]=key2; + + array[loc]=rawKey; + bufferlen[bnum]++; + if(verbose){System.err.println("bufferlen["+bnum+"] = "+bufferlen[bnum]);} + if(bufferlen[bnum]>=array.length){ + + if(verbose){System.err.println("Moving array.");} + bufferlen[bnum]=0; + buffers[bnum]=new long[array.length]; + + writers[bnum].add(array); + if(verbose){System.err.println("Moved.");} + } + } + + public int incrementAndReturn(long key, int incr){ + throw new RuntimeException("Operation not supported."); + } + + /** Returns unincremented value */ + public int incrementAndReturnUnincremented(long key, int incr){ + throw new RuntimeException("Operation not supported."); + } + + public long[] transformToFrequency(){ + return transformToFrequency(matrix); + } + + public String toContentsString(){ + StringBuilder sb=new StringBuilder(); + sb.append("["); + String comma=""; + for(int[] array : matrix){ + for(int i=0; i>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; + for(int[] array : matrix){ + if(array!=null){ + for(int word : array){ + while(word>0){ + int x=word&valueMask; + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + + final long hash(long key, int row){ + int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); +// int cell=(int)(hashCellMask&(key)); + + if(row==0){//Doublehash only first time + key=key^hashMasks[(row+4)%hashMasks.length][cell]; + cell=(int)(hashCellMask&(key>>4)); +// cell=(int)(hashCellMask&(key>>hashBits)); +// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); + } + + return key^hashMasks[row][cell]; + } + + /** + * @param i + * @param j + * @return + */ + private static long[][] makeMasks(int rows, int cols) { + + long seed; + synchronized(KCountArray6MT.class){ + seed=counter; + counter++; + } + + Timer t=new Timer(); + t.start(); + long[][] r=new long[rows][cols]; + Random randy=new Random(seed); + for(int i=0; i200000000L){System.out.println("Mask-creation time: "+t);} + return r; + } + + + /** + * @param cols + * @param randy + * @return + */ + private static void fillMasks(long[] r, Random randy) { +// for(int i=0; i16){ + x&=(~(1L<16){ + x&=(~(1L<<(randy.nextInt(32)+32))); + } + +// System.out.print("."); +// y=(((int)(x&mask))^i); + y=(((int)(x&mask))); + z=(int)((x>>hashBits)&mask); + if(count1[y]>0 || count2[z]>0){ + x=0; + } + } +// System.out.println(Long.toBinaryString(x)); + r[i]=(x&Long.MAX_VALUE); + count1[y]++; + count2[z]++; + } + + } + + + public void initialize(){ + for(int i=0; i0){ + writers[i].add(array); + } + } + + //Add poison + for(WriteThread wt : writers){ + wt.add(poison); + } + + //Wait for termination + for(WriteThread wt : writers){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + while(wt.isAlive()){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + try { + wt.join(10000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");} + } + cellsUsed+=wt.cellsUsedPersonal; +// System.out.println("cellsUsed="+cellsUsed); + } + + assert(!finished); + finished=true; + } + } + + private class WriteThread extends Thread{ + + public WriteThread(int tnum){ + num=tnum; + } + + @Override + public void run(){ + assert(matrix[num]==null); + array=new int[wordsPerArray]; //Makes NUMA systems use local memory. + + matrix[num]=array; + + long[] keys=null; + while(!shutdown){ + + if(verbose){System.err.println(" - Reading keys for wt"+num+".");} + while(keys==null){ + try { + keys=writeQueue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(keys==poison){ +// assert(false); + shutdown=true; + }else{ + for(long key : keys){ + incrementRawLocal(key); + } + } +// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length); + if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");} + keys=null; + if(verbose){System.err.println("shutdown="+shutdown);} + } + +// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."); +// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."; + + array=null; + } + + private void add(long[] keys){ +// assert(isAlive()); + assert(!shutdown); + if(shutdown){return;} +// assert(keys!=poison); + if(verbose){System.err.println(" + Adding keys to wt"+num+".");} + boolean success=false; + while(!success){ + try { + writeQueue.put(keys); + success=true; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");} + } + + private int incrementRawLocal(long rawKey){ +// verbose=(rawKey==32662670693L); + if(verbose){System.err.println("\n*** Incrementing raw key "+rawKey+" ***");} +// verbose=true; + assert(1>0); + + long key2=rawKey; + if(hashes==1){ + key2=hash(key2, 0); +// int x=incrementHashedIfAtMost(key2, 1, maxValue-1); + int x=incrementHashedLocal(key2); + assert(x>=1) : "original=?, new should be >="+(1)+", new="+read(rawKey)+", max="+maxValue+", key="+rawKey; + return x; + } + + int min=0; +// final int min=read(rawKey); +// if(min>=maxValue){return maxValue;} + + assert(key2==rawKey); + for(int i=0; i0; i++){ + if(verbose){System.err.println("Reading. i="+i+", key2="+key2);} + key2=Long.rotateRight(key2, hashBits); + key2=hash(key2, i); + if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);} + min=min(min, readHashed(key2)); + } + return min; + } + + private int readHashed(long key){ + if(verbose){System.err.print("Reading hashed key "+key);} +// System.out.println("key="+key); + int arrayNum=(int)(key&arrayMask); + key=(key>>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); +// System.out.println("array="+arrayNum); +// System.out.println("key2="+key); + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); +// assert(false) : indexShift; +// System.out.println("index="+index); + int word=array[index]; +// System.out.println("word="+Integer.toHexString(word)); + assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask))); +// int cellShift=(int)(cellBits*(key&cellMask)); + int cellShift=(int)(cellBits*key); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));} +// System.out.println("cellShift="+cellShift); + return (int)((word>>>cellShift)&valueMask); + } + + public void write(final long key, int value){ + throw new RuntimeException("Not allowed for this class."); + } + + @Override + /** This should increase speed by doing the first hash outside the critical section, but it does not seem to help. */ + public void increment(long[] keys){ + for(int i=0; i>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; +// System.out.println("A"); + for(int[] array : matrix){ +// System.out.println("B"); + if(array!=null){ +// System.out.println("C"); + for(int word : array){ +// System.out.println("D: "+Integer.toHexString(word)); + while(word>0){ + int x=word&valueMask; +// System.out.println("E: "+x+", "+mindepth); + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + + final long hash(long key, int row){ + int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); +// int cell=(int)(hashCellMask&(key)); + + if(row==0){//Doublehash only first time + key=key^hashMasks[(row+4)%hashMasks.length][cell]; + cell=(int)(hashCellMask&(key>>5)); +// cell=(int)(hashCellMask&(key>>hashBits)); +// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); + } + + return key^hashMasks[row][cell]; + } + + /** + * @param i + * @param j + * @return + */ + private static long[][] makeMasks(int rows, int cols) { + + long seed; + synchronized(KCountArray7MT.class){ + seed=counter; + counter++; + } + + Timer t=new Timer(); + t.start(); + long[][] r=new long[rows][cols]; + Random randy=new Random(seed); + for(int i=0; i200000000L){System.out.println("Mask-creation time: "+t);} + return r; + } + + + /** + * @param cols + * @param randy + * @return + */ + private static void fillMasks(long[] r, Random randy) { +// for(int i=0; i16){ + x&=(~(1L<16){ + x&=(~(1L<<(randy.nextInt(32)+32))); + } + +// System.out.print("."); +// y=(((int)(x&mask))^i); + y=(((int)(x&mask))); + z=(int)((x>>hashBits)&mask); + if(count1[y]>0 || count2[z]>0){ + x=0; + } + } +// System.out.println(Long.toBinaryString(x)); + r[i]=(x&Long.MAX_VALUE); + count1[y]++; + count2[z]++; + } + + } + + + public void initialize(){ + for(int i=0; i0){ + writers[i].add(array); + } + } + + //Add poison + for(WriteThread wt : writers){ + wt.add(poison); + } + + //Wait for termination + for(WriteThread wt : writers){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + while(wt.isAlive()){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + try { + wt.join(10000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");} + } + cellsUsed+=wt.cellsUsedPersonal; +// System.out.println("cellsUsed="+cellsUsed); +// System.err.println("wt.cellsUsedPersonal="+wt.cellsUsedPersonal); + } + + assert(!finished); + finished=true; + } + } + + private class WriteThread extends Thread{ + + public WriteThread(int tnum){ + num=tnum; + } + + @Override + public void run(){ + assert(matrix[num]==null); + array=new int[wordsPerArray]; //Makes NUMA systems use local memory. + + matrix[num]=array; + + long[] keys=null; + while(!shutdown){ + + if(verbose){System.err.println(" - Reading keys for wt"+num+".");} + while(keys==null){ + try { + keys=writeQueue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(keys==poison){ +// assert(false); + shutdown=true; + }else{ + for(long key : keys){ + incrementHashedLocal(key); + } + } +// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length); + if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");} + keys=null; + if(verbose){System.err.println("shutdown="+shutdown);} + } + +// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."); +// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."; + + array=null; + } + + private void add(long[] keys){ +// assert(isAlive()); + assert(!shutdown); + if(shutdown){return;} +// assert(keys!=poison); + if(verbose){System.err.println(" + Adding keys to wt"+num+".");} + boolean success=false; + while(!success){ + try { + writeQueue.put(keys); + success=true; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");} + } + + private int incrementHashedLocal(long key){ + assert((key&arrayMask)==num); + key=(key>>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + int value=((word>>>cellShift)&valueMask); + if(value==0){cellsUsedPersonal++;} + value=min(value+1, maxValue); + word=(value< writeQueue=new ArrayBlockingQueue(16); + public boolean shutdown=false; + + } + + + public long cellsUsed(){return cellsUsed;} + + private boolean finished=false; + + private long cellsUsed; + private final int[][] matrix; + private final WriteThread[] writers=new WriteThread[numArrays]; + private final int hashes; + private final int wordsPerArray; + private final long cellsPerArray; + private final long cellMod; + private final long[][] hashMasks=makeMasks(8, hashArrayLength); + + private final long[][] buffers=new long[numArrays][500]; + private final int[] bufferlen=new int[numArrays]; + + private static final int hashBits=6; + private static final int hashArrayLength=1<0; i++){ + if(verbose){System.err.println("Reading. i="+i+", key2="+key2);} + key2=Long.rotateRight(key2, hashBits); + key2=hash(key2, i); + if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);} + min=min(min, readHashed(key2)); + } + return min; + } + + @Override + public final int readLeft(final long key, final int k, boolean makeCanonical){ + assert(k<=32); + final long key2=key>>>2; + final int shift=2*(k-1); + final long akey=key2|(0L<=32 ? -1L : ~((-1L)<<(2*k))); + final long key2=(key<<2)&mask; + final long akey=key2|0L; + final long ckey=key2|1L; + final long gkey=key2|2L; + final long tkey=key2|3L; + final int a=read(makeCanonical ? makeCanonical2(akey, k) : akey); + final int c=read(makeCanonical ? makeCanonical2(ckey, k) : ckey); + final int g=read(makeCanonical ? makeCanonical2(gkey, k) : gkey); + final int t=read(makeCanonical ? makeCanonical2(tkey, k) : tkey); + return a+c+g+t; + } + + @Override + public final int[] readAllLeft(final long key, final int k, boolean makeCanonical, int[] rvec){ + assert(k<=32); + if(rvec==null){rvec=new int[4];} + final long key2=key>>>2; + final int shift=2*(k-1); + final long akey=key2|(0L<=32 ? -1L : ~((-1L)<<(2*k))); + final long key2=(key<<2)&mask; + final long akey=key2|0L; + final long ckey=key2|1L; + final long gkey=key2|2L; + final long tkey=key2|3L; + rvec[0]=read(makeCanonical ? makeCanonical2(akey, k) : akey); + rvec[1]=read(makeCanonical ? makeCanonical2(ckey, k) : ckey); + rvec[2]=read(makeCanonical ? makeCanonical2(gkey, k) : gkey); + rvec[3]=read(makeCanonical ? makeCanonical2(tkey, k) : tkey); + return rvec; + } + + private final int readHashed(long key){ + if(verbose){System.err.print("Reading hashed key "+key);} +// System.out.println("key="+key); + int arrayNum=(int)(key&arrayMask); + key=(key>>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); +// System.out.println("array="+arrayNum); +// System.out.println("key2="+key); + AtomicIntegerArray array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); +// assert(false) : indexShift; +// System.out.println("index="+index); + int word=array.get(index); +// System.out.println("word="+Integer.toHexString(word)); + assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask))); +// int cellShift=(int)(cellBits*(key&cellMask)); + int cellShift=(int)(cellBits*key); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));} +// System.out.println("cellShift="+cellShift); + return (int)((word>>>cellShift)&valueMask); + } + + @Override + public final void write(final long key, int value){ + throw new RuntimeException("Not allowed for this class."); + } + + @Override + public final void increment(long[] keys){ +// assert(false) : "This method is not really needed."; + for(int i=0; i>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed()/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + return cellsUsedMT(mindepth); + } + + public long cellsUsedMT(int mindepth){ +// assert(false) : matrix.length; + ArrayList list=new ArrayList(matrix.length); + for(AtomicIntegerArray aia : matrix){ + CountUsedThread ctt=new CountUsedThread(aia, mindepth); + ctt.start(); + list.add(ctt); + } + long x=0; + for(CountUsedThread ctt : list){ + while(ctt.getState()!=State.TERMINATED){ + try { + ctt.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + x+=ctt.count; + } + return x; + } + + private class CountUsedThread extends Thread{ + public CountUsedThread(AtomicIntegerArray a_, int mindepth_){ + array=a_; + mindepth=mindepth_; + } + public void run(){ + long temp=0; + if(array!=null){ +// System.out.println("C"); +// assert(false) : Integer.toBinaryString(valueMask); + if(cellBits==32){ + for(int i=0, max=array.length(); i=mindepth){temp++;} + } + } + }else{ + for(int i=0, max=array.length(); i=mindepth){temp++;} + word=word>>>cellBits; + } + } + } + } + count=temp; + } + private final AtomicIntegerArray array; + private final int mindepth; + public long count; + } + + + final long hash(long key, int row){ + int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); +// int cell=(int)(hashCellMask&(key)); + + if(row==0){//Doublehash only first time + key=key^hashMasks[(row+4)%hashMasks.length][cell]; + cell=(int)(hashCellMask&(key>>5)); +// cell=(int)(hashCellMask&(key>>hashBits)); +// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); + } + + return key^hashMasks[row][cell]; + } + + /** + * @param i + * @param j + * @return + */ + private static long[][] makeMasks(int rows, int cols) { + + long seed; + synchronized(KCountArray7MTA.class){ + seed=counter; + counter++; + } + + Timer t=new Timer(); + t.start(); + long[][] r=new long[rows][cols]; + Random randy=new Random(seed); + for(int i=0; i200000000L){System.out.println("Mask-creation time: "+t);} + return r; + } + + + /** + * @param cols + * @param randy + * @return + */ + private static void fillMasks(long[] r, Random randy) { +// for(int i=0; i16){ + x&=(~(1L<16){ + x&=(~(1L<<(randy.nextInt(32)+32))); + } + +// System.out.print("."); +// y=(((int)(x&mask))^i); + y=(((int)(x&mask))); + z=(int)((x>>hashBits)&mask); + if(count1[y]>0 || count2[z]>0){ + x=0; + } + } +// System.out.println(Long.toBinaryString(x)); + r[i]=(x&Long.MAX_VALUE); + count1[y]++; + count2[z]++; + } + + } + + + public void initialize(){} + + public void shutdown(){ + if(finished){return;} + synchronized(this){ + if(finished){return;} + + cellsUsed=-1; +// for(int i=0; i>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); + int index=(int)(key>>>indexShift); + int cellShift=(int)(cellBits*key); + int value, word, word2; + do{ + word=array.get(index); + value=((word>>>cellShift)&valueMask); + value=min(value+1, maxValue); + word2=(value<>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); + int index=(int)(key>>>indexShift); + int cellShift=(int)(cellBits*key); + int value, word, word2; + do{ + word=array.get(index); + value=((word>>>cellShift)&valueMask); + value=max(value-1, 0); + word2=(value<0; i++){ + if(verbose){System.err.println("Reading. i="+i+", key2="+key2);} + key2=Long.rotateRight(key2, hashBits); + key2=hash(key2, i); + if(verbose){System.err.println("Rot/hash. i="+i+", key2="+key2);} + min=min(min, readHashed(key2)); + } + return min; + } + + private int readHashed(long key){ + if(verbose){System.err.print("Reading hashed key "+key);} +// System.out.println("key="+key); + int arrayNum=(int)(key&arrayMask); + key=(key>>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); +// System.out.println("array="+arrayNum); +// System.out.println("key2="+key); + int[] array=matrix[arrayNum]; + int index=(int)(key>>>indexShift); +// assert(false) : indexShift; +// System.out.println("index="+index); + int word=array[index]; +// System.out.println("word="+Integer.toHexString(word)); + assert(word>>>(cellBits*key) == word>>>(cellBits*(key&cellMask))); +// int cellShift=(int)(cellBits*(key&cellMask)); + int cellShift=(int)(cellBits*key); + if(verbose){System.err.println(", array="+arrayNum+", index="+index+", cellShift="+(cellShift%32)+", value="+((int)((word>>>cellShift)&valueMask)));} +// System.out.println("cellShift="+cellShift); + return (int)((word>>>cellShift)&valueMask); + } + + public void write(final long key, int value){ + throw new RuntimeException("Not allowed for this class."); + } + + @Override + /** This should increase speed by doing the first hash outside the critical section, but it does not seem to help. */ + public void increment(long[] keys){ + if(prefilter==null){ + for(int i=0; i>>=cellBits; + comma=", "; + } + } + } + sb.append("]"); + return sb.toString(); + } + + public double usedFraction(){return cellsUsed/(double)cells;} + + public double usedFraction(int mindepth){return cellsUsed(mindepth)/(double)cells;} + + public long cellsUsed(int mindepth){ + long count=0; +// System.out.println("A: "+cellBits+", "+Integer.toBinaryString(valueMask)); + for(int[] array : matrix){ +// System.out.println("B"); + if(array!=null){ +// System.out.println("C"); + for(int word : array){ +// System.out.println("D: "+Integer.toBinaryString(word)); + while(word>0){ + int x=word&valueMask; +// System.out.println("E: "+x+", "+mindepth); + if(x>=mindepth){count++;} + word>>>=cellBits; + } + } + } + } + return count; + } + + final long hash(long key, int row){ + int cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); +// int cell=(int)(hashCellMask&(key)); + + if(row==0){//Doublehash only first time + key=key^hashMasks[(row+4)%hashMasks.length][cell]; + cell=(int)(hashCellMask&(key>>5)); +// cell=(int)(hashCellMask&(key>>hashBits)); +// cell=(int)((Long.MAX_VALUE&key)%(hashArrayLength-1)); + } + + return key^hashMasks[row][cell]; + } + + /** + * @param i + * @param j + * @return + */ + private static long[][] makeMasks(int rows, int cols) { + + long seed; + synchronized(KCountArray8MT.class){ + seed=counter; + counter++; + } + + Timer t=new Timer(); + t.start(); + long[][] r=new long[rows][cols]; + Random randy=new Random(seed); + for(int i=0; i200000000L){System.out.println("Mask-creation time: "+t);} + return r; + } + + + /** + * @param cols + * @param randy + * @return + */ + private static void fillMasks(long[] r, Random randy) { +// for(int i=0; i16){ + x&=(~(1L<16){ + x&=(~(1L<<(randy.nextInt(32)+32))); + } + +// System.out.print("."); +// y=(((int)(x&mask))^i); + y=(((int)(x&mask))); + z=(int)((x>>hashBits)&mask); + if(count1[y]>0 || count2[z]>0){ + x=0; + } + } +// System.out.println(Long.toBinaryString(x)); + r[i]=(x&Long.MAX_VALUE); + count1[y]++; + count2[z]++; + } + + } + + + public void initialize(){ + for(int i=0; i0){ + writers[i].add(array); + } + } + + //Add poison + for(WriteThread wt : writers){ + wt.add(poison); + } + + //Wait for termination + for(WriteThread wt : writers){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + while(wt.isAlive()){ +// System.out.println("wt"+wt.num+" is alive: "+wt.isAlive()); + try { + wt.join(10000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(wt.isAlive()){System.err.println(wt.getClass().getCanonicalName()+" is taking a long time to die.");} + } + cellsUsed+=wt.cellsUsedPersonal; +// System.out.println("cellsUsed="+cellsUsed); + } + + assert(!finished); + finished=true; + } + } + + private class WriteThread extends Thread{ + + public WriteThread(int tnum){ + num=tnum; + } + + @Override + public void run(){ + assert(matrix[num]==null); + array=new int[wordsPerArray]; //Makes NUMA systems use local memory. + + matrix[num]=array; + + long[] keys=null; + while(!shutdown){ + + if(verbose){System.err.println(" - Reading keys for wt"+num+".");} + while(keys==null){ + try { + keys=writeQueue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(keys==poison){ +// assert(false); + shutdown=true; + }else{ + for(long key : keys){ + incrementHashedLocal(key); + } + } +// System.out.println(" -- Read keys for wt"+num+". poison="+(keys==poison)+", len="+keys.length); + if(verbose){System.err.println(" -- Read keys for wt"+num+". (success)");} + keys=null; + if(verbose){System.err.println("shutdown="+shutdown);} + } + +// System.out.println(">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."); +// assert(false) : ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> I died: "+shutdown+", "+(keys==null)+"."; + + array=null; + } + + private void add(long[] keys){ +// assert(isAlive()); + assert(!shutdown); + if(shutdown){return;} +// assert(keys!=poison); + if(verbose){System.err.println(" + Adding keys to wt"+num+".");} + boolean success=false; + while(!success){ + try { + writeQueue.put(keys); + success=true; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(verbose){System.err.println(" ++ Added keys to wt"+num+". (success)");} + } + + private int incrementHashedLocal(long key){ + assert((key&arrayMask)==num); + key=(key>>>arrayBits)%(cellMod); +// key=(key>>>(arrayBits+1))%(cellMod); + int index=(int)(key>>>indexShift); + int word=array[index]; + int cellShift=(int)(cellBits*key); + int value=((word>>>cellShift)&valueMask); + if(value==0){cellsUsedPersonal++;} + value=min(value+1, maxValue); + word=(value< writeQueue=new ArrayBlockingQueue(16); + public boolean shutdown=false; + + } + + + public long cellsUsed(){return cellsUsed;} + + private boolean finished=false; + + private long cellsUsed; + private final int[][] matrix; + private final WriteThread[] writers=new WriteThread[numArrays]; + private final int hashes; + private final int wordsPerArray; + private final long cellsPerArray; + private final long cellMod; + private final long[][] hashMasks=makeMasks(8, hashArrayLength); + + private final long[][] buffers=new long[numArrays][500]; + private final int[] bufferlen=new int[numArrays]; + + public final KCountArray prefilter; + + private static final int hashBits=6; + private static final int hashArrayLength=1<3 || args[1].contains(".") ? args[1] : null); + int k=Integer.parseInt(args[args.length-2]); + int cbits=Integer.parseInt(args[args.length-1]); + + KCountArray2 count=null; + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.TARGET_READ_LEN=300000000; + FastaReadInputStream.MIN_READ_LEN=k; + } + count=countFastq(fname1, fname2, k, cbits); + + + t.stop(); + System.out.println("Finished counting; time = "+t); + + long[] freq=count.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + } + + public static KCountArray2 countFasta(String fname, int k, int cbits){ + assert(k>=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray2 count=new KCountArray2(cells, cbits); + + TextFile tf=new TextFile(fname, false, false); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='>'){ + for(int i=0; i=k){ + count.increment(kmer, 1); + } + } + } + } + } + return count; + } + + public static KCountArray2 countFastq(String reads1, String reads2, int k, int cbits){ + assert(k>=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray2 count=new KCountArray2(cells, cbits); + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + Thread th=new Thread(cris); + th.start(); + } + + assert(cris!=null) : reads1; + System.err.println("Started cris"); + boolean paired=cris.paired(); + System.err.println("Paired: "+paired); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + len=0; + kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + for(int i=0; i=k){ +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + count.increment(kmer, 1); +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + + if(r.mate!=null){ + len=0; + kmer=0; + bases=r.mate.bases; + quals=r.mate.quality; + for(int i=0; i=k){ + count.increment(kmer, 1); + } + } + } + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + System.err.println("Processed "+readsProcessed+" reads."); + } + + return count; + } + + public static long[] transformToFrequency(int[] count){ + long[] freq=new long[2000]; + int max=freq.length-1; + for(int i=0; iy ? x : y;} + + public static boolean verbose=false; + public static byte minQuality=5; + public static long readsProcessed=0; + public static long maxReads=10000000000L; + +} diff --git a/current/kmer/KmerCount4.java b/current/kmer/KmerCount4.java new file mode 100755 index 0000000..7e0e9a0 --- /dev/null +++ b/current/kmer/KmerCount4.java @@ -0,0 +1,438 @@ +package kmer; + +import java.util.ArrayList; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class KmerCount4 { + +public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>3 || args[1].contains(".") ? args[1] : null); + int k=14; + int cbits=16; + int gap=0; + + for(int i=(fname2==null ? 1 : 2); i1 ? split[1] : "true"); + + if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + KCountArray2 count=null; + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.TARGET_READ_LEN=300000000; + FastaReadInputStream.MIN_READ_LEN=k; + }else{ + FASTQ.PARSE_CUSTOM=false; +// assert(false) : FASTQ.PARSE_CUSTOM; + } + + if(gap==0){ + count=countFastq(fname1, fname2, k, cbits, true); + }else{ + count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null); + } + + + t.stop(); + System.out.println("Finished counting; time = "+t); + + printStatistics(count); + + } + + public static void printStatistics(KCountArray2 count){ + long[] freq=count.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public static KCountArray2 countFasta(String fname, int k, int cbits){ + assert(k>=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray2 count=new KCountArray2(cells, cbits); + + TextFile tf=new TextFile(fname, false, false); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='>'){ + for(int i=0; i=k){ + keysCounted++; + count.increment(kmer, 1); + } + } + } + } + } + return count; + } + + public static KCountArray2 countFastq(String reads1, String reads2, int k, int cbits, boolean rcomp){ + return countFastq(reads1, reads2, k, cbits, rcomp, null); + } + + public static KCountArray2 countFastq(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray2 count){ + assert(k>=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + if(count==null){ + final long cells=1L< ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addRead(r, count, k, mask, rcomp); + if(r.mate!=null){ + addRead(r.mate, count, k, mask, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + cris.close(); + if(verbose){System.err.println("Closed stream");} + if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} + + + return count; + } + + public static KCountArray2 countFastqSplit(String reads1, String reads2, int k1, int k2, int gap, int cbits, boolean rcomp, KCountArray2 count){ + assert(k1+k2>=1 && k1+k2<20); + assert(gap>=0); + final int kbits1=2*k1; + final int kbits2=2*k2; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + + if(count==null){ + final long cells=1L<<(kbits1+kbits2); + if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+ + ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));} + count=new KCountArray2(cells, cbits, gap); + } + assert(count.gap==gap); + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + Thread th=new Thread(cris); + th.start(); + } + + assert(cris!=null) : reads1; + System.err.println("Started cris"); + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); + if(r.mate!=null){ + addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + cris.close(); + if(verbose){System.err.println("Closed stream");} + if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} + + + return count; + } + + public static void addRead(final Read r, final KCountArray2 count, final int k, final long mask, boolean rcomp){ + int len=0; + long kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + for(int i=0; i=k){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + count.increment(kmer, 1); +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addRead(r, count, k, mask, false); + } + } + + public static void addReadSplit(final Read r, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1< "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); + } + } + + public static void addReadSplit(final byte[] bases, final KCountArray2 count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] quals=null; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1< "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + AminoAcid.reverseComplementBasesInPlace(bases); + addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); + } + } + + public static long[] transformToFrequency(int[] count){ + long[] freq=new long[2000]; + int max=freq.length-1; + for(int i=0; iy ? x : y;} + + public static boolean verbose=false; + public static byte minQuality=9; + public static long readsProcessed=0; + public static long maxReads=-1; + + public static long keysCounted=0; + +} diff --git a/current/kmer/KmerCount5.java b/current/kmer/KmerCount5.java new file mode 100755 index 0000000..893e866 --- /dev/null +++ b/current/kmer/KmerCount5.java @@ -0,0 +1,542 @@ +package kmer; + +import java.util.ArrayList; +import java.util.BitSet; + +import jgi.ErrorCorrect; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class KmerCount5 { + +public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + int k=14; + int cbits=16; + int gap=0; + + for(int i=2; i1 ? split[1] : "true"); + + if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + KCountArray count=null; + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.TARGET_READ_LEN=300000000; + FastaReadInputStream.MIN_READ_LEN=k; + }else{ + FASTQ.PARSE_CUSTOM=false; +// assert(false) : FASTQ.PARSE_CUSTOM; + } + + if(gap==0){ + count=countFastq(fname1, fname2, k, cbits, true); + }else{ + count=countFastqSplit(fname1, fname2, (k+1)/2, k/2, gap, cbits, true, null); + } + + + t.stop(); + System.out.println("Finished counting; time = "+t); + + printStatistics(count); + + } + + public static void printStatistics(KCountArray count){ + long[] freq=count.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public static KCountArray countFasta(String fname, int k, int cbits, int gap){ + assert(k>=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray count=KCountArray.makeNew(cells, cbits, gap); + + TextFile tf=new TextFile(fname, false, false); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='>'){ + for(int i=0; i=k){ + keysCounted++; + count.increment(kmer); + } + } + } + } + } + return count; + } + + public static KCountArray countFastq(String reads1, String reads2, int k, int cbits, boolean rcomp){ + return countFastq(reads1, reads2, k, cbits, rcomp, null); + } + + public static KCountArray countFastq(String reads1, String reads2, int k, int cbits, boolean rcomp, KCountArray count){ + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + if(count==null){ + final long cells=1L< ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addRead(r, count, k, mask, rcomp); + if(r.mate!=null){ + addRead(r.mate, count, k, mask, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + cris.close(); + if(verbose){System.err.println("Closed stream");} + if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} + + + return count; + } + + + + + public static KCountArray countFastq(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp, + KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){ + + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + +// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh); +// System.out.println("\ntrusted=\n"+trusted); +// System.out.println("\ncount=\n"+count); + + if(count==null){ + final long cells=1L< ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + for(int i=bs.nextClearBit(0); i=1 && (count!=null || k<20)); + assert(gap>=0); + final int kbits1=2*k1; + final int kbits2=2*k2; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + + if(count==null){ + final long cells=1L<<(kbits1+kbits2); + if(verbose){System.err.println("k1="+k1+", k2="+k2+", kbits1="+kbits1+", kbits2="+kbits2+", cells="+cells+ + ", mask1="+Long.toHexString(mask1)+", mask2="+Long.toHexString(mask2));} + count=KCountArray.makeNew(cells, cbits, gap); + } + assert(count.gap==gap); + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(reads1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(reads2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + Thread th=new Thread(cris); + th.start(); + } + + assert(cris!=null) : reads1; + System.err.println("Started cris"); + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); + if(r.mate!=null){ + addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + cris.close(); + if(verbose){System.err.println("Closed stream");} + if(verbose){System.err.println("Processed "+readsProcessed+" reads.");} + + + return count; + } + + public static void addRead(final Read r, final KCountArray count, final int k, final long mask, boolean rcomp){ + int len=0; + long kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + for(int i=0; i=k){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + count.increment(kmer); +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addRead(r, count, k, mask, false); + } + } + + public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1< "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); + } + } + + public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] quals=null; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1< "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + AminoAcid.reverseComplementBasesInPlace(bases); + addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); + } + } + + public static long[] transformToFrequency(int[] count){ + long[] freq=new long[2000]; + int max=freq.length-1; + for(int i=0; iy ? x : y;} + + public static boolean verbose=false; + public static byte minQuality=9; + public static long readsProcessed=0; + public static long maxReads=-1; + + public static long keysCounted=0; + +} diff --git a/current/kmer/KmerCount6.java b/current/kmer/KmerCount6.java new file mode 100755 index 0000000..3f093a1 --- /dev/null +++ b/current/kmer/KmerCount6.java @@ -0,0 +1,519 @@ +package kmer; + +import java.util.ArrayList; +import java.util.BitSet; + +import jgi.ErrorCorrect; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class KmerCount6 { + +public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + int k=14; + int cbits=16; + int gap=0; + + for(int i=2; i1 ? split[1] : "true"); + + if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + KCountArray count=null; + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.TARGET_READ_LEN=300000000; + FastaReadInputStream.MIN_READ_LEN=k; + }else{ + FASTQ.PARSE_CUSTOM=false; +// assert(false) : FASTQ.PARSE_CUSTOM; + } + + count=countFastq(fname1, fname2, k, cbits, gap, true, null); + + + t.stop(); + System.out.println("Finished counting; time = "+t); + + printStatistics(count); + + } + + public static void printStatistics(KCountArray count){ + long[] freq=count.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public static KCountArray countFasta(String fname, int k, int cbits, int gap){ + assert(k>=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray count=KCountArray.makeNew(cells, cbits, gap); + + TextFile tf=new TextFile(fname, false, false); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='>'){ + for(int i=0; i=k){ + keysCounted++; + count.increment(kmer); + } + } + } + } + } + return count; + } + + public static KCountArray countFastq(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, KCountArray count){ + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + if(count==null){ + final long cells=1L<=1 && (count!=null || k<20)); + + assert(count!=null); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + + if(count.gap==0){ + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addRead(r, count, k, mask, rcomp); + if(r.mate!=null){ + addRead(r.mate, count, k, mask, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + }else{ + final int k1=(k+1)/2; + final int k2=k/2; + final int kbits1=2*k1; + final int kbits2=2*k2; + final int gap=count.gap; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); + if(r.mate!=null){ + addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + } + + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + } + + + + + public static KCountArray countFastq(final String reads1, final String reads2, final int k, final int cbits, final boolean rcomp, + KCountArray count, final KCountArray trusted, final long maxReads, final int thresh, final int detectStepsize, final boolean conservative){ + + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + +// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh); +// System.out.println("\ntrusted=\n"+trusted); +// System.out.println("\ncount=\n"+count); + + if(count==null){ + final long cells=1L<=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + for(int i=bs.nextClearBit(0); i=k){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + count.increment(kmer); +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addRead(r, count, k, mask, false); + } + } + + public static void addReadSplit(final Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1< "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); + } + } + + public static void addReadSplit(final byte[] bases, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] quals=null; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + for(int i=0, j=i+k1+gap; j=k1){ + keysCounted++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1< "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + AminoAcid.reverseComplementBasesInPlace(bases); + addReadSplit(bases, count, k1, k2, mask1, mask2, gap, false); + } + } + + public static long[] transformToFrequency(int[] count){ + long[] freq=new long[2000]; + int max=freq.length-1; + for(int i=0; i buffer=new ThreadLocal(); + + public static final int min(int x, int y){return xy ? x : y;} + + public static boolean verbose=false; + public static byte minQuality=9; + public static long readsProcessed=0; + public static long maxReads=-1; + public static int bufferlen=1000; + + public static long keysCounted=0; + +} diff --git a/current/kmer/KmerCount6MT.java b/current/kmer/KmerCount6MT.java new file mode 100755 index 0000000..778b6c3 --- /dev/null +++ b/current/kmer/KmerCount6MT.java @@ -0,0 +1,790 @@ +package kmer; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; + +import jgi.ErrorCorrect; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import align2.Tools; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class KmerCount6MT { + +public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + int k=14; + int cbits=16; + int gap=0; + int matrixbits=-1; + int hashes=1; + + for(int i=2; i1 ? split[1] : "true"); + + if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else if(a.startsWith("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.startsWith("matrixbits")){ + matrixbits=Integer.parseInt(b); + }else if(a.startsWith("hashes")){ + hashes=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + int kbits=2*k; + if(matrixbits<0){ + matrixbits=kbits; + } + matrixbits=Tools.min(kbits, matrixbits); + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.TARGET_READ_LEN=300000000; + FastaReadInputStream.MIN_READ_LEN=k; + }else{ + FASTQ.PARSE_CUSTOM=false; +// assert(false) : FASTQ.PARSE_CUSTOM; + } + + KCountArray count=KCountArray.makeNew(1L<=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, int k, int cbits){ + return makeKca(fname1, fname2, extraFiles, k, cbits, 0, Tools.min(2*k, 35), 1, minQuality, true, maxReads, 1, 1, 1, 2); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads){ + return makeKca(fname1, fname2, extraFiles, k, cbits, gap, matrixbits, hashes, minqual, rcomp, maxreads, 1, 1, 1, 2); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){ + final int kbits=2*k; +// verbose=true; + if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", gap="+gap+", matrixbits="+matrixbits+", cbits="+cbits);} + + boolean oldsplit=FastaReadInputStream.SPLIT_READS; + long oldmax=maxReads; + byte oldq=minQuality; + maxReads=maxreads; + minQuality=(byte)minqual; + + // System.out.println("kbits="+(kbits)+" -> "+(1L< "+(1L<=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray count=KCountArray.makeNew(cells, cbits, gap); + + TextFile tf=new TextFile(fname, false, false); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='>'){ + for(int i=0; i=k){ + keysCounted++; + count.increment(kmer); + } + } + } + } + } + return count; + } + + public static KCountArray countFastq(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, KCountArray count){ + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); +// System.err.println("countFastq... making a new cris"); + if(count==null){ + final long cells=1L<=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + +// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh); +// System.out.println("\ntrusted=\n"+trusted); +// System.out.println("\ncount=\n"+count); + + if(count==null){ + final long cells=1L<0){ + if(bufflen=1 && count!=null); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + + if(count.gap==0){ + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessedLocal++; + + addRead(r, count, k, mask, rcomp); + if(r.mate!=null){ + addRead(r.mate, count, k, mask, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + }else{ + final int k1=(k+1)/2; + final int k2=k/2; + final int kbits1=2*k1; + final int kbits2=2*k2; + final int gap=count.gap; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessedLocal++; + + addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); + if(r.mate!=null){ + addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + } + + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + } + + + + + private void countFastq(final ConcurrentReadStreamInterface cris, final int k, final boolean rcomp, + final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){ + if(count.gap>0){countFastqSplit(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);} + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + for(int i=bs.nextClearBit(0); i0); + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + + final int k1=(k+1)/2; + final int k2=k/2; + final int kbits1=2*k1; + final int kbits2=2*k2; + final int gap=count.gap; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + for(int i=bs.nextClearBit(0); i0){ + r.mate.reverseComplement(); + r=r.joinRead(); + } + + int len=0; + long kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + for(int i=0; i=k){ + keysCountedLocal++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + +// System.out.println("Arrays.toString(buffer)); + buffer[bufflen]=kmer; + bufflen++; + if(bufflen>=buffer.length){ +// assert(false) : "Submitting "+Arrays.toString(buffer); + count.increment(buffer); + bufflen=0; + if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];} + } +// count.increment(kmer); + +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addRead(r, count, k, mask, false); + } + } + + private void addReadSplit(Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + + if(PREJOIN && r.mate!=null && r.insert()>0){ + if(verbose){System.err.println("Prejoining "+r.numericID+" at "+r.insert());} + r.mate.reverseComplement(); + r=r.joinRead(); + } + + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + if(verbose){System.err.println("Hashing read "+r.numericID+"; loop limits "+(k1+gap)+"-"+(bases.length));} + for(int i=0, j=i+k1+gap; j=k1){ + + keysCountedLocal++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1<=buffer.length){ + count.increment(buffer); + bufflen=0; + if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];} + } +// count.increment(kmer); + + +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); + } + } + + private final ConcurrentReadStreamInterface cris; + private final int k; + private final boolean rcomp; + private final KCountArray count; + private final KCountArray trusted; + private final int thresh; + private final int detectStepsize; + private final boolean conservative; + private long keysCountedLocal=0; + private long readsProcessedLocal=0; + private long[] buffer; + private int bufflen=0; + private final boolean MAKE_NEW_ARRAY; + } + + public static final int min(int x, int y){return xy ? x : y;} + + public static boolean verbose=false; + public static byte minQuality=9; + public static long readsProcessed=0; + public static long maxReads=-1; + public static int BUFFERLEN=500; + + public static long keysCounted=0; + + public static int THREADS=4; + public static boolean PREJOIN=false; + +} diff --git a/current/kmer/KmerCount7MT.java b/current/kmer/KmerCount7MT.java new file mode 100755 index 0000000..b94f7ba --- /dev/null +++ b/current/kmer/KmerCount7MT.java @@ -0,0 +1,1090 @@ +package kmer; + +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; + +import jgi.ErrorCorrect; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import align2.Tools; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class KmerCount7MT { + +public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + int k=14; + int cbits=16; + int gap=0; + int matrixbits=-1; + int hashes=1; + + for(int i=2; i1 ? split[1] : "true"); + + if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else if(a.startsWith("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.startsWith("matrixbits")){ + matrixbits=Integer.parseInt(b); + }else if(a.startsWith("hashes")){ + hashes=Integer.parseInt(b); + }else if(a.equals("canonical")){ + CANONICAL=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + int kbits=Tools.min(2*k, 62); + if(matrixbits<0){ + matrixbits=kbits; + } + matrixbits=Tools.min(kbits, matrixbits); + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.TARGET_READ_LEN=300000000; + FastaReadInputStream.MIN_READ_LEN=k; + }else{ + FASTQ.PARSE_CUSTOM=false; +// assert(false) : FASTQ.PARSE_CUSTOM; + } + + KCountArray count=KCountArray.makeNew(1L<=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, int k, int cbits){ + return makeKca(fname1, fname2, extraFiles, k, cbits, 0, Tools.min(2*k, 35), 1, minQuality, true, maxReads, 1, 1, 1, 2); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads){ + assert(matrixbits<63); + return makeKca(fname1, fname2, extraFiles, k, cbits, gap, matrixbits, hashes, minqual, rcomp, maxreads, 1, 1, 1, 2); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){ + assert(matrixbits<63); + return makeKca(fname1, fname2, extraFiles, + k, cbits, gap, 1L< extraFiles, + int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){ + return makeKca(fname1, fname2, extraFiles, + k, cbits, gap, cells, hashes, minqual, rcomp, maxreads, passes, stepsize, thresh1, thresh2, null); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2, + KCountArray prefilter){ + final int kbits=Tools.min(2*k, 62); +// verbose=true; + if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", gap="+gap+", cells="+Tools.toKMG(cells)+", cbits="+cbits);} + + boolean oldsplit=FastaReadInputStream.SPLIT_READS; + long oldmax=maxReads; + byte oldq=minQuality; + maxReads=maxreads; + minQuality=(byte)minqual; + // System.out.println("kbits="+(kbits)+" -> "+(1L< "+(1L< extra2=null; + if(fname1!=null && fname1.contains(",")){ + String[] s=fname1.split(","); + if(extra2==null){extra2=new ArrayList();} + for(int i=1; i();} + for(int i=1; i1); + KCountArray trusted=null; + for(int i=1; i2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3) + int step=(stepsize==1 ? 1 : stepsize+i%2); + // if(!conservative){step=(step+3)/4;} + if(!conservative){step=Tools.min(3, (step+3)/4);} + + try { + countFastq(fname1, fname2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(extraFiles!=null){ + maxReads=-1; + for(String s : extraFiles){ + try { + countFastq(s, null, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + kca.shutdown(); + + System.out.println("Trusted: \t"+kca.toShortString()); + trusted=kca; + kca=KCountArray.makeNew(1L<=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray count=KCountArray.makeNew(cells, cbits, gap); + + TextFile tf=new TextFile(fname, false, false); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='>'){ + for(int i=0; i=k && (!CANONICAL || KCountArray.isCanonical(kmer, k))){ + keysCounted++; + count.increment(kmer); + } + } + } + } + } + return count; + } + + public static KCountArray countFastq(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, KCountArray count) throws Exception{ + assert(k>=1 && (count!=null || k<20)); + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); +// System.err.println("countFastq... making a new cris"); + if(count==null){ + final long cells=1L<=1 && (count!=null || k<20)); + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); + +// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh); +// System.out.println("\ntrusted=\n"+trusted); +// System.out.println("\ncount=\n"+count); + +// verbose=true; + + if(count==null){ + final long cells=1L<0){ + if(bufflen=1 && count!=null); + +// System.out.println("Waiting for list"); + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); +// System.out.println("Got list: "+(ln==null ? "null" : ln.id)+", "+(ln==null || ln.list==null ? "null" : ln.list.size())); + + + if(count.gap==0){ + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); + + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + if(readsamplerate<2 || r.numericID%readsamplerate==0){ + readsProcessedLocal++; + addRead(r, count, k, mask, rcomp); + if(r.mate!=null){ + addRead(r.mate, count, k, mask, rcomp); + } + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + }else{ + final int k1=(k+1)/2; + final int k2=k/2; + final int kbits1=2*k1; + final int kbits2=2*k2; + final int gap=count.gap; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + if(readsamplerate<2 || r.numericID%readsamplerate==0){ + readsProcessedLocal++; + addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); + if(r.mate!=null){ + addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); + } + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + } + + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln==null ? true : ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + } + + + + + private void countFastq(final ConcurrentReadStreamInterface cris, final int k, final boolean rcomp, + final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){ + if(count.gap>0){countFastqSplit(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);} + assert(k>=1 && (count!=null || k<20)); + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + if(bs!=null){ + for(int i=bs.nextClearBit(0); i0); + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + + final int k1=(k+1)/2; + final int k2=k/2; + final int kbits1=2*k1; + final int kbits2=2*k2; + final int gap=count.gap; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + for(int i=bs.nextClearBit(0); i31){ + addReadLong(r, count, k, mask, rcomp); + return; + } + if(PREJOIN && r.mate!=null && r.insert()>0){ + r.mate.reverseComplement(); + r=r.joinRead(); + } + + int len=0; + long kmer=0; + float prob=1; + byte[] bases=r.bases; + byte[] quals=r.quality; + + if(bases==null || bases.lengthk){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || q=k && (!CANONICAL || KCountArray.isCanonical(kmer, k))){ + keysCountedLocal++; + // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + buffer[bufflen]=kmer; + bufflen++; + if(bufflen>=buffer.length){ + // assert(false) : "Submitting "+Arrays.toString(buffer); + count.increment(buffer); + bufflen=0; + if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];} + } + } + } + } + }else{ + for(int i=0; ik){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || q=k && i%kmersamplerate==0 && (!CANONICAL || KCountArray.isCanonical(kmer, k))){ + keysCountedLocal++; + // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + buffer[bufflen]=kmer; + bufflen++; + if(bufflen>=buffer.length){ + // assert(false) : "Submitting "+Arrays.toString(buffer); + count.increment(buffer); + bufflen=0; + if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];} + } + } + } + } + } + + if(rcomp){ + r.reverseComplement(); + addRead(r, count, k, mask, false); + } + } + + + + private void addReadLong(Read r, final KCountArray count, final int k, final long mask, boolean rcomp){ + + if(PREJOIN && r.mate!=null && r.insert()>0){ + r.mate.reverseComplement(); + r=r.joinRead(); + } + + int tailshift=k%32; + int tailshiftbits=tailshift*2; + + int len=0; + long kmer=0; + float prob=1; + byte[] bases=r.bases; + byte[] quals=r.quality; + + if(kmersamplerate<2){ + for(int i=0; ik){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || qk){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2<=k){ + keysCountedLocal++; + // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + // System.out.println("Arrays.toString(buffer)); + buffer[bufflen]=kmer; + bufflen++; + if(bufflen>=buffer.length){ + // assert(false) : "Submitting "+Arrays.toString(buffer); + count.increment(buffer); + bufflen=0; + if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];} + } + } + } + } + }else{ + for(int i=0; ik){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || qk){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2<=k && i%kmersamplerate==0){ + keysCountedLocal++; + // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + // System.out.println("Arrays.toString(buffer)); + buffer[bufflen]=kmer; + bufflen++; + if(bufflen>=buffer.length){ + // assert(false) : "Submitting "+Arrays.toString(buffer); + count.increment(buffer); + bufflen=0; + if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];} + } + } + } + } + } + + + if(rcomp){ + r.reverseComplement(); + addReadLong(r, count, k, mask, false); + } + } + + private void addReadSplit(Read r, final KCountArray count, final int k1, final int k2, final long mask1, final long mask2, final int gap, boolean rcomp){ + + if(PREJOIN && r.mate!=null && r.insert()>0){ + if(verbose){System.err.println("Prejoining "+r.numericID+" at "+r.insert());} + r.mate.reverseComplement(); + r=r.joinRead(); + } + + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + float prob=1; + byte[] bases=r.bases; + byte[] quals=r.quality; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + if(verbose){System.err.println("Hashing read "+r.numericID+"; loop limits "+(k1+gap)+"-"+(bases.length));} + for(int i=0, j=i+k1+gap; jk){ + byte oldq1=quals[i-k1]; + byte oldq2=quals[j-k2]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq1]*align2.QualityTools.PROB_CORRECT_INVERSE[oldq2]; + } + } + + if(x1<0 || x2<0 || q1=k1){ + + keysCountedLocal++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1<=buffer.length){ + count.increment(buffer); + bufflen=0; + if(MAKE_NEW_ARRAY){buffer=new long[BUFFERLEN];} + } +// count.increment(kmer); + + +// System.out.println(" -> "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); + } + } + + private final ConcurrentReadStreamInterface cris; + private final int k; + private final boolean rcomp; + private final KCountArray count; + private final KCountArray trusted; + private final int thresh; + private final int detectStepsize; + private final boolean conservative; + private long keysCountedLocal=0; + private long readsProcessedLocal=0; + private long[] buffer; + private int bufflen=0; + private final boolean MAKE_NEW_ARRAY; + } + + public static final int min(int x, int y){return xy ? x : y;} + + public static boolean verbose=false; + public static byte minQuality=7; + public static long readsProcessed=0; + public static long maxReads=-1; + public static int kmersamplerate=1; + public static int readsamplerate=1; + public static int BUFFERLEN=500; + + public static float minProb=0.5f; + + public static long keysCounted=0; + + public static int THREADS=4; + public static boolean PREJOIN=false; + public static boolean CANONICAL=false; + +} diff --git a/current/kmer/KmerCount7MTA.java b/current/kmer/KmerCount7MTA.java new file mode 100755 index 0000000..e9d7b93 --- /dev/null +++ b/current/kmer/KmerCount7MTA.java @@ -0,0 +1,1027 @@ +package kmer; + +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.BitSet; + +import jgi.ErrorCorrect; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jul 5, 2012 + * + */ +public class KmerCount7MTA { + +public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>1 ? args[1] : null); + int k=14; + int cbits=16; + int gap=0; + int matrixbits=-1; + int hashes=1; + + for(int i=2; i1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("k") || a.equals("kmer")){ + k=Integer.parseInt(b); + }else if(a.startsWith("cbits") || a.startsWith("cellbits")){ + cbits=Integer.parseInt(b); + }else if(a.startsWith("gap")){ + gap=Integer.parseInt(b); + }else if(a.startsWith("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.startsWith("matrixbits")){ + matrixbits=Integer.parseInt(b); + }else if(a.startsWith("hashes")){ + hashes=Integer.parseInt(b); + }else if(a.equals("canonical")){ + CANONICAL=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + int kbits=Tools.min(2*k, 62); + if(matrixbits<0){ + matrixbits=kbits; + } + matrixbits=Tools.min(kbits, matrixbits); + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.TARGET_READ_LEN=300000000; + FastaReadInputStream.MIN_READ_LEN=k; + }else{ + FASTQ.PARSE_CUSTOM=false; +// assert(false) : FASTQ.PARSE_CUSTOM; + } + + KCountArray count=KCountArray.makeNew(1L<=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("Avg Sites/Key: \t \t"+String.format("%.3f ",(keysCounted*1d/sum2))); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, int k, int cbits){ + return makeKca(fname1, fname2, extraFiles, k, cbits, 0, Tools.min(2*k, 35), 1, minQuality, true, maxReads, 1, 1, 1, 2); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads){ + assert(matrixbits<63); + return makeKca(fname1, fname2, extraFiles, k, cbits, gap, matrixbits, hashes, minqual, rcomp, maxreads, 1, 1, 1, 2); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, int matrixbits, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){ + assert(matrixbits<63); + return makeKca(fname1, fname2, extraFiles, + k, cbits, gap, 1L< extraFiles, + int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2){ + return makeKca(fname1, fname2, extraFiles, + k, cbits, gap, cells, hashes, minqual, rcomp, maxreads, passes, stepsize, thresh1, thresh2, null); + } + + public static KCountArray makeKca(String fname1, String fname2, Iterable extraFiles, + int k, int cbits, int gap, long cells, int hashes, int minqual, boolean rcomp, long maxreads, int passes, int stepsize, int thresh1, int thresh2, + KCountArray prefilter){ + final int kbits=Tools.min(2*k, 62); +// verbose=true; + if(verbose){System.err.println("Making kca from ("+fname1+", "+fname2+")\nk="+k+", gap="+gap+", cells="+Tools.toKMG(cells)+", cbits="+cbits);} + + boolean oldsplit=FastaReadInputStream.SPLIT_READS; + long oldmax=maxReads; + byte oldq=minQuality; + maxReads=maxreads; + minQuality=(byte)minqual; + // System.out.println("kbits="+(kbits)+" -> "+(1L< "+(1L< extra2=null; + if(fname1!=null && fname1.contains(",")){ + String[] s=fname1.split(","); + if(extra2==null){extra2=new ArrayList();} + for(int i=1; i();} + for(int i=1; i1); + KCountArray trusted=null; + for(int i=1; i2;// /*or, alternately, (trusted==null || trusted.capacity()>0.3) + int step=(stepsize==1 ? 1 : stepsize+i%2); + // if(!conservative){step=(step+3)/4;} + if(!conservative){step=Tools.min(3, (step+3)/4);} + + try { + countFastq(fname1, fname2, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(extraFiles!=null){ + maxReads=-1; + for(String s : extraFiles){ + try { + countFastq(s, null, k, cbits, true, kca, trusted, maxreads, thresh1, step, conservative); + } catch (Exception e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + kca.shutdown(); + + System.out.println("Trusted: \t"+kca.toShortString()); + trusted=kca; + kca=KCountArray.makeNew(1L<=1 && k<20); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + final long cells=mask+1; + if(verbose){System.err.println("k="+k+", kbits="+kbits+", cells="+cells+", mask="+Long.toHexString(mask));} + final KCountArray count=KCountArray.makeNew(cells, cbits, gap); + + TextFile tf=new TextFile(fname, false, false); + + long kmer=0; //current kmer + int len=0; //distance since last contig start or ambiguous base + + String s=null; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='>'){ + for(int i=0; i=k && (!CANONICAL || KCountArray.isCanonical(kmer, k))){ + keysCounted++; + count.increment(kmer); + } + } + } + } + } + return count; + } + + public static KCountArray countFastq(String reads1, String reads2, int k, int cbits, int gap, boolean rcomp, KCountArray count) throws Exception{ + assert(k>=1 && (count!=null || k<20)); + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); +// System.err.println("countFastq... making a new cris"); + if(count==null){ + final long cells=1L<=1 && (count!=null || k<20)); + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); + +// System.out.println("k="+k+", kbits="+kbits+", mask="+Long.toHexString(mask)+", thresh="+thresh); +// System.out.println("\ntrusted=\n"+trusted); +// System.out.println("\ncount=\n"+count); + +// verbose=true; + + if(count==null){ + final long cells=1L<=1 && count!=null); + +// System.out.println("Waiting for list"); + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); +// System.out.println("Got list: "+(ln==null ? "null" : ln.id)+", "+(ln==null || ln.list==null ? "null" : ln.list.size())); + + long[] array=null; + if(count.gap==0){ + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); + + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + if(readsamplerate<2 || r.numericID%readsamplerate==0){ + readsProcessedLocal++; +// addRead(r, count, k, mask, rcomp); +// if(r.mate!=null){ +// addRead(r.mate, count, k, mask, rcomp); +// } + array=addRead_Advanced(r, count, k, mask, array); + } +// System.out.println(r); +// System.out.println("kmers hashed: "+keysCountedLocal); + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + }else{ + final int k1=(k+1)/2; + final int k2=k/2; + final int kbits1=2*k1; + final int kbits2=2*k2; + final int gap=count.gap; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + if(readsamplerate<2 || r.numericID%readsamplerate==0){ + readsProcessedLocal++; + addReadSplit(r, count, k1, k2, mask1, mask2, gap, rcomp); + if(r.mate!=null){ + addReadSplit(r.mate, count, k1, k2, mask1, mask2, gap, rcomp); + } + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + } + + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln==null ? true : ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + } + + + + + private final void countFastq(final ConcurrentReadStreamInterface cris, final int k, final boolean rcomp, + final KCountArray count, final KCountArray trusted, final int thresh, final int detectStepsize, final boolean conservative){ + if(count.gap>0){countFastqSplit(cris, k, rcomp, count, trusted, thresh, detectStepsize, conservative);} + assert(k>=1 && (count!=null || k<20)); + final int kbits=Tools.min(2*k, 62); + final long mask=~((-1L)<<(kbits)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + long[] array=null; + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + if(bs!=null){ + for(int i=bs.nextClearBit(0); i0); + assert(k<32 && k>=1 && (count!=null || k<20)); + final int kbits=2*k; + final long mask=~((-1L)<<(kbits)); + + + final int k1=(k+1)/2; + final int k2=k/2; + final int kbits1=2*k1; + final int kbits2=2*k2; + final int gap=count.gap; + final long mask1=~((-1L)<<(kbits1)); + final long mask2=~((-1L)<<(kbits2)); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + Read r2=r.mate; + { + if(trusted!=null){ + BitSet bs=(conservative ? ErrorCorrect.detectErrorsBulk(r, trusted, k, thresh, detectStepsize) : + ErrorCorrect.detectTrusted(r, trusted, k, thresh, detectStepsize)); +// System.out.println("\n"+toString(bs, r.bases.length)); +// System.out.println(new String(r.bases)); + for(int i=bs.nextClearBit(0); i0){ + r1.mate.reverseComplement(); + r1=r1.joinRead(); + } + Read r2=r1.mate; + int len1=r1.bases==null ? 0 : Tools.max(0, r1.bases.length-k+1); + int len2=(r2==null || r2.bases==null) ? 0 : Tools.max(0, r2.bases.length-k+1); + int len=len1+len2; + if(len<1){return array;} + if(array==null || array.length!=len){array=new long[len];} + Arrays.fill(array, -1); + fillKmerArray(r1, k, mask, array, 0, len1); + if(r2!=null){fillKmerArray(r2, k, mask, array, len1, len);} + if(KEEP_DUPLICATE_KMERS){ + for(long kmer : array){ + if(kmer!=-1){ + keysCountedLocal++; + count.increment(kmer); + } + } + }else{ + Arrays.sort(array); + long prev=-1; + for(int i=0; i31){ + fillKmerArrayLong(r, k, array, start, stop); + return; + } + assert(count.gap==0); + assert(k<32); + assert(!PREJOIN || r.mate==null); + assert(CANONICAL); + assert(array!=null); + + final byte[] bases=r.bases; + final byte[] quals=r.quality; + + if(bases==null || bases.lengthk){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || q=k && (kmersamplerate<2 || i%kmersamplerate==0)){ + // System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + +// assert(array[idx]==-1 || array[idx]==AminoAcid.reverseComplementBinaryFast(kmer, k)) : +// "\npass="+pass+", start="+start+", stop="+stop+", i="+i+", idx="+idx+"\n"+ +// "array[idx]="+array[idx]+", kmer="+kmer+", rcomp="+AminoAcid.reverseComplementBinaryFast(kmer, k)+ +// "\n"+Arrays.toString(array); //TODO: Remove slow assertion +// System.out.println("\npass="+pass+", i="+i+", idx="+idx+"\nComparing "+kmer+"\tto \t"+array[idx]); + array[idx]=Tools.max(array[idx], kmer); +// System.out.println("Kept "+array[idx]+"\trcomp\t"+AminoAcid.reverseComplementBinaryFast(kmer, k)); + } + } + if(pass==0){idx++;}else{idx--;} + } +// System.out.println(Arrays.toString(array)); + r.reverseComplement(); + } + } + + private final void fillKmerArrayLong(Read r, final int k, final long[] array, final int start, final int stop){ + assert(k>31); + assert(count.gap==0); + assert(!PREJOIN || r.mate==null); + assert(CANONICAL); + assert(array!=null); + + final byte[] bases=r.bases; + final byte[] quals=r.quality; + + final int tailshift=k%32; + final int tailshiftbits=tailshift*2; + final long mask=Long.MAX_VALUE; + + if(bases==null || bases.lengthk){ + byte oldq=quals[i-k]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq]; + } + } + + if(x<0 || q=k){ + if(len>k){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2<0){ + if(verbose){System.err.println("Prejoining "+r.numericID+" at "+r.insert());} + r.mate.reverseComplement(); + r=r.joinRead(); + } + + int len=0; + int shift=k2*2; + long kmer1=0; + long kmer2=0; + float prob=1; + byte[] bases=r.bases; + byte[] quals=r.quality; + + assert(kmer1>=kmer2); + +// assert(false) : k1+", "+k2+", "+mask1+", "+mask2+", "+gap; + + if(verbose){System.err.println("Hashing read "+r.numericID+"; loop limits "+(k1+gap)+"-"+(bases.length));} + for(int i=0, j=i+k1+gap; jk){ + byte oldq1=quals[i-k1]; + byte oldq2=quals[j-k2]; + prob=prob*align2.QualityTools.PROB_CORRECT_INVERSE[oldq1]*align2.QualityTools.PROB_CORRECT_INVERSE[oldq2]; + } + } + + if(x1<0 || x2<0 || q1=k1){ + + keysCountedLocal++; +// System.out.print("Incrementing "+Long.toHexString(kmer)+": "+count.read(kmer)); + + long key=(kmer1< "+count.read(kmer)); +// System.out.print("Incrementing array for "+Long.toHexString(kmer)+": "+array[(int)kmer]); +// array[(int)kmer]++; +// System.out.println(" -> "+array[(int)kmer]+"\n"); +// assert(array[(int)kmer]==count.read(kmer) || array[(int)kmer]>3); + } + } + } + if(rcomp){ + r.reverseComplement(); + addReadSplit(r, count, k1, k2, mask1, mask2, gap, false); + } + } + + private final ConcurrentReadStreamInterface cris; + private final int k; + private final boolean rcomp; + private final KCountArray count; + private final KCountArray trusted; + private final int thresh; + private final int detectStepsize; + private final boolean conservative; + private long keysCountedLocal=0; + private long readsProcessedLocal=0; + } + + public static final int min(int x, int y){return xy ? x : y;} + + public static byte minQuality=6; + public static long readsProcessed=0; + public static long maxReads=-1; + public static int kmersamplerate=1; + public static int readsamplerate=1; + public static int BUFFERLEN=500; + + public static float minProb=0.5f; + + public static long keysCounted=0; + + public static int THREADS=Shared.THREADS; + public static boolean verbose=false; + public static boolean PREJOIN=false; + public static boolean CANONICAL=false; + public static boolean KEEP_DUPLICATE_KMERS=false; + +} diff --git a/current/kmer/KmerLink.java b/current/kmer/KmerLink.java new file mode 100755 index 0000000..f4cf43f --- /dev/null +++ b/current/kmer/KmerLink.java @@ -0,0 +1,91 @@ +package kmer; + +import java.util.ArrayList; + +/** + * @author Brian Bushnell + * @date Oct 22, 2013 + * + */ +public class KmerLink { + + public KmerLink(long pivot_){ + pivot=pivot_; + } + + public KmerLink(long pivot_, int value_){ + pivot=pivot_; + count=value_; + } + + int increment(long kmer){ + if(pivot<0){pivot=kmer; return (count=1);} //Allows initializing empty nodes to -1 + if(kmer==pivot){ + if(count list){ + if(next!=null){next.traversePrefix(list);} + list.add(this); + } + + void traverseInfix(ArrayList list){ + list.add(this); + if(next!=null){next.traverseInfix(list);} + } + + KmerLink rebalance(ArrayList list){ + throw new RuntimeException("Unsupported."); + } + + private static KmerLink rebalance(ArrayList list, int a, int b){ + throw new RuntimeException("Unsupported."); + } + + long pivot; + int count; + KmerLink next; + +} diff --git a/current/kmer/KmerNode.java b/current/kmer/KmerNode.java new file mode 100755 index 0000000..a345a66 --- /dev/null +++ b/current/kmer/KmerNode.java @@ -0,0 +1,189 @@ +package kmer; + +import java.util.ArrayList; + +/** + * @author Brian Bushnell + * @date Oct 22, 2013 + * + */ +public class KmerNode { + + public KmerNode(long pivot_){ + pivot=pivot_; + } + + public KmerNode(long pivot_, int value_){ + pivot=pivot_; + count=value_; + } + + int increment(long kmer){ + if(pivot<0){pivot=kmer; return (count=1);} //Allows initializing empty nodes to -1 + if(kmerpivot){ + if(right==null){right=new KmerNode(kmer, 1); return 1;} + return right.increment(kmer); + }else{ + if(countpivot){ + if(right==null){right=new KmerNode(kmer, value); return 1;} + return right.set(kmer, value); + }else{ + count=value; + } + return 0; + } + + /** Returns number of nodes added */ + int setIfNotPresent(long kmer, int value){ + if(pivot<0){pivot=kmer; count=value; return 1;} //Allows initializing empty nodes to -1 + if(kmerpivot){ + if(right==null){right=new KmerNode(kmer, value); return 1;} + return right.setIfNotPresent(kmer, value); + } + return 0; + } + + KmerNode get(long kmer){ + if(kmerpivot){ + return right==null ? null : right.get(kmer); + }else{ + return this; + } + } + + KmerNode getNodeOrParent(long kmer){ + if(pivot==kmer || pivot<0){return this;} + if(kmerpivot){ + if(right==null){right=n; return true;} + return right.insert(n); + }else{ + return false; + } + } + + int getCount(long kmer){ +// KmerNode node=get(kmer); +// return node==null ? 0 : node.count; + +// if(kmerpivot){ +// return right==null ? 0 : right.getCount(kmer); +// }else{ +// return count; +// } + +// if(kmer==pivot){ +// return count; +// }else if(kmerpivot); +// return right==null ? 0 : right.getCount(kmer); +// } + + KmerNode n=this; + while(n!=null && n.pivot!=kmer){ + n=(kmer list){ + if(left!=null){left.traversePrefix(list);} + list.add(this); + if(right!=null){right.traversePrefix(list);} + } + + void traverseInfix(ArrayList list){ + list.add(this); + if(left!=null){left.traverseInfix(list);} + if(right!=null){right.traverseInfix(list);} + } + + KmerNode rebalance(ArrayList list){ + assert(list.isEmpty()); + traversePrefix(list); + KmerNode n=this; + if(list.size()>2){ + n=rebalance(list, 0, list.size()-1); + } + list.clear(); + return n; + } + + public StringBuilder dumpKmersAsText(StringBuilder sb, int k){ + if(count<1){return sb;} + if(sb==null){sb=new StringBuilder(32);} + sb.append(AbstractKmerTable.toText(pivot, count, k).append('\n')); + if(left!=null){left.dumpKmersAsText(sb, k);} + if(right!=null){left.dumpKmersAsText(sb, k);} + return sb; + } + + private static KmerNode rebalance(ArrayList list, int a, int b){ + final int size=b-a+1; + final int middle=a+size/2; + final KmerNode n=list.get(middle); + if(size<4){ + if(size==1){ + n.left=n.right=null; + }else if(size==2){ + KmerNode n1=list.get(a); + n.left=n1; + n.right=null; + n1.left=n1.right=null; + }else{ + assert(size==3); + KmerNode n1=list.get(a), n2=list.get(b); + n.left=n1; + n.right=n2; + n1.left=n1.right=null; + n2.left=n2.right=null; + } + }else{ + n.left=rebalance(list, a, middle-1); + n.right=rebalance(list, middle+1, b); + } + return n; + } + + long pivot; + int count; + KmerNode left, right; + +} diff --git a/current/kmer/KmerTable.java b/current/kmer/KmerTable.java new file mode 100755 index 0000000..1e5670b --- /dev/null +++ b/current/kmer/KmerTable.java @@ -0,0 +1,238 @@ +package kmer; + +import java.util.ArrayList; + + +import fileIO.TextStreamWriter; + +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Oct 23, 2013 + * + */ +public final class KmerTable extends AbstractKmerTable { + + public KmerTable(int initialSize, boolean autoResize_){ + if(initialSize>1){ + initialSize=(int)Tools.min(maxPrime, Primes.primeAtLeast(initialSize)); + }else{ + initialSize=1; + } + prime=initialSize; + sizeLimit=(long) (initialSize*resizeMult); + array=new KmerLink[prime]; + autoResize=autoResize_; + } + + + + int increment(long kmer){ + final int cell=(int)(kmer%prime); + KmerLink n=array[cell], prev=null; + while(n!=null && n.pivot!=kmer){ + prev=n; + n=n.next; + } + if(n==null){ + n=new KmerLink(kmer, 1); + size++; + if(prev==null){ + array[cell]=n; + }else{ + prev.next=n; + } + if(autoResize && size>sizeLimit){resize();} + }else{ + n.count++; + if(n.count<0){n.count=Integer.MAX_VALUE;} + } + return n.count; + } + + int incrementAndReturnNumCreated(long kmer){ + final int cell=(int)(kmer%prime); + KmerLink n=array[cell], prev=null; + while(n!=null && n.pivot!=kmer){ + prev=n; + n=n.next; + } + if(n==null){ + n=new KmerLink(kmer, 1); + size++; + if(prev==null){ + array[cell]=n; + }else{ + prev.next=n; + } + if(autoResize && size>sizeLimit){resize();} + return 1; + }else{ + n.count++; + if(n.count<0){n.count=Integer.MAX_VALUE;} + return 0; + } + } + + + int set(long kmer, int value){ + int x=1, cell=(int)(kmer%prime); + final KmerLink n=array[cell]; + if(n==null){ + array[cell]=new KmerLink(kmer, value); + }else{ + x=n.set(kmer, value); + } + size+=x; + if(autoResize && size>sizeLimit){resize();} + return x; + } + + public int setIfNotPresent(long kmer, int value){ + int x=1, cell=(int)(kmer%prime); + final KmerLink n=array[cell]; + if(n==null){ + array[cell]=new KmerLink(kmer, value); + }else{ + x=n.setIfNotPresent(kmer, value); + } + size+=x; + if(autoResize && size>sizeLimit){resize();} + return x; + } + + KmerLink get(long kmer){ +// int cell=(int)(kmer%prime); +// KmerLink n=array[cell]; +// return n==null ? null : n.get(kmer); + + int cell=(int)(kmer%prime); + KmerLink n=array[cell]; + while(n!=null && n.pivot!=kmer){n=n.next;} + return n; + } + + public int getCount(long kmer){ +// KmerLink node=get(kmer); +// return node==null ? 0 : node.count; + + int cell=(int)(kmer%prime); + KmerLink n=array[cell]; + while(n!=null && n.pivot!=kmer){n=n.next;} + return n==null ? 0 : n.count; + } + +// int getCount(LongM kmer){ +// KmerLink node=get(kmer.value()); +// return node==null ? 0 : node.count; +// } + + public boolean contains(long kmer){ + KmerLink node=get(kmer); + return node!=null; + } + + boolean insert(KmerLink n){ + n.next=null; + int cell=(int)(n.pivot%prime); + if(array[cell]==null){ + array[cell]=n; + return true; + } + return array[cell].insert(n); + } + + public void rebalance(){ + ArrayList list=new ArrayList(1000); + for(int i=0; i=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Unique: \t \t"+sum2); + System.out.println("CollisionsA:\t \t"+collisionsA); + System.out.println("CollisionsB:\t \t"+collisionsB); + + double modifier=(collisionsB)/(double)(32*collisionsA+8*collisionsB); + + System.out.println("Estimate: \t \t"+(sum2+collisionsA+collisionsB-(long)(collisionsA*modifier))); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + + } + + public static KCountArray2 countFasta(String fname, int indexbits, int cbits, int k){ + assert(indexbits>=1 && indexbits<40); + collisionsA=0; + collisionsB=0; + final long cells=1L<=k){ + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; + long code1=hashcode%(cells-3); + long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + long temp=count.read(code2); + if(temp>0){ + if(value==0){collisionsA++;} + else{collisionsB++;} + } + } + } + } + } + } + return count; + } + + public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){ + assert(indexbits>=1 && indexbits<40); + collisionsA=0; + collisionsB=0; + final long cells=1L< ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + len=0; + kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + for(int i=0; i=k){ + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; + long code1=hashcode%(cells-3); + long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + long temp=count.read(code2); + if(temp>0){ + if(value==0){collisionsA++;} + else{collisionsB++;} + } + } + } + } + + + if(r.mate!=null){ + len=0; + kmer=0; + bases=r.mate.bases; + quals=r.mate.quality; + for(int i=0; i=k){ + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; + long code1=hashcode%(cells-3); + long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + long temp=count.read(code2); + if(temp>0){ + if(value==0){collisionsA++;} + else{collisionsB++;} + } + } + } + } + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + System.err.println("Processed "+readsProcessed+" reads."); + } + + return count; + } + + public static final long[] makeRotMasks(int rotDist){ + long[] masks=new long[4]; + for(long i=0; i<4; i++){ + masks[(int)i]=Long.rotateLeft(i, rotDist); + } + return masks; + } + + public static long[] transformToFrequency(int[] count){ + long[] freq=new long[2000]; + int max=freq.length-1; + for(int i=0; iy ? x : y;} + + public static boolean verbose=true; + public static byte minQuality=-5; + public static long readsProcessed=0; + public static long maxReads=1000000L; + public static final int ROTATE_DIST=2; + + public static long collisionsA=0; + public static long collisionsB=0; + +} diff --git a/current/kmer/LargeKmerCount2.java b/current/kmer/LargeKmerCount2.java new file mode 100755 index 0000000..9b294a9 --- /dev/null +++ b/current/kmer/LargeKmerCount2.java @@ -0,0 +1,398 @@ +package kmer; + +import java.util.ArrayList; +import java.util.Random; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + +import align2.ListNum; +import dna.AminoAcid; +import dna.Timer; +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextFile; + +/** + * @author Brian Bushnell + * @date Jul 6, 2012 + * + */ +public class LargeKmerCount2 { + +public static void main(String[] args){ + + Timer t=new Timer(); + t.start(); + + String fname1=args[0]; + String fname2=(args.length>4 || args[1].contains(".") ? args[1] : null); + int indexbits=Integer.parseInt(args[args.length-3]); + int cbits=Integer.parseInt(args[args.length-2]); + int k=Integer.parseInt(args[args.length-1]); + + KCountArray2 count=null; + + if(fileIO.FileFormat.hasFastaExtension(fname1)){ + FastaReadInputStream.MIN_READ_LEN=k; + }else{ + FASTQ.PARSE_CUSTOM=false; + } + count=countFastq(fname1, fname2, indexbits, cbits, k); + + FastaReadInputStream.TARGET_READ_LEN=999999999; + + t.stop(); + System.out.println("Finished counting; time = "+t); + + long[] freq=count.transformToFrequency(); + +// System.out.println(count+"\n"); +// System.out.println(Arrays.toString(freq)+"\n"); + + long sum=sum(freq); + System.out.println("Kmer fraction:"); + int lim1=8, lim2=16; + for(int i=0; i=freq.length){prefix=lim1+"+";} + while(prefix.length()<8){prefix=prefix+" ";} + System.out.println(prefix+"\t"+String.format("%.3f%% ",(100l*x/(double)sum))+"\t"+x); + lim1*=2; + lim2=min(lim2*2, freq.length); + } + + long estKmers=load+min(actualCollisions, (long)expectedCollisions); + + long sum2=sum-freq[0]; + long x=freq[1]; + System.out.println(); + System.out.println("Keys Counted: \t \t"+keysCounted); + System.out.println("Unique: \t \t"+sum2); + System.out.println("probCollisions:\t \t"+(long)probNewKeyCollisions); + System.out.println("EstimateP: \t \t"+(sum2+(long)probNewKeyCollisions)); + System.out.println("expectedColl: \t \t"+(long)expectedCollisions); + System.out.println("actualColl: \t \t"+(long)actualCollisions); + System.out.println("estimateKmers: \t \t"+estKmers); + System.out.println(); + System.out.println("Singleton: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + x=sum2-x; + System.out.println("Useful: \t"+String.format("%.3f%% ",(100l*x/(double)sum2))+"\t"+x); + + } + + public static KCountArray2 countFasta(String fname, int indexbits, int cbits, int k){ + assert(indexbits>=1 && indexbits<40); + final long cells=1L<=k){ + keysCounted++; + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; + long code1=hashcode%(cells-3); +// long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + + if(value==0){load++;} + else{ + double prob=(load*invCells); + double estKmers=load+probNewKeyCollisions; + double prob2=estKmers*invKmerSpace; + probNewKeyCollisions+=(prob*(1-prob2)); +// probCollisions+=(load*invCells); + } + } + } + } + } + } + return count; + } + + public static KCountArray2 countFastq(String reads1, String reads2, int indexbits, int cbits, int k){ + assert(indexbits>=1 && indexbits<40); + final long cells=1L< ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + + len=0; + kmer=0; + byte[] bases=r.bases; + byte[] quals=r.quality; + + for(int i=0; i=k){ + keysCounted++; + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; +// hashcode=randy.nextLong()&~((-1L)<<(2*k)); + long code1=hashcode%(cells-3); +// long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + + double probCollision=load*invCells; +// expectedCollisions+=probCollision; + expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); + if(value==0){load++;} + else{ + actualCollisions++; + double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); + double estKeys=load+probNewKeyCollisions; + double probOldKey=estKeys*invKmerSpace; + probNewKeyCollisions+=probNewKey*(1-probOldKey); + +// double estKmers=load+min(actualCollisions, expectedCollisions); +// double probOldKmer=estKmers*invKmerSpace; +// probNewKeyCollisions+=(prob*(1-prob2)); + } + +//// probCollisions+=(load*invCells); +// if(value==0){load++;} +// else{ +//// long load2=keysCounted-load; +// double prob=Math.sqrt(load*invCells); +// double estKmers=load+probNewKeyCollisions; +// double prob2=estKmers*invKmerSpace; +//// probCollisions+=(prob*(1-prob2)); +//// probCollisions+=Math.sqrt(prob*(1-prob2)); +// probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); +//// probCollisions+=min(prob, 1-prob2); +//// probCollisions+=(load*invCells); +// } + } + } + } + + + if(r.mate!=null){ + len=0; + kmer=0; + bases=r.mate.bases; + quals=r.mate.quality; + for(int i=0; i=k){ + keysCounted++; + if(len>k){kmer=kmer^rotMasks[x2];} + long hashcode=kmer&0x7fffffffffffffffL; +// hashcode=randy.nextLong()&~((-1L)<<(2*k)); + long code1=hashcode%(cells-3); +// long code2=((~hashcode)&0x7fffffffffffffffL)%(cells-5); + int value=count.increment2(code1, 1); + + double probCollision=load*invCells; +// expectedCollisions+=probCollision; + expectedCollisions+=probCollision*(1-(load+min(expectedCollisions, actualCollisions))*invKmerSpace); + if(value==0){load++;} + else{ + actualCollisions++; + double probNewKey=(load*invCells)*expectedCollisions/(min(expectedCollisions, actualCollisions)); + double estKeys=load+probNewKeyCollisions; + double probOldKey=estKeys*invKmerSpace; + probNewKeyCollisions+=probNewKey*(1-probOldKey); + +// double estKmers=load+min(actualCollisions, expectedCollisions); +// double probOldKmer=estKmers*invKmerSpace; +// probNewKeyCollisions+=(prob*(1-prob2)); + } + +//// probCollisions+=(load*invCells); +// if(value==0){load++;} +// else{ +//// long load2=keysCounted-load; +// double prob=Math.sqrt(load*invCells); +// double estKmers=load+probNewKeyCollisions; +// double prob2=estKmers*invKmerSpace; +//// probCollisions+=(prob*(1-prob2)); +//// probCollisions+=Math.sqrt(prob*(1-prob2)); +// probNewKeyCollisions+=Math.sqrt(prob*(1-prob2)); +//// probCollisions+=min(prob, 1-prob2); +//// probCollisions+=(load*invCells); +// } + } + } + } + } + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + System.err.println("Processed "+readsProcessed+" reads."); + } + + return count; + } + + public static final long[] makeRotMasks(int rotDist){ + long[] masks=new long[4]; + for(long i=0; i<4; i++){ + masks[(int)i]=Long.rotateLeft(i, rotDist); + } + return masks; + } + + public static long[] transformToFrequency(int[] count){ + long[] freq=new long[2000]; + int max=freq.length-1; + for(int i=0; iy ? x : y;} + public static final long min(long x, long y){return xy ? x : y;} + public static final double min(double x, double y){return xy ? x : y;} + + public static boolean verbose=true; + public static byte minQuality=-5; + public static long readsProcessed=0; + public static long maxReads=10000000L; + public static final int ROTATE_DIST=2; + + /** Non-empty cells in hash table */ + public static long load; + /** Number of expected collisions */ + public static double expectedCollisions; + /** Number of actual collisions (possibly by same value) */ + public static long actualCollisions; + /** Number of probable collisions caused by new keys */ + public static double probNewKeyCollisions; + /** Inverse of hash table size */ + public static double invCells; + /** Inverse of number of potential kmers */ + public static double invKmerSpace; + /** Inverse of number of potential kmers */ + public static long keysCounted; + + public static final Random randy=new Random(1); + +} diff --git a/current/kmer/Primes.java b/current/kmer/Primes.java new file mode 100755 index 0000000..ad7f777 --- /dev/null +++ b/current/kmer/Primes.java @@ -0,0 +1,163 @@ +package kmer; + +import java.io.File; +import java.util.Arrays; + +import dna.Data; + +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Oct 9, 2012 + * + */ +public class Primes { + + public static void main(String[] args){ + + if(args.length==3){makePrimes(args);} + else{ + + + System.out.println(primeAtLeast(100)); + System.out.println(primeAtLeast(1000)); + System.out.println(primeAtLeast(10000)); + System.out.println(primeAtLeast(100000)); + System.out.println(primeAtLeast(1000000)); + System.out.println(primeAtLeast(10000000)); + System.out.println(primeAtLeast(100000000)); + System.out.println(primeAtLeast(1000000000)); + System.out.println(primeAtLeast(10000000000L)); + System.out.println(primeAtLeast(100000000000L)); + System.out.println(primeAtLeast(1000000000000L)); + System.out.println(primeAtLeast(10000000000000L)); + System.out.println(primeAtLeast(100000000000000L)); + System.out.println(primeAtLeast(1000000000000000L)); + + + System.out.println(primeAtMost(100)); + System.out.println(primeAtMost(1000)); + System.out.println(primeAtMost(10000)); + System.out.println(primeAtMost(100000)); + System.out.println(primeAtMost(1000000)); + System.out.println(primeAtMost(10000000)); + System.out.println(primeAtMost(100000000)); + System.out.println(primeAtMost(1000000000)); + System.out.println(primeAtMost(10000000000L)); + System.out.println(primeAtMost(100000000000L)); + System.out.println(primeAtMost(1000000000000L)); + System.out.println(primeAtMost(10000000000000L)); + System.out.println(primeAtMost(100000000000000L)); + System.out.println(primeAtMost(1000000000000000L)); + + } + + } + + + public static void makePrimes(String[] args){ + + + String in=args[0]; + String out=args[1]; + double mult=Double.parseDouble(args[2]); + assert(mult>=1); + + long next=1; + + if(!new File(in).exists()){throw new RuntimeException("File not found: "+in);} + TextFile tf=new TextFile(in, true, false); + TextStreamWriter tsw=new TextStreamWriter(out, true, false, false); + tsw.start(); + +// int cnt=0; + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ +// cnt++; +// if(cnt>10000){break;} + long x=Long.parseLong(s.trim()); + +// System.out.println("cnt="+cnt+", x="+x+", next="+next); + + if(x>=next){ + tsw.print(x+"\n"); + next=(long)(x*mult); + } + } + tsw.poison(); + tf.close(); + + } + + + public static long primeAtLeast(long x){ + int loc=Arrays.binarySearch(primes, x); + if(loc<0){ + loc=-(loc+1); + assert(loc>=primes.length || primes[loc]>x) : x; + } + if(loc>=primes.length){//Out of bounds + long d=(long)Math.pow(x, 0.4); + long a=primeAtLeast(x/d); + long b=primeAtLeast(x/a); + long c=a*b; + assert(c>=x && c<(x*9)/8) : x+", "+a+", "+b+", "+c+", "+d; + return c; + } + while(primes[loc]=x){return p;} +// } +// throw new RuntimeException("No primes big enough for "+x); + } + + + public static long primeAtMost(long x){ + int loc=Arrays.binarySearch(primes, x); + if(loc<0){ + loc=-(loc+1); + assert(loc>=primes.length || primes[loc]>x) : x; + } + assert(loc>=0) : loc+", "+x; + if(loc>=primes.length){//Out of bounds + long d=(long)Math.pow(x, 0.4); + long a=primeAtMost(x/d); + long b=primeAtMost(x/a); + long c=a*b; + assert(c<=x && c>(x*7)/8) : x+", "+a+", "+b+", "+c+", "+d; + return c; + } + assert(loc>=0) : loc+", "+x; + assert(x>=primes[0]) : loc+", "+x+", "+primes[0]; + while(primes[loc]>x){loc--;} + return primes[loc]; + +// for(int i=primes.length-1; i>=0; i--){ +// if(primes[i]<=x){return primes[i];} +// } +// throw new RuntimeException("No primes small enough for "+x); + } + + + /** + * @return + */ + private static long[] fetchPrimes() { + String fname=Data.findPath("?primes.txt.gz"); + + TextFile tf=new TextFile(fname, false, false); + String[] lines=tf.toStringLines(); + long[] array=new long[lines.length]; + for(int i=0; i2000){coverage[i].resize(coverage[i].maxIndex+1);} + ReadWrite.writeObjectInThread(coverage[i], outpattern.replaceFirst("#", ""+i), false); + } + + long totalCoverage=0; + long totalCoverageBase=0; + long totalCoverageN=0; + long correctCoverage=0; + long correctCoverageBase=0; + long correctCoverageN=0; + + long onlyCorrectBase=0; + long onlyIncorrectBase=0; + long onlyCorrectN=0; + long onlyIncorrectN=0; + long mostlyCorrectBase=0; + long mostlyIncorrectBase=0; + long mostlyCorrectN=0; + long mostlyIncorrectN=0; + long anyCorrectBase=0; + long anyCorrectN=0; + long noCorrectBase=0; + long noCoverageBase=0; + long noCoverageN=0; + + long baseCount=0; + long nCount=0; + long nCountCovered=0; + + for(int chrom=1; chrom<=Data.numChroms; chrom++){ + ChromosomeArray cha=Data.getChromosome(chrom); + CoverageArray cov=coverage[chrom]; + byte[] cor=correct[chrom]; + for(int i=0; i=mincoverage){ + totalCoverageN+=total; + correctCoverageN+=good; + nCountCovered++; + if(total==good){ + onlyCorrectN++; + mostlyCorrectN++; + }else if(good>bad){ + mostlyCorrectN++; + }else if(good==0){ + onlyIncorrectN++; + mostlyIncorrectN++; + }else if(bad>good){ + mostlyIncorrectN++; + } + if(good>0){anyCorrectN++;} + }else{ + noCoverageN++; + } + }else{ + baseCount++; + if(total>=mincoverage){ + totalCoverageBase+=total; + correctCoverageBase+=good; + if(total==good){ + onlyCorrectBase++; + mostlyCorrectBase++; + }else if(good>bad){ + mostlyCorrectBase++; + }else if(good==0){ + onlyIncorrectBase++; + mostlyIncorrectBase++; + noCorrectBase++; + }else if(bad>good){ + mostlyIncorrectBase++; + } + if(good>0){anyCorrectBase++;} + }else{ + noCoverageBase++; + noCorrectBase++; + } + } + } + Data.unload(chrom, true); + coverage[chrom]=null; + correct[chrom]=null; + } + + long length=nCount+baseCount; + double invlen=1.0/length; + double invbase=1.0/baseCount; + double invn=1.0/nCount; + double invnc=1.0/nCountCovered; //covered N's + + double totalCoverageB=totalCoverage*invlen; + double totalCoverageBaseB=totalCoverageBase*invbase; + double totalCoverageNB=totalCoverageN*invn; + double correctCoverageB=correctCoverage*invlen; + double correctCoverageBaseB=correctCoverageBase*invbase; + double correctCoverageNB=correctCoverageN*invn; + + double onlyCorrectBaseB=onlyCorrectBase*invbase*100; + double onlyIncorrectBaseB=onlyIncorrectBase*invbase*100; + double onlyCorrectNB=onlyCorrectN*invnc*100; + double onlyIncorrectNB=onlyIncorrectN*invnc*100; + double mostlyCorrectBaseB=mostlyCorrectBase*invbase*100; + double mostlyIncorrectBaseB=mostlyIncorrectBase*invbase*100; + double mostlyCorrectNB=mostlyCorrectN*invnc*100; + double mostlyIncorrectNB=mostlyIncorrectN*invnc*100; + double anyCorrectBaseB=anyCorrectBase*invbase*100; + double anyCorrectNB=anyCorrectN*invnc*100; + double noCorrectBaseB=noCorrectBase*invbase*100; + double noCoverageBaseB=noCoverageBase*invbase*100; + double noCoverageNB=noCoverageN*invn*100; + + + + double correctSitesB=correctSites*100d/totalSites; + double correctSiteLenB=correctSiteLen*100d/totalSiteLen; + + System.out.println("\nOverall Statistics"); + + if(bs!=null){ + System.out.println("Reads Represented: \t"+bs.cardinality()); + } + System.out.println(String.format("Total Correct Sites: \t"+(correctSitesB<10?" ":"")+"%.3f%% ", correctSitesB)+" \t"+correctSites); + System.out.println(String.format("Total Correct Site Length:\t"+(correctSiteLenB<10?" ":"")+"%.3f%% ", correctSiteLenB)+" \t"+correctSiteLen); + + System.out.println("\nCoverage Statistics"); + + System.out.println(String.format("Avg Coverage: \t"+(totalCoverageB<10?" ":"")+"%.3f", totalCoverageB)+" \t"+totalCoverage); + + System.out.println(String.format("Avg Coverage Base: \t"+(totalCoverageBaseB<10?" ":"")+"%.3f", totalCoverageBaseB)+" \t"+totalCoverageBase); + + System.out.println(String.format("Avg Coverage N: \t"+(totalCoverageNB<10?" ":"")+"%.3f", totalCoverageNB)+" \t"+totalCoverageN); + + System.out.println(String.format("Correct Coverage: \t"+(correctCoverageB<10?" ":"")+"%.3f", correctCoverageB)+" \t"+correctCoverage); + + System.out.println(String.format("Correct Coverage Base: \t"+(correctCoverageBaseB<10?" ":"")+"%.3f", correctCoverageBaseB)+" \t"+correctCoverageBase); + + System.out.println(String.format("Correct Coverage N: \t"+(correctCoverageNB<10?" ":"")+"%.3f", correctCoverageNB)+" \t"+correctCoverageN); + + System.out.println("\nStatistics over Defined Bases"); + + System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectBaseB<10?" ":"")+"%.3f", onlyCorrectBaseB)+"%"); + System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectBaseB<10?" ":"")+"%.3f", mostlyCorrectBaseB)+"%"); + System.out.println(String.format("anyCorrect: \t"+(anyCorrectBaseB<10?" ":"")+"%.3f", anyCorrectBaseB)+"%"); + System.out.println(String.format("noCorrect: \t"+(noCorrectBaseB<10?" ":"")+"%.3f", noCorrectBaseB)+"%"); + System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectBaseB<10?" ":"")+"%.3f", mostlyIncorrectBaseB)+"%"); + System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectBaseB<10?" ":"")+"%.3f", onlyIncorrectBaseB)+"%"); + System.out.println(String.format("noCoverage: \t"+(noCoverageBaseB<10?" ":"")+"%.3f", noCoverageBaseB)+"%"); + + System.out.println("\nStatistics over N (for covered locations)"); + + System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectNB<10?" ":"")+"%.3f", onlyCorrectNB)+"%"); + System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectNB<10?" ":"")+"%.3f", mostlyCorrectNB)+"%"); + System.out.println(String.format("anyCorrect: \t"+(anyCorrectNB<10?" ":"")+"%.3f", anyCorrectNB)+"%"); + System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectNB<10?" ":"")+"%.3f", mostlyIncorrectNB)+"%"); + System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectNB<10?" ":"")+"%.3f", onlyIncorrectNB)+"%"); + System.out.println(String.format("noCoverage (over all N): \t"+(noCoverageNB<10?" ":"")+"%.3f", noCoverageNB)+"%"); + + + } + + + public static void process(final String fname, final int genome, final int mincoverage){ + Data.setGenome(genome); + + BitSet bs=new BitSet(); + + byte[][] coverage=new byte[Data.numChroms+1][]; + byte[][] correct=new byte[Data.numChroms+1][]; + + for(int chrom=1; chrom<=Data.numChroms; chrom++){ + coverage[chrom]=new byte[Data.chromLengths[chrom]]; + correct[chrom]=new byte[Data.chromLengths[chrom]]; + } + + TextFile tf=new TextFile(fname, true, false); + String s=tf.nextLine(); + + long totalSites=0; + long correctSites=0; + long totalSiteLen=0; + long correctSiteLen=0; + + while(s!=null){ + SiteScoreR[] sites=toSites(s); + for(SiteScoreR ssr : sites){ + + if(bs!=null){ + bs.set((int)ssr.numericID); + } + + int len=ssr.stop-ssr.start+1; + totalSites++; + totalSiteLen+=len; + if(ssr.correct){ + correctSites++; + correctSiteLen+=len; + } + + + int chrom=ssr.chrom; + int min=Tools.max(ssr.start, 0); + int max=Tools.min(ssr.stop, Data.chromLengths[chrom]-1); + byte[] array=coverage[chrom]; + for(int i=min; i<=max; i++){ + if(array[i]=mincoverage){ + totalCoverageN+=total; + correctCoverageN+=good; + nCountCovered++; + if(total==good){ + onlyCorrectN++; + mostlyCorrectN++; + }else if(good>bad){ + mostlyCorrectN++; + }else if(good==0){ + onlyIncorrectN++; + mostlyIncorrectN++; + }else if(bad>good){ + mostlyIncorrectN++; + } + if(good>0){anyCorrectN++;} + }else{ + noCoverageN++; + } + }else{ + baseCount++; + if(total>=mincoverage){ + totalCoverageBase+=total; + correctCoverageBase+=good; + if(total==good){ + onlyCorrectBase++; + mostlyCorrectBase++; + }else if(good>bad){ + mostlyCorrectBase++; + }else if(good==0){ + onlyIncorrectBase++; + mostlyIncorrectBase++; + noCorrectBase++; + }else if(bad>good){ + mostlyIncorrectBase++; + } + if(good>0){anyCorrectBase++;} + }else{ + noCoverageBase++; + noCorrectBase++; + } + } + } + Data.unload(chrom, true); + coverage[chrom]=null; + correct[chrom]=null; + } + + long length=nCount+baseCount; + double invlen=1.0/length; + double invbase=1.0/baseCount; + double invn=1.0/nCount; + double invnc=1.0/nCountCovered; //covered N's + + double totalCoverageB=totalCoverage*invlen; + double totalCoverageBaseB=totalCoverageBase*invbase; + double totalCoverageNB=totalCoverageN*invn; + double correctCoverageB=correctCoverage*invlen; + double correctCoverageBaseB=correctCoverageBase*invbase; + double correctCoverageNB=correctCoverageN*invn; + + double onlyCorrectBaseB=onlyCorrectBase*invbase*100; + double onlyIncorrectBaseB=onlyIncorrectBase*invbase*100; + double onlyCorrectNB=onlyCorrectN*invnc*100; + double onlyIncorrectNB=onlyIncorrectN*invnc*100; + double mostlyCorrectBaseB=mostlyCorrectBase*invbase*100; + double mostlyIncorrectBaseB=mostlyIncorrectBase*invbase*100; + double mostlyCorrectNB=mostlyCorrectN*invnc*100; + double mostlyIncorrectNB=mostlyIncorrectN*invnc*100; + double anyCorrectBaseB=anyCorrectBase*invbase*100; + double anyCorrectNB=anyCorrectN*invnc*100; + double noCorrectBaseB=noCorrectBase*invbase*100; + double noCoverageBaseB=noCoverageBase*invbase*100; + double noCoverageNB=noCoverageN*invn*100; + + + + double correctSitesB=correctSites*100d/totalSites; + double correctSiteLenB=correctSiteLen*100d/totalSiteLen; + + System.out.println("\nOverall Statistics"); + + if(bs!=null){ + System.out.println("Reads Represented: \t"+bs.cardinality()); + } + System.out.println(String.format("Total Correct Sites: \t"+(correctSitesB<10?" ":"")+"%.3f%% ", correctSitesB)+" \t"+correctSites); + System.out.println(String.format("Total Correct Site Length:\t"+(correctSiteLenB<10?" ":"")+"%.3f%% ", correctSiteLenB)+" \t"+correctSiteLen); + + System.out.println("\nCoverage Statistics"); + + System.out.println(String.format("Avg Coverage: \t"+(totalCoverageB<10?" ":"")+"%.3f", totalCoverageB)+" \t"+totalCoverage); + + System.out.println(String.format("Avg Coverage Base: \t"+(totalCoverageBaseB<10?" ":"")+"%.3f", totalCoverageBaseB)+" \t"+totalCoverageBase); + + System.out.println(String.format("Avg Coverage N: \t"+(totalCoverageNB<10?" ":"")+"%.3f", totalCoverageNB)+" \t"+totalCoverageN); + + System.out.println(String.format("Correct Coverage: \t"+(correctCoverageB<10?" ":"")+"%.3f", correctCoverageB)+" \t"+correctCoverage); + + System.out.println(String.format("Correct Coverage Base: \t"+(correctCoverageBaseB<10?" ":"")+"%.3f", correctCoverageBaseB)+" \t"+correctCoverageBase); + + System.out.println(String.format("Correct Coverage N: \t"+(correctCoverageNB<10?" ":"")+"%.3f", correctCoverageNB)+" \t"+correctCoverageN); + + System.out.println("\nStatistics over Defined Bases"); + + System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectBaseB<10?" ":"")+"%.3f", onlyCorrectBaseB)+"%"); + System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectBaseB<10?" ":"")+"%.3f", mostlyCorrectBaseB)+"%"); + System.out.println(String.format("anyCorrect: \t"+(anyCorrectBaseB<10?" ":"")+"%.3f", anyCorrectBaseB)+"%"); + System.out.println(String.format("noCorrect: \t"+(noCorrectBaseB<10?" ":"")+"%.3f", noCorrectBaseB)+"%"); + System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectBaseB<10?" ":"")+"%.3f", mostlyIncorrectBaseB)+"%"); + System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectBaseB<10?" ":"")+"%.3f", onlyIncorrectBaseB)+"%"); + System.out.println(String.format("noCoverage: \t"+(noCoverageBaseB<10?" ":"")+"%.3f", noCoverageBaseB)+"%"); + + System.out.println("\nStatistics over N (for covered locations)"); + + System.out.println(String.format("onlyCorrect: \t"+(onlyCorrectNB<10?" ":"")+"%.3f", onlyCorrectNB)+"%"); + System.out.println(String.format("mostlyCorrect: \t"+(mostlyCorrectNB<10?" ":"")+"%.3f", mostlyCorrectNB)+"%"); + System.out.println(String.format("anyCorrect: \t"+(anyCorrectNB<10?" ":"")+"%.3f", anyCorrectNB)+"%"); + System.out.println(String.format("mostlyIncorrect: \t"+(mostlyIncorrectNB<10?" ":"")+"%.3f", mostlyIncorrectNB)+"%"); + System.out.println(String.format("onlyIncorrect: \t"+(onlyIncorrectNB<10?" ":"")+"%.3f", onlyIncorrectNB)+"%"); + System.out.println(String.format("noCoverage (over all N): \t"+(noCoverageNB<10?" ":"")+"%.3f", noCoverageNB)+"%"); + + + } + + + + public static SiteScoreR[] toSites(String s){ + String[] split=s.split("\t"); + SiteScoreR[] scores=new SiteScoreR[split.length]; + for(int i=0; i3){ + mincontig=Integer.parseInt(args[3]); + maxcontig=Integer.parseInt(args[4]); + buffer=Integer.parseInt(args[5]); + System.out.println("Multichrom will be overlayed with blocks of "+buffer+" 'N'"); + } + + +// String pattern=ROOT_GENOME+GENOME_BUILD+"/chr"+chrom+".chromC"; + + File f=new File(Data.ROOT_GENOME+build); + if(!f.exists()){f.mkdirs();} + + for(int i=1; i<=copies; i++){ + ChromosomeArray chb=makeSynthetic(cha, i); + if(buffer>0){ + addN(chb, mincontig, maxcontig, buffer); + } + if(Data.CHROMC){ + ChromosomeArrayCompressed cac=new ChromosomeArrayCompressed(chb); + ReadWrite.write(cac, Data.ROOT_GENOME+build+"/chr"+i+Data.chromExtension(), false); + }else{ + ReadWrite.write(chb, Data.ROOT_GENOME+build+"/chr"+i+Data.chromExtension(), false); + } + } + FastaToChromArrays.writeInfo(build, copies, Data.name, "multiple_"+Data.GENOME_BUILD, false, false); + + } + + private static void addN(ChromosomeArray cha, int minContig, int maxContig, int buffer){ + + final int spread=maxContig-minContig+1; + final Random randy=new Random(cha.chromosome); + final int lim=cha.maxIndex-Tools.max(maxContig, minContig+buffer); + + int contig=0; + int nextContig=minContig+randy.nextInt(spread); + + for(int i=0; i=nextContig){ + contig=0; + int lim2=i+buffer; + while(i=ERROR_PERCENT){ //No error + chb.set(b, c); + a++; + b++; + }else if(x>=INDEL_PERCENT){//sub + byte e=c; + while(e==c){ + e=AminoAcid.numberToBase[randy.nextInt(4)]; + } + chb.set(b, e); + a++; + b++; + }else{//indel + boolean ins=randy.nextBoolean(); + int len=Tools.min(randy.nextInt(ERROR_LENGTH), randy.nextInt(ERROR_LENGTH), randy.nextInt(ERROR_LENGTH+1))+1; + if(ins && dif+len>MAX_DIF){ + ins=false; + }else if(!ins && dif-len0); + + if(noderam<1){ + if(threads<9){noderam=144;} + else if(threads<25){noderam=252;}//Changed due to crash at 217 GB on 24-core nodes. + else if(threads<33){noderam=512;} + else if(threads<41){noderam=1024;} + else{noderam=2048;} + System.out.println("Set noderam at "+noderam+"g"); + } + + String slotram; + if(noderam%threads==0){slotram=(noderam/threads)+"G";} + else{slotram=((noderam*990)/threads)+"M";} + + if(noderam>0){ + if(maxram<1){ + maxram=(int)(noderam*(noderam>256 ? 0.83 : 0.85f)); + System.out.println("Set maxram at "+maxram+"g"); + } + } + + if(ram>maxram){ + ram=maxram; + System.out.println("Set ram at "+maxram+"g"); + } + + if("auto".equalsIgnoreCase(targetsize) || (targetsize==null && ref!=null)){ + if(ref==null){throw new RuntimeException("Ref file must be specified for auto targetsize.");} + File f=new File(ref); + if(!f.exists()){throw new RuntimeException("Ref file must exist for auto targetsize.");} + if(f.exists()){ + targetsize=""+new File(ref).length(); + if(ref.endsWith(".gz") || ref.endsWith(".gzip") || ref.endsWith(".zip") || ref.endsWith(".bz2")){ + TextFile tf=new TextFile(ref, false, false); + long x=1; + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){x+=s.length();} + tf.close(); + targetsize=""+x; + } + } + } + + if(ref!=null && refbuild<1){ + if(build==1){refbuild=2;} + else{refbuild=1;} + } + + if(dirty==null){throw new RuntimeException("No dirty file specified.");} + if(clean==null){throw new RuntimeException("No clean file specified.");} + if(targetsize==null){throw new RuntimeException("No targetsize specified.");} + if(template==null){throw new RuntimeException("No template file specified.");} + if(!new File(template).exists()){throw new RuntimeException("Template file "+template+" does not exist; please specify a different template.");} + if(build==refbuild){throw new RuntimeException("Build id and ref build id must differ.");} + if(build<1){throw new RuntimeException("No build id.");} + if(ref!=null && refbuild<1){throw new RuntimeException("No ref build id.");} + if(ref==null && refbuild>0 && !(new File(Data.chromFname(1, refbuild))).exists()){throw new RuntimeException("Ref build id specified, but no reference file.");} + + String[] lines; + { + TextFile tf=new TextFile(template, false, false); + lines=tf.toStringLines(); + } + + + StringBuilder sb=new StringBuilder(); + for(int i=0; i0 && s.startsWith("#")){s=s.substring(1);} + } + + if(!s.startsWith("#")){ + if((eccline && !ecc) || (sortline && !sort) || (refline && refbuild<1)){s="#"+s;} + } + + + if(optional){ + optional=true; + s=s.substring(2); + } + + if((s.contains("@MAXRAM") && maxram>31) || (s.contains("@RAM") && ram>31)){ + s=s.replace("-XX:+UseCompressedOops ", ""); + } + + s=s.replace("@CLEAN_ECC_1", cleanecc); + s=s.replace("@CLEAN_BAD_ECC_1", cleanbadecc); + s=s.replace("@CLEAN_ALL_ECC_1", cleanallecc); + s=s.replace("@SORT_IN", sort_in); + s=s.replace("@SORTED_OUT", sorted_out); + s=s.replace("@SORTED", sorted); + + s=s.replace("@SLOTRAM", slotram); + s=s.replace("@BUILDNUM", ""+build); + s=s.replace("@DIRTY_INPUT", dirty); + s=s.replace("@CLEAN_INPUT_1", clean); + s=s.replace("@ORGANISM", name); + s=s.replace("@NUMSLOTS", ""+threads); + s=s.replace("@TARGET_SIZE", targetsize); + s=s.replace("@RAM", "-Xmx"+ram+"g"); + s=s.replace("@MAXRAM", "-Xmx"+maxram+"g"); + s=s.replace("@MAXREADS", ""+maxReads); + s=s.replace("@SCRIPT", (output==null ? "run.sh" : output)); + s=s.replace("@EXTRA", extra); + s=s.replace("@RUNTIME", ""+runtime); + s=s.replace("@CLASSPATH", classpath); + + if(s.contains("@REFBUILD")){ + if(refbuild<1){ + s="#"+s; + }else{ + s=s.replace("@REFBUILD", ""+refbuild); + } + } + + + if(s.contains("@REFERENCE")){ + if(ref==null){ + s="#"+s; + }else{ + s=s.replace("@REFERENCE", ref); + } + } + + if(s.contains("@MERGECLEAN")){ + if(mergeclean==null){ + s="#"+s; + }else{ + s=s.replace("@MERGECLEAN", mergeclean); + } + } + + if(s.contains("@MERGEDIRTY")){ + if(mergedirty==null){ + s="#"+s; + }else{ + s=s.replace("@MERGEDIRTY", mergedirty); + } + } + + if(s.contains("@MERGEREF")){ + if(mergeref==null){ + s="#"+s; + }else{ + s=s.replace("@MERGEREF", mergeref); + } + } + + while(s.startsWith("##")){s=s.substring(1);} + + assert(s==null || s.length()<1 || s.startsWith("#") || !s.contains("@")) : s; + + if(s!=null && !s.startsWith("#//")){sb.append(s).append('\n');} + + if(qsub==null && s.contains("export task") && s.contains("qsub")){ + qsub=s; + } + } + + if(output==null){ + System.out.println(sb); + }else{ + ReadWrite.writeString(sb, output, false); + System.out.println("Wrote "+output); + if(qsub!=null){ + while(qsub.startsWith("#")){qsub=qsub.substring(1);} + System.out.println("The script can be executed on Genepool with the following command:\n\n"+qsub.trim()); + } + } + + + } + + +} diff --git a/current/pacbio/MergeFastaContigs.java b/current/pacbio/MergeFastaContigs.java new file mode 100755 index 0000000..3df5283 --- /dev/null +++ b/current/pacbio/MergeFastaContigs.java @@ -0,0 +1,530 @@ +package pacbio; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.Read; + +import align2.ListNum; +import align2.Tools; +import dna.Timer; + +import fileIO.FileFormat; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jul 10, 2012 + * + */ +public class MergeFastaContigs { + + + public static void main(String[] args){ + System.out.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName()+" "+Arrays.toString(args))); + + Timer t=new Timer(); + t.start(); + String infile=null; + String outfile=null; + String outindex=null; + int npl=-1; + int npl2=-1; + + for(int i=0; i1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("in") && split.length>0){ + infile=b; + }else if(a.equals("out") && split.length>0){ + outfile=b; + }else if(a.equals("index") && split.length>0){ + outindex=b; + }else if(a.equals("npad")){ + npl=N_PAD_LENGTH=Integer.parseInt(b); + }else if(a.equals("npad2")){ + npl2=N_PAD_LENGTH2=Integer.parseInt(b); + }else if(a.equals("maxdataout")){ + maxDataOut=Integer.parseInt(b); + }else if(a.equals("mincontig")){ + MIN_CONTIG_TO_ADD=Integer.parseInt(b); + }else if(a.equals("maxlen")){ + MAX_OUTPUT_LEN=Integer.parseInt(b); + }else if(a.equals("maxchroms")){ + maxChromsOut=Integer.parseInt(b); + }else if(a.equals("maxdata")){ + maxDataOut=Long.parseLong(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + }else if(a.equals("padfront") || a.equals("padstart")){ + PAD_START=Tools.parseBoolean(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + } + } + + if(infile==null){infile=args[0];} + if(outfile==null){outfile=args[1];} + if(outindex==null){outindex=args[2];} + + try { + if(npl<0 && args.length>3){N_PAD_LENGTH=Integer.parseInt(args[3]);} + if(npl2<0 && args.length>4){N_PAD_LENGTH2=Integer.parseInt(args[4]);} + } catch (NumberFormatException e) { + //ignore + } + + if(infile.contains(".fq.") || infile.endsWith(".fq") || infile.contains(".fastq.") || infile.endsWith(".fastq")){ + mergeFastq(infile, outfile, outindex); + }else{ + if(new File(infile).exists()){ +// System.err.println("Warning: This will run correctly, but I suggest against putting commas in your filenames."); +// assert false : infile+", "+outfile+", "+outindex; + mergeFasta(new String[] {infile}, outfile, outindex); + }else{ + String[] files=infile.split(","); + for(String s : files){ + if(!new File(s).exists()){throw new RuntimeException("Cannot find file "+s);} + } + mergeFasta(files, outfile, outindex); + } + } + t.stop(); + + System.out.println("MergeFastaContigs output for "+Arrays.toString(args)); + System.out.println("definedBasesIn: \t"+definedBasesIn); + System.out.println("contigsIn: \t"+contigsIn); + System.out.println("definedBasesOut: \t"+definedBasesOut); + System.out.println("basesOut: \t"+dataOut); + System.out.println("contigsOut: \t"+contigsOut); + System.out.println("chromsOut: \t"+chromsOut); + + System.out.println("Time:\t"+t); + + } + + + + /** + * @param infile + * @param outfile + * @param outindex + */ + public static void merge(String infile, String outfile, String outindex) { + StringBuilder temp=new StringBuilder(MIN_CONTIG_TO_ADD); + TextFile tf=new TextFile(infile, false, false); + +// OutputStream cos=ReadWrite.getOutputStream(outfile, false); +// PrintWriter cpw=new PrintWriter(cos); + + long loc=N_PAD_LENGTH; + int chrom=1; + System.out.println(">chr"+chrom); + npad=npad(N_PAD_LENGTH); + printAsLines(npad, 0); + + String s=null; + String label=null; + for(s=tf.nextLine(); chrom'){ + + if(s!=null){contigsIn++;} + + //evict current contig + if(temp.length()>=MIN_CONTIG_TO_ADD){ + + long newloc=loc+temp.length()+N_PAD_LENGTH; + if(newloc>=MAX_OUTPUT_LEN){ + //Evict old chrom + + //Make new chrom + chrom++; + loc=N_PAD_LENGTH; + newloc=loc+temp.length()+N_PAD_LENGTH; + System.out.println("\n>chr"+chrom); + printAsLines(npad, 0); + } + + printAsLines(temp, (int)(loc%lineBreak)); + + definedBasesOut+=temp.length(); + contigsOut++; + + printAsLines(npad, (int)((loc+temp.length())%lineBreak)); + System.err.println(chrom+"\t"+loc+"\t"+label); + loc=newloc; + }else{ +// System.err.println("Ignored "+temp); + } + + if(s==null){break;} + temp.setLength(0); + label=s.substring(1); + }else{ + //append line to current contig + temp.append(s); + definedBasesIn+=s.length(); + } + } + tf.close(); + + chromsOut=chrom; + System.out.println(); + } + + + + /** + * @param infile + * @param outfile + * @param outindex + */ + public static void mergeFasta(String infiles[], String outfile, String outindex) { + + if(new File(outfile).exists()){ + for(String s : infiles){assert(!s.equalsIgnoreCase(outfile));} + } + + //if(verbose){System.err.println("A");} + + StringBuilder temp=new StringBuilder(MIN_CONTIG_TO_ADD); + TextFile tf; + + TextStreamWriter cout=new TextStreamWriter(outfile, overwrite, false, false); + TextStreamWriter iout=new TextStreamWriter(outindex, overwrite, false, false); + + cout.start(); + iout.start(); + //if(verbose){System.err.println("B");} + + long loc=(PAD_START ? N_PAD_LENGTH2 : 0); + int chrom=1; + cout.print(">chr"+chrom+"\n"); + npad=npad(N_PAD_LENGTH); + npad2=npad2(N_PAD_LENGTH2); + assert(npad.length()<=npad2.length()); + if(PAD_START){printAsLines(npad2, 0, cout);} + boolean np2=true; +// cout.poison(); +// assert(false) : "\n"+npad+"\n\n\n"+npad2+"\n"; + //if(verbose){System.err.println("C");} + +// assert(false) : PAD_START+", "+np2; + + for(String fname : infiles){ + tf=new TextFile(fname, false, false); + String s=null; + String label=null; + if(verbose){System.err.println("Processing file "+fname);} + for(s=tf.nextLine(); chrom'){ + if(verbose){System.err.println("Contig break");} +// System.err.println("chrom="+chrom+", maxChromsOut="+maxChromsOut); + + if(s!=null){contigsIn++;} + if(verbose){System.err.println("Contigs="+contigsIn);} + + //evict current contig + if(temp.length()>=MIN_CONTIG_TO_ADD){ + if(verbose){System.err.println("Big enough to add");} + + long newloc=loc+temp.length()+N_PAD_LENGTH; + if(newloc>=MAX_OUTPUT_LEN){ + if(verbose){System.err.println("newloc>=MAX_OUTPUT_LEN");} + //Evict old chrom + printAsLines(npad2, (int)(loc%lineBreak), cout); + + //Make new chrom + chrom++; + loc=N_PAD_LENGTH2; + newloc=loc+temp.length()+N_PAD_LENGTH; + cout.print("\n>chr"+chrom+"\n"); + if(PAD_START){printAsLines(npad2, 0, cout);} + np2=true; + } + if(verbose){System.err.println("G");} + + printAsLines(temp, (int)(loc%lineBreak), cout); + + definedBasesOut+=temp.length(); + contigsOut++; + + if(np2){ + if(verbose){System.err.println("np2");} + if(PAD_START){ + if(verbose){System.err.println("PAD_START");} + loc=N_PAD_LENGTH2; + newloc=N_PAD_LENGTH2+temp.length(); + }else{ + if(verbose){System.err.println("~PAD_START");} + loc=0; + newloc=temp.length(); + } + }else{ + if(verbose){System.err.println("PAD_START");} + printAsLines(npad, (int)((loc+temp.length())%lineBreak), cout); + } + if(verbose){System.err.println("H");} + if(label!=null){iout.print(chrom+"\t"+loc+"\t"+label+"\n");} + loc=newloc; + np2=false; + }else{ + // System.err.println("Ignored "+temp); + } + if(verbose){System.err.println("Done with contig");} + + temp.setLength(0); + if(s==null){break;} + label=s.substring(1); + }else{ + np2=false; + //if(verbose){System.err.print("J");} + //append line to current contig + temp.append(s); + definedBasesIn+=s.length(); + if(verbose){System.err.println("Normal line. definedBasesIn="+definedBasesIn);} + } + //if(verbose){System.err.print("K");} + } + tf.close(); + //if(verbose){System.err.print("L");} + } + //if(verbose){System.err.println("M");} + + chromsOut=chrom; + + assert(temp.length()==0) : temp.length(); + printAsLines(npad2, (int)(loc%lineBreak), cout); + //if(verbose){System.err.println("N");} + + + cout.print("\n"); + cout.poisonAndWait(); + iout.poisonAndWait(); + //if(verbose){System.err.println("O");} + } + + + + /** + * @param in1 + * @param outfile + * @param outindex + */ + public static void mergeFastq(String in1, String outfile, String outindex) { + StringBuilder temp=new StringBuilder(MIN_CONTIG_TO_ADD); + + FASTQ.TEST_INTERLEAVED=false; + FASTQ.PARSE_CUSTOM=false; + FASTQ.DETECT_QUALITY=false; + long maxReads=-1; + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, null); +// if(verbose){System.err.println("Started cris");} + Thread th=new Thread(cris); + th.start(); + } + + + TextStreamWriter cout=new TextStreamWriter(outfile, overwrite, false, false); + TextStreamWriter iout=new TextStreamWriter(outindex, overwrite, false, false); + + cout.start(); + iout.start(); + + long loc=N_PAD_LENGTH2; + int chrom=1; + cout.print(">chr"+chrom+"\n"); + npad=npad(N_PAD_LENGTH); + npad2=npad2(N_PAD_LENGTH2); + assert(npad.length()<=npad2.length()); + printAsLines(npad2, 0, cout); + + + String s=null; + String label=null; + + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + + + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + + s=new String(r.bases); + label=r.id; + + temp.append(s); + + if(temp.length()>=MIN_CONTIG_TO_ADD){ + + long newloc=loc+temp.length()+N_PAD_LENGTH; + if(newloc>=MAX_OUTPUT_LEN){ + //Evict old chrom + printAsLines(npad2, (int)(loc%lineBreak), cout); + + //Make new chrom + chrom++; + loc=N_PAD_LENGTH2; + newloc=loc+temp.length()+N_PAD_LENGTH; + cout.print("\n>chr"+chrom+"\n"); + printAsLines(npad2, 0, cout); + } + + printAsLines(temp, (int)(loc%lineBreak), cout); + printAsLines(npad, (int)((loc+temp.length())%lineBreak), cout); + iout.println(chrom+"\t"+loc+"\t"+label); + loc=newloc; + }else{ + // System.err.println("Ignored "+temp); + } + + temp.setLength(0); + if(s==null){break;} + label=s.substring(1); + + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + + + cris.returnList(ln, ln.list.isEmpty()); + + assert(temp.length()==0) : temp.length(); + printAsLines(npad2, (int)(loc%lineBreak), cout); + + + ReadWrite.closeStream(cris); + + cout.print("\n"); + cout.poison(); + iout.poison(); + } + + private static void printAsLines(CharSequence sb, int mod){ + dataOut+=sb.length(); + assert(mod0){ + CharSequence s=sb.subSequence(0, min(lineBreak-mod, sb.length())); + if(s.length()+mod==lineBreak){ + System.out.println(s); + }else{ + System.out.print(s); + } + } + + int loc=lineBreak-mod; + for(; loc0){ + + CharSequence s=sb.subSequence(0, min(lineBreak-mod, sb.length())); + +// System.out.println(mod+", "+s.length()+", "+(s.length()+mod)+", "+lineBreak); + + if(s.length()+mod==lineBreak){ + cout.println(s); + }else{ + cout.print(s); + } + } + + int loc=(mod==0 ? 0 : lineBreak-mod); + for(; loc1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null")){ + // do nothing + }else if(a.equals("in")){ + if("null".equalsIgnoreCase(b)){ + //do nothing + }else{ + in=b.split(","); + } + }else if(a.equals("out")){ + out=b; + }else if(a.equals("build") || a.equals("genome")){ + genome=Integer.parseInt(b); + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + overwrite=Tools.parseBoolean(b); + System.out.println("Set OVERWRITE to "+overwrite); + }else if(a.endsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + System.out.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM); + }else if(a.equals("reads")){ + reads=Long.parseLong(b); + }else if(a.equals("readlen") || a.equals("length") || a.equals("len")){ + readlen=Integer.parseInt(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("sequentialoverlap")){ + sequentialOverlap=Integer.parseInt(b); + }else if(a.equals("sequentialstrandalt")){ + sequentialStrandAlt=Tools.parseBoolean(b); + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else{ + System.err.println("Unknown parameter "+split[i]); + assert(false); + } + } + + assert(FastaReadInputStream.settingsOK()); + if(in!=null){ + File a=new File(out); + for(String s : in){ + File b=new File(s); + if(a.equals(b)){throw new RuntimeException("Input file may not equal output file: "+a.toString());} + } + } + assert(out!=null); + + TextStreamWriter tsw=new TextStreamWriter(out, overwrite, false, false); + tsw.start(); + + long id=0; + + if(genome>=0){ + Data.setGenome(genome); + SequentialReadInputStream.UNLOAD=true; +// SequentialReadInputStream.verbose=true; + SequentialReadInputStream ris=new SequentialReadInputStream(reads, readlen, Tools.max(50, readlen/2), sequentialOverlap, sequentialStrandAlt); + ConcurrentReadInputStream cris=new ConcurrentReadInputStream(ris, reads); + new Thread(cris).start(); + id=appendReads(cris, tsw, id); + ReadWrite.closeStream(cris); + } + + if(in!=null){ + for(String s : in){ + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(s, FileFormat.FASTQ, null, true, false); + cris=ConcurrentGenericReadInputStream.getReadInputStream(-1, false, true, ff1, null); + if(verbose){System.err.println("Started cris");} + Thread th=new Thread(cris); + th.start(); + } + id=appendReads(cris, tsw, id); + ReadWrite.closeStream(cris); + } + } + + tsw.poison(); + tsw.waitForFinish(); + } + + public static long appendReads(ConcurrentReadStreamInterface cris, TextStreamWriter tsw, long id){ + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + while(reads!=null && reads.size()>0){ + + for(Read r : reads){ + Read b=r.mate; + Read a=correctRead(r, id); + if(a!=null){ + tsw.println(a); + id++; + } + b=correctRead(b, id); + if(b!=null){ + tsw.println(b); + id++; + } + } + + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + if(verbose){System.err.println("Finished reading");} + cris.returnList(ln, ln.list.isEmpty()); + if(verbose){System.err.println("Returned list");} + return id; + } + + public static Read correctRead(Read r, long id){ + if(r==null){return null;} + r.numericID=id; + r.id=""+id; + if(r.chrom<1){return r;} + + int startN=0; + int stopN=r.bases.length-1; + while(startN0 && r.bases[stopN]=='N'){stopN--;} + if(startN>0 || stopN4){maxDataOut=Long.parseLong(args[4]);} + + if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} + + TextFile tf=new TextFile(infile, false, false); + + split(tf, outfile, partition); + t.stop(); + System.out.println("Time:\t"+t); + + } + + + + /** + * @param infile + * @param outfile + * @param outindex + */ + public static void split(TextFile tf, String outfile, long partition) { + long currentBases=0; + int pnum=1; + + TextStreamWriter tsw=new TextStreamWriter(outfile.replace("#", ""+pnum), true, false, false); + tsw.start(); + + String s; + for(s=tf.nextLine(); s!=null && dataOut'){ + if(currentBases>=partition){ + System.out.println("Ended partition "+pnum+" at "+currentBases); + currentBases=0; + pnum++; + tsw.poison(); + tsw=new TextStreamWriter(outfile.replace("#", ""+pnum), true, false, false); + tsw.start(); + } + }else{ + int x=s.length(); + currentBases+=x; + dataOut+=x; + } + tsw.println(s); + } + System.out.println("Ended partition "+pnum+" at "+currentBases); + System.out.println("Total: "+dataOut); + System.out.println("Avg: "+(dataOut)/pnum); +// System.out.println("\n"+s+"\n"+dataOut+"\n"+maxDataOut); + + try { + synchronized(tsw){ + tsw.wait(100); + } + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + tsw.poison(); + } + + public static int MIN_CONTIG_TO_ADD=150; //Not currently used + public static long MAX_OUTPUT_LEN=200000000000L; + public static long maxDataOut=Long.MAX_VALUE; + private static long dataOut=0; + +} diff --git a/current/pacbio/PartitionReads.java b/current/pacbio/PartitionReads.java new file mode 100755 index 0000000..0d4ed00 --- /dev/null +++ b/current/pacbio/PartitionReads.java @@ -0,0 +1,226 @@ +package pacbio; + +import java.util.ArrayList; +import java.util.Arrays; + +import stream.ConcurrentGenericReadInputStream; +import stream.ConcurrentReadStreamInterface; +import stream.FASTQ; +import stream.FastaReadInputStream; +import stream.Read; + + +import align2.ListNum; +import align2.Tools; +import dna.Data; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.FileFormat; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Nov 15, 2012 + * + */ +public class PartitionReads { + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + FastaReadInputStream.SPLIT_READS=false; + + Timer t=new Timer(); + t.start(); + + boolean verbose=false; + int ziplevel=-1; + + String in1=null; + String in2=null; + long maxReads=-1; + + String outname1=null; + String outname2=null; + FASTQ.PARSE_CUSTOM=false; + + for(int i=0; i1 ? split[1] : "true"; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+args[i]); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){ + Data.setPath(b); + }else if(a.equals("fasta") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + if(b.indexOf('#')>-1){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.endsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + System.out.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM); + }else if(a.startsWith("partition")){ + partitions=Integer.parseInt(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + System.out.println("Set OVERWRITE to "+OVERWRITE); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("out") || a.equals("out1")){ + if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){ + System.out.println("No output file."); + outname1=null; + }else{ + outname1=b; + assert(!outname1.equalsIgnoreCase(outname2)); + } + }else if(a.equals("out2")){ + if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){ + outname2=null; + }else{ + outname2=b; + assert(!outname2.equalsIgnoreCase(outname1)); + } + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte ascii_offset=Byte.parseByte(b); + FASTQ.ASCII_OFFSET=ascii_offset; + System.out.println("Set fastq input ASCII offset to "+FASTQ.ASCII_OFFSET); + FASTQ.DETECT_QUALITY=false; + }else if(a.startsWith("verbose")){ + verbose=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + assert(outname1==null || outname1.indexOf('#')>=0 || partitions<2); + assert(outname2==null || outname2.indexOf('#')>=0 || partitions<2); + assert(outname1==null || !outname1.equalsIgnoreCase(outname2)); + + if(in1==null){throw new RuntimeException("Please specify input file.");} + + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); + if(verbose){System.err.println("Started cris");} +// Thread th=new Thread(cris); +// th.start(); + } + + + TextStreamWriter[] tsw1=new TextStreamWriter[partitions]; + TextStreamWriter[] tsw2=new TextStreamWriter[partitions]; + + FileFormat ff=FileFormat.testOutput(outname1, FileFormat.FASTQ, null, true, OVERWRITE, false); + fastq=ff.fastq(); + fasta=ff.fasta(); + bread=ff.bread(); + + for(int i=0; i ln=cris.nextList(); + ArrayList readlist=ln.list; + + final boolean paired=cris.paired(); + + long x=0; + final int div=tsw1.length; + while(!readlist.isEmpty()){ + + //System.err.println("Got a list of size "+readlist.size()); + for(int i=0; i1 ? split[1] : null); + + if(a.equals("scorethresh")){ + SCORE_THRESH=Float.parseFloat(b); + }else if(a.equals("interval")){ + INTERVAL=Integer.parseInt(b); + }else if(a.equals("minsitestodiscard")){ + MIN_SITES_TO_DISCARD=Integer.parseInt(b); + }else if(a.equals("minlength")){ + MIN_LENGTH_TO_RETAIN=Integer.parseInt(b); + }else if(a.equals("retainall")){ + RETAIN_ALL=Tools.parseBoolean(b); + if(RETAIN_ALL){MIN_VOTES_TO_RETAIN=0;} + }else if(a.equals("fractiontoretain1")){ + FRACTION_TO_RETAIN1=Float.parseFloat(b); + }else if(a.equals("fractiontoretain2")){ + FRACTION_TO_RETAIN2=Float.parseFloat(b); + }else if(a.equals("centerweight")){ + CENTER_WEIGHT=Float.parseFloat(b); + }else if(a.equals("sitestoretain1")){ + SITES_TO_RETAIN1=Integer.parseInt(b); + }else if(a.equals("sitestoretain2")){ + SITES_TO_RETAIN2=Integer.parseInt(b); + }else if(a.equals("minvotestoretain")){ + MIN_VOTES_TO_RETAIN=Integer.parseInt(b); + }else if(a.equals("mindistfromreadends")){ +// MIN_DIST_FROM_READ_ENDS=Integer.parseInt(b); +// throw new RuntimeException("Deprecated - use minfractionfromreadends instead."); + int x=Integer.parseInt(b); + float f=x/((150-INTERVAL)*.5f); + System.err.println("Warning - mindistfromreadends is deprecated. Setting minfractionfromreadends = "+String.format("%.3f",f)); + MIN_FRACTION_FROM_READ_ENDS=f; + }else if(a.equals("minfractionfromreadends")){ + MIN_FRACTION_FROM_READ_ENDS=Float.parseFloat(b); + }else{ + assert(false) : "Unknown parameter "+a; + } + } + + process(infile, outfile); + + System.out.println("Sites In:\t"+sitesIn+" \t"+String.format("%.3f%% correct",correctIn*100d/sitesIn)); + System.out.println("Sites Out:\t"+sitesOut+" \t"+String.format("%.3f%% correct",correctOut*100d/sitesOut)); + t.stop(); + System.out.println("Time: \t"+t); + } + + /** + * @param infile + * @param outfile + */ + public static void process(String infile, String outfile) { + + Buffer buffer=new Buffer(3, infile, outfile); + + int chrom=buffer.chrom; + int start=buffer.min; + int stop=buffer.min+INTERVAL-1; + + assert(buffer.array[0]!=null); + while(buffer.array[0]!=null){ + + processInterval(buffer, chrom, start, stop); + + start+=INTERVAL; + stop+=INTERVAL; + boolean success=buffer.advanceToInterval(start, stop, chrom); + if(!success){ + chrom=buffer.chrom; + start=buffer.min; + stop=start+INTERVAL-1; + } + } + buffer.close(); + } + + private static void processInterval(Buffer buffer, int chrom, int start, int stop){ + + ArrayList plus=new ArrayList(); + ArrayList minus=new ArrayList(); + + for(Ssra ssra : buffer.array){ +// if(Tools.isWithin(start-MIN_DIST_FROM_READ_ENDS, stop+MIN_DIST_FROM_READ_ENDS, ssra.min, ssra.max)){ + if(Tools.isWithin(start, stop, ssra.min, ssra.max)){ + for(SiteScoreR ssr : ssra.array){ + + int x=(int)((((ssr.stop-ssr.start+1)-INTERVAL)/2)*MIN_FRACTION_FROM_READ_ENDS); + if(x<0){x=0;} + + if(ssr.readlen>=MIN_LENGTH_TO_RETAIN){ + if(Tools.isWithin(start, stop, ssr.start+x, ssr.stop-x)){ + ssr.normalizedScore=normalizedScore(ssr, Tools.min(start-ssr.start, ssr.stop-stop)); + if(ssr.strand==Gene.PLUS){ + plus.add(ssr); + }else{ + minus.add(ssr); + } + } + } + + } + } + } + markRetain(plus); + markRetain(minus); + + } + +// private static final int markRetain_old(ArrayList list){ +//// Collections.sort(list, SiteScoreR.NCOMP); +// assert(list.size()<2 || list.get(0).normalizedScore>=list.get(1).normalizedScore) : list.get(0)+"\t"+list.get(1); +// +// int sites=list.size()-MIN_SITES_TO_DISCARD; //Always ignore worst site(s). +// +// int retain=(int)(sites*FRACTION_TO_RETAIN1); +// if(retain>SITES_TO_RETAIN1){ +// int temp=(int)((retain-SITES_TO_RETAIN1)*FRACTION_TO_RETAIN2); +//// System.out.println("sites="+sites+", retain="+retain+", temp="+temp); +// retain=SITES_TO_RETAIN1+temp; +// } +// retain=Tools.min(retain, SITES_TO_RETAIN2); +//// System.out.println("retain2="+retain); +// +//// for(int i=0; i0 ? list.get(0) : null); +// for(int i=0; i0){ +//// SiteScoreR a=list.get(i-1); +//// if(a.score-b.score>a.score*0.03f){break;} +// if(best.score-b.score>best.score*0.034f){break;} +// } +// +// if(i==0){ +// b.retainVotes+=5; +// }else if(i<3){ +// b.retainVotes+=3; +// }else if(i<6){ +// b.retainVotes+=2; +// }else{ +// b.retainVotes++; +// } +// } +// +// return retain; +// } + + private static final int markRetain(ArrayList list){ +// Collections.sort(list, SiteScoreR.NCOMP); +// assert(list.size()<2 || list.get(0).normalizedScore>=list.get(1).normalizedScore) : list.get(0)+"\t"+list.get(1); + + int sites=list.size()-MIN_SITES_TO_DISCARD; //Always ignore worst site(s). + + int retain=(int)(sites*FRACTION_TO_RETAIN1); + if(retain>SITES_TO_RETAIN1){ + int temp=(int)((retain-SITES_TO_RETAIN1)*FRACTION_TO_RETAIN2); +// System.out.println("sites="+sites+", retain="+retain+", temp="+temp); + retain=SITES_TO_RETAIN1+temp; + } + retain=Tools.min(retain, SITES_TO_RETAIN2); + + if(RETAIN_ALL){retain=sites;} + +// System.out.println("retain2="+retain); + +// for(int i=0; i0 ? list.get(0) : null); + for(int i=0; i0){ +// SiteScoreR a=list.get(i-1); +// if(a.score-b.score>a.score*0.03f){break;} + if(!RETAIN_ALL && best.score-b.score>best.score*SCORE_THRESH){break;} + } + + if(i==0){ + b.retainVotes+=5; + }else if(i<4){ + b.retainVotes+=3; + }else if(i<8){ + b.retainVotes+=2; + }else{ + b.retainVotes++; + } + } + + return retain; + } + + public static Ssra toSrar(String s){ + String[] split=s.split("\t"); + SiteScoreR[] scores=new SiteScoreR[split.length]; + int min=Integer.MAX_VALUE; + int max=Integer.MIN_VALUE; + int worst=Integer.MAX_VALUE; + int best=Integer.MIN_VALUE; + int chrom=-1; + + for(int i=0; ilim1){modifier=lim1;} +// ssr.normalizedScore=(int)ssr.score*(1+modifier); + + + min=Tools.min(min, ssr.start); + max=Tools.max(max, ssr.stop); + worst=Tools.min(worst, ssr.score); + best=Tools.max(best, ssr.score); + assert(chrom==-1 || chrom==ssr.chrom); + chrom=ssr.chrom; + } + Ssra ssra=new Ssra(scores, chrom, min, max, best, worst); + return ssra; + } + + public static float normalizedScore(SiteScoreR ssr, int endDist){ + final float lim1=0.008f; + final float lim2=-lim1; + + + int dif=ssr.readlen-ssr.reflen(); //Positive for insertions, negative for deletions + float modifier=dif/(float)(ssr.readlen*4); //Prioritize reads with insertions over deletions, to correct for scoring bias + if(modifierlim1){modifier=lim1;} + + int maxEndDist=(ssr.reflen()-INTERVAL)/2; +// float modifier2=(0.03f*endDist)/maxEndDist; + float modifier2=CENTER_WEIGHT*endDist/(float)maxEndDist; //Prioritize reads centered on this interval + + float f=ssr.score*(1+modifier+modifier2); + return f; + } + + /** Finds highest score of ssr's fully covering this site */ + public static int maxScore(Ssra ssra, final int min, final int max){ + assert(Tools.overlap(min, max, ssra.min, ssra.max)); + assert(Tools.isWithin(min, max, ssra.min, ssra.max)); + + int best=-1; + for(SiteScoreR ssr : ssra.array){ + if(ssr.start>min){break;} + if(max>=ssr.stop){ + best=Tools.max(best, ssr.score); + if(best>=ssra.best){break;} + } + } + return best; + } + + public static class Ssra{ + + public Ssra(){} + + public Ssra(SiteScoreR[] array_, int chrom_, int min_, int max_, int best_, int worst_){ + array=array_; + chrom=chrom_; + min=min_; + max=max_; + best=best_; + worst=worst_; + } + + /** SiteScoreR array sorted by start loc, ascending */ + SiteScoreR[] array; + /** All contents must have same chromosome / contig */ + int chrom; + /** Minimum location in array */ + int min; + /** Maximum location in array */ + int max; + /** Top score in array */ + int best; + /** Bottom score in array */ + int worst; + + } + + public static class Buffer{ + + public Buffer(int size, String infname_, String outfname_){ + assert(!infname_.equalsIgnoreCase(outfname_)) : infname_+" == "+outfname_; //Not a complete test + array=new Ssra[size]; + infname=infname_; + outfname=outfname_; + tf=new TextFile(infname, true, false); + tsw=new TextStreamWriter(outfname, true, false, true); + tsw.start(); + nextSsra=read(); + fill(); + + } + + public Ssra read(){ + String s=tf.nextLine(); + if(s==null){ + tf.close(); + return null; + } + Ssra ssra=toSrar(s); + sitesIn+=ssra.array.length; + return ssra; + } + + private boolean advance(){ + if(nextSsra==null){return false;} + + Ssra old=add(nextSsra); + nextSsra=read(); + if(old!=null){write(old);} + return true; + } + + /** Starting with an empty array, fill with next chrom */ + private boolean fill(){ + assert(array[0]==null); + if(nextSsra==null){return false;} + int c=nextSsra.chrom; + for(int i=0; i=c); +// if(chrom>c || min>b){return false;} //Went past target + + while(array[0].max0){ +// ssr.normalizedScore/=ssr.weight; +// } + + if(ssr.correct){correctIn++;} + if(ssr.retainVotes>=MIN_VOTES_TO_RETAIN){ + sitesOut++; + if(ssr.correct){correctOut++;} + sb.append(tab); + sb.append(ssr.toText()); + tab="\t"; + } + } + if(sitesOut_0==sitesOut){return;} + sb.append('\n'); + tsw.print(sb); + } + + public Ssra add(Ssra s){ + + assert(array[0]==null || array[0].chrom==s.chrom); + + Ssra r=null; + if(array[array.length-1]==null){ + //insert in first null loc + for(int i=0; i1 ? split[1] : "true"; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+args[i]); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){ + Data.setPath(b); + }else if(a.equals("fasta") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + if(b.indexOf('#')>-1){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("query") || a.equals("adapter")){ + query=b; + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.endsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + System.out.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM); + }else if(a.equals("split")){ + splitReads=Tools.parseBoolean(b); + }else if(a.equals("plusonly")){ + boolean x=Tools.parseBoolean(b); + if(x){TRY_PLUS=true; TRY_MINUS=false;} + }else if(a.equals("minusonly")){ + boolean x=Tools.parseBoolean(b); + if(x){TRY_PLUS=false; TRY_MINUS=true;} + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.startsWith("mincontig")){ + minContig=Integer.parseInt(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + System.out.println("Set OVERWRITE to "+OVERWRITE); + }else if(a.equals("threads") || a.equals("t")){ + if(b.equalsIgnoreCase("auto")){THREADS=Data.LOGICAL_PROCESSORS;} + else{THREADS=Integer.parseInt(b);} + System.out.println("Set threads to "+THREADS); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.startsWith("outname") || a.startsWith("outfile") || a.equals("out")){ + if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){ + System.out.println("No output file."); + outname1=null; + OUTPUT_READS=false; + }else{ + OUTPUT_READS=true; + if(b.indexOf('#')>-1){ + outname1=b.replace('#', '1'); + outname2=b.replace('#', '2'); + }else{ + outname1=b; + } + } + }else if(a.equals("minratio")){ + MINIMUM_ALIGNMENT_SCORE_RATIO=Float.parseFloat(b); + System.out.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+MINIMUM_ALIGNMENT_SCORE_RATIO); + }else if(a.equals("suspectratio")){ + SUSPECT_RATIO=Float.parseFloat(b); + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte ascii_offset=Byte.parseByte(b); + FASTQ.ASCII_OFFSET=ascii_offset; + System.out.println("Set fastq input ASCII offset to "+FASTQ.ASCII_OFFSET); + FASTQ.DETECT_QUALITY=false; + }else if(a.startsWith("verbose")){ + verbose=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + if(in1==null){throw new RuntimeException("Please specify input file.");} + + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); +// if(verbose){System.err.println("Started cris");} +// Thread th=new Thread(cris); +// th.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + RTextOutputStream3 ros=null; + if(OUTPUT_READS){ + final int buff=(!OUTPUT_ORDERED_READS ? THREADS : Tools.max(24, 2*THREADS)); + + FileFormat ff1=FileFormat.testOutput(outname1, FileFormat.FASTQ, null, true, OVERWRITE, OUTPUT_ORDERED_READS); + FileFormat ff2=FileFormat.testOutput(outname2, FileFormat.FASTQ, null, true, OVERWRITE, OUTPUT_ORDERED_READS); + ros=new RTextOutputStream3(ff1, ff2, buff, null, true); + } + process(cris, ros, query, splitReads); + } + + public static void process(ConcurrentReadStreamInterface cris, RTextOutputStream3 ros, String query, boolean split){ + + Timer t=new Timer(); + t.start(); + + Thread cristhread=new Thread(cris); + cristhread.start(); + + System.out.println("Started read stream."); + + + if(ros!=null){ + ros.start(); + System.out.println("Started output threads."); + } + ProcessThread[] pts=new ProcessThread[THREADS]; + for(int i=0; i0){System.out.println("Reads Out: \t"+readsOut+" \t("+basesOut+" bases, avg length "+(basesOut/readsOut)+")");} + System.out.println(); + if(truepositive>0 || truenegative>0 || falsepositive>0 || falsenegative>0){ + System.out.println("Adapters Expected: \t"+expected); + System.out.println("True Positive: \t"+truepositive+" \t"+String.format("%.3f%%", truepositive*100f/expected)); + System.out.println("True Negative: \t"+truenegative+" \t"+String.format("%.3f%%", truenegative*100f/unexpected)); + System.out.println("False Positive: \t"+falsepositive+" \t"+String.format("%.3f%%", falsepositive*100f/unexpected)); + System.out.println("False Negative: \t"+falsenegative+" \t"+String.format("%.3f%%", falsenegative*100f/expected)); + } + + } + + private static class ProcessThread extends Thread{ + + /** + * @param cris + * @param ros + * @param mINIMUM_ALIGNMENT_SCORE_RATIO + */ + public ProcessThread(ConcurrentReadStreamInterface cris_, + RTextOutputStream3 ros_, float minRatio_, String query_, boolean split_) { + cris=cris_; + ros=ros_; + minRatio=minRatio_; + query1=query_.getBytes(); + query2=AminoAcid.reverseComplementBases(query1); + ALIGN_ROWS=query1.length+1; + ALIGN_COLUMNS=ALIGN_ROWS*3+20; + SPLIT=split_; + + stride=(int)(query1.length*0.95f); + window=(int)(query1.length*2.5f+10); + assert(window ln=cris.nextList(); + ArrayList readlist=ln.list; + + while(!readlist.isEmpty()){ + + //System.err.println("Got a list of size "+readlist.size()); + for(int i=0; i out=SPLIT ? split(readlist) : readlist; + for(Read r : out){ + if(r!=null){ + Read r2=r.mate; + basesOut+=r.bases.length; + readsOut++; + if(r2!=null){ + basesOut+=r2.bases.length; + readsOut++; + } + } + } + ros.add(out, ln.id); + } + + cris.returnList(ln, readlist.isEmpty()); + + //System.err.println("Waiting on a list..."); + ln=cris.nextList(); + readlist=ln.list; + } + + //System.err.println("Returning a list... (final)"); + assert(readlist.isEmpty()); + cris.returnList(ln, readlist.isEmpty()); + } + + /** + * @param readlist + * @return + */ + private ArrayList split(ArrayList in) { + ArrayList out=new ArrayList(in.size()); + for(Read r : in){ + if(r!=null){ +// assert(r.mate==null); + if(!r.hasadapter()){out.add(r);} + else{out.addAll(split(r));} + Read r2=r.mate; + if(r2!=null){ + if(!r2.hasadapter()){out.add(r2);} + else{out.addAll(split(r2));} + } + } + } + return out; + } + + /** + * @param r + * @return + */ + private ArrayList split(Read r) { + ArrayList sections=new ArrayList(); + + int lastX=-1; + for(int i=0; iminContig){ + byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i); + byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i); + Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0); + sections.add(r2); + } + lastX=i; + } + } + int i=r.bases.length; + if(i-lastX>minContig){ + byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i); + byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i); + Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0); + sections.add(r2); + } + return sections; + } + + /** + * @param r + */ + private int processRead(Read r) { + + int begin=0; + while(begin=r.bases.length){return 0;} + + basesIn+=r.bases.length; + + final byte[] array=npad(r.bases, npad); + + int lim=array.length-npad-stride; + + int plusFound=0; + int minusFound=0; + + int lastSuspect=-1; + int lastConfirmed=-1; + + for(int i=begin; i=minSwScoreSuspect){ + int score=rvec[0]; + int start=rvec[1]; + int stop=rvec[2]; + assert(score>=minSwScoreSuspect); + if((i==0 || start>i) && (j==array.length-1 || stop=minSwScore || + (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect0 && start>=lastConfirmed && start-lastConfirmedwindow){//Look ahead + rvec=msa.fillAndScoreLimited(query2, array, stop, stop+window, minSwScoreSuspect); + if(rvec!=null){ + if(score>=suspectMidpoint && rvec[0]>=minSwScoreSuspect && rvec[1]-stop=minSwScoreSuspect && rvec[0]>=minSwScore && rvec[1]-stop=(minSwScore)){kill=true;} + } + + if(kill){ +// System.out.println("-:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop); + minusFound++; + for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} + if(USE_LOCALITY && score>=minSwScore){lastConfirmed=Tools.max(lastConfirmed, stop);} + } + } +// System.out.println("Set lastSuspect="+stop+" on score "+score); + if(USE_LOCALITY){lastSuspect=Tools.max(lastSuspect, stop);} + } + } + + if(TRY_PLUS){ + int[] rvec=msa.fillAndScoreLimited(query1, array, i, j, minSwScoreSuspect); + if(rvec!=null && rvec[0]>=minSwScoreSuspect){ + int score=rvec[0]; + int start=rvec[1]; + int stop=rvec[2]; + if((i==0 || start>i) && (j==array.length-1 || stop=minSwScore || + (score>=suspectMidpoint && lastSuspect>0 && start>=lastSuspect && start-lastSuspect0 && start>=lastConfirmed && start-lastConfirmedwindow){//Look ahead + rvec=msa.fillAndScoreLimited(query1, array, stop, stop+window, minSwScoreSuspect); + if(rvec!=null){ + if(score>=suspectMidpoint && rvec[0]>=minSwScoreSuspect && rvec[1]-stop=minSwScoreSuspect && rvec[0]>=minSwScore && rvec[1]-stop=(minSwScore)){kill=true;} + } + + if(kill){ +// System.out.println("+:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop); + plusFound++; + for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} + if(USE_LOCALITY && score>=minSwScore){lastConfirmed=Tools.max(lastConfirmed, stop);} + } + } +// System.out.println("Set lastSuspect="+stop+" on score "+score); + if(USE_LOCALITY){lastSuspect=Tools.max(lastSuspect, stop);} + } + } + } + + int found=plusFound+minusFound; + +// if(r.synthetic()){ +// if(/*r.hasadapter() && */(r.numericID&3)==0){ +// if(plusFound>0){truepositive++;}else{falsenegative++;} +// if(plusFound>1){falsepositive+=(plusFound-1);} +// falsepositive+=minusFound; +// expected++; +// }else if(/*r.hasadapter() && */(r.numericID&3)==1){ +// if(minusFound>0){truepositive++;}else{falsenegative++;} +// if(minusFound>1){falsepositive+=(minusFound-1);} +// falsepositive+=plusFound; +// expected++; +// }else{ +// falsepositive=falsepositive+plusFound+minusFound; +// if(plusFound+minusFound==0){truenegative++;} +// unexpected++; +// } +// } + + if(r.synthetic()){ + if(/*r.hasadapter() && */(r.numericID&3)==0){ + if(found>0){truepositive++;}else{falsenegative++;} + if(found>1){falsepositive+=(found-1);} + expected++; + }else if(/*r.hasadapter() && */(r.numericID&3)==1){ + if(found>0){truepositive++;}else{falsenegative++;} + if(found>1){falsepositive+=(found-1);} + expected++; + }else{ + falsepositive+=found; + if(found==0){truenegative++;} + unexpected++; + } + } + + plusAdaptersFound+=plusFound; + minusAdaptersFound+=minusFound; + if(found>0){ + for(int i=npad, j=0; j0); + + return found; + + } + + private byte[] npad(final byte[] array, final int pad){ + final int len=array.length+2*pad; + if(padbuffer==null || padbuffer.length!=len){padbuffer=new byte[len];} + byte[] r=padbuffer; + for(int i=0; i list){ + int removed=0; + for(int i=0; i1 ? split[1] : "true"; + if("null".equalsIgnoreCase(b)){b=null;} +// System.err.println("Processing "+args[i]); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("path") || a.equals("root") || a.equals("tempdir")){ + Data.setPath(b); + }else if(a.equals("fasta") || a.equals("in") || a.equals("input") || a.equals("in1") || a.equals("input1")){ + in1=b; + if(b.indexOf('#')>-1){ + in1=b.replace("#", "1"); + in2=b.replace("#", "2"); + } + }else if(a.equals("in2") || a.equals("input2")){ + in2=b; + }else if(a.equals("query") || a.equals("adapter")){ + query=b; + }else if(a.endsWith("parsecustom")){ + FASTQ.PARSE_CUSTOM=Tools.parseBoolean(b); + System.out.println("Set FASTQ.PARSE_CUSTOM to "+FASTQ.PARSE_CUSTOM); + }else if(a.equals("split")){ + splitReads=Tools.parseBoolean(b); + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else if(a.equals("fastawrap")){ + FastaReadInputStream.DEFAULT_WRAP=Integer.parseInt(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ziplevel=Integer.parseInt(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + System.out.println("Set OVERWRITE to "+OVERWRITE); + }else if(a.equals("threads") || a.equals("t")){ + if(b.equalsIgnoreCase("auto")){THREADS=Data.LOGICAL_PROCESSORS;} + else{THREADS=Integer.parseInt(b);} + System.out.println("Set threads to "+THREADS); + }else if(a.equals("reads") || a.equals("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.startsWith("outname") || a.startsWith("outfile") || a.equals("out")){ + if(b==null || b.equalsIgnoreCase("null") || b.equalsIgnoreCase("none") || split.length==1){ + System.out.println("No output file."); + outname1=null; + OUTPUT_READS=false; + }else{ + OUTPUT_READS=true; + if(b.indexOf('#')>-1){ + outname1=b.replace('#', '1'); + outname2=b.replace('#', '2'); + }else{ + outname1=b; + } + } + }else if(a.equals("perfectmode")){ + PERFECTMODE=Tools.parseBoolean(b); + if(ziplevel==-1){ziplevel=2;} + }else if(a.equals("minratio")){ + MINIMUM_ALIGNMENT_SCORE_RATIO=Float.parseFloat(b); + System.out.println("Set MINIMUM_ALIGNMENT_SCORE_RATIO to "+MINIMUM_ALIGNMENT_SCORE_RATIO); + }else if(a.equals("asciiin") || a.equals("qualityin") || a.equals("qualin") || a.equals("qin")){ + byte ascii_offset=Byte.parseByte(b); + FASTQ.ASCII_OFFSET=ascii_offset; + System.out.println("Set fastq input ASCII offset to "+FASTQ.ASCII_OFFSET); + FASTQ.DETECT_QUALITY=false; + }else if(a.startsWith("verbose")){ + verbose=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + if(in1==null){throw new RuntimeException("Please specify input file.");} + + + final ConcurrentReadStreamInterface cris; + { + FileFormat ff1=FileFormat.testInput(in1, FileFormat.FASTQ, null, true, true); + FileFormat ff2=FileFormat.testInput(in2, FileFormat.FASTQ, null, true, true); + cris=ConcurrentGenericReadInputStream.getReadInputStream(maxReads, false, true, ff1, ff2); +// if(verbose){System.err.println("Started cris");} +// Thread th=new Thread(cris); +// th.start(); + } + boolean paired=cris.paired(); + if(verbose){System.err.println("Paired: "+paired);} + + RTextOutputStream3 ros=null; + if(OUTPUT_READS){ + final int buff=(!OUTPUT_ORDERED_READS ? THREADS : Tools.max(24, 2*THREADS)); + + FileFormat ff1=FileFormat.testOutput(outname1, FileFormat.FASTQ, null, true, OVERWRITE, OUTPUT_ORDERED_READS); + FileFormat ff2=FileFormat.testOutput(outname2, FileFormat.FASTQ, null, true, OVERWRITE, OUTPUT_ORDERED_READS); + ros=new RTextOutputStream3(ff1, ff2, buff, null, true); + } + process(cris, ros, query, splitReads); + } + + public static void process(ConcurrentReadStreamInterface cris, RTextOutputStream3 ros, String query, boolean split){ + + Timer t=new Timer(); + t.start(); + + Thread cristhread=new Thread(cris); + cristhread.start(); + + System.out.println("Started read stream."); + + + if(ros!=null){ + ros.start(); + System.out.println("Started output threads."); + } + ProcessThread[] pts=new ProcessThread[THREADS]; + for(int i=0; i0 || truenegative>0 || falsepositive>0 || falsenegative>0){ + System.out.println("Adapters Expected: \t"+expected); + System.out.println("True Positive: \t"+truepositive+" \t"+String.format("%.3f%%", truepositive*100f/expected)); + System.out.println("True Negative: \t"+truenegative+" \t"+String.format("%.3f%%", truenegative*100f/unexpected)); + System.out.println("False Positive: \t"+falsepositive+" \t"+String.format("%.3f%%", falsepositive*100f/unexpected)); + System.out.println("False Negative: \t"+falsenegative+" \t"+String.format("%.3f%%", falsenegative*100f/expected)); + } + + } + + private static class ProcessThread extends Thread{ + + /** + * @param cris + * @param ros + * @param mINIMUM_ALIGNMENT_SCORE_RATIO + */ + public ProcessThread(ConcurrentReadStreamInterface cris_, + RTextOutputStream3 ros_, float minRatio_, String query_, boolean split_) { + cris=cris_; + ros=ros_; + minRatio=minRatio_; + query1=query_.getBytes(); + query2=AminoAcid.reverseComplementBases(query1); + + stride=(int)(query1.length*0.95f); + window=(int)(query1.length*2.5f+10); + + ALIGN_ROWS=Tools.max(query1.length, rcompDistance)+1; + ALIGN_COLUMNS=Tools.max(window, rcompDistance)+5; + SPLIT=split_; + + assert(window ln=cris.nextList(); + ArrayList readlist=ln.list; + + while(!readlist.isEmpty()){ + + //System.err.println("Got a list of size "+readlist.size()); + for(int i=0; i split(ArrayList in) { + ArrayList out=new ArrayList(in.size()); + for(Read r : in){ + if(r!=null){ + assert(r.mate==null); + if(!r.hasadapter()){out.add(r);} + else{out.addAll(split(r));} + } + } + return out; + } + + /** + * @param r + * @return + */ + private ArrayList split(Read r) { + ArrayList sections=new ArrayList(); + + int lastX=-1; + for(int i=0; iminContig){ + byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i); + byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i); + Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0); + sections.add(r2); + } + lastX=i; + } + } + int i=r.bases.length; + if(i-lastX>minContig){ + byte[] b=Arrays.copyOfRange(r.bases, lastX+1, i); + byte[] q=r.quality==null ? null : Arrays.copyOfRange(r.quality, lastX+1, i); + Read r2=new Read(b, 0, -1, -1, r.id+"_"+(lastX+1), q, r.numericID, 0); + sections.add(r2); + } + return sections; + } + + /** + * @param r + */ + private int processRead(Read r) { + + int begin=0; + while(begin=r.bases.length){return 0;} + + final byte[] array=npad(r.bases, npad); + + int lim=array.length-npad-stride; + + int plusFound=0; + int minusFound=0; + + int lastSuspect=-1; + + for(int i=begin; i=minSwScoreSuspect){ + int score=rvec[0]; + int start=rvec[1]; + int stop=rvec[2]; + assert(score>=minSwScoreSuspect); + if((i==0 || start>i) && (j==array.length-1 || stop=minSwScore || (lastSuspect>0 && start>=lastSuspect && start-lastSuspectwindow){//Look ahead + rvec=msa.fillAndScoreLimited(query2, array, stop, stop+window, minSwScoreSuspect); + if(rvec!=null && rvec[0]>=minSwScoreSuspect && rvec[1]-stoprcompDistance && array.length-stop>rcompDistance+1){ + kill=testRcomp(array, start, stop); +// System.out.print(kill ? "#" : "."); + } + + if(kill){ +// System.out.println("-:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop); + minusFound++; + for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} + } + } +// System.out.println("Set lastSuspect="+stop+" on score "+score); + lastSuspect=stop; + } + } + + { + int[] rvec=msa.fillAndScoreLimited(query1, array, i, j, minSwScoreSuspect); + if(rvec!=null && rvec[0]>=minSwScoreSuspect){ + int score=rvec[0]; + int start=rvec[1]; + int stop=rvec[2]; + if((i==0 || start>i) && (j==array.length-1 || stop=minSwScore || (lastSuspect>0 && start>=lastSuspect && start-lastSuspectwindow){//Look ahead + rvec=msa.fillAndScoreLimited(query1, array, stop, stop+window, minSwScoreSuspect); + if(rvec!=null && rvec[0]>=minSwScoreSuspect && rvec[1]-stoprcompDistance && array.length-stop>rcompDistance+1){ + kill=testRcomp(array, start, stop); +// System.out.print(kill ? "#" : "."); + } + + if(kill){ +// System.out.println("+:"+score+", "+minSwScore+", "+minSwScoreSuspect+"\t"+lastSuspect+", "+start+", "+stop); + plusFound++; + for(int x=Tools.max(0, start); x<=stop; x++){array[x]='X';} + } + } +// System.out.println("Set lastSuspect="+stop+" on score "+score); + lastSuspect=stop; + } + } + } + + if(r.synthetic()){ + if(/*r.hasadapter() && */(r.numericID&3)==0){ + if(plusFound>0){truepositive++;}else{falsenegative++;} + if(plusFound>1){falsepositive+=(plusFound-1);} + falsepositive+=minusFound; + expected++; + }else if(/*r.hasadapter() && */(r.numericID&3)==1){ + if(minusFound>0){truepositive++;}else{falsenegative++;} + if(minusFound>1){falsepositive+=(minusFound-1);} + falsepositive+=plusFound; + expected++; + }else{ + falsepositive=falsepositive+plusFound+minusFound; + if(plusFound+minusFound==0){truenegative++;} + unexpected++; + } + } + + plusAdaptersFound+=plusFound; + minusAdaptersFound+=minusFound; + int found=plusFound+minusFound; + if(found>0){ + for(int i=npad, j=0; j0); + + return found; + + } + + /** + * @param array + * @param start + * @param stop + * @return + */ + private boolean testRcomp(byte[] array, int start, int stop) { + + for(int i=0, j=start-rcompDistance, k=stop+1; i=1000; +// return score>=minSwScoreRcomp; + } + + private byte[] npad(final byte[] array, final int pad){ + final int len=array.length+2*pad; + if(padbuffer==null || padbuffer.length!=len){padbuffer=new byte[len];} + byte[] r=padbuffer; + for(int i=0; i list){ + int removed=0; + for(int i=0; i=0 ? idPairnum : -idPairnum;} + public int pairNum(){return idPairnum>=0 ? 0 : 1;} + public int chrom(){return chromStrand>=0 ? chromStrand : -chromStrand;} + public byte strand(){return chromStrand>=0 ? (byte)0 : (byte)1;}; + public int listLength(){ + int i=1; + SiteR sr=this; + while(sr.next!=null){ + sr=sr.next; + i++; + } + return i; + } + +} diff --git a/current/pacbio/SortSites.java b/current/pacbio/SortSites.java new file mode 100755 index 0000000..450249e --- /dev/null +++ b/current/pacbio/SortSites.java @@ -0,0 +1,298 @@ +package pacbio; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; + +import stream.SiteScoreR; + + +import align2.Tools; +import dna.Data; +import dna.Timer; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Aug 2, 2012 + * + */ +public class SortSites { + + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + Timer t=new Timer(); + t.start(); + + String tempname=null; + + for(int i=2; i1 ? split[1] : null; + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); //Not needed + }else if(a.equals("tempname")){ + tempname=b; + }else if(a.equals("deletefiles") || a.startsWith("deletetemp") || a.equals("delete")){ + DELETE_TEMP=(Tools.parseBoolean(b)); + }else if(a.equals("mode")){ + POSITIONMODE=(b.contains("position") || b.contains("location")); + }else if(a.equals("blocksize")){ + BLOCKSIZE=(Integer.parseInt(b)); + }else if(a.equals("ignoreperfect")){ + IGNORE_PERFECT_SITES=(Tools.parseBoolean(b)); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + if(POSITIONMODE){ + System.out.println("Sorting by position."); + }else{ + System.out.println("Sorting by ID."); + } + + stack(args[0], args[1], tempname); + assert(sitesRead==sitesWritten || (sitesRead>=sitesWritten && IGNORE_PERFECT_SITES)); + t.stop(); + System.out.println("Time: \t"+t); + } + + public static void stack(String fname1, String outname, String tempname){ + + TextFile tf=new TextFile(fname1, false, false); + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + + SiteScoreR[] array=SiteScoreR.fromTextArray(s); + sitesRead+=array.length; + + for(SiteScoreR ssr : array){ + if(!ssr.perfect || !IGNORE_PERFECT_SITES){ + write(ssr); + } + } + } + tf.close(); + + System.out.println("Finished reading"); + System.out.println("Read "+sitesRead+" sites."); + + finish(outname); + System.out.println("Wrote "+sitesWritten+" sites."); + System.out.println("Wrote "+perfectWritten+" perfect sites."); + System.out.println("Wrote "+semiperfectWritten+" semiperfect sites."); + wmap.clear(); + } + + private static void write(SiteScoreR ssr){ + long key=key(ssr); + TextStreamWriter tsw=wmap.get(key); + if(tsw==null){ + String fname=fname(key, tempname); + tsw=new TextStreamWriter(fname, true, false, false); + tsw.start(); + wmap.put(key, tsw); + } + tsw.print(ssr.toText().append('\n')); + } + + protected static final long key(SiteScoreR ssr){ + return (POSITIONMODE ? poskey(ssr.chrom, ssr.start) : idkey(ssr.numericID)); + } + + protected static final long poskey(int chrom, int start){ + long k=((long)chrom<<32)+(Tools.max(start, 0))/BLOCKSIZE; + return k; + } + + protected static final long idkey(long id){ + long k=id/BLOCKSIZE; + return k; + } + + protected static final String fname(long key, String outname){ + if(outname==null){outname=DEFAULT_TEMP_PATTERN;} + assert(outname.contains("#")) : outname; + return outname.replace("#", "b"+Data.GENOME_BUILD+"_"+key); + } + + private static final void finish(String outname){ + TextStreamWriter out=new TextStreamWriter(outname, true, false, false); + out.start(); + ArrayList keys=new ArrayList(wmap.size()); + keys.addAll(wmap.keySet()); + Collections.sort(keys); + for(Long k : keys){ + TextStreamWriter tsw=wmap.get(k); + tsw.poison(); + } + + if(POSITIONMODE){ + finishByPosition(out, keys); + }else{ + finishByID(out, keys); + } + + out.poisonAndWait(); + } + + private static final void finishByPosition(TextStreamWriter out, ArrayList keys){ + + + + int chrom=0; + int loc=INTERVAL; + String tab=""; + StringBuilder sb=new StringBuilder(4000); + + for(Long k : keys){ + TextStreamWriter tsw=wmap.get(k); + String fname=fname(k, tempname); + for(int i=0; i<50 && tsw.isAlive(); i++){ + try { + tsw.join(20000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(tsw.isAlive()){ + System.err.println("Waiting for tsw "+tsw.fname+" to finish..."); + } + } + if(tsw.isAlive()){ + System.err.println(tsw.getClass().getName()+" for "+fname+" refused to die after a long time."); + assert(false); + } + + TextFile tf=new TextFile(fname, false, false); + ArrayList list=new ArrayList(1000); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){list.add(SiteScoreR.fromText(s));} + tf.close(); + if(DELETE_TEMP){ + new File(fname).delete(); + } + Collections.sort(list, SiteScoreR.PCOMP); + + final int lim=list.size(); + for(int i=0; ichrom || ssr.start>=loc){ + if(sb.length()>0){//Purge to disk + sb.append('\n'); + out.print(sb.toString()); + sb.setLength(0); + } + chrom=ssr.chrom; + loc=ssr.start; + loc=(loc-(loc%INTERVAL))+INTERVAL; + assert(loc>ssr.start); + assert(loc-ssr.start<=INTERVAL); + assert(loc%INTERVAL==0); + tab=""; + } + sb.append(tab); + sb.append(ssr.toText()); + tab="\t"; + } + + } + + + sb.append('\n'); + out.print(sb.toString()); + } + + private static final void finishByID(TextStreamWriter out, ArrayList keys){ + + long id=0; + int pairnum=0; + String tab=""; + StringBuilder sb=new StringBuilder(4000); + + for(Long k : keys){ + TextStreamWriter tsw=wmap.get(k); + String fname=fname(k, tempname); + for(int i=0; i<50 && tsw.isAlive(); i++){ + try { + tsw.join(20000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(tsw.isAlive()){ + System.err.println("Waiting for tsw "+tsw.fname+" to finish..."); + } + } + if(tsw.isAlive()){ + System.err.println(tsw.getClass().getName()+" for "+fname+" refused to die after a long time."); + assert(false); + } + + TextFile tf=new TextFile(fname, false, false); + ArrayList list=new ArrayList(1000); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){list.add(SiteScoreR.fromText(s));} + tf.close(); + if(DELETE_TEMP){ + new File(fname).delete(); + } + Collections.sort(list, SiteScoreR.IDCOMP); + + final int lim=list.size(); + for(int i=0; iid || ssr.pairnum>pairnum){ + if(sb.length()>0){//Purge to disk + sb.append('\n'); + out.print(sb.toString()); + sb.setLength(0); + } + id=ssr.numericID; + pairnum=ssr.pairnum; + tab=""; + }else{ + assert(ssr.numericID==id && ssr.pairnum==pairnum); + } + sb.append(tab); + sb.append(ssr.toText()); + tab="\t"; + } + + } + + + sb.append('\n'); + out.print(sb.toString()); + } + + private static final HashMap wmap=new HashMap(); + + public static int INTERVAL=200; + public static int BLOCKSIZE=8000000; + public static long sitesRead=0; + public static long sitesWritten=0; + public static long perfectWritten=0; + public static long semiperfectWritten=0; + public static boolean DELETE_TEMP=true; + public static final String DEFAULT_TEMP_PATTERN="SortSitesByIDTempFile_#.txt.gz"; + public static String tempname=null; + public static boolean POSITIONMODE=false; //False means sort by ID + public static boolean IGNORE_PERFECT_SITES=false; //Don't process perfect mappings, since they can't yield varlets. + +} diff --git a/current/pacbio/SplitOffPerfectContigs.java b/current/pacbio/SplitOffPerfectContigs.java new file mode 100755 index 0000000..e23ec74 --- /dev/null +++ b/current/pacbio/SplitOffPerfectContigs.java @@ -0,0 +1,393 @@ +package pacbio; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + +import stream.SiteScore; + + +import align2.Tools; +import dna.ChromosomeArray; +import dna.ChromosomeArrayCompressed; +import dna.CoverageArray; +import dna.CoverageArray2; +import dna.Data; +import dna.FastaToChromArrays; +import dna.Range; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +/** + * @author Brian Bushnell + * @date Jul 26, 2012 + * + */ +public class SplitOffPerfectContigs { + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + Timer t=new Timer(); + t.start(); + +// ChromosomeArray c=new ChromosomeArray(1, (byte)1, "ANNNAAAANAAANNA"); +// System.out.println(c.toContigRanges(3)); +// System.out.println(c.toContigRanges(2)); +// System.out.println(c.toContigRanges(1)); +// assert(false); + + + Data.GENOME_BUILD=-1; + String dest=null; + String covfile=null; + String sitesfile=null; + String contigfile=null; + int trigger=50; + int blocklen=100; + int mincoverage=2; + int padding=4; + int buildout=-1; + String name=null; + String source=null; + + + for(int i=0; i1 ? split[1] : "true"; + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); + name=Data.name; + source=Data.genomeSource; + System.out.println("Set Data.GENOME_BUILD to "+Data.GENOME_BUILD); + }else if(a.equals("outgenome") || a.equals("outbuild") || a.equals("genomeout") || a.equals("buildout")){ + buildout=Integer.parseInt(b); + }else if(a.equals("out") || a.equals("outfile")){ + dest=b; + }else if(a.startsWith("cov") || a.startsWith("pcov") || a.startsWith("perfectcov")){ + covfile=b; + }else if(a.startsWith("sites") || a.startsWith("psites") || a.startsWith("perfectsites")){ + sitesfile=b; + }else if(a.equals("padding")){ + padding=Integer.parseInt(b); + }else if(a.equals("trigger")){ + trigger=Integer.parseInt(b); + }else if(a.startsWith("mincov")){ + mincoverage=Integer.parseInt(b); + }else if(a.equals("blocklen")){ + blocklen=Integer.parseInt(b); + }else if(a.equals("contigfile")){ + contigfile=b; + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.startsWith("breakbad") || a.startsWith("splitbad") || a.startsWith("splitchim")){ + BREAK_BAD_CONTIGS=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter: "+args[i]); + } + } + + assert(Data.GENOME_BUILD>-1); + if(buildout<=0){buildout=Data.GENOME_BUILD;} +// assert(buildout!=Data.GENOME_BUILD); //For testing + + TextStreamWriter tsw=new TextStreamWriter(dest, false, true, false); + tsw.start(); + + //Break into contigs + long contig=1; + + if(contigfile!=null){ + if(new File(contigfile).exists()){ + TextFile tf=new TextFile(contigfile, false, false); + String s=tf.nextLine(); + if(s!=null){contig=Long.parseLong(s);} + tf.close(); + } + } + + ArrayList calist=null; + if(sitesfile!=null){ + calist=toCoverage(sitesfile, padding); + System.out.println("Made coverage; list size is "+calist.size()); + } + + if(buildout==Data.GENOME_BUILD){ + String fname=Data.chromFname(1, buildout); + fname=fname.replaceFirst("/genome/", "/index/"); + fname=fname.substring(0, fname.lastIndexOf('/')); + File dir=new File(fname); + if(dir.exists()){ + System.out.println("Deleting old index."); + for(File f2 : dir.listFiles()){ + if(f2.isFile() && !f2.isDirectory() && f2.getName().contains(".int2d")){f2.delete();} + } + } + } + + for(int chrom=1; chrom<=Data.numChroms; chrom++){ + ChromosomeArray cha=Data.getChromosome(chrom); + Data.unload(chrom, true); + CoverageArray ca=null; + if(calist!=null){ + if(calist.size()>chrom){ + ca=calist.get(chrom); + calist.set(chrom, null); + } + }else{ + assert(covfile!=null && covfile.contains("#")); + ca=ReadWrite.read(CoverageArray.class, covfile.replaceFirst("#", ""+chrom)); + if(ca==null){System.out.println("Can't find coverage for chrom "+chrom+" in file "+covfile.replaceFirst("#", ""+chrom));} + } + if(ca!=null){ + contig=writeContigs(cha, ca, contig, trigger, mincoverage, blocklen, tsw, buildout, align2.Tools.max(1, padding)); + }else{ + System.out.println("Can't find coverage for chrom "+chrom); + } + } + + + tsw.poison(); + + if(contigfile!=null){ + ReadWrite.writeString(""+contig, contigfile, false); + } + + FastaToChromArrays.writeInfo(buildout, Data.numChroms, name, source, false, false); + + t.stop(); + + System.out.println(" \tWrote \tKept \tDropped \tSplit"); + System.out.println("Bases \t"+basesWritten+" \t"+basesKept+" \t"+basesDropped+" \t"+basesX); + System.out.println("Contigs \t"+contigsWritten+" \t"+contigsKept+" \t"+contigsDropped+" \t"+contigsX); + System.out.println("Avg Len \t"+(basesWritten/Tools.max(contigsWritten,1))+" \t"+(basesKept/Tools.max(contigsKept,1)) + +" \t"+(basesDropped/Tools.max(contigsDropped, 1))+" \t"+(basesX/Tools.max(contigsX, 1))); + + System.out.println("Time:\t"+t); + } + + public static long writeContigs(ChromosomeArray cha, CoverageArray ca, long contig, int trigger, int minAcceptableCoverage, int fastaBlocklen, + TextStreamWriter tsw, int buildout, int tipbuffer){ + + ArrayList list=cha.toContigRanges(trigger); + + int minContig=MIN_CONTIG_TO_ADD; + + if(BREAK_BAD_CONTIGS){ + for(Range r : list){ + if(r.length>=minContig){ +// int uncovered=0; +// for(int i=r.a; i<=r.b; i++){ +// int cov=ca.get(i); +// if(cov=minContig){ + byte c=cha.get(i); + if(c!='N' && c!='X'){basesX++;} + if(i-lastx>10){ + contigsX++; + } + cha.set(i, 'X'); + lastx=i; + } + contiglen=0; + }else{ + contiglen++; + } + } + + //Reverse pass + lastx=Integer.MAX_VALUE; + contiglen=0; + for(int i=r.b; i>=r.a; i--){ + int cov=ca.get(i); + if(cov=minContig){ + byte c=cha.get(i); + if(c!='N' && c!='X'){basesX++;} + if(lastx-i>10){ + contigsX++; + } + cha.set(i, 'X'); + lastx=i; + } + contiglen=0; + }else{ + contiglen++; + } + } + } + } + list=cha.toContigRanges(trigger); + } + + + ArrayList good=new ArrayList(); + ArrayList bad=new ArrayList(); + int badlen=0; + + for(Range r : list){ + if(r.length>=minContig){ + int minCov=Integer.MAX_VALUE; + for(int i=r.a+tipbuffer; i<=r.b-tipbuffer; i++){ + minCov=Tools.min(minCov, ca.get(i)); + } + if(minCov>=minAcceptableCoverage){ + good.add(r); + if(verbose){ + StringBuilder sb0=new StringBuilder(), sb1=new StringBuilder(), sb2=new StringBuilder(); + for(int i=r.a; i<=r.b; i++){ + int cov=ca.get(i); + char b=(char) cha.get(i); + sb0.append(b); + sb1.append(b+"\t"); + sb2.append(cov+"\t"); + } + System.out.println(sb0+"\n"+sb1+"\n"+sb2+"\n"); + } + }else{ + bad.add(r); + badlen+=r.length+N_PAD_LENGTH; + if(verbose){ + StringBuilder sb0=new StringBuilder(), sb1=new StringBuilder(), sb2=new StringBuilder(); + for(int i=r.a; i<=r.b; i++){ + int cov=ca.get(i); + char b=(char) cha.get(i); + sb0.append(b); + sb1.append(b+"\t"); + sb2.append(cov+"\t"); + } + System.err.println(sb0+"\n"+sb1+"\n"+sb2+"\n"); + } + } + }else{ + contigsDropped++; + basesDropped+=r.length; + } + } + + for(Range r : good){ + contigsWritten++; + basesWritten+=r.length; + String s=cha.getString(r.a, r.b); + tsw.print(">"+contig+"\n"); + contig++; + writeContig(s, tsw, fastaBlocklen); +// for(int i=r.a; i<=r.b; i++){cha.set(i, 'N');} //Delete "good" contigs from reference. + } + + badlen=badlen+2*N_PAD_LENGTH2-N_PAD_LENGTH+10; + ChromosomeArray cha2=new ChromosomeArray(cha.chromosome, cha.strand, 0, badlen); + cha2.maxIndex=-1; + cha2.minIndex=0; + for(int i=0; i toCoverage(String sitesfile, int padding){ + ArrayList pcov=new ArrayList(8); + pcov.add(new CoverageArray2(0,1000)); + + long perfect=0; + long semiperfect=0; + long sites=0; + + String[] files=sitesfile.split(","); + for(String f : files){ + TextFile tf=new TextFile(f, false, false); + for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ + String[] split=line.split("\t"); + for(String s : split){ + SiteScore ss=SiteScore.fromText(s); + while(pcov.size()<=ss.chrom){ + pcov.add(new CoverageArray2(pcov.size())); + } + if(ss.perfect || ss.semiperfect){ + CoverageArray ca=pcov.get(ss.chrom); + for(int i=ss.start+padding; i<=ss.stop-padding; i++){ + ca.increment(i); + } + } + if(ss.perfect){perfect++;} + if(ss.semiperfect){semiperfect++;} + sites++; + assert(!ss.perfect || ss.semiperfect) : ss.perfect+", "+ss.semiperfect+"\n"+ss.header()+"\n"+ss.toText()+"\n"+s+"\n"; + } + } + tf.close(); + } + System.out.println("Read "+files.length+" sites file"+(files.length==1 ? "." : "s.")); + System.out.println("sites="+sites+" \tsemiperfect="+semiperfect+" \tperfect="+perfect); + return pcov; + } + + + public static long basesWritten=0; + public static long basesKept=0; + public static long basesDropped=0; + public static long basesX=0; + public static long contigsWritten=0; + public static long contigsKept=0; + public static long contigsDropped=0; + public static long contigsX=0; + + public static int N_PAD_LENGTH=MergeFastaContigs.N_PAD_LENGTH; + public static int N_PAD_LENGTH2=MergeFastaContigs.N_PAD_LENGTH2; //for ends + public static int MIN_CONTIG_TO_ADD=50; + public static boolean BREAK_BAD_CONTIGS=false; + + public static boolean verbose=false; + +} diff --git a/current/pacbio/StackSites.java b/current/pacbio/StackSites.java new file mode 100755 index 0000000..8d63c82 --- /dev/null +++ b/current/pacbio/StackSites.java @@ -0,0 +1,312 @@ +package pacbio; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; +import stream.SiteScoreR; + + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.CoverageArray; +import dna.CoverageArray2; +import dna.Data; +import dna.Gene; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.TextStreamWriter; + +import align2.ListNum; +import align2.MultiStateAligner9PacBio; +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Jul 16, 2012 + * + */ +public class StackSites { + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + Timer t=new Timer(); + t.start(); + + for(int i=4; i1 ? split[1] : null; + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); + } + } + + stack(args[0], args[1], args[2], args[3]); + t.stop(); + System.out.println("Time: \t"+t); + } + + public static void stack(String fname1, String fname2, String outname, String pcovoutname){ + assert(pcovoutname.contains("#")); + RTextInputStream rtis=new RTextInputStream(fname1, (fname2==null || fname2.equals("null") ? null : fname2), -1); + ConcurrentReadInputStream cris=new ConcurrentReadInputStream(rtis, -1); + + new Thread(cris).start(); + System.err.println("Started cris"); + boolean paired=cris.paired(); + System.err.println("Paired: "+paired); + + ArrayList pcov=new ArrayList(8); + pcov.add(new CoverageArray2(0,1000)); + + Glob g=new Glob(); + + { + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + +// System.out.println("Processing read "+r.numericID); + + if(r!=null){ + if(r.sites!=null){ +// System.out.println("Adding "+r.list.size()+" sites."); + SiteScore original=r.originalSite; + for(SiteScore ss : r.sites){ + sitesProcessed++; + + //TODO: Process perfect coverage + { + boolean b=false; + if(ss.perfect || ss.semiperfect){ + b=true; + }else{//Check for no-refs + int len=ss.stop-ss.start+1; + if(len==r.bases.length && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){ + b=checkPerfection(ss.start, ss.stop, r.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f); + } + } + if(b){ + while(pcov.size()<=ss.chrom){ + pcov.add(new CoverageArray2(pcov.size())); + } + CoverageArray ca=pcov.get(ss.chrom); + for(int i=ss.start; i<=ss.stop; i++){ + ca.increment(i); + } + } + } + + SiteScoreR ssr=new SiteScoreR(ss, r.bases.length, r.numericID, (byte)r.pairnum()); + + if(original!=null){ + ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false); + } + + g.add(ssr); + } +// System.out.println(sitesProcessed); + } + } + + if(r.mate!=null){ + Read r2=r.mate; + if(r2.sites!=null){ + + SiteScore original=r2.originalSite; + for(SiteScore ss : r2.sites){ + sitesProcessed++; + + { + boolean b=false; + if(ss.perfect || ss.semiperfect){ + b=true; + }else{//Check for no-refs + int len=ss.stop-ss.start+1; + if(len==r2.bases.length && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){ + b=checkPerfection(ss.start, ss.stop, r2.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f); + } + } + if(b){ + while(pcov.size()<=ss.chrom){ + pcov.add(new CoverageArray2(pcov.size())); + } + CoverageArray ca=pcov.get(ss.chrom); + for(int i=ss.start; i<=ss.stop; i++){ + ca.increment(i); + } + } + } + + SiteScoreR ssr=new SiteScoreR(ss, r2.bases.length, r2.numericID, (byte)r2.pairnum()); + + if(original!=null){ + ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false); + } + + g.add(ssr); + } + } + } + +// System.out.println(r.toString()); +// assert(r.list!=null); +// assert(r.list.size()>0); + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.err.println("Returned list"); + ReadWrite.closeStream(cris); + System.err.println("Closed stream"); + System.err.println("Processed "+readsProcessed+" reads."); + System.err.println("Processed "+sitesProcessed+" sites."); + } + + + for(int i=1; i=f*bases.length; + } + + private static void write(ArrayList alsr, TextStreamWriter out){ + if(alsr==null || alsr.size()==0){return;} + + int chrom=0; + int loc=INTERVAL; + StringBuilder sb=new StringBuilder(); + + String tab=""; + + final int lim=alsr.size(); + for(int i=0; ichrom || ssr.start>=loc){ + if(sb.length()>0){//Purge to disk + sb.append('\n'); + out.print(sb.toString()); + sb.setLength(0); + } + chrom=ssr.chrom; + loc=ssr.start; + loc=(loc-(loc%INTERVAL))+INTERVAL; + assert(loc>ssr.start); + assert(loc-ssr.start<=INTERVAL); + assert(loc%INTERVAL==0); + tab=""; + } + sb.append(tab); + sb.append(ssr.toText()); + tab="\t"; + } + + if(sb.length()>0){//Purge to disk + sb.append('\n'); + out.print(sb.toString()); + sb.setLength(0); + } + } + + public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, boolean useChrom){ + if((useChrom && ss.chrom!=trueChrom) || ss.strand!=trueStrand){return false;} + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + + return (Tools.absdif(ss.start, trueStart)<=thresh || Tools.absdif(ss.stop, trueStop)<=thresh); + } + + private static class Glob{ + + public Glob(){ + array=new ArrayList[8]; + for(int i=0; i(); + } + } + + public void add(SiteScoreR ssr){ + if(ssr.chrom>=array.length){ + int newlen=((int)ssr.chrom*2); + assert(newlen>array.length); + ArrayList[] array2=new ArrayList[newlen]; + for(int i=0; i();} + array=array2; + } + array[ssr.chrom].add(ssr); + } + + public ArrayList[] array; + + } + + public static final int INTERVAL=200; + public static long readsProcessed=0; + public static long sitesProcessed=0; + +} diff --git a/current/pacbio/StackSites2.java b/current/pacbio/StackSites2.java new file mode 100755 index 0000000..2cfdc80 --- /dev/null +++ b/current/pacbio/StackSites2.java @@ -0,0 +1,501 @@ +package pacbio; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; +import stream.SiteScoreR; + + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.CoverageArray; +import dna.CoverageArray2; +import dna.Data; +import dna.Gene; +import dna.Timer; +import fileIO.ReadWrite; +import fileIO.TextFile; +import fileIO.TextStreamWriter; + +import align2.ListNum; +import align2.MultiStateAligner9PacBio; +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Jul 16, 2012 + * + */ +public class StackSites2 { + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + Timer t=new Timer(); + t.start(); + + String tempname=null; + Data.GENOME_BUILD=-1; + + for(int i=4; i1 ? split[1] : null; + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); + }else if(a.equals("tempname")){ + tempname=b; + }else if(a.equals("deletefiles") || a.startsWith("deletetemp") || a.equals("delete")){ + DELETE_TEMP=(Tools.parseBoolean(b)); + }else if(a.equals("blocksize")){ + BLOCKSIZE=(Integer.parseInt(b)); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(Data.GENOME_BUILD<0){throw new RuntimeException("Please specify genome build.");} + + stack(args[0], args[1], args[2], args[3], tempname); + t.stop(); + System.out.println("Time: \t"+t); + } + + public static void stack(String fname1, String fname2, String outname, String pcovoutname, String tempname){ + assert(pcovoutname.contains("#")); + final RTextInputStream rtis=new RTextInputStream(fname1, (fname2==null || fname2.equals("null") ? null : fname2), -1); + final ConcurrentReadInputStream cris=new ConcurrentReadInputStream(rtis, -1); + + new Thread(cris).start(); + System.err.println("Started cris"); + final boolean paired=cris.paired(); + System.err.println("Paired: "+paired); + + final ArrayList pcov; + final ArrayList truePcov; + final ArrayList cov; + + { + int len=(Data.GENOME_BUILD<0 ? 8 : Data.numChroms+1); + + pcov=new ArrayList(len); + truePcov=new ArrayList(len); + cov=new ArrayList(len); + + System.out.println("len="+len+"; Data.numChroms="+Data.numChroms); + + pcov.add(null); + truePcov.add(null); + cov.add(null); + + for(int i=1; i ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert(paired==(r.mate!=null)); + } + + while(reads!=null && reads.size()>0){ + //System.err.println("reads.size()="+reads.size()); + for(Read r : reads){ + readsProcessed++; + +// System.out.println("Processing read "+r.numericID); + + if(r!=null){ + if(r.sites!=null){ +// System.out.println("Adding "+r.list.size()+" sites."); + SiteScore original=r.originalSite; + for(SiteScore ss : r.sites){ + sitesProcessed++; + + //TODO: Process perfect coverage + { + boolean b=false; + if(ss.semiperfect){ + b=true; + }else{//Check for no-refs + int len=ss.stop-ss.start+1; + if(len==r.bases.length && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){ + b=checkPerfection(ss.start, ss.stop, r.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f); + } + } + if(b){ + while(pcov.size()<=ss.chrom){ + pcov.add(new CoverageArray2(pcov.size(), Data.chromLengths[pcov.size()])); + truePcov.add(new CoverageArray2(truePcov.size(), Data.chromLengths[truePcov.size()])); + } + CoverageArray ca=pcov.get(ss.chrom); + CoverageArray tca=truePcov.get(ss.chrom); + for(int i=ss.start+PCOV_TIP_DIST; i<=ss.stop-PCOV_TIP_DIST; i++){ + ca.increment(i); + } + if(ss.perfect){ + for(int i=ss.start; i<=ss.stop; i++){ + tca.increment(i); + } + } + } + { + while(cov.size()<=ss.chrom){ + cov.add(new CoverageArray2(cov.size(), Data.chromLengths[cov.size()])); + } + CoverageArray ca=cov.get(ss.chrom); + for(int i=ss.start; i<=ss.stop; i++){ + ca.increment(i); + } + } + } + + SiteScoreR ssr=new SiteScoreR(ss, r.bases.length, r.numericID, (byte)r.pairnum()); + + if(original!=null){ + ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false); + } + + g.write(ssr); + } +// System.out.println(sitesProcessed); + } + } + + if(r.mate!=null){ + Read r2=r.mate; + if(r2.sites!=null){ + + SiteScore original=r2.originalSite; + for(SiteScore ss : r2.sites){ + sitesProcessed++; + + { + boolean b=false; + if(ss.semiperfect){ + b=true; + }else{//Check for no-refs + int len=ss.stop-ss.start+1; + if(len==r2.bases.length && ss.slowScore>=0.5f*MultiStateAligner9PacBio.POINTS_MATCH2){ + b=checkPerfection(ss.start, ss.stop, r2.bases, Data.getChromosome(ss.chrom), ss.strand==Gene.MINUS, 0.5f); + } + } + if(b){ + while(pcov.size()<=ss.chrom){ + pcov.add(new CoverageArray2(pcov.size(), Data.chromLengths[pcov.size()])); + truePcov.add(new CoverageArray2(truePcov.size(), Data.chromLengths[truePcov.size()])); + } + CoverageArray ca=pcov.get(ss.chrom); + CoverageArray tca=truePcov.get(ss.chrom); + for(int i=ss.start+PCOV_TIP_DIST; i<=ss.stop-PCOV_TIP_DIST; i++){ + ca.increment(i); + } + if(ss.perfect){ + for(int i=ss.start; i<=ss.stop; i++){ + tca.increment(i); + } + } + } + { + while(cov.size()<=ss.chrom){ + cov.add(new CoverageArray2(cov.size(), Data.chromLengths[cov.size()])); + } + CoverageArray ca=cov.get(ss.chrom); + for(int i=ss.start; i<=ss.stop; i++){ + ca.increment(i); + } + } + } + + SiteScoreR ssr=new SiteScoreR(ss, r2.bases.length, r2.numericID, (byte)r2.pairnum()); + + if(original!=null){ + ssr.correct=isCorrectHitLoose(ss, original.chrom, original.strand, original.start, original.stop, 40, false); + } + + g.write(ssr); + } + } + } + +// System.out.println(r.toString()); +// assert(r.list!=null); +// assert(r.list.size()>0); + + } + //System.err.println("returning list"); + cris.returnList(ln, ln.list.isEmpty()); + //System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + System.out.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + System.out.println("Returned list"); + ReadWrite.closeStream(cris); + System.out.println("Closed stream"); + System.out.println("Processed "+readsProcessed+" reads."); + System.out.println("Processed "+sitesProcessed+" sites."); + } + + + for(int i=1; i pcov, ArrayList truePcov, ArrayList cov){ + + + final TextStreamWriter out=new TextStreamWriter(outname, true, false, false); + out.start(); + ArrayList keys=new ArrayList(g.wmap.size()); + keys.addAll(g.wmap.keySet()); + Collections.sort(keys); + for(Long k : keys){ + TextStreamWriter tsw=g.wmap.get(k); + tsw.poison(); + } + + + + int chrom=0; + int loc=INTERVAL; + String tab=""; + StringBuilder sb=new StringBuilder(4000); + + for(Long k : keys){ + TextStreamWriter tsw=g.wmap.get(k); + String fname=Glob.fname(k, g.tempname); + for(int i=0; i<50 && tsw.isAlive(); i++){ + try { + tsw.join(20000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + if(tsw.isAlive()){ + System.err.println("Waiting for tsw "+tsw.fname+" to finish..."); + } + } + if(tsw.isAlive()){ + System.err.println(tsw.getClass().getName()+" for "+fname+" refused to die after a long time."); + assert(false); + } + + TextFile tf=new TextFile(fname, false, false); + ArrayList list=new ArrayList(1000); + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + SiteScoreR ssr=SiteScoreR.fromText(s); + + assert(pcov.size()>=ssr.chrom) : ssr.chrom+", "+pcov.size()+", "+truePcov.size()+", "+cov.size(); + final int c=ssr.chrom; + boolean retain=retainSite(ssr, (pcov.size()>c ? pcov.get(c) : FAKE), (truePcov.size()>c ? truePcov.get(c) : FAKE), (cov.size()>c ? cov.get(c) : null)); + if(retain){ + list.add(ssr); + sitesOut++; + } + } + tf.close(); + if(DELETE_TEMP){ + new File(fname).delete(); + } + Collections.sort(list, SiteScoreR.PCOMP); + + final int lim=list.size(); + for(int i=0; ichrom || ssr.start>=loc){ + if(sb.length()>0){//Purge to disk + sb.append('\n'); + out.print(sb.toString()); + sb.setLength(0); + } + chrom=ssr.chrom; + loc=ssr.start; + loc=(loc-(loc%INTERVAL))+INTERVAL; + assert(loc>ssr.start); + assert(loc-ssr.start<=INTERVAL); + assert(loc%INTERVAL==0); + tab=""; + } + sb.append(tab); + sb.append(ssr.toText()); + tab="\t"; + } + + } + + + sb.append('\n'); + out.print(sb.toString()); + out.poisonAndWait(); + } + + private static boolean retainSite(SiteScoreR ssr, CoverageArray pcov, CoverageArray tpcov, CoverageArray cov){ + if(ssr.semiperfect && !ssr.perfect){return true;} //For tip extension + assert(cov!=null && cov!=FAKE) : (cov==FAKE)+", "+ssr.chrom; + + if(!ssr.semiperfect){ //Typical flawed read + assert(!ssr.perfect); + boolean toss=true; + if(pcov==null || tpcov==null){ + toss=false; + }else{ + for(int j=ssr.start-PCOV_TIP_DIST; toss && j<=ssr.stop+PCOV_TIP_DIST; j++){ + toss=(pcov.get(j)>=MIN_PCOV_TO_TOSS && tpcov.get(j)>=MIN_PCOV_TO_TOSS); + } + } + if(toss){ + for(int j=ssr.start; j<=ssr.stop; j++){cov.increment(j, -1);} + return false; + } + } + + boolean alwaysLowCov=true; + boolean alwaysTooPerfect=true; + boolean onlyPerfect=true; + + for(int j=ssr.start; (alwaysLowCov || alwaysTooPerfect || onlyPerfect) && j<=ssr.stop; j++){ + int c=cov.get(j); + int tp=tpcov.get(j); + + alwaysLowCov=alwaysLowCov && c0; + } + + if(alwaysLowCov || (alwaysTooPerfect && !ssr.semiperfect) || onlyPerfect){ + if(!ssr.semiperfect){ + for(int j=ssr.start; j<=ssr.stop; j++){cov.increment(j, -1);} + } + return false; + } + + return true; + } + + private static boolean checkPerfection(int start, int stop, byte[] bases, ChromosomeArray cha, boolean rcomp, float f) { + + int noref=0; + if(rcomp){ + for(int i=0; i=f*bases.length; + } + + public static boolean isCorrectHitLoose(SiteScore ss, int trueChrom, byte trueStrand, int trueStart, int trueStop, int thresh, boolean useChrom){ + if((useChrom && ss.chrom!=trueChrom) || ss.strand!=trueStrand){return false;} + + assert(ss.stop>ss.start) : ss.toText()+", "+trueStart+", "+trueStop; + assert(trueStop>trueStart) : ss.toText()+", "+trueStart+", "+trueStop; + + return (Tools.absdif(ss.start, trueStart)<=thresh || Tools.absdif(ss.stop, trueStop)<=thresh); + } + + private static class Glob{ + + public Glob(String tempPattern_){ + tempname=(tempPattern_ == null ? DEFAULT_TEMP_PATTERN : tempPattern_); + } + + public void write(SiteScoreR ssr){ + long key=key(ssr.chrom, ssr.start); + TextStreamWriter tsw=wmap.get(key); + if(tsw==null){ + String fname=fname(key, tempname); + tsw=new TextStreamWriter(fname, true, false, false); + tsw.start(); + wmap.put(key, tsw); + } + tsw.print(ssr.toText().append('\n')); + } + + protected static final long key(int chrom, int start){ + long k=((long)chrom<<32)+(Tools.max(start, 0))/BLOCKSIZE; + return k; + } + + protected static final String fname(long key, String outname){ + if(outname==null){outname=DEFAULT_TEMP_PATTERN;} + assert(outname.contains("#")) : outname; + return outname.replace("#", "b"+Data.GENOME_BUILD+"_"+key); + } + + final HashMap wmap=new HashMap(); + final String tempname; + + } + + /** Sites will be written to files, each containing an index range of this size. + * Larger means fewer files, but more memory used when reading the files (at a later stage). + */ + public static int BLOCKSIZE=8000000; + + /** Sites are grouped into intervals (by start location) and treated as an array of arrays. + * All sites in an interval are printed as one line of text. */ + public static final int INTERVAL=200; + public static long readsProcessed=0; + public static long sitesProcessed=0; + public static long sitesOut=0; + public static boolean DELETE_TEMP=true; + public static final String DEFAULT_TEMP_PATTERN="StackSites2TempFile_#.txt.gz"; + /** Start incrementing coverage this far in from the site tips. */ + public static int PCOV_TIP_DIST=6; + + /** Toss sites from areas with less than this coverage, since they can't be used to call vars */ + public static int MIN_COV_TO_RETAIN=2; + /** Toss sites from areas with less than this coverage, since they can't be used to call vars */ + public static int MIN_PCOV_TO_TOSS=3; + + private static final CoverageArray FAKE=new CoverageArray2(1000); +} diff --git a/current/stream/ByteBuilder.java b/current/stream/ByteBuilder.java new file mode 100755 index 0000000..7e695ed --- /dev/null +++ b/current/stream/ByteBuilder.java @@ -0,0 +1,304 @@ +package stream; + +import java.util.Arrays; + +import align2.Tools; + +/** + * @author Brian Bushnell + * @date Oct 8, 2013 + * + */ +public final class ByteBuilder { + + public static void main(String[] args){ + StringBuilder sb=new StringBuilder(); + } + + public ByteBuilder(){ + array=new byte[32]; + } + + public ByteBuilder(int initial){ + assert(initial>=1); + array=new byte[initial]; + } + + public ByteBuilder(Object o){ + String s=o.toString(); + array=new byte[s.length()+1]; + append(s); + } + + + public ByteBuilder append(float x, int places){return append(String.format("%."+places+"f", x));} + public ByteBuilder append(double x, int places){return append(String.format("%."+places+"f", x));} + + public ByteBuilder append(float x){return append(Float.toString(x));} + public ByteBuilder append(double x){return append(Double.toString(x));} + public ByteBuilder append(boolean x){return append(x ? tbool : fbool);} + + + public ByteBuilder append(char x){ + if(length>=array.length){expand();} + array[length]=(byte)x; + length++; + return this; + } + public ByteBuilder append(byte x){ + if(length>=array.length){expand();} + array[length]=x; + length++; + return this; + } + + public ByteBuilder append(int x){ + expand(11); + if(x<0){ + if(x==Integer.MIN_VALUE){ + return append(Integer.toString(Integer.MIN_VALUE)); + }else{ + array[length]='-'; + length++; + x=-x; + } + }else if(x==0){ + array[length]='0'; + length++; + return this; + } + +// final int len=lengthOf(x); +// int pos=length+len-1; +// while(x>9){ +// int y=x%100; +// x=x/100; +// array[pos]=ones100[y]; +// pos--; +// array[pos]=tens100[y]; +// pos--; +// } +// while(x>0){ +// int y=x%10; +// x=x/10; +// array[pos]=numbers[y]; +// pos--; +// } +// length+=len; + +// final int initial=length; +// while(x>9){ +// int y=x%100; +// x=x/100; +// array[length]=tens100[y]; +// length--; +// array[length]=ones100[y]; +// length--; +// } +// while(x>0){ +// int y=x%10; +// x=x/10; +// array[length]=numbers[y]; +// length++; +// } +// +// for(int i=initial, j=length-1; i9){ + int y=x%100; + x=x/100; + numbuffer[pos]=ones100[y]; + pos++; + numbuffer[pos]=tens100[y]; + pos++; + } + while(x>0){ + int y=x%10; + x=x/10; + numbuffer[pos]=ones100[y]; + pos++; + } + + while(pos>0){ + pos--; + array[length]=numbuffer[pos]; + length++; + } + + return this; + } + + public ByteBuilder append(long x){ + if(x>Integer.MIN_VALUE && x<=Integer.MAX_VALUE){return append((int)x);} + expand(20); + if(x<0){ + if(x==Integer.MIN_VALUE){ + return append((long)x); + }else{ + array[length]='-'; + length++; + x=-x; + } + }else if(x==0){ + array[length]='0'; + length++; + return this; + } + +// final int len=lengthOf(x); +// int pos=length+len-1; +// while(x>9){ +// int y=(int)(x%100); +// x=x/100; +// array[pos]=ones100[y]; +// pos--; +// array[pos]=tens100[y]; +// pos--; +// } +// while(x>0){ +// int y=(int)(x%10); +// x=x/10; +// array[pos]=numbers[y]; +// pos--; +// } +// length+=len; + + int pos=0; + while(x>9){ + int y=(int)(x%100); + x=x/100; + numbuffer[pos]=ones100[y]; + pos++; + numbuffer[pos]=tens100[y]; + pos++; + } + while(x>0){ + int y=(int)(x%10); + x=x/10; + numbuffer[pos]=ones100[y]; + pos++; + } + + while(pos>0){ + pos--; + array[length]=numbuffer[pos]; + length++; + } + + return this; + } + + public ByteBuilder append(String x){ + if(x==null){return append(nullBytes);} + expand(x.length()); + for(int i=0; i=x; + } + + private final void expand(){ + int x=Tools.min(Integer.MAX_VALUE, array.length*2); + array=Arrays.copyOf(array, x); + } + + private final void expand(int extra){ + long x=array.length; + while(x-length=0 && x<=array.length); + length=x; + } + + public static final byte[] numbers=new byte[] {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'}; + public static final byte[] nullBytes="null".getBytes(); + public static final byte[] fbool="false".getBytes(); + public static final byte[] tbool="true".getBytes(); + public byte[] array; + public int length=0; + private final byte[] numbuffer=new byte[19]; + + public static final byte[] ones100, tens100; + + static{ + ones100=new byte[100]; + tens100=new byte[100]; + for(int i=0; i<100; i++){ + ones100[i]=(byte)('0'+i%10); + tens100[i]=(byte)('0'+i/10); + } + } + +} diff --git a/current/stream/ConcurrentCollectionReadInputStream.java b/current/stream/ConcurrentCollectionReadInputStream.java new file mode 100755 index 0000000..ae145fd --- /dev/null +++ b/current/stream/ConcurrentCollectionReadInputStream.java @@ -0,0 +1,285 @@ +package stream; + +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import align2.ListNum; +import align2.Shared; + +import dna.Data; + +public class ConcurrentCollectionReadInputStream implements ConcurrentReadStreamInterface { + + public ConcurrentCollectionReadInputStream(List source1, List source2, long maxReadsToGenerate){ + assert(source1!=source2); + producer1=source1; + depot=new ConcurrentDepot(BUF_LEN, NUM_BUFFS); + producer2=source2; + maxReads=maxReadsToGenerate>=0 ? maxReadsToGenerate : Long.MAX_VALUE; + if(maxReads==0){ + System.err.println("Warning - created a read stream for 0 reads."); + assert(false); + } + + } + + public synchronized ListNum nextList() { + ArrayList list=null; + if(verbose){System.err.println("**************** nextList() was called; shutdown="+shutdown+", depot.full="+depot.full.size());} + while(list==null){ + if(shutdown){ + if(verbose){System.err.println("**************** nextList() returning null; shutdown="+shutdown+", depot.full="+depot.full.size());} + return null; + } + try { + list=depot.full.take(); + assert(list!=null); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + if(verbose){System.err.println("**************** nextList() returning list of size "+list.size()+"; shutdown="+shutdown+", depot.full="+depot.full.size());} + ListNum ln=new ListNum(list, listnum); + listnum++; + return ln; + } + + public void returnList(ListNum ln, boolean poison){ + if(ln!=null){ + ln.list.clear(); + }else{ + System.err.println("Warning, null list returned: "); //System.err.println("Warning from class "+getClass().getName()+", null list returned: "); + new Exception().printStackTrace(); + } + if(poison){ + if(verbose){System.err.println("A: Adding empty list to full.");} + depot.full.add(ln==null ? new ArrayList(0) : ln.list); + }else{ + if(ln!=null){depot.empty.add(ln.list);} +// depot.empty.add(ln==null ? new ArrayList(0) : ln.list); + } + } + + @Override + public void run() { +// producer.start(); + if(verbose){System.err.println("cris started.");} + threads=new Thread[] {Thread.currentThread()}; + +// readLists(); + readSingles(); + + addPoison(); + + //End thread + + while(!depot.empty.isEmpty() && !shutdown){ +// System.out.println("Ending"); + if(verbose){System.err.println("B: Adding empty lists to full.");} + depot.full.add(depot.empty.poll()); + } +// System.err.println("cris thread terminated. Final depot size: "+depot.full.size()+", "+depot.empty.size()); + } + + private final void addPoison(){ + //System.err.println("Adding poison."); + //Add poison pills + if(verbose){System.err.println("C: Adding poison to full.");} + depot.full.add(new ArrayList()); + for(int i=1; i list=null; + while(list==null){ + try { + list=depot.empty.poll(1000, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // TODO Auto-generated catch block +// System.err.println("Do not be alarmed by the following error message:"); +// e.printStackTrace(); + if(shutdown){ + i=depot.bufferCount; + break; + } + } + } + if(list!=null){ + if(verbose){System.err.println("D: Adding list("+list.size()+") to full.");} + depot.full.add(list); + } + } + //System.err.println("Added poison."); + } + + private final void readSingles(){ + + for(int i=0; !shutdown && i list=null; + while(list==null){ + try { + list=depot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + if(shutdown){break;} + } + } + if(shutdown || list==null){break;} + + long bases=0; + final long lim=producer1.size(); + while(list.size()(BUF_LEN, NUM_BUFFS); + generated=0; + nextProgress=PROGRESS_INCR; + } + + @Override + public synchronized void close(){ + shutdown(); +// producer1.close(); +// if(producer2!=null){producer2.close();} +// System.out.println("A"); + if(threads!=null && threads[0]!=null && threads[0].isAlive()){ + + while(threads[0].isAlive()){ +// System.out.println("B"); + ArrayList list=null; + for(int i=0; i<1000 && list==null && threads[0].isAlive(); i++){ + try { + list=depot.full.poll(200, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + System.err.println("Do not be alarmed by the following error message:"); + e.printStackTrace(); + break; + } + } + + if(list!=null){ + list.clear(); + depot.empty.add(list); + } + + +// System.out.println("isAlive? "+threads[0].isAlive()); + } + + } + + if(threads!=null){ + for(int i=1; i=nextProgress){ + Data.sysout.print('.'); + nextProgress+=PROGRESS_INCR; + } + } + + @Override + public void setSampleRate(float rate, long seed){ + samplerate=rate; + if(rate>=1f){ + randy=null; + }else if(seed>-1){ + randy=new java.util.Random(seed); + }else{ + randy=new java.util.Random(); + } + } + + @Override + public boolean errorState(){return errorState;} + /** TODO */ + private boolean errorState=false; + + private float samplerate=1f; + private java.util.Random randy=null; + + private Thread[] threads; + + public Object[] producers(){return new Object[] {producer1, producer2};} + + public final List producer1; + public final List producer2; + private ConcurrentDepot depot; + + private long maxReads; + private long generated=0; + private long listnum=0; + private long nextProgress=PROGRESS_INCR; + + private final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + private final int NUM_BUFFS=Shared.READ_BUFFER_NUM_BUFFERS; + private final long MAX_DATA=Shared.READ_BUFFER_MAX_DATA; + + public static boolean verbose=false; + + private static final ArrayList poison=new ArrayList(0); + + public static boolean SHOW_PROGRESS=false; + public static long PROGRESS_INCR=1000000; + +} diff --git a/current/stream/ConcurrentDepot.java b/current/stream/ConcurrentDepot.java new file mode 100755 index 0000000..6aa2b33 --- /dev/null +++ b/current/stream/ConcurrentDepot.java @@ -0,0 +1,35 @@ +package stream; + +import java.util.ArrayList; +import java.util.concurrent.ArrayBlockingQueue; + +public class ConcurrentDepot { + + + + public ConcurrentDepot(int bufSize, int numBufs){ + bufferSize=bufSize; + bufferCount=numBufs; + + lists=new ArrayList[numBufs]; + empty=new ArrayBlockingQueue>(numBufs+1); + full=new ArrayBlockingQueue>(numBufs+1); + + for(int i=0; i(bufSize); + empty.add(lists[i]); + } + + } + + + public final ArrayBlockingQueue> empty; + public final ArrayBlockingQueue> full; + + public final int bufferSize; + public final int bufferCount; + + + private final ArrayList[] lists; + +} diff --git a/current/stream/ConcurrentGenericReadInputStream.java b/current/stream/ConcurrentGenericReadInputStream.java new file mode 100755 index 0000000..c5b4a24 --- /dev/null +++ b/current/stream/ConcurrentGenericReadInputStream.java @@ -0,0 +1,870 @@ +package stream; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.concurrent.ArrayBlockingQueue; +import java.util.concurrent.TimeUnit; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; + +import dna.Data; +import dna.Timer; + +import fileIO.ReadWrite; +import fileIO.FileFormat; + +public class ConcurrentGenericReadInputStream implements ConcurrentReadStreamInterface { + + public static void main(String[] args){ + String in1=args[0]; + String in2=(args.length<2 || args[1].equalsIgnoreCase("null") || args[1].contains("=") ? null : args[1]); + if(in2!=null){ + assert(!in1.equalsIgnoreCase(in2)); + FASTQ.TEST_INTERLEAVED=false; + }else{ + FASTQ.TEST_INTERLEAVED=true; + FASTQ.FORCE_INTERLEAVED=true; + } + + long maxReads=-1; + for(int i=1; i1 ? split[1] : "true"); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("null") || (split.length==1 && i==1)){ + // do nothing + }else if(a.equals("reads") || a.startsWith("maxreads")){ + maxReads=Long.parseLong(b); + }else if(a.equals("ziplevel") || a.equals("zl")){ + ReadWrite.ZIPLEVEL=Integer.parseInt(b); + }else if(a.startsWith("fastareadlen")){ + FastaReadInputStream.TARGET_READ_LEN=Integer.parseInt(b); + FastaReadInputStream.SPLIT_READS=(FastaReadInputStream.TARGET_READ_LEN>0); + }else if(a.startsWith("fastaminread") || a.startsWith("fastaminlen")){ + FastaReadInputStream.MIN_READ_LEN=Integer.parseInt(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(FastaReadInputStream.settingsOK()); + Timer t=new Timer(); + t.start(); + + ConcurrentReadStreamInterface cris=getReadInputStream(maxReads, false, false, true, in1, in2); + System.out.println("Fetched "+cris.getClass().getName()); + { + Object[] p=cris.producers(); +// while(p[0]==null){ +// p=cris.producers(); +// } + System.out.print("Producers: "); + String comma=""; + for(Object o : p){ + System.out.print(comma+(o==null ? "null" : o.getClass().getName())); + comma=", "; + } + System.out.println(); + } + boolean paired=cris.paired(); + System.out.println("paired="+paired); + Thread cristhread=new Thread(cris); + cristhread.start(); + + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + if(reads!=null && !reads.isEmpty()){ + Read r=reads.get(0); + assert((r.mate!=null)==paired); + } + + long readCount=0; + long baseCount=0; + + while(reads!=null && reads.size()>0){ + + for(Read r : reads){ + Read r2=r.mate; + if(r!=null){ + readCount++; + if(r.bases!=null){ + baseCount+=r.bases.length; + } + } + if(r2!=null){ + readCount++; + if(r2.bases!=null){ + baseCount+=r2.bases.length; + } + } + } + cris.returnList(ln, ln.list.isEmpty()); +// System.err.println("fetching list"); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); +// System.out.println("reads: "+(reads==null ? "null" : reads.size())); + } + System.err.println("Finished reading"); + cris.returnList(ln, ln.list.isEmpty()); + + cris.close(); + t.stop(); + + System.out.println("Reads: \t"+readCount); + System.out.println("Bases: \t"+baseCount); + System.out.println("Avg Length: \t"+String.format("%.2f",baseCount*1.0/readCount)); + System.out.println("Time: \t"+t); + } + + public ConcurrentGenericReadInputStream(ReadInputStream source1, ReadInputStream source2, long maxReadsToGenerate){ + assert(source1!=source2); + producer1=source1; + depot=new ConcurrentDepot(BUF_LEN, NUM_BUFFS); + producer2=source2; + assert(source2==null || !FASTQ.FORCE_INTERLEAVED) : "Please do not set 'interleaved=true' with dual input files."; + maxReads=maxReadsToGenerate>=0 ? maxReadsToGenerate : Long.MAX_VALUE; + if(maxReads==0){ + System.err.println("Warning - created a read stream for 0 reads."); + assert(false); + } +// if(maxReads>(4);} + if(producer2!=null){p2q=new ArrayBlockingQueue>(4);} + } + + public synchronized ListNum nextList() { + ArrayList list=null; + if(verbose){System.err.println("**************** nextList() was called; shutdown="+shutdown+", depot.full="+depot.full.size());} + while(list==null){ + if(shutdown){ + if(verbose){System.err.println("**************** nextList() returning null; shutdown="+shutdown+", depot.full="+depot.full.size());} + return null; + } + try { + list=depot.full.take(); + assert(list!=null); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + if(verbose){System.err.println("**************** nextList() returning list of size "+list.size()+"; shutdown="+shutdown+", depot.full="+depot.full.size());} + ListNum ln=new ListNum(list, listnum); + listnum++; + return ln; + } + + public void returnList(ListNum ln, boolean poison){ + if(ln!=null){ + ln.list.clear(); + }else{ + System.err.println("Warning, null list returned: "); //System.err.println("Warning from class "+getClass().getName()+", null list returned: "); + new Exception().printStackTrace(); + } + if(poison){ + if(verbose){System.err.println("A: Adding empty list to full.");} + depot.full.add(ln==null ? new ArrayList(0) : ln.list); + }else{ + if(ln!=null){depot.empty.add(ln.list);} +// depot.empty.add(ln==null ? new ArrayList(0) : ln.list); + } + } + + @Override + public void run() { +// producer.start(); + synchronized(running){ + assert(!running[0]) : "This cris was started by multiple threads."; + running[0]=true; + } + + ReadThread rt1=null; + ReadThread rt2=null; + if(producer1.preferLists() || producer1.preferBlocks()){ + rt1=new ReadThread(producer1, p1q); + rt2=(producer2==null ? null : new ReadThread(producer2, p2q)); + rt1.start(); + if(rt2!=null){rt2.start();} + } + + threads=(rt1==null ? new Thread[] {Thread.currentThread()} : + rt2==null ? new Thread[] {Thread.currentThread(), rt1} : + new Thread[] {Thread.currentThread(), rt1, rt2}); + + if(producer1.preferLists() || producer1.preferBlocks()){ + readLists(); + //System.err.println("Done reading lists."); + }else if(producer1.preferBlocks()){ + assert(false); +// readBlocks(); + }else{ + readSingles(); + } + + addPoison(); + + //End thread + + if(verbose){System.err.println("cris finished addPoison.");} + while(!depot.empty.isEmpty() && !shutdown){ +// System.out.println("Ending"); + if(verbose){System.err.println("B: Adding empty lists to full.");} + depot.full.add(depot.empty.poll()); + } + if(verbose){System.err.println("cris thread syncing before shutdown.");} + + synchronized(running){//TODO Note: for some reason syncing on 'this' instead of 'running' causes a hang. Something else must be syncing improperly on this. + assert(running[0]); + running[0]=false; + } + if(verbose){System.err.println("cris thread terminated. Final depot size: "+depot.full.size()+", "+depot.empty.size());} + } + + private final void addPoison(){ + //System.err.println("Adding poison."); + //Add poison pills + if(verbose){System.err.println("C: Adding poison to full.");} + depot.full.add(new ArrayList()); + for(int i=1; i list=null; + while(list==null){ + try { + list=depot.empty.poll(1000, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // TODO Auto-generated catch block +// System.err.println("Do not be alarmed by the following error message:"); +// e.printStackTrace(); + if(shutdown){ + i=depot.bufferCount; + break; + } + } + } + if(list!=null){ + if(verbose){System.err.println("D: Adding list("+list.size()+") to full.");} + depot.full.add(list); + } + } + if(verbose){System.err.println("Added poison.");} + } + + private final void readSingles(){ + + while(!shutdown && producer1.hasMore() && generated list=null; + while(list==null){ + try { + list=depot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + if(shutdown){break;} + } + } + if(shutdown || list==null){break;} + + long bases=0; + while(list.size() buffer1=null; + ArrayList buffer2=null; + ArrayList list=null; + int next=0; + +// System.out.println("a"); + if(verbose){System.err.println(getClass().getName()+" entering read lists loop.");} + while(buffer1!=poison && (buffer1!=null || (!shutdown && generated0){ +// if(verbose){System.err.println("G: Adding list("+list.size()+") to full.");} +// depot.full.add(list); +// list=null; +// } + if(verbose){System.err.println("Breaking because buffer1==null: "+(buffer1==null)+" || buffer1==poison: "+(buffer1==poison)+" || shutdown: "+shutdown);} + break; + } + assert(buffer1.size()<=BUF_LEN); //Although this is not really necessary. + +// assert(!set2.contains(buffer1)) : buffer1.hashCode(); +// set2.add(buffer1); +// System.out.println(buffer1.hashCode()); + + if(buffer2!=null){ +// System.out.println("h"); + + if(buffer2!=null && (buffer1==null || buffer2.size()!=buffer1.size())){ + System.err.println("Error: Misaligned read streams."); + errorState=true; + return; + } + assert(buffer2==null || buffer2.size()==buffer1.size()); + } +// System.out.println("i"); + if(buffer1.size()<=(BUF_LEN-list.size()) && (buffer1.size()+generated)=buffer1.size()){ + buffer1=null; + buffer2=null; + next=0; +// System.out.println("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"); + }else{ +// System.out.println("------------------------------------------------"); + } +// System.out.println("m"); + } + if(verbose){System.err.println("Loop end: list.size()="+(list.size()+", depot.bufferSize="+depot.bufferSize+", generated="+generated));} +// System.out.println("n"); + if(verbose){System.err.println(Thread.currentThread().getName());} + } + +// System.out.println("p"); +// System.err.println("Adding list to full depot. Shutdown="+shutdown); + if(verbose){System.err.println("F: Adding list("+list.size()+") to full.");} + depot.full.add(list); +// System.err.println("Added."); + +// System.out.println("o"); + if(buffer1==poison){ + if(verbose){System.err.println("Detected poison from buffer1.");} + break; + } + list=null; + if(verbose){System.err.println("Finished loop iteration.\n");} + if(verbose){System.err.println("loop end: buffer1==null "+(buffer1==null)+", buffer1==poison "+(buffer1==poison) + +", shutdown="+shutdown+", generated buffer1, ArrayList buffer2){ + for(int i=0; i buffer1, ArrayList buffer2){ + int removed=0; + if(buffer2==null){ + for(int i=0; i0){ + Tools.condenseStrict(buffer1); + if(buffer2!=null){Tools.condenseStrict(buffer2);} + } + return removed; + } + + private boolean shutdown=false; + + @Override + public void shutdown(){ +// System.err.println("Called shutdown."); + shutdown=true; + if(!shutdown){ + for(Thread t : threads){ + if(t!=null && t.isAlive()){ + t.interrupt(); + } + } + } + } + + @Override + public synchronized void restart(){ + shutdown=false; + p1q.clear(); + if(p2q!=null){p2q.clear();} + producer1.restart(); + if(producer2!=null){producer2.restart();} + depot=new ConcurrentDepot(BUF_LEN, NUM_BUFFS); + generated=0; + nextProgress=PROGRESS_INCR; + } + + @Override + public synchronized void close(){ + if(verbose){System.err.println("Called shutdown for "+producer1+"; "+threads[0].getState());} +// if(verbose){System.err.println(((FastqReadInputStream)producer1).tf.isOpen());} + shutdown(); + errorState|=producer1.close(); + if(producer2!=null){errorState|=producer2.close();} + if(threads!=null && threads[0]!=null && threads[0].isAlive()){ + + while(threads[0].isAlive()){ +// System.out.println("B"); + ArrayList list=null; + for(int i=0; i<1000 && list==null && threads[0].isAlive(); i++){ + try { + list=depot.full.poll(200, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + System.err.println("Do not be alarmed by the following error message:"); + e.printStackTrace(); + break; + } + } + + if(list!=null){ + list.clear(); + depot.empty.add(list); + } + } + + } + + if(threads!=null){ + for(int i=1; i1){in2=args[1];} + if(args.length>2){qf1=args[2];} + if(args.length>3){qf2=args[3];} + + final FileFormat ff1=FileFormat.testInput(in1, null, allowSubprocess); + final FileFormat ff2=FileFormat.testInput(in2, null, allowSubprocess); + + if(verbose){ + System.err.println("getReadInputStream("+maxReads+", "+colorspace+", "+keepSamHeader+", "+allowSubprocess+", "+in1+", "+in2+", "+qf1+", "+qf2+")"); + } + + return getReadInputStream(maxReads, colorspace, keepSamHeader, ff1, ff2, qf1, qf2); + } + + public static ConcurrentReadStreamInterface getReadInputStream(long maxReads, boolean colorspace, boolean keepSamHeader, FileFormat ff1, FileFormat ff2){ + return getReadInputStream(maxReads, colorspace, keepSamHeader, ff1, ff2, (String)null, (String)null); + } + + public static ConcurrentReadStreamInterface getReadInputStream(long maxReads, boolean colorspace, boolean keepSamHeader, FileFormat ff1, FileFormat ff2, String qf1, String qf2){ + + if(verbose){ + System.err.println("getReadInputStream("+maxReads+", "+colorspace+", "+keepSamHeader+", "+ff1+", "+ff2+", "+qf1+", "+qf2+")"); + } + + assert(ff1!=null); + assert(ff2==null || ff1.name()==null || !ff1.name().equalsIgnoreCase(ff2.name())); + assert(qf1==null || ff1.name()==null || !ff1.name().equalsIgnoreCase(qf2)); + assert(qf1==null || qf2==null || qf1.equalsIgnoreCase(qf2)); + + final ConcurrentReadStreamInterface cris; + + if(ff1.fastq()){ + + ReadInputStream ris1=new FastqReadInputStream(ff1, colorspace); + ReadInputStream ris2=(ff2==null ? null : new FastqReadInputStream(ff2, colorspace)); + cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads); + + }else if(ff1.fasta()){ + + ReadInputStream ris1=(qf1==null ? new FastaReadInputStream(ff1, colorspace, (FASTQ.FORCE_INTERLEAVED && ff2==null), ff2==null ? Shared.READ_BUFFER_MAX_DATA : -1) + : new FastaQualReadInputStream(ff1, qf1, colorspace)); + ReadInputStream ris2=(ff2==null ? null : qf2==null ? new FastaReadInputStream(ff2, colorspace, false, -1) : new FastaQualReadInputStream(ff2, qf2, colorspace)); + cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads); + + }else if(ff1.scarf()){ + + ReadInputStream ris1=new ScarfReadInputStream(ff1, colorspace); + ReadInputStream ris2=(ff2==null ? null : new ScarfReadInputStream(ff2, colorspace)); + cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads); + + }else if(ff1.samOrBam()){ + + ReadInputStream ris1=new SamReadInputStream(ff1, colorspace, keepSamHeader, FASTQ.FORCE_INTERLEAVED); + ReadInputStream ris2=(ff2==null ? null : new SamReadInputStream(ff2, colorspace, false, false)); + cris=new ConcurrentGenericReadInputStream(ris1, ris2, maxReads); + + }else if(ff1.bread()){ + + RTextInputStream rtis=new RTextInputStream(ff1, ff2, maxReads); + cris=new ConcurrentReadInputStream(rtis, maxReads); //TODO: Change to generic + + + }else if(ff1.sequential()){ + SequentialReadInputStream ris=new SequentialReadInputStream(maxReads, 200, 50, 0, false); + cris=new ConcurrentReadInputStream(ris, maxReads); + }else if(ff1.csfasta()){ + + if(ff2!=null){ + cris=new ConcurrentSolidInputStream(ff1, qf1, ff2, qf2, maxReads); + }else{ + cris=new ConcurrentSolidInputStream(ff1, qf1, maxReads, null); + } + }else{ + cris=null; + throw new RuntimeException(""+ff1); + } + + return cris; + } + + + private class ReadThread extends Thread{ + ReadThread(ReadInputStream producer_, ArrayBlockingQueue> pq_){ + producer=producer_; + pq=pq_; + } + + @Override + public void run(){ + readLists(); + } + + private final void readLists(){ + + ArrayList list=null; + + if(verbose){System.err.println(getClass().getName()+" entering read lists loop.");} + while(list!=null || (!shutdown && producer.hasMore() && generatedLocal(1)); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + errorState=true; + } + if(verbose){System.err.println(getClass().getName()+" grabbed a list of size "+(list==null ? "null" : list.size()+""));} +// System.out.println("G"); + if(list==null){ +// System.out.println("H"); + if(verbose){System.err.println(getClass().getName()+" broke loop on null list.");} + break; + } + assert(list.size()>0); + assert(list.size()<=BUF_LEN); //Although this is not really necessary. +// System.out.println("I"); + if(list.size()+generatedLocal>maxReads){ +// System.out.println("J"); + if(verbose){System.err.println("Removing extra reads.");} + while(list.size()+generatedLocal>maxReads){list.remove(list.size()-1);} +// System.out.println("K"); + } +// System.out.println("A"); + while(list!=null && !shutdown){ +// System.out.println("B"); + try { + if(verbose){System.err.println("Trying to add list");} + pq.put(list); + generatedLocal+=list.size(); + list=null; + if(verbose){ + System.out.println("Added list; pq.size() = "+pq.size()); + } + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } +// System.out.println("C"); + } +// System.out.println("D"); + if(verbose){System.err.println("looping");} + } + + if(verbose){System.err.println(getClass().getName()+" Finished inner loop iteration.\n");} + } + + + if(verbose){System.err.println(getClass().getName()+" attempting to poison output queue.");} + boolean b=true; + while(b){ + //TODO Note that this could cause a deadlock if there was a premature shutdown, so the consumer died while the queue was full. + try { +// pq.offer(poison, 10000, TimeUnit.SECONDS); + pq.put(poison); + b=false; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + + if(verbose){System.err.println(getClass().getName()+" exited read lists loop: "+(list==null)+", "+shutdown+", "+producer.hasMore()+", "+generatedLocal+", "+maxReads);} + + } + + private final ArrayBlockingQueue> pq; + private final ReadInputStream producer; + private long generatedLocal=0; + } + + private void incrementGenerated(long amt){ + generated+=amt; + if(SHOW_PROGRESS && generated>=nextProgress){ + Data.sysout.print('.'); + nextProgress+=PROGRESS_INCR; + } + } + + @Override + public void setSampleRate(float rate, long seed){ + samplerate=rate; + if(rate>=1f){ + randy=null; + }else if(seed>-1){ + randy=new java.util.Random(seed); + }else{ + randy=new java.util.Random(); + } + } + + @Override + public boolean errorState(){return errorState || + (producer1==null ? false : producer1.errorState()) || (producer2==null ? false : producer2.errorState());} + /** TODO */ + private boolean errorState=false; + + private boolean[] running=new boolean[] {false}; + + private float samplerate=1f; + private java.util.Random randy=null; + + private ArrayBlockingQueue> p1q; + private ArrayBlockingQueue> p2q; + + + public Object[] producers(){return producer2==null ? new Object[] {producer1} : new Object[] {producer1, producer2};} + + private Thread[] threads; + + public final ReadInputStream producer1; + public final ReadInputStream producer2; + private ConcurrentDepot depot; + + private long maxReads; + private long generated=0; + private long listnum=0; + private long nextProgress=PROGRESS_INCR; + + private final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + private final int NUM_BUFFS=Shared.READ_BUFFER_NUM_BUFFERS; + private final long MAX_DATA=Shared.READ_BUFFER_MAX_DATA; + + public static boolean verbose=false; + + private static final ArrayList poison=new ArrayList(0); + + public static boolean SHOW_PROGRESS=false; + public static long PROGRESS_INCR=1000000; + public static boolean REMOVE_DISCARDED_READS=false; + +} diff --git a/current/stream/ConcurrentReadInputStream.java b/current/stream/ConcurrentReadInputStream.java new file mode 100755 index 0000000..24aa0f1 --- /dev/null +++ b/current/stream/ConcurrentReadInputStream.java @@ -0,0 +1,305 @@ +package stream; + +import java.util.ArrayList; +import java.util.concurrent.TimeUnit; + +import align2.ListNum; +import align2.Shared; + +public class ConcurrentReadInputStream implements ConcurrentReadStreamInterface { + + public ConcurrentReadInputStream(ReadInputStream source, long maxReadsToGenerate){ + producer=source; + depot=new ConcurrentDepot(BUF_LEN, NUM_BUFFS); + maxReads=maxReadsToGenerate>=0 ? maxReadsToGenerate : Long.MAX_VALUE; + if(maxReads==0){ + System.err.println("Warning - created a read stream for 0 reads."); + assert(false); + } +// if(maxReads nextList() { + ArrayList list=null; + while(list==null){ + try { + list=depot.full.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + if(shutdown){return null;} + } + } + ListNum ln=new ListNum(list, listnum); + listnum++; + return ln; + } + + public void returnList(ListNum ln, boolean poison){ + ln.list.clear(); + if(poison){ + depot.full.add(ln.list); + }else{ + depot.empty.add(ln.list); + } + } + + @Override + public void run() { +// producer.start(); + threads=new Thread[] {Thread.currentThread()}; + + if(producer.preferLists()){ + readLists(); + //System.err.println("Done reading lists."); + }else if(producer.preferBlocks()){ + readBlocks(); + }else{ + readSingles(); + } + + addPoison(); + + //End thread + + while(!depot.empty.isEmpty()){ + depot.full.add(depot.empty.poll()); + } +// System.err.println(depot.full.size()+", "+depot.empty.size()); + } + + private final void addPoison(){ + //System.err.println("Adding poison."); + //Add poison pills + depot.full.add(new ArrayList()); + for(int i=1; i list=null; + while(list==null){ + try { + list=depot.empty.poll(1000, TimeUnit.MILLISECONDS); + } catch (InterruptedException e) { + // TODO Auto-generated catch block +// System.err.println("Do not be alarmed by the following error message:"); +// e.printStackTrace(); + if(shutdown){ + i=depot.bufferCount; + break; + } + } + } + if(list!=null){depot.full.add(list);} + } + //System.err.println("Added poison."); + } + + private final void readSingles(){ + + long bases=0; + while(!shutdown && producer.hasMore() && generated list=null; + while(list==null){ + try { + list=depot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + if(shutdown){break;} + } + } + if(shutdown || list==null){break;} + + for(int i=0; i buffer=null; + ArrayList list=null; + int next=0; + while(buffer!=null || (!shutdown && producer.hasMore() && generated=buffer.size()){ + buffer=producer.nextList(); + next=0; + } + if(buffer==null){break;} + assert(buffer.size()<=BUF_LEN); //Although this is not really necessary. + + if(buffer.size()<=(BUF_LEN-list.size()) && (buffer.size()+generated)1 && (generated%1000000)==0){System.err.println("Generated read #"+generated);} + next++; + } + + if(next>=buffer.size()){ + buffer=null; + next=0; + } + } + } + //System.err.println("Adding list to full depot."); + depot.full.add(list); + //System.err.println("Added."); + list=null; + } + + } + + private final void readBlocks(){ + + Read[] buffer=null; + ArrayList list=null; + int next=0; + while(buffer!=null || (!shutdown && producer.hasMore() && generated=buffer.length){ + buffer=producer.nextBlock(); + next=0; + } + if(buffer==null){break;} + while(next=buffer.length){ + buffer=null; + next=0; + } + } + depot.full.add(list); + list=null; + } + + } + + private boolean shutdown=false; + + @Override + public void shutdown(){ + shutdown=true; + if(threads[0]!=null && threads[0].isAlive()){ + threads[0].interrupt(); + } + } + + @Override + public synchronized void restart(){ + shutdown=false; + producer.restart(); + depot=new ConcurrentDepot(BUF_LEN, NUM_BUFFS); + generated=0; + } + + @Override + public synchronized void close(){ +// System.err.println("Closing cris: "+maxReads+", "+generated); +// if(threads!=null){ +// for(int i=0; i=1f){ + randy=null; + }else if(seed>-1){ + randy=new java.util.Random(seed); + }else{ + randy=new java.util.Random(); + } + } + + @Override + public boolean errorState(){return errorState || (producer!=null && producer.errorState());} + /** TODO */ + private boolean errorState=false; + + private float samplerate=1f; + private java.util.Random randy=null; + + public Object[] producers(){return new Object[] {producer};} + + private Thread[] threads; + + public final ReadInputStream producer; + private ConcurrentDepot depot; + private long maxReads; + private long generated=0; + private long listnum=0; + + private final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + private final int NUM_BUFFS=Shared.READ_BUFFER_NUM_BUFFERS; + private final long MAX_DATA=Shared.READ_BUFFER_MAX_DATA; + + +} diff --git a/current/stream/ConcurrentReadListDepot.java b/current/stream/ConcurrentReadListDepot.java new file mode 100755 index 0000000..af84b8a --- /dev/null +++ b/current/stream/ConcurrentReadListDepot.java @@ -0,0 +1,35 @@ +package stream; + +import java.util.ArrayList; +import java.util.concurrent.ArrayBlockingQueue; + +public class ConcurrentReadListDepot { + + + + public ConcurrentReadListDepot(int bufSize, int numBufs){ + bufferSize=bufSize; + bufferCount=numBufs; + + lists=new ArrayList[numBufs]; + empty=new ArrayBlockingQueue>(numBufs+1); + full=new ArrayBlockingQueue>(numBufs+1); + + for(int i=0; i(bufSize); + empty.add(lists[i]); + } + + } + + + public final ArrayBlockingQueue> empty; + public final ArrayBlockingQueue> full; + + public final int bufferSize; + public final int bufferCount; + + + private final ArrayList[] lists; + +} diff --git a/current/stream/ConcurrentReadStreamInterface.java b/current/stream/ConcurrentReadStreamInterface.java new file mode 100755 index 0000000..82bd2dd --- /dev/null +++ b/current/stream/ConcurrentReadStreamInterface.java @@ -0,0 +1,28 @@ +package stream; + +import align2.ListNum; + +public interface ConcurrentReadStreamInterface extends Runnable{ + + public ListNum nextList(); + + public void returnList(ListNum list, boolean poison); + + public void run(); + + public void shutdown(); + + public void restart(); + public void close(); + + /** Returns true for paired-end stream, false for single-end stream */ + public boolean paired(); + + public Object[] producers(); + + /** Return true if this stream has detected an error */ + public boolean errorState(); + + public void setSampleRate(float rate, long seed); + +} diff --git a/current/stream/ConcurrentSolidInputStream.java b/current/stream/ConcurrentSolidInputStream.java new file mode 100755 index 0000000..253c38c --- /dev/null +++ b/current/stream/ConcurrentSolidInputStream.java @@ -0,0 +1,457 @@ +package stream; + +import java.util.ArrayList; +import java.util.concurrent.TimeUnit; + +import align2.ListNum; +import align2.Shared; +import align2.Tools; + +import fileIO.FileFormat; + +public class ConcurrentSolidInputStream implements ConcurrentReadStreamInterface { + + public static void main(String[] args){ + ConcurrentSolidInputStream mates=null; + if(args.length>2){ + mates=new ConcurrentSolidInputStream(args[2], args[3], 0, null); + } + + ConcurrentSolidInputStream stream=new ConcurrentSolidInputStream(args[0], args[1], 0, mates); + new Thread(stream).start(); +// ArrayList list=stream.nextList(); +// System.out.println(list.size()); +// for(int i=0; i<30 && i ln=stream.nextList(); + long total=0, bad=0; + while(ln!=null && ln.list!=null && !ln.list.isEmpty()){ + for(int i=0; i100000){break;} + } + + System.out.println("Total = \t"+total+"\nBad = \t"+bad+"\t("+String.format("%.3f", bad*100f/total)+"%)"); + + stream.shutdown(); + try { + Thread.sleep(1500); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + //e.printStackTrace(); + } + for(Thread t : stream.threads){ + System.out.println(t.isAlive()); + } + } + + public ConcurrentSolidInputStream(FileFormat ff1, String qf1, long maxReadsToGenerate, ConcurrentSolidInputStream mateStream_){ + this(ff1.name(), qf1, maxReadsToGenerate, mateStream_); + } + + public ConcurrentSolidInputStream(String fastaName_, String qualName_, long maxReadsToGenerate, + ConcurrentSolidInputStream mateStream_){ + fastaName=fastaName_; + qualName=qualName_; + producerFasta=new FastaStream(fastaName); + producerQual=new QualStream(qualName); + fdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_RAW); + qdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_RAW); + rdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_READ); + maxReads=maxReadsToGenerate>0 ? maxReadsToGenerate : Long.MAX_VALUE; + mateStream=mateStream_; + paired=(mateStream!=null); + } + + public ConcurrentSolidInputStream(FileFormat ff1, String qf1, FileFormat ff2, String qf2, long maxReadsToGenerate){ + this(ff1.name(), qf1, ff2==null ? null : ff2.name(), qf2, maxReadsToGenerate); + } + + public ConcurrentSolidInputStream(String fastaName_, String qualName_, String fastaName2_, String qualName2_, + long maxReadsToGenerate){ + fastaName=fastaName_; + qualName=qualName_; + producerFasta=new FastaStream(fastaName); + producerQual=new QualStream(qualName); + fdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_RAW); + qdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_RAW); + rdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_READ); + maxReads=maxReadsToGenerate>0 ? maxReadsToGenerate : Long.MAX_VALUE; + mateStream=(fastaName2_==null ? null : new ConcurrentSolidInputStream(fastaName2_, qualName2_, maxReadsToGenerate, null)); + paired=(mateStream!=null); + } + + /** If running in paired-end mode, attaches mated reads to each other. */ + private ArrayList makeReadList() { + ArrayList rlist=makeReadList2(); + if(mateStream!=null){ + ListNum matesln=mateStream.nextList(); + ArrayList mates=matesln.list; + if(rlist!=null && mates!=null){ + int max=Tools.min(rlist.size(), mates.size()); + for(int i=0; i makeReadList2() { + + ArrayList flist=null; + ArrayList qlist=null; + ArrayList rlist=null; +// System.out.println("Making list"); + while(flist==null){ + try { + flist=fdepot.full.take(); +// System.out.println("Got flist of size "+flist.size()); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + if(shutdown){return null;} + //e.printStackTrace(); + } + } + while(qlist==null){ + try { + qlist=qdepot.full.take(); +// System.out.println("Got qlist of size "+flist.size()); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + if(shutdown){return null;} + //e.printStackTrace(); + } + } + while(rlist==null){ + try { + rlist=rdepot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + if(shutdown){return null;} + //e.printStackTrace(); + } + } + + assert(flist.size()==qlist.size() || + (maxReads1 && ((generated%1000000)==0) && mateStream==null){System.err.println("Generated read #"+generated);} + } + flist.clear(); + qlist.clear(); + fdepot.empty.add(flist); + qdepot.empty.add(qlist); + + return rlist; + } + + public synchronized ListNum nextList() { + ArrayList rlist=null; + + while(rlist==null){ + try { + //System.err.println((mateStream==null ? 2 : 1)+" Attempting take; depot.size = "+rdepot.full.size()+", "+rdepot.empty.size()); + rlist=rdepot.full.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + //e.printStackTrace(); + } + } + //System.err.println((mateStream==null ? 2 : 1)+" Took "+rlist.size()); + ListNum ln=new ListNum(rlist, listnum); + listnum++; + return ln; + } + + public void returnList(ListNum ln, boolean poison){ + ln.list.clear(); + if(poison){ + rdepot.full.add(ln.list); + }else{ + rdepot.empty.add(ln.list); + } + } + + @Override + public void run() { + FThread fthread=new FThread(); + QThread qthread=new QThread(); + threads=new Thread[] {new Thread(fthread), new Thread(qthread), Thread.currentThread()}; + threads[0].start(); + threads[1].start(); + + if(mateStream!=null){new Thread(mateStream).start();} + + ArrayList list=makeReadList(); + while(list!=null && !list.isEmpty() && !shutdown){ + //System.err.println((mateStream==null ? 2 : 1)+" Adding list to rdepot size "+rdepot.full.size()+"/"+rdepot.full.remainingCapacity()); + rdepot.full.add(list); + list=makeReadList(); + } + + //System.err.println((mateStream==null ? 2 : 1)+" Exiting main loop."); + if(generated>=maxReads){shutdown();} + //System.err.println((mateStream==null ? 2 : 1)+" Shutdown complete."); + + if(list!=null){ + //System.err.println((mateStream==null ? 2 : 1)+" Attempting to add current list to rdepot.full: "+rdepot.full.size()); + list.clear(); + rdepot.full.add(list); + }else{ + assert(shutdown) : "Null read list encountered for unknown reason."; +// System.err.println("Null read list encountered."); +// shutdown(); + } + + //Add poison pills + //System.err.println((mateStream==null ? 2 : 1)+" Attempting to add poison list to rdepot.full: "+rdepot.full.size()); + rdepot.full.add(new ArrayList()); + + for(int i=1; i list=null; + while(list==null){ + try { + list=fdepot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + //e.printStackTrace(); + if(shutdown){return;} + } + } + for(int i=0; i=maxReads){break;} + } +// System.err.println("Shutting down FThread."); + + //Add poison pills + for(int i=1; i list=null; + while(list==null){ + try { + list=fdepot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + //e.printStackTrace(); + if(shutdown){return;} + } + } + fdepot.full.add(list); + } +// System.err.println("Done shutting down FThread."); + + //End thread + producerFasta.close(); + } + + private long fgenerated=0; + } + + private class QThread implements Runnable{ + + @Override + public void run() { + while(!shutdown && producerQual.hasMore()){ + ArrayList list=null; + while(list==null){ + try { + list=qdepot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + //e.printStackTrace(); + if(shutdown){return;} + } + } + for(int i=0; i=maxReads){break;} + } +// System.err.println("Shutting down QThread."); + + //Add poison pills + for(int i=1; i list=null; + while(list==null){ + try { + list=qdepot.empty.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + //e.printStackTrace(); + if(shutdown){return;} + } + } + qdepot.full.add(list); + } + producerQual.close(); + //End thread +// System.err.println("Done shutting down QThread."); + } + + private long qgenerated=0; + } + + public void shutdown(){ + synchronized(ShutdownKey){ + if(shutdown){return;} + //System.err.println("Shutting down SCRIS."); + shutdown=true; + ShutdownKey[0]=true; + //System.err.println("A"); + threads[0].interrupt(); + //System.err.println("B"); + threads[1].interrupt(); + //System.err.println("C"); + threads[2].interrupt(); + } + if(mateStream!=null){ + mateStream.shutdown(); + } + } + + @Override + public synchronized void restart() { + shutdown=false; + generated=0; + producerFasta=new FastaStream(fastaName); + producerQual=new QualStream(qualName); + fdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_RAW); + qdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_RAW); + rdepot=new ConcurrentDepot(BUF_LEN, NUMLISTS_READ); + } + + @Override + public synchronized void close() { + producerFasta.close(); + producerQual.close(); + } + + @Override + public void setSampleRate(float rate, long seed){ + samplerate=rate; + if(rate>=1f){ + randy=null; + }else if(seed>-1){ + randy=new java.util.Random(seed); + }else{ + randy=new java.util.Random(); + } + } + private float samplerate=1f; + private java.util.Random randy=null; + + public Object[] producers(){return new Object[] {producerFasta, producerQual};} + + @Override + public boolean errorState(){return errorState;} + /** TODO */ + private boolean errorState=false; + + private boolean shutdown=false; + + private Thread[] threads; + + public final long maxReads; + private long generated=0; + private long listnum=0; + + public final String fastaName; + public final String qualName; + + public FastaStream producerFasta; + public QualStream producerQual; + private ConcurrentDepot fdepot; + private ConcurrentDepot qdepot; + private ConcurrentDepot rdepot; + + public static int NUMLISTS_RAW=7; + public static int NUMLISTS_READ=25; + + private static final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + private final ConcurrentSolidInputStream mateStream; + private final boolean paired; + +// private final Object ShutdownKey="ShutdownKey"+hashCode(); + private final boolean[] ShutdownKey=new boolean[] {false}; + + @Override + public boolean paired() { + assert(paired==(mateStream!=null)); + return paired; + } + +} diff --git a/current/stream/FASTQ.java b/current/stream/FASTQ.java new file mode 100755 index 0000000..9e119c1 --- /dev/null +++ b/current/stream/FASTQ.java @@ -0,0 +1,860 @@ +package stream; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStream; +import java.io.InputStreamReader; +import java.util.ArrayList; +import java.util.Arrays; + +import align2.Shared; +import align2.Tools; + +import dna.Data; +import dna.Gene; +import fileIO.ByteFile; +import fileIO.ReadWrite; +import fileIO.TextFile; + + +public class FASTQ { + + public static void writeFASTQ(Read[] reads, String fname){ + StringBuilder sb=new StringBuilder(); + for(Read r : reads){ + String[] quad=toFASTQ(r); + for(int i=0; iQUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){ + System.err.println("Changed from ASCII-33 to ASCII-64 on input quality "+(quals[i]+ASCII_OFFSET)+" while prescanning."); + qflips++; + ASCII_OFFSET=64; + if(DETECT_QUALITY_OUT){ASCII_OFFSET_OUT=64;} + for(int j=0; j<=i; j++){ + quals[j]=(byte)(quals[j]-31); + } + }else if(ASCII_OFFSET==64 && (quals[i]<-5)){ + System.err.println("Changed from ASCII-64 to ASCII-33 on input quality "+(quals[i]+ASCII_OFFSET)+" while prescanning."); + ASCII_OFFSET=33; + if(DETECT_QUALITY_OUT){ASCII_OFFSET_OUT=33;} + qflips++; + for(int j=0; j<=i; j++){ + quals[j]=(byte)(quals[j]+31); + } + } + } + assert(quals[i]>=-5) : "ASCII encoding for quality (currently ASCII-"+ASCII_OFFSET+") appears to be wrong.\n" + +oct[k]+"\n"+oct[k+3]+"\n"+Arrays.toString(oct[k+3].getBytes()); + assert(qflips<2) : "Failed to auto-detect quality coding; quitting."; + } + } + + return ASCII_OFFSET; + } + + public static boolean testPairNames(Read r1, Read r2){ + if(r1==null || r2==null){return false;} + return testPairNames(r1.id, r2.id); + } + + public static boolean testPairNames(String id1, String id2){ + + if(id1==null || id2==null){return false;} + + final int idxSlash1=id1.lastIndexOf('/'); + final int idxSlash2=id2.lastIndexOf('/'); + final int idxSpace1=id1.indexOf(' '); + final int idxSpace2=id2.indexOf(' '); + // System.out.println("idxSlash1="+idxSlash1+", idxSlash2="+idxSlash2+", idxSpace1="+idxSpace1+", idxSpace2="+idxSpace2); + if(idxSlash1==idxSlash2 && idxSlash1>1){ + // System.out.println("A"); + String[] split1=id1.split("/"); + String[] split2=id2.split("/"); + // System.out.println(Arrays.toString(split1)); + // System.out.println(Arrays.toString(split2)); + + if(split1.length>1 && split2.length>1 && split1[0].equals(split2[0])){ + // System.out.println("B"); + if(split1[split1.length-1].contains(" ")){ + split1[split1.length-1]=split1[split1.length-1].split(" ")[0]; + // System.out.println("B1: "+Arrays.toString(split1)); + } + if(split2[split2.length-1].contains(" ")){ + split2[split2.length-1]=split2[split2.length-1].split(" ")[0]; + // System.out.println("B2: "+Arrays.toString(split2)); + } + if(split1[split1.length-1].equals("1") && split2[split2.length-1].equals("2")){ + // System.out.println("B3"); + return true; + } + } + } + + if(idxSpace1==idxSpace2 && idxSpace1>=0){ + // System.out.println("C"); + if(idxSpace1==idxSpace2 && idxSpace1>1){ + // System.out.println("D"); + String[] split1=id1.split(" "); + String[] split2=id2.split(" "); + // System.out.println(Arrays.toString(split1)); + // System.out.println(Arrays.toString(split2)); + + if(split1.length>1 && split2.length>1 && split1[0].equals(split2[0])){ + // System.out.println("E"); + if(split1[1].startsWith("1:") && split2[1].startsWith("2:")){return true;} + } + } + } + return false; + } + + public static String[] toFASTQ(Read r){ + String id=customID(r); + return toFASTQ(r.bases, r.quality, id==null ? ""+r.numericID : id); + } + + public static String customID(Read r){ + if(PARSE_CUSTOM && (r.chrom>-1 && r.stop>-1)){ + if(Data.GENOME_BUILD>=0){ + final int chrom1=r.chrom; + final int start1=r.start; + final int stop1=r.stop; + int idx1=Data.scaffoldIndex(chrom1, (start1+stop1)/2); + byte[] name1=Data.scaffoldNames[chrom1][idx1]; + int a1=Data.scaffoldRelativeLoc(chrom1, start1, idx1); + if(r.mate==null || !ADD_PAIRNUM_TO_CUSTOM_ID){ + return (r.id==null ? ""+r.numericID : r.id)+"_chr"+r.chrom+"_"+r.strand()+"_"+r.start+"_"+r.stop+"_"+a1+"_"+new String(name1); + }else{ + return (r.id==null ? ""+r.numericID : r.id)+"_chr"+r.chrom+"_"+r.strand()+"_"+r.start+"_"+r.stop+"_"+a1+"_"+new String(name1)+" /"+(r.pairnum()+1); + } + }else{ + if(r.mate==null || !ADD_PAIRNUM_TO_CUSTOM_ID){ + return (r.id==null ? ""+r.numericID : r.id)+"_chr"+r.chrom+"_"+r.strand()+"_"+r.start+"_"+r.stop; + }else{ + return (r.id==null ? ""+r.numericID : r.id)+"_chr"+r.chrom+"_"+r.strand()+"_"+r.start+"_"+r.stop+" /"+(r.pairnum()+1); + } + } + } + return r.id; + } + + private static int fastqLength(Read r){ + int len=6; //newlines, @, + + len+=(r.id==null ? Tools.stringLength(r.numericID) : r.id.length()); + len+=(r.bases==null ? 0 : r.bases.length); + len+=(r.quality==null ? 0 : r.quality.length); + return len; + } + + public static ByteBuilder toFASTQ(Read r, ByteBuilder bb){ + int len=fastqLength(r); + final String id; + final byte[] bases=r.bases, quals=r.quality; + if(PARSE_CUSTOM && (r.chrom>-1 && r.stop>-1)){ + id=customID(r); + if(id!=null){len+=id.length();} + }else{ + id=r.id; + } + if(bb==null){bb=new ByteBuilder(len);} + else{bb.ensureExtra(len);} + + bb.append('@'); + if(id==null){bb.append(r.numericID);} + else{bb.append(id);} + bb.append('\n'); + +// if(bases!=null){for(byte b : bases){sb.append((char)b);}} +// sb.append('\n'); +// sb.append('+'); +// sb.append('\n'); +// if(quals!=null){for(byte b : quals){sb.append((char)(b+ASCII_OFFSET_OUT));}} + + if(bases==null){ + bb.append('\n').append('+').append('\n'); + if(verbose){System.err.println("A:\n"+bb);} + }else{ + bb.append(bases); + bb.append('\n').append('+').append('\n'); + if(verbose){System.err.println("B:\n"+bb);} + if(quals==null){ + final byte q=(byte)(30+ASCII_OFFSET_OUT); + final int blen=bases.length; + bb.ensureExtra(blen); + for(int i=0, j=bb.length; i-1 && r.stop>-1)){ + id=customID(r); + if(id!=null){len+=id.length();} + }else{ + id=r.id; + } + if(sb==null){sb=new StringBuilder(len);} + else{sb.ensureCapacity(len);} + + sb.append('@'); + if(id==null){sb.append(r.numericID);} + else{sb.append(id);} + sb.append('\n'); + +// if(bases!=null){for(byte b : bases){sb.append((char)b);}} +// sb.append('\n'); +// sb.append('+'); +// sb.append('\n'); +// if(quals!=null){for(byte b : quals){sb.append((char)(b+ASCII_OFFSET_OUT));}} + + if(bases==null){ + sb.append('\n').append('+').append('\n'); + }else{ + char[] buffer=Shared.getTLCB(bases.length); + for(int i=0; i list=toReadList(tf, maxReadsToReturn, colorspace, numericID, interleaved); + assert(list.size()<=maxReadsToReturn); + return list.toArray(new Read[list.size()]); + } + + public static final String makeId(String s){ + if(s==null || s.length()<1){return null;} + char c=s.charAt(0); + int start=0, stop=s.length(); + if(c=='@' || c=='>'){start=1;} + if(Shared.TRIM_READ_COMMENTS){ + for(int i=start; i'){start=1;} + if(Shared.TRIM_READ_COMMENTS){ + for(int i=start; i toReadList(TextFile tf, int maxReadsToReturn, boolean colorspace, long numericID, boolean interleaved){ + String s=null; + ArrayList list=new ArrayList(Data.min(16384, maxReadsToReturn)); + + String[] quad=new String[4]; + + int cntr=0; + int added=0; + + Read prev=null; + + for(s=tf.nextLine(); s!=null && addedQUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){ + if(numericID<1){ + System.err.println("Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31)); + }else{ + System.err.println("Warning! Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31)); + System.err.println("Up to "+numericID+" prior reads may have been generated with incorrect qualities."); + System.err.println("If this is a problem you may wish to re-run with the flag 'qin=64'."); + } + ASCII_OFFSET=64; + for(int j=0; j<=i; j++){ + quals[j]=(byte)(quals[j]-31); + } + } + assert(quals[i]>=-5) : "\n"+quad[0]+"\n"+quad[3]; + } +// assert(false) : Arrays.toString(quals); +// assert(false) : new String(quad[0]); + if(PARSE_CUSTOM && quad[0]!=null && quad[0].indexOf('_')>0){ + String[] answer=quad[0].split("_"); + if(answer.length>=5){ + try { + byte trueChrom=Gene.toChromosome(answer[1]); + byte trueStrand=Byte.parseByte(answer[2]); + int trueLoc=Integer.parseInt(answer[3]); + int trueStop=Integer.parseInt(answer[4]); + r=new Read(bases, trueChrom, trueStrand, trueLoc, trueStop, id, quals, colorspace, numericID); + r.setSynthetic(true); + } catch (NumberFormatException e) {} + } + } + if(r==null){ + r=new Read(bases, 0, (byte)0, 0, 0, id, quals, colorspace, numericID); + } + + cntr=0; + + if(interleaved){ + if(prev==null){prev=r;} + else{ + prev.mate=r; + r.mate=prev; + r.setPairnum(1); + list.add(prev); + added++; + numericID++; + prev=null; + } + }else{ + list.add(r); + added++; + numericID++; + } + + + if(added>=maxReadsToReturn){break;} + +// System.out.println(r.chrom+", "+r.strand+", "+r.loc); +// assert(false); + } + } + assert(list.size()<=maxReadsToReturn); + return list; + } + + public static ArrayList toReadList(ByteFile tf, int maxReadsToReturn, boolean colorspace, long numericID, boolean interleaved){ + byte[] s=null; + ArrayList list=new ArrayList(Data.min(16384, maxReadsToReturn)); + + byte[][] quad=new byte[4][]; + + int cntr=0; + int added=0; + + Read prev=null; + + for(s=tf.nextLine(); s!=null && addedQUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){ +// if(numericID<1){ +// System.err.println("Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31)); +// }else{ +// System.err.println("Warning! Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31)); +// System.err.println("Up to "+numericID+" prior reads may have been generated with incorrect qualities."); +// System.err.println("If this is a problem you may wish to re-run with the flag 'qin=64'."); +// errorState=true; +// } +// ASCII_OFFSET=64; +// for(int j=0; j<=i; j++){ +// quals[j]=(byte)(quals[j]-31); +// } +// } +// if(quals[i]<-5){ +// if(!negativeFive){ +// for(int j=0; j=-5); +//// assert(quals[i]>=-5) : "The ASCII quality encoding level is not set correctly. Quality value below -5:" + +//// "\n"+new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3]); +// } +//// assert(false) : Arrays.toString(quals); +//// assert(false) : PARSE_CUSTOM+"\n"+new String(quad[0]); +// if(PARSE_CUSTOM){ +// if(quad[0]!=null && Tools.indexOf(quad[0], (byte)'_')>0){ +// String temp=new String(quad[0]); +// if(temp.endsWith(" /1") || temp.endsWith(" /2")){temp=temp.substring(0, temp.length()-3);} +// String[] answer=temp.split("_"); +// +// if(answer.length>=5){ +// try { +// byte trueChrom=Gene.toChromosome(answer[1]); +// byte trueStrand=Byte.parseByte(answer[2]); +// int trueLoc=Integer.parseInt(answer[3]); +// int trueStop=Integer.parseInt(answer[4]); +// r=new Read(bases, trueChrom, trueStrand, trueLoc, trueStop, id, quals, colorspace, numericID); +// r.setSynthetic(true); +// } catch (NumberFormatException e) { +// PARSE_CUSTOM=false; +// System.err.println("Turned off PARSE_CUSTOM because could not parse "+new String(quad[0])); +// } +// }else{ +// PARSE_CUSTOM=false; +// System.err.println("Turned off PARSE_CUSTOM because answer="+Arrays.toString(answer)); +// } +// }else{ +// PARSE_CUSTOM=false; +// System.err.println("Turned off PARSE_CUSTOM because quad[0]="+new String(quad[0])+", index="+Tools.indexOf(quad[0], (byte)'_')); +// } +// } +// if(r==null){ +// r=new Read(bases, 0, (byte)0, 0, 0, id, quals, colorspace, numericID); +// } + + + Read r=quadToRead(quad, true, false, colorspace, tf, numericID); + cntr=0; + + if(interleaved){ + if(prev==null){prev=r;} + else{ + prev.mate=r; + r.mate=prev; + r.setPairnum(1); + list.add(prev); + added++; + numericID++; + prev=null; + } + }else{ + list.add(r); + added++; + numericID++; + } + + + if(added>=maxReadsToReturn){break;} + +// System.out.println(r.chrom+", "+r.strand+", "+r.loc); +// assert(false); + } + } + assert(list.size()<=maxReadsToReturn); + return list; + } + + public static byte[][] scarfToQuad(final byte[] scarf, byte[][] quad){ + + int a=-1, b=-1; + final byte colon=':'; + for(int i=scarf.length-1; i>=0; i--){ + if(scarf[i]==colon){ + if(b<0){b=i;} + else{ + assert(a<0); + a=i; + break; + } + } + } + if(a<0 || b<0){ + throw new RuntimeException("Misformatted scarf line: "+new String(scarf)); + } + if(quad==null){quad=new byte[4][];} + quad[0]=Arrays.copyOfRange(scarf, 0, a); + quad[1]=Arrays.copyOfRange(scarf, a+1, b); + quad[3]=Arrays.copyOfRange(scarf, b+1, scarf.length); + return quad; + } + + public static Read quadToRead(final byte[][] quad, boolean fastq, boolean scarf, boolean colorspace, ByteFile tf, long numericID){ + + if(verbose){ + System.err.println("\nASCII offset is "+ASCII_OFFSET); + System.err.println("quad:"); + System.err.println(new String(quad[0])); + System.err.println(new String(quad[1])); + System.err.println(new String(quad[2])); + System.err.println(new String(quad[3])); + } + + assert(scarf || quad[0][0]==(byte)'@') : "\nError in "+tf.name()+", line "+tf.lineNum()+"\n"+ + new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3])+"\n"; + assert(scarf || quad[2][0]==(byte)'+') : "\nError in "+tf.name()+", line "+tf.lineNum()+"\n"+ + new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3])+"\n"; + + // if(quad[0].startsWith("@HW") || quad[0].startsWith("@FC")){ascii_offset=66;} //TODO: clumsy + + final String id=makeId(quad[0]); + + Read r=null; + + byte[] bases=quad[1]; + byte[] quals=quad[3]; + // assert(false) : Arrays.toString(quals); + for(int i=0; iQUAL_THRESH /*|| (bases[i]=='N' && quals[i]>20)*/)){ + if(numericID<1){ + System.err.println("Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31)); + }else{ + System.err.println("Warning! Changed from ASCII-33 to ASCII-64 on input "+((char)quals[i])+": "+quals[i]+" -> "+(quals[i]-31)); + System.err.println("Up to "+numericID+" prior reads may have been generated with incorrect qualities."); + System.err.println("If this is a problem you may wish to re-run with the flag 'qin=64'."); + errorState=true; + } + ASCII_OFFSET=64; + for(int j=0; j<=i; j++){ + quals[j]=(byte)(quals[j]-31); + } + } + if(quals[i]<-5){ + if(!negativeFive){ + for(int j=0; j=-5); + // assert(quals[i]>=-5) : "The ASCII quality encoding level is not set correctly. Quality value below -5:" + + // "\n"+new String(quad[0])+"\n"+new String(quad[1])+"\n"+new String(quad[2])+"\n"+new String(quad[3]); + } + // assert(false) : Arrays.toString(quals); + // assert(false) : PARSE_CUSTOM+"\n"+new String(quad[0]); + if(PARSE_CUSTOM){ + if(quad[0]!=null && Tools.indexOf(quad[0], (byte)'_')>0){ + String temp=new String(quad[0]); + if(temp.endsWith(" /1") || temp.endsWith(" /2")){temp=temp.substring(0, temp.length()-3);} + String[] answer=temp.split("_"); + + if(answer.length>=5){ + try { + byte trueChrom=Gene.toChromosome(answer[1]); + byte trueStrand=Byte.parseByte(answer[2]); + int trueLoc=Integer.parseInt(answer[3]); + int trueStop=Integer.parseInt(answer[4]); + r=new Read(bases, trueChrom, trueStrand, trueLoc, trueStop, id, quals, colorspace, numericID); + r.setSynthetic(true); + } catch (NumberFormatException e) { + PARSE_CUSTOM=false; + System.err.println("Turned off PARSE_CUSTOM because could not parse "+new String(quad[0])); + } + }else{ + PARSE_CUSTOM=false; + System.err.println("Turned off PARSE_CUSTOM because answer="+Arrays.toString(answer)); + } + }else{ + PARSE_CUSTOM=false; + System.err.println("Turned off PARSE_CUSTOM because quad[0]="+new String(quad[0])+", index="+Tools.indexOf(quad[0], (byte)'_')); + } + } + if(r==null){ + r=new Read(bases, 0, (byte)0, 0, 0, id, quals, colorspace, numericID); + } + return r; + } + + public static ArrayList toScarfReadList(ByteFile tf, int maxReadsToReturn, boolean colorspace, long numericID, boolean interleaved){ + byte[] s=null; + ArrayList list=new ArrayList(Data.min(16384, maxReadsToReturn)); + + byte[][] quad=new byte[4][]; + + int added=0; + + Read prev=null; + + for(s=tf.nextLine(); s!=null && added=maxReadsToReturn){break;} + } + assert(list.size()<=maxReadsToReturn); + return list; + } + + public static String qualToString(byte[] quals){ + byte[] q2=new byte[quals.length]; + for(int i=0; i3){break;} + } + + } + + public FastaQualReadInputStream(FileFormat ff, String qfname, boolean colorspace_){ +// assert(false) : "In progress"; + colorspace=colorspace_; + + tf=new TextFile(ff, false); + qtf=new TextFile(qfname, false, false); + interleaved=false; + + } + + public FastaQualReadInputStream(String fname, String qfname, boolean colorspace_){ +// assert(false) : "In progress"; + colorspace=colorspace_; + + if(!fileIO.FileFormat.hasFastaExtension(fname)){ + System.err.println("Warning: Did not find expected fasta file extension for filename "+fname); + } + + tf=new TextFile(fname, false, false); + qtf=new TextFile(qfname, false, false); + interleaved=false; + + } + + @Override + public void start() { +// if(cris!=null){new Thread(cris).start();} + } + + + @Override + public boolean hasMore() { + if(buffer==null || next>=buffer.length){ + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next=buffer.length){fillBuffer();} + Read[] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return r; + } + + @Override + public synchronized ArrayList nextList() { + return toList(nextBlock()); + } + public final boolean preferArrays(){return true;} + + private synchronized void fillBuffer(){ + if(verbose){System.err.println("Filling buffer. buffer="+(buffer==null ? null : buffer.length));} + assert(buffer==null || next>=buffer.length); + + buffer=null; + next=0; + + buffer=toReads(tf, BUF_LEN, nextReadID, interleaved, headerA); + + if(verbose){System.err.println("Filled buffer. buffer="+(buffer==null ? null : buffer.length));} + + nextReadID+=buffer.length; + if(buffer.length list=new ArrayList(Data.min(16384, maxReadsToReturn)); + + int added=0; + + Read prev=null; + + while(added'){ + assert(currentLine.equals(currentQLine)); + headerA[0]=currentLine; + currentLoc=0; + currentSection=0; + currentLine=null; + } + } + + final boolean SPLIT_READS=FastaReadInputStream.SPLIT_READS; + final int TARGET_READ_LEN=FastaReadInputStream.TARGET_READ_LEN; + final int MIN_READ_LEN=FastaReadInputStream.MIN_READ_LEN; + + assert(currentLine==null || currentLine.charAt(0)!='>'); + + StringBuilder sb=new StringBuilder(); + StringBuilder sbq=new StringBuilder(); + Read r=null; + while(r==null){ + if(!SPLIT_READS || (currentLoc==0 && (currentLine.length()<=(TARGET_READ_LEN-sb.length())))){ + sb.append(currentLine); + sbq.append(currentQLine); + currentLoc=currentLine.length(); + }else{ + while(sb.length()=currentLine.length()) : currentLoc+", "+currentLine.length()+", "+ + TARGET_READ_LEN+", "+sb.length()+"\n"+currentLine+"\n"+sb; + currentLine=null; + currentQLine=null; + currentLoc=0; + while(currentLine==null){ + currentLine=tf.nextLine(); + currentQLine=nextQtfLine(qtf); + assert(currentLine==null || currentLine.length()==currentQLine.length()); + assert(currentLine==null || currentLine.charAt(0)!='>' || currentLine.equals(currentQLine)); + if(currentLine==null || currentLine.charAt(0)=='>'){ + if(sb.length()>=MIN_READ_LEN){ + r=makeRead(sb, sbq, numericID); + }else{ + sb.setLength(0); + sbq.setLength(0); + } + headerA[0]=currentLine; + currentLoc=0; + currentSection=0; + currentLine=null; + currentQLine=null; + if(r!=null){return r;} + if(headerA[0]==null){ + if(verbose){System.err.println("Returning null because tf.nextLine()==null: B");} + return null; + } + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + if(verbose){System.err.println("Returning null because loop exited (should be unreachable).");} + return null; + } + + private final CharSequence nextQtfLine(TextFile qtf){ + String s=qtf.nextLine(); + if(!NUMERIC_QUAL || s==null || s.length()==0 || s.charAt(0)=='>'){return s;} + s=s.trim(); + final StringBuilder sb=new StringBuilder((s.length()+1)/2); + + int x=0; + for(int i=0; i0); + sb.append((char)(x+FASTQ.ASCII_OFFSET)); + x=0; + }else{ + x=10*x+(c-'0'); + } + } + sb.append((char)(x+FASTQ.ASCII_OFFSET)); + return sb; + } + + private Read makeRead(StringBuilder sb, StringBuilder sbq, long numericID){ +// assert(!sb.equals(sbq)) : sb+"\n"+sbq; + byte[] quals=new byte[sbq.length()]; + byte[] bases=new byte[sb.length()]; +// if(FAKE_QUALITY){ +// quals=new byte[sb.length()]; +// Arrays.fill(quals, (byte)(30)); +// } + for(int i=0; i0 ? headerA[0].substring(1)+"_"+currentSection : new String(headerA[0].substring(1))); + Read r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, colorspace, numericID); + return r; + } + + public boolean close(){ + boolean a=tf.close(); + boolean b=qtf.close(); + return a | b; + } + + @Override + public synchronized void restart() { + generated=0; + consumed=0; + next=0; + nextReadID=0; + buffer=null; + + currentLine=null; + currentLoc=0; + currentSection=0; + finished=false; + + tf.reset(); + qtf.reset(); + } + + @Override + public boolean paired() { + return interleaved; + } + + private Read[] buffer=null; + private int next=0; + + private final TextFile tf; + private final TextFile qtf; + private final boolean interleaved; + + public static final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + public long generated=0; + public long consumed=0; + private long nextReadID=0; + + public int readlen=-1; + + public final boolean colorspace; + private final String[] headerA=new String[1]; + + public static boolean NUMERIC_QUAL=true; + + public static boolean verbose=false; + public static boolean FAKE_QUALITY=false; + + private String currentLine=null; + private CharSequence currentQLine=null; + private int currentLoc=0; + private int currentSection=0; + private boolean finished=false; + +} diff --git a/current/stream/FastaQualReadInputStream2.java b/current/stream/FastaQualReadInputStream2.java new file mode 100755 index 0000000..dfca10a --- /dev/null +++ b/current/stream/FastaQualReadInputStream2.java @@ -0,0 +1,364 @@ +package stream; + +import java.util.ArrayList; + +import align2.Shared; + +import dna.Data; +import fileIO.ByteFile; +import fileIO.FileFormat; + +public class FastaQualReadInputStream2 extends ReadInputStream { + + public static void main(String[] args){ + + FastaQualReadInputStream2 fris=new FastaQualReadInputStream2(args[0], args[1], false, true); + + Read r=fris.next(); + int i=0; + while(r!=null){ + System.out.println(r.toText(false)); + r=fris.next(); + if(i++>3){break;} + } + + } + + public FastaQualReadInputStream2(String fname, String qfname, boolean colorspace_, boolean allowSubprocess_){ +// assert(false) : "In progress"; + colorspace=colorspace_; + + FileFormat ff=FileFormat.testInput(fname, FileFormat.FASTA, null, allowSubprocess_, false); + if(!ff.fasta() && !ff.stdio()){ + System.err.println("Warning: Did not find expected fasta file extension for filename "+fname); + } + + tf=ByteFile.makeByteFile(ff, false); + qtf=ByteFile.makeByteFile(FileFormat.testInput(qfname, FileFormat.QUAL, null, allowSubprocess_, false), false); + interleaved=false; + + } + + @Override + public void start() { +// if(cris!=null){new Thread(cris).start();} + } + + + @Override + public boolean hasMore() { + if(buffer==null || next>=buffer.length){ + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next=buffer.length){fillBuffer();} + Read[] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return r; + } + + @Override + public synchronized ArrayList nextList() { + return toList(nextBlock()); + } + public final boolean preferArrays(){return true;} + + private synchronized void fillBuffer(){ + if(verbose){System.err.println("Filling buffer. buffer="+(buffer==null ? null : buffer.length));} + assert(buffer==null || next>=buffer.length); + + buffer=null; + next=0; + + buffer=toReads(tf, BUF_LEN, nextReadID, interleaved, headerA); + + if(verbose){System.err.println("Filled buffer. buffer="+(buffer==null ? null : buffer.length));} + + nextReadID+=buffer.length; + if(buffer.length list=new ArrayList(Data.min(16384, maxReadsToReturn)); + + int added=0; + + Read prev=null; + + while(added=currentLine.length) : currentLoc+", "+currentLine.length+", "+ + TARGET_READ_LEN+", "+sb.length()+"\n"+currentLine+"\n"+sb; + currentLine=null; + currentQLine=null; + currentLoc=0; + while(currentLine==null){ + currentLine=tf.nextLine(); + currentQLine=nextQtfLine(qtf); + assert(currentLine==null || currentLine.length==currentQLine.length); + assert(currentLine==null || currentLine[0]!=carrot || currentLine.equals(currentQLine)); + if(currentLine==null || currentLine[0]==carrot){ + if(sb.length()>=MIN_READ_LEN){ + r=makeRead(sb, sbq, numericID); + }else{ + sb.setLength(0); + sbq.setLength(0); + } + headerA[0]=currentLine==null ? null : new String(currentLine); + currentLoc=0; + currentSection=0; + currentLine=null; + currentQLine=null; + if(r!=null){return r;} + if(headerA[0]==null){ + if(verbose){System.err.println("Returning null because tf.nextLine()==null: B");} + return null; + } + } + assert(currentLine==null || currentLine[0]!=carrot); + } + assert(currentLine==null || currentLine[0]!=carrot); + } + assert(currentLine==null || currentLine[0]!=carrot); + } + assert(currentLine==null || currentLine[0]!=carrot); + if(verbose){System.err.println("Returning null because loop exited (should be unreachable).");} + return null; + } + + private final byte[] nextQtfLine(ByteFile qtf){ + byte[] s=qtf.nextLine(); + if(!NUMERIC_QUAL || s==null || s.length==0 || s[0]==carrot){return s;} + + int last=s.length-1; + while(last>=0 && Character.isWhitespace(s[last])){ + last--; + } + if(last<0){return new byte[0];} + final int lim=last+1; + int spaces=0; + for(int i=0; i0); + sb[j]=(byte)(x+FASTQ.ASCII_OFFSET); + x=0; + j++; + }else{ + x=10*x+(b-zero); + } + } + sb[j]=(byte)(x+FASTQ.ASCII_OFFSET); + return sb; + } + + private Read makeRead(StringBuilder sb, StringBuilder sbq, long numericID){ +// assert(!sb.equals(sbq)) : sb+"\n"+sbq; + byte[] quals=new byte[sbq.length()]; + byte[] bases=new byte[sb.length()]; +// if(FAKE_QUALITY){ +// quals=new byte[sb.length()]; +// Arrays.fill(quals, (byte)(30)); +// } + for(int i=0; i0 ? headerA[0].substring(1)+"_"+currentSection : new String(headerA[0].substring(1))); + Read r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, colorspace, numericID); + return r; + } + + public boolean close(){ + boolean a=tf.close(); + boolean b=qtf.close(); + return a|b; + } + + @Override + public synchronized void restart() { + generated=0; + consumed=0; + next=0; + nextReadID=0; + buffer=null; + + currentLine=null; + currentLoc=0; + currentSection=0; + finished=false; + + tf.reset(); + qtf.reset(); + } + + @Override + public boolean paired() { + return interleaved; + } + + private Read[] buffer=null; + private int next=0; + + private final ByteFile tf; + private final ByteFile qtf; + private final boolean interleaved; + + public static final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + public long generated=0; + public long consumed=0; + private long nextReadID=0; + + public final boolean colorspace; + private final String[] headerA=new String[1]; + + public static boolean NUMERIC_QUAL=true; + + public static boolean verbose=false; + public static boolean FAKE_QUALITY=false; + + private byte[] currentLine=null; + private byte[] currentQLine=null; +// private CharSequence currentQLine=null; + private int currentLoc=0; + private int currentSection=0; + private boolean finished=false; + private final byte carrot='>', space=' ', zero='0'; + +} diff --git a/current/stream/FastaReadInputStream.java b/current/stream/FastaReadInputStream.java new file mode 100755 index 0000000..0bdd04b --- /dev/null +++ b/current/stream/FastaReadInputStream.java @@ -0,0 +1,490 @@ +package stream; + +import java.io.IOException; +import java.io.InputStream; +import java.util.ArrayList; +import java.util.Arrays; + +import align2.Shared; + +import dna.Gene; +import dna.Timer; + +import fileIO.FileFormat; +import fileIO.ReadWrite; + +/** + * @author Brian Bushnell + * @date Feb 13, 2013 + * + */ +public class FastaReadInputStream extends ReadInputStream { + + public static void main(String[] args){ + + int a=20, b=Integer.MAX_VALUE; + if(args.length>1){a=Integer.parseInt(args[1]);} + if(args.length>2){b=Integer.parseInt(args[2]);} + if(args.length>3){MIN_READ_LEN=Integer.parseInt(args[3]);} + if(args.length>4){TARGET_READ_LEN=Integer.parseInt(args[4]);} + if(TARGET_READ_LEN<1){ + TARGET_READ_LEN=Integer.MAX_VALUE; + SPLIT_READS=false; + } + + Timer t=new Timer(); + t.start(); + + FastaReadInputStream fris=new FastaReadInputStream(args[0], false, false, false, Shared.READ_BUFFER_MAX_DATA); + Read r=fris.next(); + int i=0; + + while(r!=null){ + if(i=a){break;} + } + while(r!=null && i++0 ? TARGET_READ_LEN : Integer.MAX_VALUE) : Integer.MAX_VALUE); + MAX_DATA=maxdata>0 ? maxdata : Shared.READ_BUFFER_MAX_DATA; + + ins=open(); + + assert(settingsOK()); + } + + @Override + public Read next() { + if(!hasMore()){ + if(verbose){System.err.println("hasMore() returned false; currentList="+ + (currentList==null ? null : currentList.size())+", nextReadIndex="+nextReadIndex+", consumed="+consumed);} + return null; + } + Read r=currentList.set(nextReadIndex, null); + nextReadIndex++; + consumed++; + return r; + } + + @Override + public ArrayList nextList() { + if(nextReadIndex!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");} + if(currentList==null || nextReadIndex>=currentList.size()){ + boolean b=fillList(); + } + ArrayList list=currentList; + currentList=null; + if(list==null || list.isEmpty()){ + list=null; + }else{ + consumed+=list.size(); + } + return list; + } + + @Override + public boolean hasMore() { + if(currentList==null || nextReadIndex>=currentList.size()){ + if(open){ + fillList(); + }else{ +// assert(generated>0) : "Was the file empty?"; + } + } + return (currentList!=null && nextReadIndex=currentList.size()); + nextReadIndex=0; + currentList=new ArrayList(BUF_LEN); + + if(header==null){header=nextHeader();} + long len=0; + for(int i=0; i0); + + byte[] quals=null; + if(FAKE_QUALITY){ + quals=new byte[bases.length]; + Arrays.fill(quals, (byte)(FAKE_QUALITY_LEVEL)); + } +// String hd=((currentSection==1 && !hitmax) ? header : header+"_"+currentSection); + String hd=((currentSection==1 && bases.length0) : "id="+hd+", section="+currentSection+", len="+bases.length+"\n"+new String(bases); + Read r=null; + if(FASTQ.PARSE_CUSTOM){ + if(header!=null && header.indexOf('_')>0){ + String temp=header; + if(temp.endsWith(" /1") || temp.endsWith(" /2")){temp=temp.substring(0, temp.length()-3);} + String[] answer=temp.split("_"); + + if(answer.length>=5){ + try { + byte trueChrom=Gene.toChromosome(answer[1]); + byte trueStrand=Byte.parseByte(answer[2]); + int trueLoc=Integer.parseInt(answer[3]); + int trueStop=Integer.parseInt(answer[4]); + r=new Read(bases, trueChrom, trueStrand, trueLoc, trueStop, hd, quals, colorspace, nextReadID); + r.setSynthetic(true); + } catch (NumberFormatException e) { + FASTQ.PARSE_CUSTOM=false; + System.err.println("Turned off PARSE_CUSTOM because could not parse "+new String(header)); + } + }else{ + FASTQ.PARSE_CUSTOM=false; + System.err.println("Turned off PARSE_CUSTOM because answer="+Arrays.toString(answer)); + } + }else{ + FASTQ.PARSE_CUSTOM=false; + System.err.println("Turned off PARSE_CUSTOM because header="+header+", index="+header.indexOf('_')); + } + } + if(r==null){r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, colorspace, nextReadID);} + nextReadID++; + if(verbose){System.err.println("Made read:\t"+(r.bases.length>1000 ? r.id : r.toString()));} + return r; + } + + private String nextHeader(){ + if(verbose){System.err.println("Called nextHeader(); bstart="+bstart);} + assert(bstart>=bstop || buffer[bstart]=='>' || buffer[bstart]<=slashr) : bstart+", "+bstop+", '"+(char)buffer[bstart]+"'"; + while(bstart=bstop || buffer[x]=='>') : bstart+", "+bstop+", '"+(char)buffer[x]+"'"; + while(xslashr){x++;} + if(x>=bstop){ + int fb=fillBuffer(); + if(fb<1){ + if(verbose){System.err.println("Returning null from nextHeader()");} + return null; + } + x=0; + assert(bstart==0 && bstart'); //Note: This assertion will fire if a fasta file starts with a newline. + while(xslashr){x++;} + } + assert(x>=bstop || buffer[x]<=slashr); + + int start=bstart+1, stop=x; + if(Shared.TRIM_READ_COMMENTS){ + for(int i=start; istart ? new String(buffer, start, stop-start) : ""; +// String s=new String(buffer, bstart+1, x-(bstart+1)); + if(verbose){System.err.println("Fetched header: '"+s+"'");} + bstart=x+1; + + return s; + } + + private byte[] nextBases(){ + if(verbose){System.err.println("Called nextBases()");} + assert(open) : "Attempting to read from a closed file. Current header: "+header; + if(bstart>=bstop){ + int bytes=fillBuffer(); + if(bytes<1){return null;} + } + int x=bstart; + int bases=0; + + assert(x>=bstop || buffer[x]!='>'); + + while(xslashr){bases++;} + x++; + } + assert(x==bstop || buffer[x]=='>' || bases==maxLen); + if(x==bstop && bases1000 ? "*LONG*" : new String(r))+"'");} + + bstart=x; + return r; + } + + /** Fills buffer. Ensures that result will extend to the next caret or EOF. Returns number of bytes filled. */ + private final int fillBuffer(){ + assert(open); + if(verbose){System.err.println("fillBuffer() : bstart="+bstart+", bstop="+bstop);} + if(bstart0){ +// assert(bstart>0) : bstart+", "+bstop+", "+new String(buffer); + int extra=bstop-bstart; + for(int i=0; i=Integer.MAX_VALUE-1){ + throw new RuntimeException("Minimum FASTA read length is too long: "+MIN_READ_LEN); + } + if(MIN_READ_LEN<1){ + throw new RuntimeException("Minimum FASTA read length is too short: "+MIN_READ_LEN); + } + if(SPLIT_READS){ + if(TARGET_READ_LEN<1){ + throw new RuntimeException("Target FASTA read length is too short: "+TARGET_READ_LEN); + } + if(MIN_READ_LEN>TARGET_READ_LEN){ + throw new RuntimeException("Minimum FASTA read length is longer than maximum read length: "+MIN_READ_LEN+">"+TARGET_READ_LEN); + } + } + if(MIN_READ_LEN>=Integer.MAX_VALUE-1 || MIN_READ_LEN<1){return false;} + if(SPLIT_READS && (TARGET_READ_LEN<1 || MIN_READ_LEN>TARGET_READ_LEN)){return false;} + return true; + } + + public final String name; + + private ArrayList currentList=null; + private String header=null; + + private boolean open=false; +// private boolean hitmax=false; //Indicates that the current 'read' has more pieces to come + private byte[] buffer=new byte[16384]; + private int bstart=0, bstop=0; + public InputStream ins; + + private long consumed=0; + private long nextReadID=0; + private int nextReadIndex=0; + private int currentSection=0; + + public final boolean allowSubprocess; + public final boolean interleaved; + public final boolean colorspace; + private final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + private final long MAX_DATA; + private final int maxLen, minLen; + + + public static boolean verbose=false; + private final static byte slashr='\r', slashn='\n', carrot='>'; + + public static boolean SPLIT_READS=true; + public static int TARGET_READ_LEN=500; + public static int MIN_READ_LEN=1;//40; + public static int DEFAULT_WRAP=80; + public static boolean FAKE_QUALITY=false; + public static byte FAKE_QUALITY_LEVEL=30; + +} diff --git a/current/stream/FastaReadInputStream2.java b/current/stream/FastaReadInputStream2.java new file mode 100755 index 0000000..fc658da --- /dev/null +++ b/current/stream/FastaReadInputStream2.java @@ -0,0 +1,317 @@ +package stream; + +import java.util.ArrayList; +import java.util.Arrays; + +import align2.Shared; + +import dna.Data; +import fileIO.ByteFile; +import fileIO.FileFormat; + +public class FastaReadInputStream2 extends ReadInputStream { + + public static void main(String[] args){ + + FastaReadInputStream2 fris=new FastaReadInputStream2(args[0], false, true); + + Read r=fris.next(); + int i=0; + while(r!=null){ + System.out.println(r.toText(false)); + r=fris.next(); + if(i++>3){break;} + } + + } + + public FastaReadInputStream2(String fname, boolean colorspace_, boolean allowSubprocess_){ +// assert(false) : "In progress"; + colorspace=colorspace_; + + FileFormat ff=FileFormat.testInput(fname, FileFormat.FASTA, null, allowSubprocess_, false); + + if(!ff.fasta() && !ff.stdio()){ + System.err.println("Warning: Did not find expected fasta file extension for filename "+fname); + } + + tf=ByteFile.makeByteFile(ff, false); + interleaved=false; + + } + + @Override + public void start() { +// if(cris!=null){new Thread(cris).start();} + } + + + @Override + public boolean hasMore() { + if(buffer==null || next>=buffer.length){ + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next=buffer.length){fillBuffer();} + Read[] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return r; + } + + @Override + public synchronized ArrayList nextList() { + return toList(nextBlock()); + } + public final boolean preferArrays(){return true;} + + private synchronized void fillBuffer(){ + if(verbose){System.err.println("Filling buffer. buffer="+(buffer==null ? null : buffer.length));} + assert(buffer==null || next>=buffer.length); + + buffer=null; + next=0; + + buffer=toReads(tf, BUF_LEN, nextReadID, interleaved, headerA); + + if(verbose){System.err.println("Filled buffer. buffer="+(buffer==null ? null : buffer.length));} + + nextReadID+=buffer.length; + if(buffer.length list=new ArrayList(Data.min(16384, maxReadsToReturn)); + + int added=0; + + Read prev=null; + if(verbose){System.err.println("added="+added+", max="+maxReadsToReturn);} + while(added=currentLine.length) : currentLoc+", "+currentLine.length+", "+ + TARGET_READ_LEN+", "+sb.length()+"\n"+currentLine+"\n"+sb; + currentLine=null; + currentLoc=0; + while(currentLine==null){ + currentLine=tf.nextLine(); + if(currentLine==null || currentLine[0]==carrot){ + if(sb.length()>=MIN_READ_LEN){ + if(verbose){System.err.println("Made read of length "+sb.length());} + r=makeRead(sb, numericID); + }else{ + if(verbose){System.err.println("Read was too short at length "+sb.length()+"\n"+sb);} + sb.setLength(0); + } + if(verbose){System.err.println("headerA was "+headerA[0]);} + headerA[0]=(currentLine==null ? null : new String(currentLine)); + currentLoc=0; + currentSection=0; +// assert(false) : "'"+new String(currentLine)+"', "+headerA[0]; + currentLine=null; + if(r!=null){ + if(verbose){System.err.println("Returning read "+r);} + return r; + } + if(headerA[0]==null){ + if(verbose){System.err.println("Returning null because tf.nextLine()==null: B");} + return null; + } + } + assert(currentLine==null || currentLine[0]!=carrot); + } + assert(currentLine==null || currentLine[0]!=carrot); + } + assert(currentLine==null || currentLine[0]!=carrot); + } + assert(currentLine==null || currentLine[0]!=carrot); + if(verbose){System.err.println("Returning null because loop exited (should be unreachable).");} + return null; + } + + private Read makeRead(StringBuilder sb, long numericID){ + byte[] quals=null; + byte[] bases=new byte[sb.length()]; + if(FAKE_QUALITY){ + quals=new byte[sb.length()]; + Arrays.fill(quals, (byte)(30)); + } + for(int i=0; i0 ? headerA[0].substring(1)+"_"+currentSection : new String(headerA[0].substring(1))); +// assert(currentSection==0); + Read r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, colorspace, numericID); + return r; + } + + public boolean close(){ + return tf.close(); + } + + @Override + public synchronized void restart() { + generated=0; + consumed=0; + next=0; + nextReadID=0; + buffer=null; + + currentLine=null; + currentLoc=0; + currentSection=0; + finished=false; + + tf.reset(); + } + + @Override + public boolean paired() { + return interleaved; + } + + private Read[] buffer=null; + private int next=0; + + private final ByteFile tf; + private final boolean interleaved; + + public static final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + public long generated=0; + public long consumed=0; + private long nextReadID=0; + + public final boolean colorspace; + private final String[] headerA=new String[1]; + + public static boolean SPLIT_READS=true; + public static int TARGET_READ_LEN=500; + public static int MIN_READ_LEN=40; + public static int DEFAULT_WRAP=100; + + public static boolean verbose=false; + public static boolean FAKE_QUALITY=false; + + private byte[] currentLine=null; + private int currentLoc=0; + private int currentSection=0; + private boolean finished=false; + private final byte carrot='>'; + +} diff --git a/current/stream/FastaReadInputStream_old.java b/current/stream/FastaReadInputStream_old.java new file mode 100755 index 0000000..b5e8d84 --- /dev/null +++ b/current/stream/FastaReadInputStream_old.java @@ -0,0 +1,303 @@ +package stream; + +import java.util.ArrayList; +import java.util.Arrays; + +import align2.Shared; + +import dna.Data; +import fileIO.TextFile; + +public class FastaReadInputStream_old extends ReadInputStream { + + public static void main(String[] args){ + + FastaReadInputStream_old fris=new FastaReadInputStream_old(args[0], false); + + Read r=fris.next(); + int i=0; + while(r!=null){ + System.out.println(r.toText(false)); + r=fris.next(); + if(i++>3){break;} + } + + } + + public FastaReadInputStream_old(String fname, boolean colorspace_){ +// assert(false) : "In progress"; + colorspace=colorspace_; + + if(!fileIO.FileFormat.hasFastaExtension(fname)){ + System.err.println("Warning: Did not find expected fasta file extension for filename "+fname); + } + + tf=new TextFile(fname, false, false); + interleaved=false; + + } + + @Override + public void start() { +// if(cris!=null){new Thread(cris).start();} + } + + + @Override + public boolean hasMore() { + if(buffer==null || next>=buffer.length){ + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next=buffer.length){fillBuffer();} + Read[] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return r; + } + + @Override + public synchronized ArrayList nextList() { + return toList(nextBlock()); + } + public final boolean preferArrays(){return true;} + + private synchronized void fillBuffer(){ + if(verbose){System.err.println("Filling buffer. buffer="+(buffer==null ? null : buffer.length));} + assert(buffer==null || next>=buffer.length); + + buffer=null; + next=0; + + buffer=toReads(tf, BUF_LEN, nextReadID, interleaved, headerA); + + if(verbose){System.err.println("Filled buffer. buffer="+(buffer==null ? null : buffer.length));} + + nextReadID+=buffer.length; + if(buffer.length list=new ArrayList(Data.min(16384, maxReadsToReturn)); + + int added=0; + + Read prev=null; + + while(added'){ + headerA[0]=currentLine; + currentLoc=0; + currentSection=0; + currentLine=null; + } + } + + assert(currentLine==null || currentLine.charAt(0)!='>'); + + StringBuilder sb=new StringBuilder(); + Read r=null; + while(r==null){ + if(!SPLIT_READS || (currentLoc==0 && (currentLine.length()<=(TARGET_READ_LEN-sb.length())))){ + sb.append(currentLine); + currentLoc=currentLine.length(); + }else{ + while(sb.length()=currentLine.length()) : currentLoc+", "+currentLine.length()+", "+ + TARGET_READ_LEN+", "+sb.length()+"\n"+currentLine+"\n"+sb; + currentLine=null; + currentLoc=0; + while(currentLine==null){ + currentLine=tf.nextLine(); + if(currentLine==null || currentLine.charAt(0)=='>'){ + if(sb.length()>=MIN_READ_LEN){ + r=makeRead(sb, numericID); + }else{ + sb.setLength(0); + } + headerA[0]=currentLine; + currentLoc=0; + currentSection=0; + currentLine=null; + if(r!=null){return r;} + if(headerA[0]==null){ + if(verbose){System.err.println("Returning null because tf.nextLine()==null: B");} + return null; + } + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + } + assert(currentLine==null || currentLine.charAt(0)!='>'); + if(verbose){System.err.println("Returning null because loop exited (should be unreachable).");} + return null; + } + + private Read makeRead(StringBuilder sb, long numericID){ + byte[] quals=null; + byte[] bases=new byte[sb.length()]; + if(FAKE_QUALITY){ + quals=new byte[sb.length()]; + Arrays.fill(quals, (byte)(30)); + } + for(int i=0; i0 ? headerA[0].substring(1)+"_"+currentSection : new String(headerA[0].substring(1))); + Read r=new Read(bases, (byte)0, (byte)0, 0, 0, hd, quals, colorspace, numericID); + return r; + } + + public boolean close(){ + return tf.close(); + } + + @Override + public synchronized void restart() { + generated=0; + consumed=0; + next=0; + nextReadID=0; + buffer=null; + + currentLine=null; + currentLoc=0; + currentSection=0; + finished=false; + + tf.reset(); + } + + @Override + public boolean paired() { + return interleaved; + } + + private Read[] buffer=null; + private int next=0; + + private final TextFile tf; + private final boolean interleaved; + + public static final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + public long generated=0; + public long consumed=0; + private long nextReadID=0; + + public final boolean colorspace; + private final String[] headerA=new String[1]; + + public static boolean SPLIT_READS=true; + public static int TARGET_READ_LEN=500; + public static int MIN_READ_LEN=40; + public static int DEFAULT_WRAP=100; + + public static boolean verbose=false; + public static boolean FAKE_QUALITY=false; + + private String currentLine=null; + private int currentLoc=0; + private int currentSection=0; + private boolean finished=false; + private final byte carrot='>'; + +} diff --git a/current/stream/FastaStream.java b/current/stream/FastaStream.java new file mode 100755 index 0000000..966e1d8 --- /dev/null +++ b/current/stream/FastaStream.java @@ -0,0 +1,157 @@ +package stream; + +import java.util.ArrayList; +import java.util.Arrays; + +import align2.Shared; +import align2.Tools; + +import dna.Data; +import fileIO.TextFile; + +public class FastaStream { + + public static void main(String[] args){ + FastaStream qs=new FastaStream(args[0]); + for(int i=0; i<9000; i++){ + byte[][] next=qs.next(); + System.out.println(new String(next[0])); + System.out.println(new String(next[1])); + System.out.println(); + } + } + + public FastaStream(String fname){ + + if(!fileIO.FileFormat.hasFastaExtension(fname) && !fname.startsWith("stdin")){ + System.err.println("Warning: Did not find expected fasta file extension for filename "+fname); + } + tf=new TextFile(fname, false, false); + + if(SKIP_INITIAL_READS){ + while(consumed=buffer.length){ +// System.err.println("Attempting to fill buffer."); + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next=buffer.length){fillBuffer();} + byte[][][] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); + return r; + } + + private synchronized void fillBuffer(){ + //System.err.println("Calling fillBuffer()"); + + assert(buffer==null || next>=buffer.length); + + buffer=null; + next=0; + + buffer=toPairs(tf, BUF_LEN); + if(buffer.length list=new ArrayList(Tools.min(16384, maxToReturn)); + + String[] pair=new String[2]; + + int cntr=0; + int added=0; + final byte dot='.'; + final byte N='N'; + + for(s=tf.nextLine(); s!=null && added")); + + //TODO Note: These assertions are only for SOLiD colorspace. + assert(pair[1].charAt(0)=='T' || pair[1].charAt(0)=='G' || + pair[1].charAt(0)=='.' || Character.isDigit(pair[1].charAt(0))) : pair[1]; + assert(pair[1].charAt(1)=='.' || Character.isDigit(pair[1].charAt(1))) : pair[1]; + + byte[][] fixed=new byte[2][]; + fixed[0]=Arrays.copyOfRange(pair[0].getBytes(), 1, pair[0].length()); + fixed[1]=pair[1].getBytes(); + for(int i=0; i=maxToReturn){break;} + +// System.out.println(r.chrom+", "+r.strand+", "+r.loc); +// assert(false); + } + } + +// for(int i=0; i<12000; i++){tf.nextLine();} + + assert(list.size()<=maxToReturn); + return list.toArray(new byte[0][][]); + } + + public void close(){ + tf.close(); + } + + private byte[][][] buffer; + + private int next=0; + + private final TextFile tf; + + public static final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + public long generated=0; + public long consumed=0; + + public static boolean SKIP_INITIAL_READS=QualStream.SKIP_INITIAL_READS; + public static final int INITIAL_READS_TO_SKIP=QualStream.INITIAL_READS_TO_SKIP; + +} diff --git a/current/stream/FastqReadInputStream.java b/current/stream/FastqReadInputStream.java new file mode 100755 index 0000000..eafea96 --- /dev/null +++ b/current/stream/FastqReadInputStream.java @@ -0,0 +1,178 @@ +package stream; + +import java.util.ArrayList; + +import align2.Shared; + +import dna.Data; +import fileIO.ByteFile; +import fileIO.FileFormat; + +public class FastqReadInputStream extends ReadInputStream { + + public static void main(String[] args){ + + FASTQ.PARSE_CUSTOM=false; + + FastqReadInputStream fris=new FastqReadInputStream(args[0], false, true); + + Read r=fris.next(); + System.out.println(r.toText(false)); + + } + + public FastqReadInputStream(String fname, boolean colorspace_, boolean allowSubprocess_){ + this(FileFormat.testInput(fname, FileFormat.FASTQ, null, allowSubprocess_, false), colorspace_); + } + + + public FastqReadInputStream(FileFormat ff, boolean colorspace_){ + if(verbose){System.err.println("FastqReadInputStream("+ff+", "+colorspace_+")");} + + colorspace=colorspace_; + + stdin=ff.stdio(); + if(!ff.fastq()){ + System.err.println("Warning: Did not find expected fastq file extension for filename "+ff.name()); + } + + if(FASTQ.PARSE_CUSTOM){ + try { + String s[]=ff.name().split("_"); +// maxSnps=toNumber(s[3]); +// maxInss=toNumber(s[4]); +// maxDels=toNumber(s[5]); +// maxSubs=toNumber(s[6]); + +// s=s[8].split("\\."); +// +// s=s[0].split("-"); + + if(s.length!=8 && s.length!=9){ + if(Data.WINDOWS){System.err.println("Note: Filename indicates non-synthetic data, but FASTQ.PARSE_CUSTOM="+FASTQ.PARSE_CUSTOM);} + } + +// minChrom=Gene.toChromosome(s[0]); +// maxChrom=Gene.toChromosome(s[1]); + + } catch (Exception e) { + // TODO Auto-generated catch block + // e.printStackTrace(); + if(Data.WINDOWS){System.err.println("Note: Filename indicates non-synthetic data, but FASTQ.PARSE_CUSTOM="+FASTQ.PARSE_CUSTOM);} + } + } + + tf=ByteFile.makeByteFile(ff, false); + interleaved=((tf.is()==System.in || stdin) ? FASTQ.FORCE_INTERLEAVED : FASTQ.isInterleaved(tf.name())); +// assert(false) : interleaved; + } + + @Override + public void start() { +// if(cris!=null){new Thread(cris).start();} + } + + + @Override + public boolean hasMore() { + if(buffer==null || next>=buffer.size()){ + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next x=nextList(); + if(x==null){return null;} + return x.toArray(new Read[x.size()]); + } + + @Override + public synchronized ArrayList nextList() { + if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");} + if(buffer==null || next>=buffer.size()){fillBuffer();} + ArrayList list=buffer; + buffer=null; + if(list!=null && list.size()==0){list=null;} + consumed+=(list==null ? 0 : list.size()); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return list; + } + public final boolean preferArrays(){return false;} + + private synchronized void fillBuffer(){ + + assert(buffer==null || next>=buffer.size()); + + buffer=null; + next=0; + + buffer=FASTQ.toReadList(tf, BUF_LEN, colorspace, nextReadID, interleaved); + int bsize=(buffer==null ? 0 : buffer.size()); + nextReadID+=bsize; + if(bsize=buffer.length){fillBuffer();} + Read[] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return r; + } + + @Override + public synchronized ArrayList nextList() { + return toList(nextBlock()); + } + public final boolean preferArrays(){return true;} + + private synchronized void fillBuffer(){ + + assert(buffer==null || next>=buffer.length); + + buffer=null; + next=0; + + buffer=FASTQ.toReads(tf, BUF_LEN, colorspace, nextReadID, interleaved); + nextReadID+=buffer.length; + if(buffer.length=buffer.length){ +// System.err.println("Attempting to fill buffer."); + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next=buffer.length){fillBuffer();} + byte[][][] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); + return r; + } + + private synchronized void fillBuffer(){ + //System.err.println("Calling fillBuffer()"); + + assert(buffer==null || next>=buffer.length); + + buffer=null; + next=0; + + buffer=toPairs(tf, BUF_LEN); + if(buffer.length list=new ArrayList(Tools.min(16384, maxToReturn)); + + String[] pair=new String[2]; + + int cntr=0; + int added=0; + + for(s=tf.nextLine(); s!=null && added")); + assert(pair[1].startsWith("-") || Character.isDigit(pair[1].charAt(0))); + +// int spaces=0; +// for(int i=0; i=maxToReturn){break;} + +// System.out.println(r.chrom+", "+r.strand+", "+r.loc); +// assert(false); + } + } + +// for(int i=0; i<12000; i++){tf.nextLine();} + + assert(list.size()<=maxToReturn); + return list.toArray(new byte[0][][]); + } + + /** TODO Speed up by removing the "split" call. */ + public static byte[] toNumbers(String s){ + //System.err.println("Calling toNumbers()"); + String[] split=s.split(" "); + byte[] out=new byte[split.length]; + for(int i=0; i list=rtis.nextList(); + while(list!=null){ + for(Read r : list){ + System.out.println(r.toText(true)); + } + list=rtis.nextList(); + } + } + + public RTextInputStream(FileFormat ff1, FileFormat ff2, long crisReadLimit){ + this(ff1.name(), (ff2==null ? null : ff2.name()), crisReadLimit); + } + + public RTextInputStream(String fname1, String fname2, long crisReadLimit){ + this(new String[] {fname1}, (fname2==null || "null".equalsIgnoreCase(fname2)) ? null : new String[] {fname2}, crisReadLimit); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + } + public RTextInputStream(String[] fnames_, long crisReadLimit){this(fnames_, null, crisReadLimit);} + + public RTextInputStream(String[] fnames_, String[] mate_fnames_, long crisReadLimit){ + fnames=fnames_; + textfiles=new TextFile[fnames.length]; + for(int i=0; i list=readList(); +// if(list==null || list.size()==0){return null;} +// return list.toArray(new Read[list.size()]); +// } + + @Override + public synchronized ArrayList nextList(){ +// System.out.println((mateStream==null ? "F5: " : "F3: ")+"Grabbing a list: finished="+finished); + if(finished){return null;} + return readList(); + } + public final boolean preferArrays(){return false;} + + private synchronized ArrayList readList(){ + assert(buffer==null); +// System.out.println((mateStream==null ? "F5: " : "F3: ")+" Entering readList"); + if(finished){return null;} + + ArrayList merged=getListFromFile(textfiles[0]); + + if(textfiles.length>1){ + ArrayList[] temp=new ArrayList[textfiles.length]; + temp[0]=merged; + for(int i=0; i getListFromFile(TextFile tf){ + + int len=READS_PER_LIST; + if(readLimit-readCount list=new ArrayList(len); + + for(int i=0; i0 == r.mapScore>0) : r.toText(false); + if(interleaved){ + s=tf.nextLine(); + assert(s!=null) : "Odd number of reads in interleaved file "+tf.name; + if(s!=null){ + Read r2=Read.fromText(s); + assert(r2.numericID==r.numericID) : "Different numeric IDs for paired reads in interleaved file "+tf.name; + r2.numericID=r.numericID; + r2.mate=r; + r.mate=r2; + } + } + list.add(r); + } + readCount+=list.size(); + + if(list.size() buffer=null; + private int next=0; + + private long readCount; + private final long readLimit; + private final boolean interleaved; + + public static final int READS_PER_LIST=Shared.READ_BUFFER_LENGTH; + + private final RTextInputStream mateStream; + private final ConcurrentReadInputStream cris; + public static boolean USE_CRIS=true; //Doubles read speed for zipped paired files + + @Override + /** This is optimistic and may return "true" incorrectly. */ + public boolean hasMore() { + if(buffer!=null && next=buffer.size()){ + buffer=null; + next=0; + if(!finished){ + buffer=nextList(); + } + } + + if(buffer==null || next>=buffer.size()){ + assert(finished); + return null; + } + Read r=buffer.get(next); + buffer.set(next, null); + next++; + return r; + } + + + @Override + public synchronized void restart() { + finished=false; + next=0; + buffer=null; + for(TextFile tf : textfiles){tf.reset();} + if(cris!=null){ + cris.restart(); + new Thread(cris).start(); + }else if(mateStream!=null){mateStream.restart();} + } + + @Override + public synchronized boolean close() { + boolean error=false; + for(TextFile tf : textfiles){error|=tf.close();} + if(cris!=null){ + error|=ReadWrite.closeStream(cris);; + }else if(mateStream!=null){ + mateStream.close(); + error|=mateStream.errorState(); + } + return error; + } + +} diff --git a/current/stream/RTextOutputStream3.java b/current/stream/RTextOutputStream3.java new file mode 100755 index 0000000..b115712 --- /dev/null +++ b/current/stream/RTextOutputStream3.java @@ -0,0 +1,351 @@ +package stream; + +import java.io.File; +import java.lang.Thread.State; +import java.util.ArrayList; +import java.util.HashMap; + +import fileIO.FileFormat; + +public class RTextOutputStream3 { + + public RTextOutputStream3(String fname, String mate_fname, int maxSize, + boolean ordered, boolean sam, boolean bam, boolean fastq, boolean fasta, boolean attachment, boolean overwrite, boolean sitesonly){ + this(fname, mate_fname, null, null, maxSize, ordered, sam, bam, fastq, fasta, attachment, overwrite, sitesonly, false); + } + + public RTextOutputStream3(String fname, String mate_fname, String qfname, String mate_qfname, int maxSize, + boolean ordered, boolean sam, boolean bam, boolean fastq, boolean fasta, boolean attachment, boolean overwrite, boolean sitesonly, boolean useSharedHeader){ +// System.err.println("Called RTextOutputStream3 with fname="+fname+", mate_fname="+mate_fname+", qfname="+qfname+", mate_qfname="+mate_qfname); + STANDARD_OUT=("standardout".equalsIgnoreCase(fname) || fname.toLowerCase().startsWith("standardout.") + || "stdout".equalsIgnoreCase(fname) || fname.toLowerCase().startsWith("stdout.")); + + if(verbose){ + System.err.println("RTextOutputStream3("+fname+", "+mate_fname+", "+qfname+", "+mate_qfname+", "+maxSize+", "+ordered+")"); + } + + ORDERED=ordered; + ATTACHMENT=attachment; + SAM=!ATTACHMENT && (sam || bam); + BAM=!ATTACHMENT && bam; + FASTQ=!ATTACHMENT && fastq; + FASTA=!ATTACHMENT && fasta; + SITESONLY=!ATTACHMENT && sitesonly; + + assert(((SAM ? 1 : 0)+(FASTQ ? 1 : 0)+(FASTA ? 1 : 0)+(ATTACHMENT ? 1 : 0)+(SITESONLY ? 1 : 0))<=1) : + SAM+", "+SITESONLY+", "+FASTQ+", "+FASTA+", "+ATTACHMENT; + + if(fname!=null && !fname.equals("/dev/null")){ + File f=new File(fname); + assert(overwrite || !f.exists()) : f.getAbsolutePath()+" already exists; please delete it."; + if(mate_fname!=null){assert(!fname.equals(mate_fname)) : fname+"=="+mate_fname;} + } + + boolean allowSubprocess=true; //TODO - make parameter + + if(BYTE_WRITER){ + readstream1=new ReadStreamByteWriter(fname, qfname, true, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, STANDARD_OUT, useSharedHeader, allowSubprocess); + readstream2=STANDARD_OUT ? null : ((SAM || SITESONLY || mate_fname==null) ? null : + new ReadStreamByteWriter(mate_fname, mate_qfname, false, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, false, useSharedHeader, allowSubprocess)); + }else{ + readstream1=new ReadStreamStringWriter(fname, qfname, true, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, STANDARD_OUT, useSharedHeader, allowSubprocess); + readstream2=STANDARD_OUT ? null : ((SAM || SITESONLY || mate_fname==null) ? null : + new ReadStreamStringWriter(mate_fname, mate_qfname, false, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, false, useSharedHeader, allowSubprocess)); + } + + if(readstream2==null && readstream1!=null){ +// System.out.println("RTextOutputStream3 detected interleaved output."); + readstream1.OUTPUT_INTERLEAVED=true; + } + + table=(ORDERED ? new HashMap>(MAX_CAPACITY) : null); + + assert(readstream1==null || readstream1.read1==true); + assert(readstream2==null || (readstream2.read1==false)); +// assert(false) : ATTACHMENT; + } + + public RTextOutputStream3(FileFormat ff1, FileFormat ff2, int maxSize, CharSequence header, boolean useSharedHeader){ + this(ff1, ff2, null, null, maxSize, header, useSharedHeader); + } + + public RTextOutputStream3(FileFormat ff1, FileFormat ff2, String qf1, String qf2, int maxSize, CharSequence header, boolean useSharedHeader){ + + if(verbose){ + System.err.println("RTextOutputStream3("+ff1+", "+ff2+", "+qf1+", "+qf2+", "+maxSize+", "+useSharedHeader+")"); + } + + assert(ff1!=null); + assert(!ff1.text() && !ff1.unknownFormat()) : "Unknown format for "+ff1; + + FASTA=ff1.fasta(); +// boolean bread=(ext==TestFormat.txt); + SAM=(ff1.sam() || ff1.bam()); + BAM=ff1.bam(); + ATTACHMENT=ff1.attachment(); + SITESONLY=ff1.sites(); + FASTQ=ff1.fastq(); + STANDARD_OUT=ff1.stdio(); + + assert(((SAM ? 1 : 0)+(FASTQ ? 1 : 0)+(FASTA ? 1 : 0)+(ATTACHMENT ? 1 : 0)+(SITESONLY ? 1 : 0))<=1) : + SAM+", "+SITESONLY+", "+FASTQ+", "+FASTA+", "+ATTACHMENT; + + ORDERED=ff1.ordered(); + if(ff1.hasName() && ff1.devnull()){ + File f=new File(ff1.name()); + assert(ff1.overwrite() || !f.exists()) : f.getAbsolutePath()+" already exists; please delete it."; + if(ff2!=null){assert(!ff1.name().equals(ff2.name())) : ff1.name()+"=="+ff2.name();} + } + + if(BYTE_WRITER){ + readstream1=new ReadStreamByteWriter(ff1, qf1, true, maxSize, header, useSharedHeader); + readstream2=ff1.stdio() || ff2==null ? null : new ReadStreamByteWriter(ff2, qf2, false, maxSize, header, useSharedHeader); + +// readstream1=new ReadStreamByteWriter(ff1, qfname, true, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, STANDARD_OUT, useSharedHeader, ff.allowSubprocess()); +// readstream2=STANDARD_OUT ? null : ((SAM || SITESONLY || mate_fname==null) ? null : +// new ReadStreamByteWriter(mate_fname, mate_qfname, false, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, false, useSharedHeader, ff.allowSubprocess())); + }else{ + readstream1=new ReadStreamStringWriter(ff1, qf1, true, maxSize, header, useSharedHeader); + readstream2=ff1.stdio() || ff2==null ? null : new ReadStreamStringWriter(ff2, qf2, false, maxSize, header, useSharedHeader); + +// readstream1=new ReadStreamStringWriter(fname, qfname, true, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, STANDARD_OUT, useSharedHeader, ff.allowSubprocess()); +// readstream2=STANDARD_OUT ? null : ((SAM || SITESONLY || mate_fname==null) ? null : +// new ReadStreamStringWriter(mate_fname, mate_qfname, false, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, false, useSharedHeader, ff.allowSubprocess())); + } + + if(readstream2==null && readstream1!=null){ +// System.out.println("RTextOutputStream3 detected interleaved output."); + readstream1.OUTPUT_INTERLEAVED=true; + } + + table=(ORDERED ? new HashMap>(MAX_CAPACITY) : null); + + assert(readstream1==null || readstream1.read1==true); + assert(readstream2==null || (readstream2.read1==false)); + } + +// public RTextOutputStream3(String fname, String mate_fname, int maxSize, +// boolean ordered, int[] exts, CharSequence header, boolean overwrite, boolean useSharedHeader){ +// this(fname, mate_fname, null, null, maxSize, ordered, exts, header, overwrite, useSharedHeader); +// } +// +// public RTextOutputStream3(String fname, String mate_fname, String qfname, String mate_qfname, int maxSize, +// boolean ordered, int[] exts, CharSequence header, boolean overwrite, boolean useSharedHeader){ +// +// if(verbose){ +// System.err.println("RTextOutputStream3("+fname+", "+mate_fname+", "+qfname+", "+mate_qfname+", "+maxSize+", "+ordered+")"); +// } +// +// if(exts==null){exts=FileFormat.testFormat(fname, false);} +// int ext=exts[0], zip=exts[1], itype=exts[2]; +// FASTA=ff.fasta(); +//// boolean bread=(ext==TestFormat.txt); +// SAM=ff.samOrBam(); +// BAM=ff.bam(); +// ATTACHMENT=ff.attachment(); +// SITESONLY=ff.sites(); +// FASTQ=(ff.fastq() || (ext==FileFormat.UNKNOWN && !SAM && !ATTACHMENT && !SITESONLY)); +// STANDARD_OUT=ff.stdio(); +// +// assert(((SAM ? 1 : 0)+(FASTQ ? 1 : 0)+(FASTA ? 1 : 0)+(ATTACHMENT ? 1 : 0)+(SITESONLY ? 1 : 0))<=1) : +// SAM+", "+SITESONLY+", "+FASTQ+", "+FASTA+", "+ATTACHMENT; +// +// ORDERED=ordered; +// if(fname!=null && !fname.equals("/dev/null")){ +// File f=new File(fname); +// assert(overwrite || !f.exists()) : f.getAbsolutePath()+" already exists; please delete it."; +// if(mate_fname!=null){assert(!fname.equals(mate_fname)) : fname+"=="+mate_fname;} +// } +// +// boolean allowSubprocess=true; //TODO - make parameter +// +// if(BYTE_WRITER){ +// readstream1=new ReadStreamByteWriter(fname, qfname, true, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, STANDARD_OUT, useSharedHeader, allowSubprocess); +// readstream2=STANDARD_OUT ? null : ((SAM || SITESONLY || mate_fname==null) ? null : +// new ReadStreamByteWriter(mate_fname, mate_qfname, false, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, false, useSharedHeader, allowSubprocess)); +// }else{ +// readstream1=new ReadStreamStringWriter(fname, qfname, true, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, STANDARD_OUT, useSharedHeader, allowSubprocess); +// readstream2=STANDARD_OUT ? null : ((SAM || SITESONLY || mate_fname==null) ? null : +// new ReadStreamStringWriter(mate_fname, mate_qfname, false, maxSize, SAM, BAM, FASTQ, FASTA, SITESONLY, ATTACHMENT, false, useSharedHeader, allowSubprocess)); +// } +// +// if(readstream2==null && readstream1!=null){ +//// System.out.println("RTextOutputStream3 detected interleaved output."); +// readstream1.OUTPUT_INTERLEAVED=true; +// } +// +// table=(ORDERED ? new HashMap>(MAX_CAPACITY) : null); +// +// assert(readstream1==null || readstream1.read1==true); +// assert(readstream2==null || (readstream2.read1==false)); +// } + + public synchronized void add(ArrayList list, long listnum){ + + if(ORDERED){ + int size=table.size(); +// System.err.print(size+", "); + final boolean flag=(size>=HALF_LIMIT); + if(listnum>nextListID && size>=ADD_LIMIT){ + System.err.println("Output buffer became full; key "+listnum+" waiting on "+nextListID+"."); + while(listnum>nextListID && size>=HALF_LIMIT){ + try { + this.wait(20000); + } catch (InterruptedException e) { + e.printStackTrace(); + } + size=table.size(); + } + System.err.println("Output buffer became clear for key "+listnum+"; next="+nextListID+", size="+size); + } + addOrdered(list, listnum); + assert(listnum!=nextListID); + if(flag && listnum list, long listnum){ +// System.err.println("RTOS got "+listnum+" of size "+(list==null ? "null" : list.size())+ +// " with first read id "+(list==null || list.isEmpty() || list.get(0)==null ? "null" : ""+list.get(0).numericID)); + assert(list!=null) : listnum; + assert(listnum>=nextListID) : listnum+", "+nextListID; +// assert(list.isEmpty() || list.get(0)==null || list.get(0).numericID>=nextReadID) : list.get(0).numericID+", "+nextReadID; + assert(!table.containsKey(listnum)); + + table.put(listnum, new ArrayList(list)); + + while(table.containsKey(nextListID)){ +// System.err.println("Writing list "+first.get(0).numericID); + ArrayList value=table.remove(nextListID); + write(value); + nextListID++; + } + if(table.isEmpty()){notifyAll();} + } + + private synchronized void addDisordered(ArrayList list, long listnum){ + assert(list!=null); + assert(table==null); + write(new ArrayList(list)); + } + + private synchronized void write(ArrayList list){ + if(readstream1!=null){ + if(readstream1.getState()==State.TERMINATED){throw new RuntimeException("Writing to a terminated thread.");} + readstream1.addList(list); + } + if(readstream2!=null){ + if(readstream1.getState()==State.TERMINATED){throw new RuntimeException("Writing to a terminated thread.");} + readstream2.addList(list); + } + } + + public synchronized void close(){ + + assert(table==null || table.isEmpty()); //Seems like a race condition. Probably, I should wait at this point until the condition is true before proceeding. + +// readstream1.addList(null); +// if(readstream2!=null){readstream2.addList(null);} + readstream1.poison(); + if(readstream2!=null){readstream2.poison();} + } + + public synchronized void start(){ + if(started){ + System.err.println("Resetting output stream."); + nextListID=0; + throw new RuntimeException(); + }else{ + started=true; + if(readstream1!=null){readstream1.start();} + if(readstream2!=null){readstream2.start();} + } + } + + public void join(){ + while(readstream1!=null && readstream1.getState()!=Thread.State.TERMINATED){ + try { + readstream1.join(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + while(readstream2!=null && readstream2.getState()!=Thread.State.TERMINATED){ + try { + if(readstream2!=null){readstream2.join();} + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + assert(table==null || table.isEmpty()); + finishedSuccessfully=true; + } + + public synchronized void resetNextListID(){ + for(int i=0; i<2000 && !table.isEmpty(); i++){ + try {this.wait(2000);} + catch (InterruptedException e) {e.printStackTrace();} + } + if(!table.isEmpty()){ + System.err.println("WARNING! resetNextListID() waited a long time and the table never cleared. Process may have stalled."); + } + while(!table.isEmpty()){ + try {this.wait(2000);} + catch (InterruptedException e) {e.printStackTrace();} + } + nextListID=0; + } + + public final String fname(){ +// if(STANDARD_OUT){return "stdout";} + return readstream1.fname(); + } + + /** Return true if this stream has detected an error */ + public boolean errorState(){ + return errorState || (readstream1!=null && readstream1.errorState()) || (readstream2!=null && readstream2.errorState()); + } + + public boolean finishedSuccessfully(){ + return finishedSuccessfully && (readstream1==null || readstream1.finishedSuccessfully()) && (readstream2==null || readstream2.finishedSuccessfully()); + } + + private boolean errorState=false; + private boolean finishedSuccessfully=false; + + public final boolean ORDERED; + public final boolean SAM; + public final boolean BAM; + public final boolean FASTQ; + public final boolean FASTA; + public final boolean ATTACHMENT; + public final boolean SITESONLY; + + public final boolean STANDARD_OUT; + + public final ReadStreamWriter getRS1(){return readstream1;} + public final ReadStreamWriter getRS2(){return readstream2;} + + private final ReadStreamWriter readstream1; + private final ReadStreamWriter readstream2; + private long nextListID=0; + private boolean started=false; + + /** Number of lists held before the stream blocks */ + private final int MAX_CAPACITY=256; + private final int ADD_LIMIT=MAX_CAPACITY-2; + private final int HALF_LIMIT=ADD_LIMIT/2; + + /** For ordered output */ + private final HashMap> table; + + {if(HALF_LIMIT<1){throw new RuntimeException("Capacity too low.");}} + + public static boolean BYTE_WRITER=true; + public static boolean verbose=false; + +} diff --git a/current/stream/RandomReadInputStream.java b/current/stream/RandomReadInputStream.java new file mode 100755 index 0000000..5e833cb --- /dev/null +++ b/current/stream/RandomReadInputStream.java @@ -0,0 +1,160 @@ +package stream; + +import java.util.ArrayList; + +import align2.RandomReads; +import align2.Shared; +import align2.Tools; + +public class RandomReadInputStream extends ReadInputStream { + + public RandomReadInputStream(long number_, int readlen_, + int maxSnps_, int maxInss_, int maxDels_, int maxSubs_, + float snpRate_, float insRate_, float delRate_, float subRate_, + int maxInsertionLen_, int maxDeletionLen_, int maxSubLen_, + int minChrom_, int maxChrom_, boolean colorspace_, boolean paired_, + int minQual_, int midQual_, int maxQual_){ + + number=number_; + readlen=readlen_; + + maxInsertionLen=maxInsertionLen_; + maxSubLen=maxSubLen_; + maxDeletionLen=maxDeletionLen_; + + minChrom=minChrom_; + maxChrom=maxChrom_; + + maxSnps=maxSnps_; + maxInss=maxInss_; + maxDels=maxDels_; + maxSubs=maxSubs_; + + snpRate=snpRate_; + insRate=insRate_; + delRate=delRate_; + subRate=subRate_; + + colorspace=colorspace_; + paired=paired_; + + minQual=(byte) minQual_; + midQual=(byte) midQual_; + maxQual=(byte) maxQual_; + + restart(); + } + + @Override + public void start() {} + + + @Override + public boolean hasMore() { + return number>consumed; + } + + @Override + public Read next() { + if(consumed>=number){return null;} + if(buffer==null || next>=buffer.length){fillBuffer();} + Read r=buffer[next]; + buffer[next]=null; + next++; + consumed++; + return r; + } + + @Override + public synchronized Read[] nextBlock() { + if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");} + if(consumed>=number){return null;} + if(buffer==null || next>=buffer.length){fillBuffer();} + Read[] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); + return r; + } + + @Override + public synchronized ArrayList nextList() { + return toList(nextBlock()); + } + public final boolean preferArrays(){return true;} + + private synchronized void fillBuffer(){ + buffer=null; + next=0; + + long toMake=number-generated; + if(toMake<1){return;} + toMake=Tools.min(toMake, BUF_LEN); + + Read[] reads=rr.makeRandomReadsX((int)toMake, readlen, + maxSnps, maxInss, maxDels, maxSubs, + snpRate, insRate, delRate, subRate, + maxInsertionLen, maxDeletionLen, maxSubLen, + minChrom, maxChrom, colorspace, + minQual, midQual, maxQual); + + generated+=reads.length; + assert(generated<=number); + buffer=reads; + } + + public synchronized void restart(){ + next=0; + buffer=null; + consumed=0; + generated=0; + rr=new RandomReads(1, paired); + rr.mateLen=readlen; + } + + @Override + public boolean close() {return false;} + + @Override + public boolean paired() { + return paired; + } + + private Read[] buffer=null; + private int next=0; + + public static final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + public long generated=0; + public long consumed=0; + + public long number=100000; + public int readlen=50; + + public int maxInsertionLen=6; + public int maxSubLen=6; + public int maxDeletionLen=100; + + public int minChrom=1; + public int maxChrom=22; + + public int maxSnps=2; + public int maxInss=2; + public int maxDels=2; + public int maxSubs=2; + + public float snpRate=0.3f; + public float insRate=0.15f; + public float delRate=0.15f; + public float subRate=0.10f; + + public final boolean colorspace; + public final boolean paired; + + public final byte minQual; + public final byte midQual; + public final byte maxQual; + + private RandomReads rr; + +} diff --git a/current/stream/Read.java b/current/stream/Read.java new file mode 100755 index 0000000..b5b280f --- /dev/null +++ b/current/stream/Read.java @@ -0,0 +1,2522 @@ +package stream; + +import java.util.ArrayList; +import java.util.Arrays; + +import align2.GapTools; +import align2.QualityTools; +import align2.Shared; +import align2.Tools; + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Gene; + +public final class Read implements Comparable, Cloneable{ + + public static void main(String[] args){ + byte[] a=args[0].getBytes(); + System.out.println(new String(a)); + byte[] b=toShortMatchString(a); + System.out.println(new String(b)); + byte[] c=toLongMatchString(b); + System.out.println(new String(c)); + byte[] d=toLongMatchString(c); + System.out.println(new String(d)); +// byte[] e=toShortMatchString(b); +// System.out.println(new String(e)); + + } + + public Read(byte[] bases_, byte[] quals_, long id_){ + this(bases_, -1, (byte)0, 0, 0, Long.toString(id_), quals_, false, id_); + } + + public Read(byte[] s_, int chrom_, byte strand_, int start_, int stop_, long id_, byte[] quality_, boolean cs_){ + this(s_, chrom_, strand_, start_, stop_, Long.toString(id_), quality_, cs_, id_); + } + + public Read(byte[][] fasta_, byte[][] qual_, boolean cs_, long numericID_){ + this(fasta_[1], 0, (byte)0, 0, 0, new String(fasta_[0]), qual_[1], cs_, numericID_); + } + + public Read(byte[] s_, int chrom_, byte strand_, int start_, int stop_, String id_, byte[] quality_, boolean cs_, long numericID_){ + this(s_, chrom_, start_, stop_, id_, quality_, numericID_, (strand_|(cs_ ? COLORMASK : 0))); + assert(strand_==0 || strand_==1); + assert(start_<=stop_) : chrom_+", "+start_+", "+stop_+", "+numericID_; + } + + public Read(byte[] s_, int chrom_, int start_, int stop_, String id_, byte[] quality_, long numericID_, int flags_){ + assert(quality_==null || quality_[0]<=80 || !FASTQ.DETECT_QUALITY) : "\n"+Arrays.toString(quality_)+ + "\n"+Arrays.toString(s_)+"\n"+numericID_+"\n"+id_+"\n"+FASTQ.ASCII_OFFSET; + + flags=flags_; + byte[] basesOriginal=s_; + byte[] qualityOriginal=quality_; + if(qualityOriginal!=null && qualityOriginal.length==4){ + if(qualityOriginal[0]=='n' && + qualityOriginal[0]=='u' && + qualityOriginal[0]=='l' && + qualityOriginal[0]=='l'){ + qualityOriginal=null; + } + } + + if(NULLIFY_BROKEN_QUALITY && qualityOriginal!=null && qualityOriginal.length!=s_.length){ + qualityOriginal=null; + setDiscarded(true); + } + assert(qualityOriginal==null || absdif(qualityOriginal.length, s_.length)<=2) : + "\nMismatch between length of bases and qualities for read "+numericID_+" (id="+id_+").\n"+ + "# qualities="+qualityOriginal.length+", # bases="+s_.length+"\n\n"+ + FASTQ.qualToString(qualityOriginal)+"\n"+new String(s_)+"\n"; + + assert(basesOriginal.length<2 || basesOriginal[1]=='N' || basesOriginal[1]=='.' || basesOriginal[1]=='-' || colorspace()!=Character.isLetter(basesOriginal[1])) : + "\nAn input file appears to be misformatted. The character with ASCII code "+basesOriginal[1]+" appeared where a base was expected.\n" + + colorspace()+", "+Arrays.toString(basesOriginal); + + if(colorspace()){ //Trim tips of reads that have a primer base attached + int x=0, y=basesOriginal.length; + int xq=0, yq=qualityOriginal.length; + if(basesOriginal[0]>3 && basesOriginal[0]!='N' && basesOriginal[0]!='.'){ + assert(basesOriginal[0]=='T' || basesOriginal[0]=='G') : "Just an assumption based on SOLiD - safe to disable."; + assert(basesOriginal[1]<=3 || basesOriginal[1]=='N' || basesOriginal[1]=='.') : "Just an assumption based on SOLiD - safe to disable."; + x+=2; + xq+=1; + } + byte last=basesOriginal[basesOriginal.length-1]; + if(last>3 && last!='N' && last!='.'){ + //Might be 'G' too + assert(last=='T' || last=='G') : "Just an assumption based on SOLiD - safe to disable.\n"+new String(basesOriginal); +// assert(basesOriginal[basesOriginal.length-2]<=3) : "Just an assumption based on SOLiD - safe to disable."; + y-=2; + yq-=1; + } + + if(x!=0 || y!=basesOriginal.length){ + bases=Arrays.copyOfRange(basesOriginal, x, y); + assert(bases.length==y-x); + assert(bases[bases.length-1]==basesOriginal[bases.length-1+x]); + + if(qualityOriginal!=null){ + quality=Arrays.copyOfRange(qualityOriginal, xq, yq); + assert(bases.length==quality.length); +// quality=new byte[yq-xq]; +// for(int i=0; i=0 && b<=3){ + if(q90){bases[i]-=32;} + } + }else if(TO_UPPER_CASE){ + for(int i=0; i90){bases[i]-=32;}} + } + } + + chrom=chrom_; + start=start_; + stop=stop_; + + id=id_; + assert(quality==null || quality.length==bases.length) : "\n"+new String(bases)+"\n"+bases.length+"!="+quality.length+"\n"+ + Arrays.toString(quality)+"\n"+Arrays.toString(bases)+"\n"; + numericID=numericID_; + + mapLength=bases.length; + } + + private static final int absdif(int a, int b){ + return a>b ? a-b : b-a; + } + + + + /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/ + public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch){ + return isDuplicateByBases(r, nmax, mmax, qmax, false, false); + } + + + + /** Returns true if these reads are identical, allowing at most n no-calls and m mismatches of max quality q*/ + public boolean isDuplicateByBases(Read r, int nmax, int mmax, byte qmax, boolean banSameQualityMismatch, boolean allowDifferentLength){ + int n=0, m=0; + assert(r.bases.length==bases.length) : "Merging different-length reads is supported but seems to be not useful."; + if(!allowDifferentLength && r.bases.length!=bases.length){return false;} + int minLen=Tools.min(bases.length, r.bases.length); + for(int i=0; inmax){return false;} + }else if(b1!=b2){ + m++; + if(m>mmax){return false;} + if(quality[i]>qmax && r.quality[i]>qmax){return false;} + if(banSameQualityMismatch && quality[i]==r.quality[i]){return false;} + } + } + return true; + } + + public boolean isDuplicateByMapping(Read r, boolean bothEnds, boolean checkAlignment){ + if(bases.length!=r.bases.length){ + return isDuplicateByMappingDifferentLength(r, bothEnds, checkAlignment); + } + assert(this!=r && mate!=r); + assert(!bothEnds || bases.length==r.bases.length); + if(!mapped() || !r.mapped()){return false;} +// if(chrom==-1 && start==-1){return false;} + if(chrom<1 && start<1){return false;} + +// if(chrom!=r.chrom || strand()!=r.strand() || start!=r.start){return false;} +//// if(mate==null && stop!=r.stop){return false;} //For unpaired reads, require both ends match +// if(stop!=r.stop){return false;} //For unpaired reads, require both ends match +// return true; + + if(chrom!=r.chrom || strand()!=r.strand()){return false;} + if(bothEnds){ + if(start!=r.start || stop!=r.stop){return false;} + }else{ + if(strand()==Gene.PLUS){ + if(start!=r.start){return false;} + }else{ + if(stop!=r.stop){return false;} + } + } + if(checkAlignment){ + if(perfect() && r.perfect()){return true;} + if(match!=null && r.match!=null){ + if(match.length!=r.match.length){return false;} + for(int i=0; i=q2){ + quality[i]=(byte) Tools.min(48, q1+1+q2/4); + }else{ + quality[i]=(byte) Tools.min(48, q2+1+q1/4); + } + } + }else if(b1=='N'){ + bases[i]=b2; + quality[i]=q2; + }else if(b2=='N'){ + //do nothing + }else if(mergeVectors){ + if(q1<1 && q2<1){ + //Special case - e.g. Illumina calls bases at 0 quality. + //Possibly best to keep the matching allele if one matches the ref. + //But for now, do nothing. + //This was causing problems changing perfect match strings into imperfect matches. + }else if(q1==q2){ + assert(b1!=b2); + bases[i]='N'; + quality[i]=0; + }else if(q1>q2){ + bases[i]=b1; + quality[i]=(byte)(q1-q2/2); + }else{ + bases[i]=b2; + quality[i]=(byte)(q2-q1/2); + } + assert(quality[i]>=0 && quality[i]<=48); + } + } + } + } + + //TODO: + //Note that the read may need to be realigned after merging, so the match string may be rendered incorrect. + + if(mergeN && match!=null){ + if(r.match==null){match=null;} + else{ + if(match.length!=r.match.length){match=null;} + else{ + boolean ok=true; + for(int i=0; i1); + + assert(r!=this); + assert(r!=this.mate); + assert(r!=r.mate); + assert(this!=this.mate); + assert(r.mate==null || r.mate.mate==r); + assert(this.mate==null || this.mate.mate==this); + assert(r.mate==null || r.numericID==r.mate.numericID); + assert(mate==null || numericID==mate.numericID); + + +// if(numericID==11063941 || r.numericID==11063941 || numericID==8715632){ +// System.err.println("\nAfter:\n"); +// System.err.println(this.toText()+"\n"); +// System.err.println(r.toText()+"\n"); +// System.err.println("***************"); +// } + } + + public Read translateToColorspace(boolean appendT){ + assert(!colorspace()); + byte[] temp=(appendT ? AminoAcid.toColorspaceSimulated(bases) : AminoAcid.toColorspace(bases)); +// assert(false) : "\n"+new String(bases)+"\n->\n"+new String(temp); + Read r=new Read(temp, chrom, start, stop-1, id, quality, numericID, (flags|COLORMASK)); + r.sites=sites; + r.originalSite=originalSite; + r.errors=errors; + r.errorsCorrected=errorsCorrected; + r.mapScore=mapScore; + r.obj=obj; + assert(false) : "TODO: Be sure to copy ALL fields, like flags, etc."; + return r; + } + + public String toString(){return toText(false).toString();} + + public StringBuilder toSites(){ + StringBuilder sb; + if(numSites()==0){ + sb=new StringBuilder(2); + sb.append('.'); + }else{ + sb=new StringBuilder(sites.size()*20); + int appended=0; + for(SiteScore ss : sites){ + if(appended>0){sb.append('\t');} + if(ss!=null){ + sb.append(ss.toText()); + appended++; + } + } + if(appended==0){sb.append('.');} + } + return sb; + } + + public ByteBuilder toSitesB(ByteBuilder sb){ + if(numSites()==0){ + if(sb==null){sb=new ByteBuilder(2);} + sb.append('.'); + }else{ + if(sb==null){sb=new ByteBuilder(sites.size()*20);} + int appended=0; + for(SiteScore ss : sites){ + if(appended>0){sb.append('\t');} + if(ss!=null){ + ss.toBytes(sb); + appended++; + } + } + if(appended==0){sb.append('.');} + } + return sb; + } + + public StringBuilder toInfo(){ + if(obj==null){return new StringBuilder();} + if(obj.getClass()==StringBuilder.class){return (StringBuilder)obj;} + return new StringBuilder(obj.toString()); + } + + public ByteBuilder toInfoB(){ + if(obj==null){return new ByteBuilder();} + if(obj.getClass()==ByteBuilder.class){return (ByteBuilder)obj;} + return new ByteBuilder(obj.toString()); + } + + public StringBuilder toFastq(){ + return FASTQ.toFASTQ(this, (StringBuilder)null); + } + + public ByteBuilder toFastq(ByteBuilder bb){ + return FASTQ.toFASTQ(this, bb); + } + + public StringBuilder toFasta(){return toFasta(FastaReadInputStream.DEFAULT_WRAP);} + public ByteBuilder toFasta(ByteBuilder bb){return toFasta(FastaReadInputStream.DEFAULT_WRAP, bb);} + + public StringBuilder toFasta(int wrap){ + if(wrap<1){wrap=Integer.MAX_VALUE;} + int len=(id==null ? Tools.stringLength(numericID) : id.length())+(bases==null ? 0 : bases.length+bases.length/wrap)+5; + StringBuilder sb=new StringBuilder(len); + sb.append('>'); + if(id==null){sb.append(numericID);} + else{sb.append(id);} + sb.append('\n'); + if(bases!=null){ +// for(int i=0, j=0; i0){sb.append(buffer, 0, j);} + } + return sb; + } + + public ByteBuilder toFasta(int wrap, ByteBuilder sb){ + if(wrap<1){wrap=Integer.MAX_VALUE;} + int len=(id==null ? Tools.stringLength(numericID) : id.length())+(bases==null ? 0 : bases.length+bases.length/wrap)+5; + if(sb==null){sb=new ByteBuilder(len+1);} + sb.append('>'); + if(id==null){sb.append(numericID);} + else{sb.append(id);} + if(bases!=null){ + int pos=0; + while(pos=0; i--){ + sb.append(flagToNumber(maskArray[i])); + } + sb.append('\t'); + + sb.append(copies); + sb.append('\t'); + + sb.append(errors); + if(errorsCorrected>0){ + sb.append(','); + sb.append(errorsCorrected); + } + sb.append('\t'); + sb.append(mapScore); + sb.append('\t'); + sb.append(mapLength); + sb.append('\t'); + + if(bases==null){sb.append('.');} + else{ + for(int i=0; i=0); + b=(byte) (b+'0'); + } + sb.append((char)b); + } + } + sb.append('\t'); + + int qualSum=0; + int qualMin=99999; + + if(quality==null){ + sb.append('.'); + }else{ + for(int i=0; i0){sb.append('~');} + sb.append(gaps[i]); + } + } + + if(sites!=null && sites.size()>0){ + + assert(absdif(start, stop)<3000 || (gaps==null) == (sites.get(0).gaps==null)) : + "\n"+this.numericID+"\n"+Arrays.toString(gaps)+"\n"+sites.toString()+"\n"; + + for(SiteScore ss : sites){ + sb.append('\t'); + sb.append(ss==null ? "null" : ss.toText()); + } + } + + if(originalSite!=null){ + sb.append('\t'); + sb.append('*'); + sb.append(originalSite.toText()); + } + + match=oldmatch; + setShortMatch(oldshortmatch); + + return sb; + } + + public ByteBuilder toText(boolean okToCompressMatch, ByteBuilder bb){ + + final byte[] oldmatch=match; + final boolean oldshortmatch=this.shortmatch(); + if(COMPRESS_MATCH_BEFORE_WRITING && !shortmatch() && okToCompressMatch){ + match=toShortMatchString(match); + setShortMatch(true); + } + + if(bb==null){bb=new ByteBuilder();} + bb.append(id); + bb.append('\t'); + bb.append(numericID); + bb.append('\t'); + bb.append(chrom); + bb.append('\t'); + bb.append(Gene.strandCodes2[strand()]); + bb.append('\t'); + bb.append(start); + bb.append('\t'); + bb.append(stop); + bb.append('\t'); + +// sb.append(colorspace() ? 1 : 0); +// sb.append('\t'); +// +// sb.append(paired() ? 1 : 0); +// sb.append('\t'); + + for(int i=maskArray.length-1; i>=0; i--){ + bb.append(flagToNumber(maskArray[i])); + } + bb.append('\t'); + + bb.append(copies); + bb.append('\t'); + + bb.append(errors); + if(errorsCorrected>0){ + bb.append(','); + bb.append(errorsCorrected); + } + bb.append('\t'); + bb.append(mapScore); + bb.append('\t'); + bb.append(mapLength); + bb.append('\t'); + + if(bases==null){bb.append('.');} + else{ + if(colorspace()){ + for(int i=0; i=0); + b=(byte) (b+'0'); + } + bb.append((char)b); + } + }else{ + bb.append(bases); + } + } + bb.append('\t'); + +// int qualSum=0; +// int qualMin=99999; + + if(quality==null){ + bb.append('.'); + }else{ + bb.ensureExtra(quality.length); + for(int i=0, j=bb.length; i0){bb.append('~');} + bb.append(gaps[i]); + } + } + + if(sites!=null && sites.size()>0){ + + assert(absdif(start, stop)<3000 || (gaps==null) == (sites.get(0).gaps==null)) : + "\n"+this.numericID+"\n"+Arrays.toString(gaps)+"\n"+sites.toString()+"\n"; + + for(SiteScore ss : sites){ + bb.append('\t'); + if(ss==null){ + bb.append((byte[])null); + }else{ + ss.toBytes(bb); + } + bb.append(ss==null ? "null" : ss.toText()); + } + } + + if(originalSite!=null){ + bb.append('\t'); + bb.append('*'); + originalSite.toBytes(bb); + } + + match=oldmatch; + setShortMatch(oldshortmatch); + + return bb; + } + + public static Read fromText(String line){ + if(line.length()==1 && line.charAt(0)=='.'){return null;} + + String[] split=line.split("\t"); + + if(split.length<17){ + throw new RuntimeException("Error parsing read from text.\n" + + "This may be caused be attempting to parse the wrong format.\n" + + "Please ensure that the file extension is correct:\n" + + "\tFASTQ should end in .fastq or .fq\n" + + "\tFASTA should end in .fasta or .fa, .fas, .fna, .ffn, .frn, .seq, .fsa\n" + + "\tSAM should end in .sam\n" + + "\tNative format should end in .txt or .bread\n" + + "If a file is compressed, there must be a compression extension after the format extension:\n" + + "\tgzipped files should end in .gz or .gzip\n" + + "\tzipped files should end in .zip and have only 1 file per archive\n" + + "\tbz2 files should end in .bz2"); + } + + final String id=new String(split[0]); + long numericID=Long.parseLong(split[1]); + int chrom=Byte.parseByte(split[2]); +// byte strand=Byte.parseByte(split[3]); + int start=Integer.parseInt(split[4]); + int stop=Integer.parseInt(split[5]); + +// boolean cs=(Integer.parseInt(split[6])==1); +// boolean paired=(Integer.parseInt(split[7])==1); + + int flags=Integer.parseInt(split[6], 2); + boolean cs=((flags&COLORMASK)!=0); + + int copies=Integer.parseInt(split[7]); + + int errors; + int errorsCorrected; + if(split[8].indexOf(',')>=0){ + String[] estring=split[8].split(","); + errors=Integer.parseInt(estring[0]); + errorsCorrected=Integer.parseInt(estring[1]); + }else{ + errors=Integer.parseInt(split[8]); + errorsCorrected=0; + } + + int mapScore=Integer.parseInt(split[9]); + int mapLen=Integer.parseInt(split[10]); + + byte[] basesOriginal=split[11].getBytes(); + byte[] qualityOriginal=(split[12].equals(".") ? null : split[12].getBytes()); + + if(cs){ + for(int i=0; i='0' && b<='3'){ + b=(byte) (b-'0'); + } + basesOriginal[i]=b; + } + } + + if(qualityOriginal!=null){ + for(int i=0; i=-1) : b; + qualityOriginal[i]=b; + } + } + + int insert=-1; + if(!split[13].equals(".")){insert=Integer.parseInt(split[13]);} + + byte[] match=null; + if(!split[15].equals(".")){match=split[15].getBytes();} + int[] gaps=null; + if(!split[16].equals(".")){ + + String[] gstring=split[16].split("~"); + gaps=new int[gstring.length]; + for(int i=0; i0){r.sites=new ArrayList(mSites);} + for(int i=firstScore; i0){ + SiteScore ss=sites.get(x); + sites.set(0, ss); + sites.set(x, ss0); + } + setFromSite(sites.get(0)); + return; + } + +// assert(false) : "TODO: Proper strand orientation, and more."; + //TODO: Also, this code appears to sometimes duplicate sitescores(?) +// for(int i=0; imaxdist){return true;} + } +// if(absdif(start, mate.start)>maxdist){return true;} + if(requireCorrectStrands){ + if((strand()==mate.strand())!=sameStrandPairs){return true;} + } + if(!sameStrandPairs){ + if(strand()==Gene.PLUS && mate.strand()==Gene.MINUS){ + if(start>=mate.stop){return true;} + }else if(strand()==Gene.MINUS && mate.strand()==Gene.PLUS){ + if(mate.start>=stop){return true;} + } + } + return false; + } + + public int countMismatches(){ + assert(match!=null); + int x=0; + for(byte b : match){ + if(b=='S'){x++;} + } + return x; + } + + + + /** + * @param match string + * @return Total number of match, sub, del, ins, or clip symbols + */ + public static final int[] matchToMsdicn(byte[] match) { + if(match==null || match.length<1){return null;} + int[] msdicn=new int[6]; + + byte mode='0', c='0'; + int current=0; + for(int i=0; i0 || !Character.isDigit(c)){ + current=Tools.max(current, 1); + if(mode=='m'){ + msdicn[0]+=current; + }else if(mode=='S'){ + msdicn[1]+=current; + }else if(mode=='D'){ + msdicn[2]+=current; + }else if(mode=='I'){ + msdicn[3]+=current; + }else if(mode=='C' || mode=='X' || mode=='Y'){ + msdicn[4]+=current; + }else if(mode=='N' || mode=='R'){ + msdicn[5]+=current; + } + } + return msdicn; + } + + + + /** + * Handles short or long mode. + * @param match string + * @return Total number of match, sub, del, ins, or clip symbols + */ + public static final float identity(byte[] match) { +// assert(false) : new String(match); + if(match==null || match.length<1){return 0;} + + int good=0, bad=0, n=0; + + byte mode='0', c='0'; + int current=0; + for(int i=0; i2){sb.append(count);} + else if(count==2){sb.append((char)prev);} + prev=m; + count=1; + } + } + sb.append((char)prev); + if(count>2){sb.append(count);} + else if(count==2){sb.append((char)prev);} + + byte[] r=new byte[sb.length()]; + for(int i=0; i0); + + int count=0; + int current=0; + for(int i=0; i0 ? current-1 : 0); + current=0; + }else{ + assert(Character.isDigit(m)); + current=(current*10)+(m-48); //48 == '0' + } + } + count+=(current>0 ? current-1 : 0); + + + byte[] r=new byte[count]; + current=0; + byte lastLetter='?'; + int j=0; + for(int i=0; i1){ + r[j]=lastLetter; + current--; + j++; + } + current=0; + + r[j]=m; + j++; + lastLetter=m; + }else{ + assert(Character.isDigit(m)); + current=(current*10)+(m-48); //48 == '0' + } + } + while(current>1){ + r[j]=lastLetter; + current--; + j++; + } + + assert(r[r.length-1]>0); + return r; + } + + +// /** Original bases of the read. Do not modify! */ +// public byte[] basesOriginal; +// +// /** Quality code for read bases of the read. Do not modify! */ +// public byte[] qualityOriginal; + + /** Bases of the read, after trimming (for colorspace). */ + public byte[] bases; + + /** Quality of the read, after trimming (for colorspace). */ + public byte[] quality; + + /** Alignment string. E.G. mmmmDDDmmm would have 4 matching bases, then a 3-base deletion, then 3 matching bases. */ + public byte[] match; + + public int[] gaps; + + public String id; + public long numericID; + public int chrom; + public int start; + public int stop; + + public int mapLength; //Length used for mapping, before trimming + + public int copies=1; + + /** Errors detected (remaining) */ + public int errors=0; + + /** Errors corrected. Total initial errors should be errors+errorsCorrected. */ + public int errorsCorrected=0; + + /** Alignment score from BBMap. Assumed to max at approx 100*bases.length */ + public int mapScore=0; + + public ArrayList sites; + public SiteScore originalSite; //Origin site for synthetic reads + public Object obj=null; //For testing only + public Read mate; + + public int flags; + + /** -1 if invalid. TODO: Currently not retained through most processes. */ + private int insert=-1; + + /** A random number for deterministic usage. + * May decrease speed in multithreaded applications. + */ + public double rand=-1; + + public byte strand(){return (byte)(flags&1);} + public boolean mapped(){return (flags&MAPPEDMASK)==MAPPEDMASK;} + public boolean paired(){return (flags&PAIREDMASK)==PAIREDMASK;} + public boolean synthetic(){return (flags&SYNTHMASK)==SYNTHMASK;} + public boolean colorspace(){return (flags&COLORMASK)==COLORMASK;} + public boolean ambiguous(){return (flags&AMBIMASK)==AMBIMASK;} + public boolean perfect(){return (flags&PERFECTMASK)==PERFECTMASK;} +// public boolean semiperfect(){return perfect() ? true : list!=null && list.size()>0 ? list.get(0).semiperfect : false;} //TODO: This is a hack. Add a semiperfect flag. + public boolean rescued(){return (flags&RESCUEDMASK)==RESCUEDMASK;} + public boolean discarded(){return (flags&DISCARDMASK)==DISCARDMASK;} + public boolean invalid(){return (flags&INVALIDMASK)==INVALIDMASK;} + public boolean swapped(){return (flags&SWAPMASK)==SWAPMASK;} + public boolean shortmatch(){return (flags&SHORTMATCHMASK)==SHORTMATCHMASK;} + public boolean insertvalid(){return (flags&INSERTMASK)==INSERTMASK;} + public boolean hasadapter(){return (flags&ADAPTERMASK)==ADAPTERMASK;} + public boolean secondary(){return (flags&SECONDARYMASK)==SECONDARYMASK;} + /** For paired ends: 0 for read1, 1 for read2 */ + public int pairnum(){return (flags&PAIRNUMMASK)>>PAIRNUMSHIFT;} + public boolean valid(){return !invalid();} + + public boolean getFlag(int mask){return (flags&mask)==mask;} + public int flagToNumber(int mask){return (flags&mask)==mask ? 1 : 0;} + + public void setFlag(int mask, boolean b){ + flags=(flags&~mask); + if(b){flags|=mask;} + } + + public void setStrand(int b){ + assert(b==1 || b==0); + flags=(flags&(~1))|b; + } + + /** For paired ends: 0 for read1, 1 for read2 */ + public void setPairnum(int b){ + assert(b==1 || b==0); + flags=(flags&(~PAIRNUMMASK))|(b<=maxScore) : "\n"+ss+"\n"+maxScore+"\n"+this+"\n"+mate+"\n"; + } + } + return perfect(); + } + + private boolean testMatchPerfection(boolean returnIfNoMatch){ + if(match==null){return returnIfNoMatch;} + boolean flag=(match.length==bases.length); + if(shortmatch()){ + flag=(match.length==0 || match[0]=='m'); + for(int i=0; ir2.strand()){return insertSizeMapped_PlusLeft(r2, r1);} + if(r1.strand()==r2.strand() || r1.start>r2.stop){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left. + +// if(!mapped() || !mate.mapped()){return 0;} + if(r1.chrom!=r2.chrom){return 0;} + if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //??? + + int a=(r1.bases==null ? 0 : r1.bases.length); + int b=(r2.bases==null ? 0 : r2.bases.length); + int mid=r2.start-r1.stop-1; + if(-mid>=a+b){return insertSizeMapped_Unstranded(r1, r2);} //Not properly oriented; plus read is to the right of minus read + return mid+a+b; + } + + public static int insertSizeMapped_Unstranded(Read r1, Read r2){ + if(r2==null){return r1.start==r1.stop ? 0 : r1.stop-r1.start+1;} + + if(r1.start>r2.start){return insertSizeMapped_Unstranded(r2, r1);} //So r1 is always on the left side. + +// if(!mapped() || !mate.mapped()){return 0;} + if(r1.start==r1.stop || r2.start==r2.stop){return 0;} //??? + + if(r1.chrom!=r2.chrom){return 0;} + int a=(r1.bases==null ? 0 : r1.bases.length); + int b=(r2.bases==null ? 0 : r2.bases.length); + if(false && Tools.overlap(r1.start, r1.stop, r2.start, r2.stop)){ + //This does not handle very short inserts + return Tools.max(r1.stop, r2.stop)-Tools.min(r1.start, r2.start)+1; + + }else{ + if(r1.starta && -mid>b){return Tools.min(a, b);} //Strange situation, no way to guess insert size + if(-mid>=a+b){return 0;} //Strange situation, no way to guess insert size + return mid+a+b; + }else{ + assert(r1.start==r2.start); + return Tools.min(a, b); + } + } + } + + public int insertSizeOriginalSite(){ + if(mate==null){ +// System.err.println("A: "+(originalSite==null ? "null" : (originalSite.stop-originalSite.start+1))); + return (originalSite==null ? 0 : originalSite.stop-originalSite.start+1); + } + + final SiteScore ssa=originalSite, ssb=mate.originalSite; + final int x; + if(ssa==null || ssb==null){ +// System.err.println("B: 0"); + x=0; + }else{ + x=insertSize(ssa, ssb, bases.length, mate.bases.length); + } + + assert(pairnum()>=mate.pairnum() || x==mate.insertSizeOriginalSite()); + return x; + } + + public static int insertSize(SiteScore ssa, SiteScore ssb, int lena, int lenb){ + return insertSize(ssa.chrom, ssb.chrom, ssa.start, ssb.start, ssa.stop, ssb.stop, lena, lenb); + } + + public static int insertSize(int chroma, int chromb, int starta, int startb, int stopa, int stopb, int lena, int lenb){ + + final int x; + + // if(mate==null || ){return bases==null ? 0 : bases.length;} + if(chroma!=chromb){x=0;} + else{ + + if(Tools.overlap(starta, stopa, startb, stopb)){ + x=Tools.max(stopa, stopb)-Tools.min(starta, startb)+1; +// System.err.println("C: "+x); + }else{ + if(starta<=startb){ + int mid=startb-stopa-1; + // assert(false) : mid+", "+a+", "+b; + x=mid+lena+lenb; +// System.err.println("D: "+x); + }else{ + int mid=starta-stopb-1; + // assert(false) : mid+", "+a+", "+b; + x=mid+lena+lenb; +// System.err.println("E: "+x); + } + } + } + return x; + } + + public Read joinRead(){ + if(insert<1 || mate==null || !insertvalid()){return this;} + assert(insert>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n"; + return joinRead(this, mate, insert); + } + + public Read joinRead(int x){ + if(x<1 || mate==null){return this;} + assert(x>9 || bases.length<20) : "Perhaps old read format is being used? This appears to be a quality value, not an insert.\n"+this+"\n\n"+mate+"\n"; + return joinRead(this, mate, x); + } + + public static Read joinRead(Read a, Read b, int insert){ + assert(a!=null && b!=null && insert>0); + final int lengthSum=a.bases.length+b.bases.length; + final int overlap=Tools.min(insert, lengthSum-insert); + + final byte[] bases=new byte[insert]; + final byte[] quals=new byte[insert]; + + int mismatches=0; + + int start, stop; + + if(overlap<=0){//Simple join + for(int i=0; i=a.bases.length && insert>=b.bases.length){ //Overlapped join, proper orientation +// final int lim1=a.bases.length-overlap; +// final int lim2=a.bases.length; +// for(int i=0; i=0 && j>=0; i--, j--){ + byte ca=bases[i], cb=b.bases[j]; + byte qa=quals[i], qb=b.quality[j]; + if(ca==0 || ca=='N'){ + bases[i]=cb; + quals[i]=qb; + }else if(ca==cb){ + quals[i]=(byte)Tools.min((Tools.max(qa, qb)+Tools.min(qa, qb)/4), 50); + }else{ + bases[i]=(ca>=cb ? ca : cb); + quals[i]=(byte)(Tools.max(ca, cb)-Tools.min(ca, cb)); + if(ca!='N' && cb!='N'){mismatches++;} + } + } + + if(a.strand()==0){ + start=a.start; +// stop=start+insert-1; + stop=b.stop; + }else{ + stop=a.stop; +// start=stop-insert+1; + start=b.start; + } + if(start>stop){ + start=Tools.min(a.start, b.start); + stop=Tools.max(a.stop, b.stop); + } + } +// assert(mismatches>=countMismatches(a, b, insert, 999)); +// System.err.println(mismatches); + if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){start=stop=a.chrom=0;} + + Read r=new Read(bases, a.chrom, start, stop, a.id, quals, a.numericID, a.flags); + if(a.chrom==0 || start==stop || (!a.mapped() && !a.synthetic())){r.setMapped(true);} + r.setInsert(insert); + r.setPaired(false); + r.copies=a.copies; + r.mapScore=a.mapScore+b.mapScore; + r.mapLength=a.mapLength+b.mapLength; + if(overlap<=0){ + r.mapScore=a.mapScore+b.mapScore; + r.mapLength=a.mapLength+b.mapLength; + r.errors=a.errors+b.errors; + r.errorsCorrected=a.errorsCorrected+b.errorsCorrected; + //TODO r.gaps=? + }else{//Hard to calculate + r.mapScore=(int)((a.mapScore*(long)a.mapLength+b.mapScore*(long)b.mapLength)/insert); + r.mapLength=insert; + r.errors=a.errors; + r.errorsCorrected=a.errorsCorrected; + } + + + assert(r.insertvalid()) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; + assert(r.insert()==r.bases.length) : r.insert()+"\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; +// assert(false) : "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; + + assert(Shared.anomaly || (a.insertSizeMapped(false)>0 == r.insertSizeMapped(false)>0)) : + "\n"+r.insert()+"\n"+r.insertSizeMapped(false)+"\n"+a.insert()+"\n"+a.insertSizeMapped(false)+ + "\n\n"+a.toText(false)+"\n\n"+b.toText(false)+"\n\n"+r.toText(false)+"\n\n"; + + return r; + } + + /** + * @param minlen + * @param maxlen + * @return + */ + public ArrayList split(int minlen, int maxlen) { + int len=bases==null ? 0 : bases.length; + if(len subreads=new ArrayList(parts); + if(len<=maxlen){ + subreads.add(this); + }else{ + float ideal=Tools.max(minlen, len/(float)parts); + int actual=(int)ideal; + assert(false) : "TODO"; //Some assertion goes here, I forget what + for(int i=0; ibases.length){b=bases.length;} +// if(b-a<) + byte[] subbases=Arrays.copyOfRange(bases, a, b); + byte[] subquals=(quality==null ? null : Arrays.copyOfRange(quality, a, b+1)); + Read r=new Read(subbases, -1, -1, -1, id+"_"+i, subquals, numericID, flags); + subreads.add(r); + } + } + return subreads; + } + + /** Generate and return an array of canonical kmers for this read */ + public long[] toKmers(final int k, final int gap, long[] kmers, boolean makeCanonical) { + if(gap>0){throw new RuntimeException("Gapped reads: TODO");} + if(k>31){return toLongKmers(k, gap, kmers, makeCanonical);} + if(bases==null || bases.length=k){ + kmers[i-k+1]=kmer; + } + } + } + +// System.out.println(new String(bases)); +// System.out.println(Arrays.toString(kmers)); + + if(makeCanonical){ + this.reverseComplement(); + len=0; + kmer=0; + for(int i=0, j=bases.length-1; i=k){ + assert(kmer==AminoAcid.reverseComplementBinaryFast(kmers[j], k)); + kmers[j]=Tools.max(kmers[j], kmer); + } + } + } + this.reverseComplement(); + +// System.out.println(Arrays.toString(kmers)); + } + + + return kmers; + } + + /** Generate and return an array of canonical kmers for this read */ + public long[] toLongKmers(final int k, final int gap, long[] kmers, boolean makeCanonical) { + if(gap>0){throw new RuntimeException("Gapped reads: TODO");} + assert(k>31) : k; + if(bases==null || bases.length=k){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2<=k){ + long x2=AminoAcid.baseToNumber[bases[i-k]]; + kmer=kmer^(x2< list, byte[] basesP, byte[] basesM, long id){ + if(list==null || list.isEmpty()){return true;} + for(int i=0; i"+p1+", "+sp+"->"+sp1+", "+ss.isSemiPerfect(bases)+ + "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"; + assert(sp==sp1) : p+"->"+p1+", "+sp+"->"+sp1+", "+ss.isSemiPerfect(bases)+ + "\nnumericID="+id+"\n"+new String(bases)+"\n\n"+Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"; + +// ss.setPerfect(bases, false); + + assert(p==ss.perfect) : + p+"->"+ss.perfect+", "+sp+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n"+new String(bases)+"\n\n"+ + Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"; + assert(sp==ss.semiperfect) : + p+"->"+ss.perfect+", "+sp+"->"+ss.semiperfect+", "+ss.isSemiPerfect(bases)+"\nnumericID="+id+"\n"+new String(bases)+"\n\n"+ + Data.getChromosome(ss.chrom).getString(ss.start, ss.stop)+"\n\n"; + if(ss.perfect){assert(ss.semiperfect);} + } + return true; + } + + + public void setPerfect(boolean b){ + flags=(flags&~PERFECTMASK); + if(b){flags|=PERFECTMASK;} + } + + public void setRescued(boolean b){ + flags=(flags&~RESCUEDMASK); + if(b){flags|=RESCUEDMASK;} + } + + public void setMapped(boolean b){ +// assert(false) : mapped()+"->"+b; + flags=(flags&~MAPPEDMASK); + if(b){flags|=MAPPEDMASK;} + } + + public void setDiscarded(boolean b){ + flags=(flags&~DISCARDMASK); + if(b){flags|=DISCARDMASK;} + } + + public void setInvalid(boolean b){ + flags=(flags&~INVALIDMASK); + if(b){flags|=INVALIDMASK;} + } + + public void setSwapped(boolean b){ + flags=(flags&~SWAPMASK); + if(b){flags|=SWAPMASK;} + } + + public void setShortMatch(boolean b){ + flags=(flags&~SHORTMATCHMASK); + if(b){flags|=SHORTMATCHMASK;} + } + + public void setInsertValid(boolean b){ + flags=(flags&~INSERTMASK); + if(b){flags|=INSERTMASK;} + } + + public void setHasAdapter(boolean b){ + flags=(flags&~ADAPTERMASK); + if(b){flags|=ADAPTERMASK;} + } + + public void setSecondary(boolean b){ + flags=(flags&~SECONDARYMASK); + if(b){flags|=SECONDARYMASK;} + } + + public void setInsert(int x){ + if(x<1){x=-1;} + assert(x==-1 || x>9 || bases.length<20); + insert=x; + setInsertValid(x>0); + if(mate!=null){ + mate.insert=x; + mate.setInsertValid(x>0); + } + } + + private static int[] makeMaskArray(int max) { + int[] r=new int[max+1]; + for(int i=0; i=QUALCACHE.length){ + byte[] r=new byte[len]; + Arrays.fill(r, (byte)30); + return r; + } + if(QUALCACHE[len]==null){ + synchronized(QUALCACHE){ + if(QUALCACHE[len]==null){ + QUALCACHE[len]=new byte[len]; + Arrays.fill(QUALCACHE[len], (byte)30); + } + } + } + return QUALCACHE[len]; + } + + + public byte[] getScaffoldName(boolean requireSingleScaffold){ + byte[] name=null; + if(mapped()){ + if(!requireSingleScaffold || Data.isSingleScaffold(chrom, start, stop)){ + int idx=Data.scaffoldIndex(chrom, (start+stop)/2); + name=Data.scaffoldNames[chrom][idx]; +// int scaflen=Data.scaffoldLengths[chrom][idx]; +// a1=Data.scaffoldRelativeLoc(chrom, start, idx); +// b1=a1-start1+stop1; + } + } + return name; + } + + public Read clone(){ + try { + return (Read) super.clone(); + } catch (CloneNotSupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + throw new RuntimeException(); + } + + private static final byte[][] QUALCACHE=new byte[1000][]; + + + public static final int STRANDMASK=1; + public static final int MAPPEDMASK=(1<<1); + public static final int PAIREDMASK=(1<<2); + public static final int PERFECTMASK=(1<<3); + public static final int AMBIMASK=(1<<4); + public static final int RESCUEDMASK=(1<<5); + public static final int COLORMASK=(1<<6); + public static final int SYNTHMASK=(1<<7); + public static final int DISCARDMASK=(1<<8); + public static final int INVALIDMASK=(1<<9); + public static final int SWAPMASK=(1<<10); + public static final int SHORTMATCHMASK=(1<<11); + + public static final int PAIRNUMSHIFT=12; + public static final int PAIRNUMMASK=(1< nextList(){ +// throw new RuntimeException("Not supported."); +// } + + public abstract ArrayList nextList(); + + public abstract boolean hasMore(); + + public abstract void restart(); + + /** Returns true if there was an error, false otherwise */ + public abstract boolean close(); + + public abstract boolean paired(); + + protected static final ArrayList toList(Read[] array){ + if(array==null || array.length==0){return null;} + ArrayList list=new ArrayList(array.length); + for(int i=0; i'); + bbq.append(r.id); + bbq.append('\n'); + if(r.bases!=null){toQualityB(r.quality, r.bases.length, bbq);} + bbq.append('\n'); + } + Read r2=r.mate; + if(OUTPUT_INTERLEAVED && r2!=null){ + bbq.append('\n'); + bbq.append('>'); + bbq.append(r2.id); + bbq.append('\n'); + if(r2.bases!=null){toQualityB(r2.quality, r2.bases.length, bbq);} + bbq.append('\n'); + } + } + if(bbq.length>=32768){ + myQOutstream.write(bbq.array, 0, bbq.length); + bbq.setLength(0); + } + } + }else{ + for(final Read r1 : job.list){ + if(r1!=null){ + final Read r2=r1.mate; + assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false); + bbq.append('\n'); + bbq.append('>'); + bbq.append(r2.id); + bbq.append('\n'); + if(r2.bases!=null){toQualityB(r2.quality, r2.bases.length, bbq);} + bbq.append('\n'); + } + if(bbq.length>=32768){ + myQOutstream.write(bbq.array, 0, bbq.length); + bbq.setLength(0); + } + } + } + +// if(bbq.length>0){ +// myQOutstream.write(bbq.array, 0, bbq.length); +// bbq.setLength(0); +// } + } +// assert(false) : OUTPUT_SAM+", "+SITES_ONLY+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT+"\n"+job.list.get(0).obj+"\n"+job.list.get(0); + if(OUTPUT_SAM){ + assert(read1); + for(final Read r : job.list){ + Read r2=(r==null ? null : r.mate); + + SamLine sl1=(r==null ? null : new SamLine(r, 0)); + SamLine sl2=(r2==null ? null : new SamLine(r2, 1)); + + if(r!=null){ + + if(verbose && r.numSites()>0){ + final Read clone=r.clone(); + for(SiteScore ss : r.sites){ + + clone.setFromSite(ss); + clone.setSecondary(true); + SamLine sl=new SamLine(clone, 0); + + System.err.println("\n@************************************\n\n"+ss+"\n\n"+clone+"\n\n"+sl+"\n\n+************************************\n"); + + } + } + + assert(!ASSERT_CIGAR || !r.mapped() || sl1.cigar!=null) : r; + sl1.toBytes(bb).append('\n'); + + readsWritten++; + basesWritten+=(r.bases!=null ? r.bases.length : 0); + validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0); + validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.bases.length : 0); + ArrayList list=r.sites; + if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){ + final Read clone=r.clone(); + for(int i=1; i list=r2.sites; + if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){ + final Read clone=r2.clone(); + for(int i=1; i=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + + } + }else if(SITES_ONLY){ + assert(read1); + for(final Read r : job.list){ + Read r2=(r==null ? null : r.mate); + + if(r!=null && r.sites!=null){ + r.toSitesB(bb).append('\n'); + + readsWritten++; + basesWritten+=(r.bases!=null ? r.bases.length : 0); + validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0); + validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.bases.length : 0); + } + if(r2!=null){ + r2.toSitesB(bb).append('\n'); + + readsWritten++; + basesWritten+=(r2.bases!=null ? r2.bases.length : 0); + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.bases.length : 0); + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + }else if(OUTPUT_FASTQ){ + if(read1){ + for(final Read r : job.list){ + if(r!=null){ + r.toFastq(bb).append('\n'); + readsWritten++; + basesWritten+=(r.bases!=null ? r.bases.length : 0); + validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0); + validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.bases.length : 0); + Read r2=r.mate; + if(OUTPUT_INTERLEAVED && r2!=null){ + r2.toFastq(bb).append('\n'); + readsWritten++; + basesWritten+=(r2.bases!=null ? r2.bases.length : 0); + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.bases.length : 0); + } + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + }else{ + for(final Read r1 : job.list){ + if(r1!=null){ + final Read r2=r1.mate; + assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false); + r2.toFastq(bb).append('\n'); + readsWritten++; + basesWritten+=(r2.bases!=null ? r2.bases.length : 0); + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.bases.length : 0); + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + } + }else if(OUTPUT_FASTA){ + if(read1){ + for(final Read r : job.list){ + if(r!=null){ + r.toFasta(bb).append('\n'); + readsWritten++; + basesWritten+=(r.bases!=null ? r.bases.length : 0); + validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0); + validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.bases.length : 0); + Read r2=r.mate; + if(OUTPUT_INTERLEAVED && r2!=null){ + r2.toFasta(bb).append('\n'); + readsWritten++; + basesWritten+=(r2.bases!=null ? r2.bases.length : 0); + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.bases.length : 0); + } + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + }else{ + for(final Read r1 : job.list){ + if(r1!=null){ + final Read r2=r1.mate; + assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false); + r2.toFasta(bb).append('\n'); + readsWritten++; + basesWritten+=(r2.bases!=null ? r2.bases.length : 0); + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.bases.length : 0); + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + } + }else if(OUTPUT_ATTACHMENT){ + if(read1){ + for(final Read r : job.list){ + if(r!=null){ + if(r.obj==null){bb.append('.').append('\n');} + else{bb.append(r.obj.toString()).append('.');} + readsWritten++; + validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0); + Read r2=r.mate; + if(OUTPUT_INTERLEAVED && r2!=null){ + if(r2.obj==null){bb.append('.').append('\n');} + else{bb.append(r2.obj.toString()).append('.');} + readsWritten++; + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + } + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + }else{ + for(final Read r1 : job.list){ + if(r1!=null){ + final Read r2=r1.mate; + if(r2!=null){ + if(r2.obj==null){bb.append('.').append('\n');} + else{bb.append(r2.obj.toString()).append('.');} + readsWritten++; + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + }else{ + bb.append('.').append('\n'); + } + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + } + }else{ + if(read1){ + for(final Read r : job.list){ + if(r!=null){ + r.toText(true, bb).append('\n'); + readsWritten++; + basesWritten+=(r.bases!=null ? r.bases.length : 0); + validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0); + validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.bases.length : 0); + Read r2=r.mate; + if(OUTPUT_INTERLEAVED && r2!=null){ + r2.toText(true, bb).append('\n'); + readsWritten++; + basesWritten+=(r2.bases!=null ? r2.bases.length : 0); + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.bases.length : 0); + } + + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + }else{ + for(final Read r1 : job.list){ + if(r1!=null){ + final Read r2=r1.mate; +// assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false); + if(r2!=null){ + r2.toText(true, bb).append('\n'); + readsWritten++; + basesWritten+=(r2.bases!=null ? r2.bases.length : 0); + validReadsWritten+=(r2.valid() && r2.mapped() ? 1 : 0); + validBasesWritten+=(r2.valid() && r2.mapped() && r2.bases!=null ? r2.bases.length : 0); + }else{ + //TODO abd.print(".\n"); + } + } + if(bb.length>=32768){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + } + } + } + } + if(job.close){ + if(bb.length>0){ + abd.write(bb.array, 0, bb.length); + bb.setLength(0); + } + assert(job.outstream!=null && job.outstream!=myOutstream); + ReadWrite.finishWriting(null, job.outstream, fname, allowSubprocess); //TODO: This should be job.fname + } + + job=null; + while(job==null){ + try { + job=queue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + + if(myOutstream!=null){ + if(bb.length>0){ + myOutstream.write(bb.array, 0, bb.length); + bb.setLength(0); + } + ReadWrite.finishWriting(null, myOutstream, fname, allowSubprocess); + } + if(myQOutstream!=null){ + if(bbq.length>0){ + myQOutstream.write(bbq.array, 0, bbq.length); + bbq.setLength(0); + } + ReadWrite.finishWriting(null, myQOutstream, qfname, allowSubprocess); + } + finishedSuccessfully=true; + } + + private static final boolean buffered=true; + private static final boolean ASSERT_CIGAR=false; + private static final boolean verbose=false; + +} diff --git a/current/stream/ReadStreamStringWriter.java b/current/stream/ReadStreamStringWriter.java new file mode 100755 index 0000000..6d674a0 --- /dev/null +++ b/current/stream/ReadStreamStringWriter.java @@ -0,0 +1,354 @@ +package stream; + +import java.util.ArrayList; + +import fileIO.FileFormat; +import fileIO.ReadWrite; + +public class ReadStreamStringWriter extends ReadStreamWriter { + + public ReadStreamStringWriter(String fname_, boolean read1_, int bufferSize, boolean allowSubprocess_){ + this(fname_, read1_, bufferSize, false, false, false, false, false, false, false, false, allowSubprocess_); + } + +// public ReadStreamStringWriter(String fname_, boolean read1_, int bufferSize, +// boolean outputSamFile, boolean outputBamFile, boolean fastq, boolean fasta, boolean sitesOnly, boolean attachment, boolean stdout){ +// this(fname_, null, read1_, bufferSize, outputSamFile, outputBamFile, fastq, fasta, sitesOnly, attachment, stdout, false); +// } + + public ReadStreamStringWriter(String fname_, boolean read1_, int bufferSize, + boolean outputSamFile, boolean outputBamFile, boolean fastq, boolean fasta, boolean sitesOnly, boolean attachment, boolean stdout, + boolean useSharedHeader, boolean allowSubprocess_){ + super(fname_, null, read1_, bufferSize, + outputSamFile, outputBamFile, fastq, fasta, sitesOnly, attachment, stdout, + useSharedHeader, true, true, allowSubprocess_); + } + + public ReadStreamStringWriter(String fname_, String qfname_, boolean read1_, int bufferSize, + boolean outputSamFile, boolean outputBamFile, boolean fastq, boolean fasta, boolean sitesOnly, boolean attachment, boolean stdout, + boolean useSharedHeader, boolean allowSubprocess_){ + super(fname_, qfname_, read1_, bufferSize, + outputSamFile, outputBamFile, fastq, fasta, sitesOnly, attachment, stdout, + useSharedHeader, true, true, allowSubprocess_); + } + + public ReadStreamStringWriter(FileFormat ff, String qfname_, boolean read1_, int bufferSize, CharSequence header, boolean useSharedHeader){ + super(ff, qfname_, read1_, bufferSize, header, true, true, useSharedHeader); + } + + @Override + public void run() { + + if(!OUTPUT_SAM && !OUTPUT_FASTQ && !OUTPUT_FASTA && !OUTPUT_ATTACHMENT){ + if(OUTPUT_INTERLEAVED){ +// assert(false) : OUTPUT_SAM+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT+", "+OUTPUT_INTERLEAVED+", "+SITES_ONLY; + myWriter.print("#INTERLEAVED\n"); + } + if(SITES_ONLY){ + myWriter.println("#"+SiteScore.header()); + }else if(!OUTPUT_ATTACHMENT){ + myWriter.println("#"+Read.header()); + } + } + + Job job=null; + while(job==null){ + try { + job=queue.take(); +// job.list=queue.take(); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + + while(job!=null && !job.poison){ +// System.err.println("Processing job "+job); + if(!job.isEmpty()){ + + if(myQWriter!=null){ + if(read1){ + for(final Read r : job.list){ + if(r!=null){ + { + CharSequence cs=(r.bases==null ? "\n" : toQualitySB(r.quality, r.bases.length).append('\n')); + myQWriter.print('>'); + myQWriter.println(r.id); + myQWriter.print(cs); + } + Read r2=r.mate; + if(OUTPUT_INTERLEAVED && r2!=null){ + CharSequence cs=(r2.bases==null ? "\n" : toQualitySB(r2.quality, r2.bases.length).append('\n')); + myQWriter.print('>'); + myQWriter.println(r2.id); + myQWriter.print(cs); + } + } + } + }else{ + for(final Read r1 : job.list){ + if(r1!=null){ + final Read r2=r1.mate; + assert(r2!=null && r2.mate==r1 && r2!=r1) : r1.toText(false); + CharSequence cs=(r2.bases==null ? "\n" : toQualitySB(r2.quality, r2.bases.length).append('\n')); + myQWriter.print('>'); + myQWriter.println(r2.id); + myQWriter.print(cs); + } + } + } + } +// assert(false) : OUTPUT_SAM+", "+SITES_ONLY+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT+"\n"+job.list.get(0).obj+"\n"+job.list.get(0); + if(OUTPUT_SAM){ + assert(read1); + for(final Read r : job.list){ + Read r2=(r==null ? null : r.mate); + + SamLine sl1=(r==null ? null : new SamLine(r, 0)); + SamLine sl2=(r2==null ? null : new SamLine(r2, 1)); + + if(r!=null){ + job.writer.print(sl1.toText().append('\n')); + + readsWritten++; + basesWritten+=(r.bases!=null ? r.bases.length : 0); + validReadsWritten+=(r.valid() && r.mapped() ? 1 : 0); + validBasesWritten+=(r.valid() && r.mapped() && r.bases!=null ? r.bases.length : 0); + ArrayList list=r.sites; + if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){ + final Read clone=r.clone(); + for(int i=1; i list=r2.sites; + if(OUTPUT_SAM_SECONDARY_ALIGNMENTS && list!=null && list.size()>1){ + final Read clone=r2.clone(); + for(int i=1; i0){myOutstream.write(bb.array, 0, bb.length);} + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + }else if(OUTPUT_SAM){ + if(useSharedHeader){ + ArrayList list=SamReadInputStream.getSharedHeader(true); + if(list==null){ + System.err.println("Header was null."); + }else{ + try { + for(byte[] line : list){ + myOutstream.write(line); + myOutstream.write('\n'); + //myWriter.println(new String(line)); + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + }else{ + if(makeWriter){ + myWriter.println(SamLine.header0()); + int a=(MINCHROM==-1 ? 1 : MINCHROM); + int b=(MAXCHROM==-1 ? Data.numChroms : MAXCHROM); + for(int chrom=a; chrom<=b; chrom++){ + // myWriter.print(SamLine.header1(chrom, chrom)); + SamLine.printHeader1(chrom, chrom, myWriter); + } + myWriter.println(SamLine.header2()); + }else{ + ByteBuilder bb=new ByteBuilder(4096); + SamLine.header0B(bb); + bb.append('\n'); + int a=(MINCHROM==-1 ? 1 : MINCHROM); + int b=(MAXCHROM==-1 ? Data.numChroms : MAXCHROM); + for(int chrom=a; chrom<=b; chrom++){ + SamLine.printHeader1B(chrom, chrom, bb, myOutstream); + } + SamLine.header2B(bb); + bb.append('\n'); + + + try { + if(bb.length>0){myOutstream.write(bb.array, 0, bb.length);} + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + }else if(!OUTPUT_SAM && !OUTPUT_FASTQ && !OUTPUT_FASTA && !SITES_ONLY && !OUTPUT_ATTACHMENT){ +// myWriter.println("#"+Read.header()); + } + } + + assert(bufferSize>=1); + queue=new ArrayBlockingQueue(bufferSize); + } + + protected ReadStreamWriter(FileFormat ff, String qfname_, boolean read1_, int bufferSize, CharSequence header, boolean makeWriter, boolean buffered, boolean useSharedHeader){ +// assert(false) : useSharedHeader+", "+header; + assert(ff!=null); + assert(ff.write()) : "FileFormat is not in read mode for "+ff.name(); + + assert(!ff.text() && !ff.unknownFormat()) : "Unknown format for "+ff; + OUTPUT_FASTQ=ff.fastq(); + OUTPUT_FASTA=ff.fasta(); +// boolean bread=(ext==TestFormat.txt); + OUTPUT_SAM=ff.samOrBam(); + OUTPUT_BAM=ff.bam(); + OUTPUT_ATTACHMENT=ff.attachment(); + SITES_ONLY=ff.sites(); + OUTPUT_STANDARD_OUT=ff.stdio(); + assert(((OUTPUT_SAM ? 1 : 0)+(OUTPUT_FASTQ ? 1 : 0)+(OUTPUT_FASTA ? 1 : 0)+(OUTPUT_ATTACHMENT ? 1 : 0)+(SITES_ONLY ? 1 : 0))<=1) : + OUTPUT_SAM+", "+SITES_ONLY+", "+OUTPUT_FASTQ+", "+OUTPUT_FASTA+", "+OUTPUT_ATTACHMENT; + + fname=ff.name(); + qfname=qfname_; + read1=read1_; + allowSubprocess=ff.allowSubprocess(); +// assert(fname==null || (fname.contains(".sam") || fname.contains(".bam"))==OUTPUT_SAM) : "Outfile name and sam output mode flag disagree: "+fname; + assert(read1 || !OUTPUT_SAM) : "Attempting to output paired reads to different sam files."; + + if(qfname==null){ + myQOutstream=null; + myQWriter=null; + }else{ + myQOutstream=ReadWrite.getOutputStream(fname, false, buffered, allowSubprocess); + myQWriter=(makeWriter ? new PrintWriter(myQOutstream) : null); + } + + if(header==null){header=HEADER;} //new line; test. + + + if(fname==null && !OUTPUT_STANDARD_OUT){ + myOutstream=null; + myWriter=null; + }else{ + if(OUTPUT_STANDARD_OUT){myOutstream=System.out;} + else if(!OUTPUT_BAM || !Data.SAMTOOLS() || !Data.SH()){ + myOutstream=ReadWrite.getOutputStream(fname, false, buffered, allowSubprocess); + }else{ + if(!allowSubprocess){System.err.println("Warning! Spawning a samtools process when allowSubprocess="+allowSubprocess);} + myOutstream=ReadWrite.getOutputStreamFromProcess(fname, "samtools view -S -b -h - ", true); + } + + + + myWriter=(makeWriter ? new PrintWriter(myOutstream) : null); + + if(header!=null){ + if(myWriter!=null){ + myWriter.println(header); + }else{ + byte[] temp=new byte[header.length()]; + for(int i=0; i list=SamReadInputStream.getSharedHeader(true); + if(list==null){ + System.err.println("Header was null."); + }else{ + try { + for(byte[] line : list){ + myOutstream.write(line); + myOutstream.write('\n'); + //myWriter.println(new String(line)); + } + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + }else{ + if(myWriter!=null){ + myWriter.println(SamLine.header0()); + int a=(MINCHROM==-1 ? 1 : MINCHROM); + int b=(MAXCHROM==-1 ? Data.numChroms : MAXCHROM); + for(int chrom=a; chrom<=b; chrom++){ + // myWriter.print(SamLine.header1(chrom, chrom)); + SamLine.printHeader1(chrom, chrom, myWriter); + } + myWriter.println(SamLine.header2()); + }else{ + ByteBuilder bb=new ByteBuilder(4096); + SamLine.header0B(bb); + bb.append('\n'); + int a=(MINCHROM==-1 ? 1 : MINCHROM); + int b=(MAXCHROM==-1 ? Data.numChroms : MAXCHROM); + for(int chrom=a; chrom<=b; chrom++){ + SamLine.printHeader1B(chrom, chrom, bb, myOutstream); + } + SamLine.header2B(bb); + bb.append('\n'); + + + try { + if(bb.length>0){myOutstream.write(bb.array, 0, bb.length);} + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + }else if(ff.bread()){ + if(myWriter!=null){ + myWriter.println("#"+Read.header()); + }else{ + try { + myOutstream.write(("#"+Read.header()).getBytes()); + } catch (IOException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + } + } + + assert(bufferSize>=1); + queue=new ArrayBlockingQueue(bufferSize); + } + + @Override + public abstract void run(); + + /** Uses this thread to transform reads to text, and the ReadStreamWriter thread to write text to disk */ + public final synchronized void addListAsText(ArrayList list){ + assert(false) : "TODO"; + addList(list, myWriter, myOutstream, false); + } + + public final synchronized void poison(){ + addJob(new Job(null, null, null, false, true)); + } + + public final synchronized void addList(ArrayList list){ + addList(list, myWriter, myOutstream, false); + } + + public final synchronized void addList(ArrayList l, PrintWriter w, OutputStream o, boolean c){ + boolean poison=(c && w!=null && w==myWriter); + Job j=new Job(l, w, o, c, poison); + addJob(j); + } + + public final synchronized void addJob(Job j){ +// System.err.println("Got job "+(j.list==null ? "null" : j.list.size())); + boolean success=false; + while(!success){ + try { + queue.put(j); + success=true; + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + assert(!queue.contains(j)); //Hopefully it was not added. + } + } + } + + protected static final StringBuilder toQualitySB(byte[] quals, int len){ + if(quals==null){return fakeQualitySB(30, len);} + assert(quals.length==len); + StringBuilder sb=new StringBuilder(NUMERIC_QUAL ? len*3+1 : len+1); + if(NUMERIC_QUAL){ + if(quals.length>0){sb.append(quals[0]);} + for(int i=1; i0){sb.append(q);} + for(int i=1; i0){bb.append((int)quals[0]);} + for(int i=1; i0){bb.append(q);} + for(int i=1; i queue; + + protected long readsWritten=0; + protected long basesWritten=0; + protected long validReadsWritten=0; + protected long validBasesWritten=0; + public String fname(){return fname;} + public long readsWritten(){return readsWritten;} + public long basesWritten(){return basesWritten;} + public long validReadsWritten(){return validReadsWritten;} + public long validBasesWritten(){return validBasesWritten;} + + protected static class Job{ + public Job(ArrayList list_, PrintWriter writer_, OutputStream outstream_, boolean closeWhenDone_, + boolean shutdownThread_){ + list=list_; + writer=writer_; + outstream=outstream_; + close=closeWhenDone_; + poison=shutdownThread_; + } + public Job(ArrayList list_, PrintWriter writer_){ + this(list_, writer_, null, false, false); + } + public boolean isEmpty(){return list==null || list.isEmpty();} + public final ArrayList list; + public final PrintWriter writer; + public final OutputStream outstream; + public final boolean close; + public final boolean poison; + } + +} diff --git a/current/stream/SamLine.java b/current/stream/SamLine.java new file mode 100755 index 0000000..e7336be --- /dev/null +++ b/current/stream/SamLine.java @@ -0,0 +1,2190 @@ +package stream; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import align2.Shared; +import align2.Tools; + + +import dna.AminoAcid; +import dna.ChromosomeArray; +import dna.Data; +import dna.Gene; +import dna.ScafLoc; +import fileIO.TextStreamWriter; + + +public class SamLine { + +// 426_647_582 161 chr1 10159 0 26M9H chr3 170711991 0 TCCCTAACCCTAACCCTAACCTAACC IIFIIIIIIIIIIIIIIIIIICH2<> RG:Z:20110708003021394 NH:i:3 CM:i:2 SM:i:1 CQ:Z:A9?(BB?:=>B67=:7A);.%8'%))/%*%' CS:Z:G12002301002301002301023010200000003 XS:A:+ + +// 1 QNAME String [!-?A-~]f1,255g Query template NAME +// 2 FLAG Int [0,216-1] bitwise FLAG +// 3 RNAME String \*|[!-()+-<>-~][!-~]* Reference sequence NAME +// 4 POS Int [0,229-1] 1-based leftmost mapping POSition +// 5 MAPQ Int [0,28-1] MAPping Quality +// 6 CIGAR String \*|([0-9]+[MIDNSHPX=])+ CIGAR string +// 7 RNEXT String \*|=|[!-()+-<>-~][!-~]* Ref. name of the mate/next fragment +// 8 PNEXT Int [0,229-1] Position of the mate/next fragment +// 9 TLEN Int [-229+1,229-1] observed Template LENgth +// 10 SEQ String \*|[A-Za-z=.]+ fragment SEQuence +// 11 QUAL String [!-~]+ ASCII of Phred-scaled base QUALity+33 + + +// 2. FLAG: bitwise FLAG. Each bit is explained in the following table: +// Bit Description +// 0x1 template having multiple fragments in sequencing +// 0x2 each fragment properly aligned according to the aligner +// 0x4 fragment unmapped +// 0x8 next fragment in the template unmapped +// 0x10 SEQ being reverse complemented +// 0x20 SEQ of the next fragment in the template being reversed +// 0x40 the first fragment in the template +// 0x80 the last fragment in the template +// 0x100 secondary alignment +// 0x200 not passing quality controls +// 0x400 PCR or optical duplicate + + +// FCB062MABXX:1:1101:1177:2115#GGCTACAA 147 chr11 47765857 29 90M = 47765579 -368 CCTCTGTGGCCCGGGTTGGAGTGCAGTGTCATGATCATGGCTCGCTGTAGCTACACCCTTCTGAGCTCAAGCAATCCTCCCACCTCTCCC ############################################################A@@>DGGGDFBGGGGHHHHH9<@## + + +// public static StringBuilder header(int minChrom, int maxChrom, boolean firstLine, boolean lastLine){ +// StringBuilder sb=new StringBuilder(1000); +// if(firstLine){ +// sb.append("@HD\tVN:1.0\tSO:unsorted\n"); +// } +//// assert(false) : minChrom+", "+maxChrom+", "+Data.numChroms; +// for(int i=minChrom; i<=maxChrom && i<=Data.numChroms; i++){ +// // sb.append("\n"+Data.getChromosome(i).getString(0, 2100)+"\n"); +// +// // sb.append("@SQ\tSN:chr"+i); +// +// for(int j=0; j scaffolds(int minChrom, int maxChrom, boolean sort){ + final ArrayList list=new ArrayList(4000); + final StringBuilder sb=new StringBuilder(1000); + for(int i=minChrom; i<=maxChrom && i<=Data.numChroms; i++){ + final byte[][] inames=Data.scaffoldNames[i]; + for(int j=0; j scaffolds=scaffolds(minChrom, maxChrom, true); + for(int i=0; i scaffolds=scaffolds(minChrom, maxChrom, true); + for(int i=0; i scaffolds=scaffolds(minChrom, maxChrom, true); + for(int i=0; i=32768){ + try { + os.write(bb.array, 0, bb.length); + } catch (IOException e) { + throw new RuntimeException(e); + } + bb.setLength(0); + } + } + return; + } + + if(verbose){System.err.println("Iterating over chroms");} + for(int chrom=minChrom; chrom<=maxChrom && chrom<=Data.numChroms; chrom++){ +// if(verbose){System.err.println("chrom "+chrom);} + final byte[][] inames=Data.scaffoldNames[chrom]; +// if(verbose){System.err.println("inames"+(inames==null ? " = null" : ".length = "+inames.length));} + final int numScafs=Data.chromScaffolds[chrom]; +// if(verbose){System.err.println("scaffolds: "+numScafs);} + assert(inames.length==numScafs) : "Mismatch between number of scaffolds and names for chrom "+chrom+": "+inames.length+" != "+numScafs; + for(int scaf=0; scaf=32768){ + try { + os.write(bb.array, 0, bb.length); + } catch (IOException e) { + throw new RuntimeException(e); + } + bb.setLength(0); + } + } + } + } + + public static void printHeader1(int minChrom, int maxChrom, TextStreamWriter tsw){ + if(SORT_SCAFFOLDS){ + ArrayList scaffolds=scaffolds(minChrom, maxChrom, true); + for(int i=0; i list=null; + list=Shared.JVM_ARGS(); + if(list!=null){ + for(String s : list){ + sb.append(' '); + sb.append(s); + } + } + } + sb.append(' '); + sb.append(Shared.BBMAP_CLASS); + if(Shared.COMMAND_LINE!=null){ + for(String s : Shared.COMMAND_LINE){ + sb.append(' '); + sb.append(s); + } + } + } + + return sb; + } + + public static ByteBuilder header2B(ByteBuilder sb){ + sb.append("@PG\tID:BBMap\tPN:BBMap\tVN:"+Shared.BBMAP_VERSION_STRING); + + if(Shared.BBMAP_CLASS!=null){ + sb.append("\tCL:java"); + { + List list=null; + list=Shared.JVM_ARGS(); + if(list!=null){ + for(String s : list){ + sb.append(' '); + sb.append(s); + } + } + } + sb.append(' '); + sb.append(Shared.BBMAP_CLASS); + if(Shared.COMMAND_LINE!=null){ + for(String s : Shared.COMMAND_LINE){ + sb.append(' '); + sb.append(s); + } + } + } + + return sb; + } + + public static final boolean KILL_BAD_PAIRS=false; + public static final boolean REQUIRE_CORRECT_STRANDS_PAIRS=true; + public static final boolean SAME_STRAND_PAIRS=false; + + public SamLine(String s){ + this(s.split("\t")); + } + + /** Prevents references to original string, in case of e.g. very long MD tags. */ + public SamLine toSamLine(String s){ + String[] split=s.split("\t"); + split[0]=new String(split[0]); + split[5]=new String(split[5]); + split[9]=new String(split[9]); + split[10]=new String(split[10]); + for(int i=11; i0){//Prevents an initial length-0 match +// sb.append(count); +// sb.append(lastMode); +// } +// count=0; +// lastMode=mode; +// } +// +// count++; +// } +// sb.append(count); +// sb.append(mode); +// return sb.toString(); +// } + + public static String toCigar13(byte[] match, int readStart, int readStop, int reflen, byte[] bases){ + if(match==null || readStart==readStop){return null;} + StringBuilder sb=new StringBuilder(8); + int count=0; + char mode='='; + char lastMode='='; + + int refloc=readStart; + + int cigarlen=0; //for debugging + int opcount=0; //for debugging + + for(int mpos=0; mpos=reflen)){ + mode='S'; //soft-clip out-of-bounds + if(m!='I'){refloc++;} + if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions! + }else if(m=='m' || m=='s' || m=='S' || m=='N' || m=='B'){//Little 's' is for a match classified as a sub to improve the affine score. + mode='M'; + refloc++; + }else if(m=='I' || m=='X' || m=='Y'){ + mode='I'; + }else if(m=='D'){ + mode='D'; + refloc++; + }else if(m=='C'){ + mode='S'; + refloc++; + }else{ + throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii). " + + "Match string should be in long format here."); + } + + if(mode!=lastMode){ + if(count>0){//Prevents an initial length-0 match + sb.append(count); +// sb.append(lastMode); + if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');} + else{sb.append(lastMode);} + if(lastMode!='D'){cigarlen+=count;} + opcount+=count; + } + count=0; + lastMode=mode; + } + + count++; + if(sfdflag){count--;} + } + sb.append(count); + if(mode=='D' && count>INTRON_LIMIT){sb.append('N');} + else{sb.append(mode);} + if(mode!='D'){cigarlen+=count;} + opcount+=count; + + assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" + + "cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n"; + + return sb.toString(); + } + + public static String toCigar14(byte[] match, int readStart, int readStop, int reflen, byte[] bases){ + if(match==null || readStart==readStop){return null;} + StringBuilder sb=new StringBuilder(8); + int count=0; + char mode='='; + char lastMode='='; + + int refloc=readStart; + + int cigarlen=0; //for debugging + int opcount=0; //for debugging + + for(int mpos=0; mpos=reflen)){ + mode='S'; //soft-clip out-of-bounds + if(m!='I'){refloc++;} + if(m=='D'){sfdflag=true;} //Don't add soft-clip count for deletions! + }else if(m=='m' || m=='s'){//Little 's' is for a match classified as a sub to improve the affine score. + mode='='; + refloc++; + }else if(m=='S'){ + mode='X'; + refloc++; + }else if(m=='I' || m=='X' || m=='Y'){ + mode='I'; + }else if(m=='D'){ + mode='D'; + refloc++; + }else if(m=='C'){ + mode='S'; + refloc++; + }else if(m=='N' || m=='B'){ + mode='M'; + refloc++; + }else{ + throw new RuntimeException("Invalid match string character '"+(char)m+"' = "+m+" (ascii). " + + "Match string should be in long format here."); + } + + if(mode!=lastMode){ + if(count>0){//Prevents an initial length-0 match + sb.append(count); + if(lastMode=='D' && count>INTRON_LIMIT){sb.append('N');} + else{sb.append(lastMode);} + if(lastMode!='D'){cigarlen+=count;} + opcount+=count; + } + count=0; + lastMode=mode; + } + + count++; + if(sfdflag){count--;} + } + sb.append(count); + if(mode=='D' && count>INTRON_LIMIT){ + sb.append('N'); + }else{ + sb.append(mode); + } + if(mode!='D'){cigarlen+=count;} + opcount+=count; + + assert(bases==null || cigarlen==bases.length) : "\n(cigarlen = "+cigarlen+") != (bases.length = "+(bases==null ? -1 : bases.length)+")\n" + + "cigar = "+sb+"\nmatch = "+new String(match)+"\nbases = "+new String(bases)+"\n"; + + return sb.toString(); + } + + + public static String makeStopTag(int pos, int seqLength, String cigar, boolean perfect){ +// return String.format("YS:i:%d", pos+(cigar==null ? seqLength : -countLeadingClip(cigar)+calcCigarLength(cigar))-1); + return "YS:i:"+(pos+((cigar==null || perfect) ? seqLength : -countLeadingClip(cigar)+calcCigarLength(cigar))-1); + } + + public static String makeIdentityTag(byte[] match, boolean perfect){ + if(perfect){return "YI:f:100";} + float f=Read.identity(match); + return String.format("YI:f:%.2f", (100*f)); + } + + + public static String makeMdTag(int chrom, int refstart, byte[] match, byte[] call, boolean colorspace){ + if(match==null || chrom<0 || colorspace){return null;} + StringBuilder md=new StringBuilder(8); + md.append("MD:Z:"); + + if(colorspace){throw new RuntimeException("Colorspace is incompatible with MD tags.");} + + ChromosomeArray cha=Data.getChromosome(chrom); + + byte prevM='?'; + int count=0; + int dels=0; +// inr cpos=0; + for(int mpos=0, rpos=refstart; mpos0){ + md.append(count); +// } + + return md.toString(); + } + + +// public static String makeMdTag(int chrom, int refstart, byte[] match, byte[] call, boolean colorspace){ +// if(match==null || chrom<0 || colorspace){return null;} +// StringBuilder md=new StringBuilder(8); +// md.append("MD:Z:"); +// +// if(colorspace){throw new RuntimeException("Colorspace is incompatible with MD tags.");} +// +// ChromosomeArray cha=Data.getChromosome(chrom); +// +// byte prevM='?'; +// int count=0; +// for(int mpos=0, cpos=0, rpos=refstart; mpos2 && r2!=null){ +// if(qname.endsWith("/1") || qname.endsWith("/2") || qname.endsWith(" 1") || qname.endsWith(" 2")){} +// } + + if(!KEEP_NAMES && qname.length()>2 && r2!=null){ + char c=qname.charAt(qname.length()-2); + int num=(qname.charAt(qname.length()-1))-'1'; + if((num==0 || num==1) && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);} +// if(r.pairnum()==num && (c==' ' || c=='/')){qname=qname.substring(0, qname.length()-2);} + } +// flag=Integer.parseInt(s[1]); + + int idx1=-1, idx2=-1; + int chrom1=-1, chrom2=-1; + int start1=-1, start2=-1, a1=0, a2=0; + int stop1=-1, stop2=-1, b1=0, b2=0; + int scaflen=0; + byte[] name1=bytestar, name2=bytestar; + if(r.mapped()){ + assert(r.chrom>=0); + chrom1=r.chrom; + start1=r.start; + stop1=r.stop; + if(Data.isSingleScaffold(chrom1, start1, stop1)){ + assert(Data.scaffoldLocs!=null) : "\n\n"+r+"\n\n"+r.obj+"\n\n"; + idx1=Data.scaffoldIndex(chrom1, (start1+stop1)/2); + name1=Data.scaffoldNames[chrom1][idx1]; + scaflen=Data.scaffoldLengths[chrom1][idx1]; + a1=Data.scaffoldRelativeLoc(chrom1, start1, idx1); + b1=a1-start1+stop1; + }else{ + if(verbose){System.err.println("------------- Found multi-scaffold alignment! -------------");} + r.setMapped(false); + r.setPaired(false); + r.match=null; + if(r2!=null){r2.setPaired(false);} + } + } + if(r2!=null && r2.mapped()){ + chrom2=r2.chrom; + start2=r2.start; + stop2=r2.stop; + if(Data.isSingleScaffold(chrom2, start2, stop2)){ + idx2=Data.scaffoldIndex(chrom2, (start2+stop2)/2); + name2=Data.scaffoldNames[chrom2][idx2]; + a2=Data.scaffoldRelativeLoc(chrom2, start2, idx2); + b2=a2-start2+stop2; + }else{ + if(verbose){System.err.println("------------- Found multi-scaffold alignment for r2! -------------");} + r2.setMapped(false); + r2.setPaired(false); + r2.match=null; + if(r!=null){r.setPaired(false);} + } + } + + flag=0; + if(r2!=null){ + flag|=0x1; + + if(r.mapped() && r.valid() && r.match!=null && + (r2==null || (idx1==idx2 && r.paired() && r2.mapped() && r2.valid() && r2.match!=null))){flag|=0x2;} + if(fragNum==0){flag|=0x40;} + if(fragNum>0){flag|=0x80;} + } + if(!r.mapped()){flag|=0x4;} + if(r2!=null && !r2.mapped()){flag|=0x8;} + if(r.strand()==Gene.MINUS){flag|=0x10;} + if(r2!=null && r2.strand()==Gene.MINUS){flag|=0x20;} + if(r.secondary()){flag|=0x100;} + if(r.discarded()){flag|=0x200;} +// if(){flag|=0x400;} + +// assert(!r.secondary()) : r.mapScore; + +// 2. FLAG: bitwise FLAG. Each bit is explained in the following table: +// Bit Description +// 0x1 template having multiple fragments in sequencing +// 0x2 each fragment properly aligned according to the aligner +// 0x4 fragment unmapped +// 0x8 next fragment in the template unmapped +// 0x10 SEQ being reverse complemented +// 0x20 SEQ of the next fragment in the template being reversed +// 0x40 the first fragment in the template +// 0x80 the last fragment in the template +// 0x100 secondary alignment +// 0x200 not passing quality controls +// 0x400 PCR or optical duplicate + + rname=r.mapped() ? name1 : ((r2!=null && r2.mapped()) ? name2 : null); + pos=r.mapped() ? a1+1 : ((r2!=null && r2.mapped()) ? Tools.max(a2+1, 1) : 0); + +// assert(false) : pos+", "+a1+", "+a2; + +// pos=Tools.max(pos, 1); + +// mapq=r.mapped() ? Data.max(1, -20+(int)(r.mapScore*60f/(100*r.mapLength))) : 0;//Scale of 0-40 + mapq=r.mapped() ? Data.max(1, r.mapScore/r.mapLength) : 0;//Scale of 0-100 + + if(verbose){ + System.err.println("Making cigar for "+(r.match==null ? "null" : new String(r.match))); + } + + if(r.bases!=null && r.mapped() && r.match!=null){ + final boolean inbounds=(a1>=0 && b11.3f){ + if(inbounds && perfect && !r.containsNonM()){//r.containsNonM() should be unnecessary... it's there in case of clipping... + cigar=(r.bases.length+"="); +// System.err.println("SETTING cigar14="+cigar); +// +// byte[] match=r.match; +// if(r.shortmatch()){match=Read.toLongMatchString(match);} +// cigar=toCigar13(match, a1, b1, scaflen, r.bases); +// System.err.println("RESETTING cigar14="+cigar+" from toCigar14("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); + }else{ + byte[] match=r.match; + if(r.shortmatch()){match=Read.toLongMatchString(match);} + cigar=toCigar14(match, a1, b1, scaflen, r.bases); +// System.err.println("CALLING toCigar14("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); + } + }else{ + if(inbounds && (perfect || !r.containsNonNMS())){ + cigar=(r.bases.length+"M"); +// System.err.println("SETTING cigar13="+cigar); +// +// byte[] match=r.match; +// if(r.shortmatch()){match=Read.toLongMatchString(match);} +// cigar=toCigar13(match, a1, b1, scaflen, r.bases); +// System.err.println("RESETTING cigar13="+cigar+" from toCigar13("+new String(Read.toShortMatchString(match))+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); + }else{ + byte[] match=r.match; + if(r.shortmatch()){match=Read.toLongMatchString(match);} + cigar=toCigar13(match, a1, b1, scaflen, r.bases); +// System.err.println("CALLING toCigar13("+Read.toShortMatchString(match)+", "+a1+", "+b1+", "+scaflen+", "+r.bases+")"); + } + } + } + + if(verbose){ + System.err.println("cigar="+cigar); + } + +// assert(false); + +// assert(primary() || cigar.equals(stringstar)) : cigar; + + if(r.mapped()){ + int leadingClip=countLeadingClip(cigar); + int clippedIndels=(r.match==null ? 0 : countLeadingIndels(a1, r.match)); + if(verbose){ + System.err.println("leadingClip="+leadingClip); + System.err.println("clippedDels="+clippedIndels); + } + pos=pos+leadingClip+clippedIndels; + } + + if((!r.mapped() || cigar==null || stringstar.equals(cigar)) && pos<1){ + //This is necessary to prevent unmapped reads from having negative POS, + //and mapped reads without cigar strings from having POS less than 1. + pos=Tools.max(pos, r.mapped() ? 1 : 0); + } + assert(pos>=0) : "Negative coordinate "+pos+" for read:\n\n"+r+"\n\n"+r2+"\n\n"+this+"\n\na1="+a1+", a2="+a2+", clip="+countLeadingClip(cigar); +// if(pos<0){pos=0;cigar=null;rname=bytestar;mapq=0;flag|=0x4;} + +// assert(false) : "\npos="+pos+"\ncigar='"+cigar+"'\nVERSION="+VERSION+"\na1="+a1+", b1="+b1+"\n\n"+r.toString(); + +// rnext=(r2==null ? stringstar : (r.mapped() && !r2.mapped()) ? "chr"+Gene.chromCodes[r.chrom] : "chr"+Gene.chromCodes[r2.chrom]); + rnext=((r2==null || (!r.mapped() && !r2.mapped())) ? bytestar : (r.mapped() && r2.mapped()) ? (idx1==idx2 ? byteequals : name2) : byteequals); + + if(Data.scaffoldPrefixes){ + if(rname!=null && rname!=bytestar){ + int k=Tools.indexOf(rname, (byte)'$'); + rname=Arrays.copyOfRange(rname, k+1, rname.length); + } + if(rnext!=null && rnext!=bytestar){ + int k=Tools.indexOf(rnext, (byte)'$'); + rnext=Arrays.copyOfRange(rnext, k+1, rnext.length); + } + } + + if(r2==null){ + pnext=0; + }else if(r2.mapped()){ + pnext=Tools.max(1, a2+1); + }else if(r.mapped()){ + pnext=Tools.max(1, a1+1); + }else{ + pnext=0; + } + tlen=(r2==null || !r.mapped() || !r2.mapped() || idx1!=idx2) ? 0 : 1+(Data.max(r.stop, r2.stop)-Data.min(r.start, r2.start)); +// if(r2==null || r.stop<=r2.start){ +// //plus sign +// }else if(r2.stop<=r.start){ +// //minus sign +// tlen=-tlen; +// }else{ +// //They overlap... a lot. Physically shorter than read length. +// if(r.start<=r2.start){ +// +// }else{ +// tlen=-tlen; +// } +// } + //This version is less technically correct (does not account for very short insert reads) but probably more inline with what is expected + if(r2==null || r.start=0; i--){ +//// q.append((char)(r.quality[i]+33)); +//// } +//// qual=q.toString(); +// qual=new byte[r.quality.length]; +// for(int i=0, j=qual.length-1; i(8); + +// if(!r.secondary()){optional.add(r.ambiguous() ? "XT:A:R" : "XT:A:U");} //Not sure what do do for secondary alignments + if(!r.secondary() && r.ambiguous()){optional.add("XT:A:R");} //Not sure what do do for secondary alignments + + int nm=r.bases.length; + int dels=0; + if(perfect){nm=0;} + else if(r.match!=null){ + int delsCurrent=0; + for(byte b : r.match){ + if(b=='m' || b=='C'){nm--;} + if(b=='D'){delsCurrent++;} + else{ + if(delsCurrent<=INTRON_LIMIT){dels+=delsCurrent;} + delsCurrent=0; + } + } + if(delsCurrent<=INTRON_LIMIT){dels+=delsCurrent;} +// assert(false) : nm+", "+dels+", "+delsCurrent+", "+r.bases.length+", "+r.match.length; + } + + //Samtools puts nm tag in wrong place for deletions. This "if" block is not really necessary except for exact text match to samtools. + if(false && dels>0){ + if(MAKE_SM_TAG){optional.add("SM:i:"+mapq);} +// optional.add("AM:i:"+Data.min(mapq, r2==null ? mapq : (r2.mapped() ? Data.max(1, -20+(int)((r2.mapScore*60f/(100*r2.mapLength)))) : 0))); + optional.add("AM:i:"+Data.min(mapq, r2==null ? mapq : (r2.mapped() ? Data.max(1, r2.mapScore/r2.mapLength) : 0))); + if(r.match!=null){optional.add("NM:i:"+(nm+dels));} + }else{ + if(perfect){optional.add("NM:i:0");} + else if(r.match!=null){optional.add("NM:i:"+(nm+dels));} + if(MAKE_SM_TAG){optional.add("SM:i:"+mapq);} +// optional.add("AM:i:"+Data.min(mapq, r2==null ? mapq : (r2.mapped() ? Data.max(1, -20+(int)((r2.mapScore*60f/(100*r2.mapLength)))) : 0))); + optional.add("AM:i:"+Data.min(mapq, r2==null ? mapq : (r2.mapped() ? Data.max(1, r2.mapScore/r2.mapLength) : 0))); + } + + if(MAKE_TOPHAT_TAGS){ + optional.add("AS:i:0"); + if(cigar==null || cigar.indexOf('N')<0){ + optional.add("XN:i:0"); + }else{ + } + optional.add("XM:i:0"); + optional.add("XO:i:0"); + optional.add("XG:i:0"); + if(cigar==null || cigar.indexOf('N')<0){ + optional.add("YT:Z:UU"); + }else{ + } + optional.add("NH:i:1"); + }else if(MAKE_XM_TAG){//XM tag. For bowtie compatibility; unfortunately it is poorly defined. + int x=0; + if(r.discarded() || (!r.ambiguous() && !r.mapped())){ + x=0;//TODO: See if the flag needs to be present in this case. + }else if(r.mapped()){ + x=1; + if(r.numSites()>0 && r.numSites()>0){ + int z=r.topSite().score; + for(int i=1; i=0){optional.add("XM:i:"+x);} + } + + //XS tag + if(MAKE_XS_TAG){ + String xs=makeXSTag(r); + if(xs!=null){ + optional.add(xs); + assert(r2==null || r.pairnum()!=r2.pairnum()); +// assert(r2==null || !r2.mapped() || r.strand()==r2.strand() || makeXSTag(r2)==xs) : +// "XS problem:\n"+r+"\n"+r2+"\n"+xs+"\n"+makeXSTag(r2)+"\n"; + } + } + + if(MAKE_MD_TAG){ + String md=makeMdTag(r.chrom, r.start, r.match, r.bases, r.colorspace()); + if(md!=null){optional.add(md);} + } + + if(r.mapped() && MAKE_NH_TAG){ + if(ReadStreamWriter.OUTPUT_SAM_SECONDARY_ALIGNMENTS && r.numSites()>1){ + optional.add("NH:i:"+r.sites.size()); + }else{ + optional.add("NH:i:1"); + } + } + + if(MAKE_STOP_TAG && (perfect || r.match!=null)){optional.add(makeStopTag(pos, seq.length, cigar, perfect));} + + if(MAKE_IDENTITY_TAG && (perfect || r.match!=null)){optional.add(makeIdentityTag(r.match, perfect));} + + if(MAKE_INSERT_TAG && r2!=null){ + if(r.mapped() ||r.originalSite!=null){ + optional.add("X8:Z:"+r.insertSizeMapped(false)+(r.originalSite==null ? "" : ","+r.insertSizeOriginalSite())); +// assert(r.originalSite==null || r2.originalSite==null || r.insertSizeOriginalSite()==r2.insertSizeOriginalSite()); +// assert(!r.mapped() || !r2.mapped() || r.insertSizeMapped()==r2.insertSizeMapped()); + } + } + if(MAKE_CORRECTNESS_TAG){ + final SiteScore ss0=r.originalSite; + if(ss0!=null){ + optional.add("X9:Z:"+(ss0.isCorrect(r.chrom, r.strand(), r.start, r.stop, 0) ? "T" : "F")); + } + } + + if(MAKE_CUSTOM_TAGS){ + int sites=r.numSites() + (r.originalSite==null ? 0 : 1); + if(sites>0){ + StringBuilder sb=new StringBuilder(); + sb.append("X1:Z:"); + if(r.sites!=null){ + for(SiteScore ss : r.sites){ + sb.append('$'); + sb.append(ss.toText()); + } + } + if(r.originalSite!=null){ + sb.append('$'); + sb.append('*'); + sb.append(r.originalSite.toText()); + } + optional.add(sb.toString()); + } + + if(r.match!=null){ + byte[] match=r.match; + if(!r.shortmatch()){ + match=Read.toShortMatchString(match); + } + optional.add("X2:Z:"+new String(match)); + } + + optional.add("X3:i:"+r.mapScore); + optional.add("X4:i:"+r.mapLength); + optional.add("X5:Z:"+r.numericID); + optional.add("X6:i:"+(r.flags|(r.match==null ? 0 : Read.SHORTMATCHMASK))); + if(r.copies>1){optional.add("X7:i:"+r.copies);} + } + + } +// assert(r.pairnum()==1) : "\n"+r.toText(false)+"\n"+this+"\n"+r2; + } + + public SamLine(String[] s){ + assert(!s[0].startsWith("@")) : "Tried to make a SamLine from a header: "+s[0]; + assert(s.length>=11) : "\nNot all required fields are present: "+s.length+"\nline='"+Arrays.toString(s)+"'\n"; + if(s.length<11){ + System.err.println("Invalid SamLine: "+Arrays.toString(s)); + return; + } + qname=s[0]; + flag=Integer.parseInt(s[1]); + rname=s[2].getBytes(); + pos=Integer.parseInt(s[3]); +// try { +// Integer.parseInt(s[4]); +// } catch (NumberFormatException e) { +// System.err.println(Arrays.toString(s)); +// } + mapq=Character.isDigit(s[4].charAt(0)) ? Integer.parseInt(s[4]) : 99; //Added for non-compliant mappers that put * here + cigar=s[5]; + rnext=s[6].getBytes(); + pnext=Integer.parseInt(s[7]); + tlen=Character.isDigit(s[8].charAt(0)) ? Integer.parseInt(s[8]) : 0; //Added for non-compliant mappers that put * here +// seq=s[9]; +// qual=s[10]; + seq=(s[9].equals(stringstar) ? null : s[9].getBytes()); + qual=(s[10].equals(stringstar) ? null : s[10].getBytes()); + + if(mapped() && strand()==Gene.MINUS){ + if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);} + if(qual!=bytestar){Tools.reverseInPlace(qual);} + } + + if(qual!=null && qual!=bytestar){ + for(int i=0; i11){ + optional=new ArrayList(s.length-11); + for(int i=11; ia) : "Missing field 0: "+new String(s); + qname=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a)); + b++; + a=b; + + while(ba) : "Missing field 1: "+new String(s); + flag=Tools.parseInt(s, a, b); + b++; + a=b; + + while(ba) : "Missing field 2: "+new String(s); + rname=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b)); + b++; + a=b; + + while(ba) : "Missing field 3: "+new String(s); + pos=Tools.parseInt(s, a, b); + b++; + a=b; + + while(ba) : "Missing field 4: "+new String(s); + mapq=Tools.parseInt(s, a, b); + b++; + a=b; + + while(ba) : "Missing field 5: "+new String(s); + cigar=(b==a+1 && s[a]=='*' ? null : new String(s, a, b-a)); + b++; + a=b; + + while(ba) : "Missing field 6: "+new String(s); + rnext=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b)); + b++; + a=b; + + while(ba) : "Missing field 7: "+new String(s); + pnext=Tools.parseInt(s, a, b); + b++; + a=b; + + while(ba) : "Missing field 8: "+new String(s); + tlen=Tools.parseInt(s, a, b); + b++; + a=b; + + while(ba) : "Missing field 9: "+new String(s); +// seq=new String(s, a, b-a); + seq=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b)); + b++; + a=b; + + while(ba) : "Missing field 10: "+new String(s); +// qual=new String(s, a, b-a); + qual=(b==a+1 && s[a]=='*' ? null : Arrays.copyOfRange(s, a, b)); + b++; + a=b; + + assert((seq==bytestar)==(Tools.equals(seq, bytestar))); + assert((qual==bytestar)==(Tools.equals(qual, bytestar))); + + if(mapped() && strand()==Gene.MINUS){ + if(seq!=bytestar){AminoAcid.reverseComplementBasesInPlace(seq);} + if(qual!=bytestar){Tools.reverseInPlace(qual);} + } + + if(qual!=null && qual!=bytestar){ + for(int i=0; i(4); + while(ba){ + String x=new String(s, a, b-a); + optional.add(x); + }else{ + //Empty field + } + b++; + a=b; + } + } + } + + + public Read parseName(){ + try { + String[] answer=qname.split("_"); + long id=Long.parseLong(answer[0]); + byte trueChrom=Gene.toChromosome(answer[1]); + byte trueStrand=Byte.parseByte(answer[2]); + int trueLoc=Integer.parseInt(answer[3]); + int trueStop=Integer.parseInt(answer[4]); +// byte[] quals=qual.getBytes(); +// byte[] quals=qual; +// for(int i=0; i=0){ + + } + + int chrom_=-1; + byte strand_=strand(); + int start_=start(); + int stop_=stop(); + assert(start_<=stop_) : start_+", "+stop_; + boolean cs_=colorspace(); + + if(Data.GENOME_BUILD>=0 && rname!=null && (rname.length!=1 || rname[0]!='*')){ + ScafLoc sc=Data.getScafLoc(rname); + assert(sc!=null) : "Can't find scaffold in reference with name "+new String(rname)+"\n"+this; + if(sc!=null){ + chrom_=sc.chrom; + start_+=sc.loc; + stop_+=sc.loc; + } + } + +//// byte[] quals=(qual==null || (qual.length()==1 && qual.charAt(0)=='*')) ? null : qual.getBytes(); +//// byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual.clone(); +// byte[] quals=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual; +// byte[] bases=seq==null ? null : seq.clone(); +// if(strand_==Gene.MINUS){//Minus-mapped SAM lines have bases and quals reversed +// AminoAcid.reverseComplementBasesInPlace(bases); +// Tools.reverseInPlace(quals); +// } +// Read r=new Read(bases, chrom_, strand_, start_, stop_, qname, quals, cs_, numericId_); + + final Read r; + { + byte[] seqX=(seq==null || (seq.length==1 && seq[0]=='*')) ? null : seq; + byte[] qualX=(qual==null || (qual.length==1 && qual[0]=='*')) ? null : qual; + String qnameX=(qname==null || qname.equals(stringstar)) ? null : qname; + r=new Read(seqX, chrom_, strand_, start_, stop_, qnameX, qualX, cs_, numericId_); + } + + r.setMapped(mapped()); + r.setSynthetic(synthetic); +// r.setPairnum(pairnum()); //TODO: Enable after fixing assertions that this will break in read input streams. + if(originalSite!=null){ + r.originalSite=originalSite; + } + + r.mapScore=mapq; + r.setSecondary(!primary()); + +// if(mapped()){ +// r.list=new ArrayList(1); +// r.list.add(new SiteScore(r.chrom, r.strand(), r.start, r.stop, 0)); +// } + +// System.out.println(optional); + if(optional!=null){ + for(String s : optional){ + if(s.equals("XT:A:R")){ + r.setAmbiguous(true); + }else if(s.startsWith("X1:Z:")){ +// System.err.println("Found X1 tag!\t"+s); + String[] split=s.split("\\$"); +// assert(false) : Arrays.toString(split); + ArrayList list=new ArrayList(3); + + for(int i=1; i=0){ + boolean success=r.fixMatchB(); +// if(!success){r.match=null;} +// assert(false) : new String(r.match); + } +// assert(false) : new String(r.match); + } +// assert(false) : new String(r.match); +// System.err.println(">\n"+cigar+"\n"+(r.match==null ? "null" : new String(r.match))); + } +// assert(false) : new String(r.match); + +// System.err.println("Resulting read: "+r.toText()); + + return r; + + } + + /** Aproximate length of result of SamLine.toText() */ + public int textLength(){ + int len=11; //11 tabs + len+=(3+9+3+9); + len+=(tlen>999 ? 9 : 3); + + len+=(qname==null ? 1 : qname.length()); + len+=(rname==null ? 1 : rname.length); + len+=(rnext==null ? 1 : rnext.length); + len+=(cigar==null ? 1 : cigar.length()); + len+=(seq==null ? 1 : seq.length); + len+=(qual==null ? 1 : qual.length); + + if(optional!=null){ + len+=optional.size(); + for(String s : optional){len+=s.length();} + } + return len; + } + + public ByteBuilder toBytes(ByteBuilder bb){ + + final int buflen=Tools.max((rname==null ? 1 : rname.length), (rnext==null ? 1 : rnext.length), (seq==null ? 1 : seq.length), (qual==null ? 1 : qual.length)); + + if(bb==null){bb=new ByteBuilder(textLength()+4);} + if(qname==null){bb.append('*').append('\t');}else{bb.append(qname).append('\t');} + bb.append(flag).append('\t'); + append(bb, rname).append('\t'); + bb.append(pos).append('\t'); + bb.append(mapq).append('\t'); + if(cigar==null){bb.append('*').append('\t');}else{bb.append(cigar).append('\t');} + append(bb, rnext).append('\t'); + bb.append(pnext).append('\t'); + bb.append(tlen).append('\t'); + + if(mapped() && strand()==Gene.MINUS){ + appendReverseComplimented(bb, seq).append('\t'); + appendQualReversed(bb, qual); + }else{ + append(bb, seq).append('\t'); + appendQual(bb, qual); + } + +// assert(seq.getClass()==String.class); +// assert(qual.getClass()==String.class); +// sb.append(seq).append('\t'); +// sb.append(qual); + + if(optional!=null){ + for(String s : optional){ + bb.append('\t').append(s); + } + } + return bb; + } + + public StringBuilder toText(){ + + final int buflen=Tools.max((rname==null ? 1 : rname.length), (rnext==null ? 1 : rnext.length), (seq==null ? 1 : seq.length), (qual==null ? 1 : qual.length)); + final char[] buffer=Shared.getTLCB(buflen); + + StringBuilder sb=new StringBuilder(textLength()+4); + if(qname==null){sb.append('*').append('\t');}else{sb.append(qname).append('\t');} + sb.append(flag).append('\t'); + append(sb, rname, buffer).append('\t'); + sb.append(pos).append('\t'); + sb.append(mapq).append('\t'); + if(cigar==null){sb.append('*').append('\t');}else{sb.append(cigar).append('\t');} + append(sb, rnext, buffer).append('\t'); + sb.append(pnext).append('\t'); + sb.append(tlen).append('\t'); + + if(mapped() && strand()==Gene.MINUS){ + appendReverseComplimented(sb, seq, buffer).append('\t'); + appendQualReversed(sb, qual, buffer); + }else{ + append(sb, seq, buffer).append('\t'); + appendQual(sb, qual, buffer); + } + +// assert(seq.getClass()==String.class); +// assert(qual.getClass()==String.class); +// sb.append(seq).append('\t'); +// sb.append(qual); + + if(optional!=null){ + for(String s : optional){ + sb.append('\t').append(s); + } + } + return sb; + } + + public String toString(){return toText().toString();} + +// Bit Description +// 0x1 template having multiple fragments in sequencing +// 0x2 each fragment properly aligned according to the aligner +// 0x4 fragment unmapped +// 0x8 next fragment in the template unmapped +// 0x10 SEQ being reverse complemented +// 0x20 SEQ of the next fragment in the template being reversed +// 0x40 the first fragment in the template +// 0x80 the last fragment in the template +// 0x100 secondary alignment +// 0x200 not passing quality controls +// 0x400 PCR or optical duplicate + + public boolean hasMate(){ + return (flag&0x1)==0x1; + } + + public boolean properPair(){ + return (flag&0x2)==0x2; + } + + public boolean mapped(){ + return (flag&0x4)!=0x4; +// 0x4 fragment unmapped +// 0x8 next fragment in the template unmapped + } + + public boolean nextMapped(){ + return (flag&0x8)!=0x8; +// 0x4 fragment unmapped +// 0x8 next fragment in the template unmapped + } + + public byte strand(){ + return ((flag&0x10)==0x10 ? (byte)1 : (byte)0); + } + + public byte nextStrand(){ + return ((flag&0x20)==0x20 ? (byte)1 : (byte)0); + } + + public boolean firstFragment(){ + return (flag&0x40)==0x40; + } + + public boolean lastFragment(){ + return (flag&0x80)==0x80; + } + + public int pairnum(){ + return firstFragment() ? 0 : lastFragment() ? 1 : 0; + } + + public boolean primary(){return (flag&0x100)==0;} + public void setPrimary(boolean b){ + if(b){ + flag=flag|0x100; + }else{ + flag=flag&~0x100; + } + } + + public boolean discarded(){ + return (flag&0x200)==0x200; + } + +// /** Assumes rname is an integer. */ +// public int chrom(){ +// if(Data.GENOME_BUILD<0){return -1;} +// HashMap sc +// } + + /** Assumes rname is an integer. */ + public int chrom_old(){ + assert(false); + if(!Character.isDigit(rname[0]) && !Character.isDigit(rname[rname.length-1])){ + if(warning){ + warning=false; + System.err.println("Warning - sam lines need a chrom field."); + } + return -1; + } + assert(Shared.anomaly || '*'==rname[0] || (Character.isDigit(rname[0]) && Character.isDigit(rname[rname.length-1]))) : + "This is no longer correct, considering that sam lines are named by scaffold. They need a chrom field.\n"+new String(rname); + if(rname==null || Arrays.equals(rname, bytestar) || !(Character.isDigit(rname[0]) && Character.isDigit(rname[rname.length-1]))){return -1;} + //return Gene.toChromosome(new String(rname)); + //return Integer.parseInt(new String(rname))); + final byte z='0'; + int x=rname[0]-z; + for(int i=1; i=0){return 0;} + int dels=0; + int inss=0; + int cloc=0; + for(int mloc=0; mloc1){sb.append(current);} + }else if(c=='X'){ + sb.append('S'); + if(current>1){sb.append(current);} + }else if(c=='D' || c=='N'){ + sb.append('D'); + if(current>1){sb.append(current);} + }else if(c=='I'){ + sb.append('I'); + if(current>1){sb.append(current);} + }else if(c=='S'){ + sb.append('C'); + if(current>1){sb.append(current);} + }else if(c=='M'){ + sb.append('B'); + if(current>1){sb.append(current);} + } + current=0; + } + } + + byte[] match=new byte[sb.length()]; + for(int i=0; i=a.length); + for(int i=0; i=a.length); + for(int i=0, j=a.length-1; j>=0; i++, j--){buffer[i]=(char)AminoAcid.baseToComplementExtended[a[j]];} + sb.append(buffer, 0, a.length); + } +// for(int i=a.length-1; i>=0; i--){ +// sb.append((char)AminoAcid.baseToComplementEbuffertended[a[i]]); +// } + return sb; + } + + private static StringBuilder appendQual(StringBuilder sb, byte[] a, char[] buffer){ + if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} + {//This is actually faster + assert(buffer.length>=a.length); + for(int i=0; i=a.length); + for(int i=0, j=a.length-1; j>=0; i++, j--){buffer[i]=(char)(a[j]+33);} + sb.append(buffer, 0, a.length); + } +// for(int i=a.length-1; i>=0; i--){ +// sb.append((char)(a[i]+33)); +// } + return sb; + } + + private static ByteBuilder append(ByteBuilder sb, byte[] a){ + if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} + return sb.append(a); + } + + private static ByteBuilder appendReverseComplimented(ByteBuilder sb, byte[] a){ + if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} + + sb.ensureExtra(a.length); + byte[] buffer=sb.array; + int i=sb.length; + for(int j=a.length-1; j>=0; i++, j--){buffer[i]=AminoAcid.baseToComplementExtended[a[j]];} + sb.length+=a.length; + + return sb; + } + + private static ByteBuilder appendQual(ByteBuilder sb, byte[] a){ + if(a==null || a==bytestar || (a.length==1 && a[0]=='*')){return sb.append('*');} + + sb.ensureExtra(a.length); + byte[] buffer=sb.array; + int i=sb.length; + for(int j=0; j=0; i++, j--){buffer[i]=(byte)(a[j]+33);} + sb.length+=a.length; + + return sb; + } + + /** Assumes a custom name including original location */ + public byte[] originalContig(){ +// assert(PARSE_CUSTOM); + int loc=-1; + int count=0; + for(int i=0; i=0){ +// System.err.println("For read "+r.pairnum()+" mapped to strand "+r.strand()); + boolean plus=(r.strand()==Gene.PLUS); //Assumes secondstrand=false +// System.err.println("plus="+plus); + if(r.pairnum()!=0){plus=!plus;} +// System.err.println("plus="+plus); + if(XS_SECONDSTRAND){plus=!plus;} +// System.err.println("plus="+plus); + return (plus ? XSPLUS : XSMINUS); + }else{ + return null; + } + } + + public byte[] rname(){return rname;} + public byte[] rnext(){return rnext;} + + public String qname; + public int flag; + private byte[] rname; + public int pos; + public int mapq; + public String cigar; + private byte[] rnext; + public int pnext; + public int tlen; + public byte[] seq; + public byte[] qual; + public ArrayList optional; + + public Object obj; + + /** Turn this off for RNAseq */ + public static boolean MAKE_MD_TAG=false; + public static boolean MAKE_SM_TAG=false; + public static boolean MAKE_XM_TAG=false; + public static boolean MAKE_XS_TAG=false; + public static boolean MAKE_AS_TAG=false; //TODO: Alignment score from aligner + public static boolean MAKE_NH_TAG=false; + public static boolean MAKE_TOPHAT_TAGS=false; + public static boolean XS_SECONDSTRAND=false; + public static boolean MAKE_IDENTITY_TAG=false; + public static boolean MAKE_STOP_TAG=false; + public static boolean MAKE_CUSTOM_TAGS=false; + public static boolean MAKE_INSERT_TAG=false; + public static boolean MAKE_CORRECTNESS_TAG=false; + public static boolean CONVERT_CIGAR_TO_MATCH=false; + public static boolean SOFT_CLIP=true; + /** OK to use the "setFrom" function which uses the old SamLine instead of translating the read, if a genome is not loaded. Should be false when processing occurs. */ + public static boolean SET_FROM_OK=false; + /** For paired reads, keep original names rather than changing read2's name to match read1 */ + public static boolean KEEP_NAMES=false; + public static float VERSION=1.3f; + /** Tells program when to use 'N' rather than 'D' in cigar strings */ + public static int INTRON_LIMIT=Integer.MAX_VALUE; + + private static boolean warning=System.getProperty("user.dir").contains("/bushnell/"); + + /** SSAHA2 incorrectly calculates the start position of reads with soft-clipped starts, and needs this enabled. */ + public static boolean SUBTRACT_LEADING_SOFT_CLIP=true; + /** Sort header scaffolds in alphabetical order to be more compatible with Tophat */ + public static boolean SORT_SCAFFOLDS=false; + public static boolean verbose=false; + + private static final String stringstar="*"; + private static final byte[] bytestar=new byte[] {(byte)'*'}; + private static final byte[] byteequals=new byte[] {(byte)'='}; + private static final String XSPLUS="XS:A:+", XSMINUS="XS:A:-"; + +} diff --git a/current/stream/SamReadInputStream.java b/current/stream/SamReadInputStream.java new file mode 100755 index 0000000..a04a758 --- /dev/null +++ b/current/stream/SamReadInputStream.java @@ -0,0 +1,224 @@ +package stream; + +import java.util.ArrayList; + +import align2.Shared; +import align2.Tools; + +import fileIO.ByteFile; +import fileIO.FileFormat; + +public class SamReadInputStream extends ReadInputStream { + + public static void main(String[] args){ + + FASTQ.PARSE_CUSTOM=false; + + SamReadInputStream sris=new SamReadInputStream(args[0], false, false, false, true); + + Read r=sris.next(); + System.out.println(r.toText(false)); + System.out.println(); + System.out.println(r.obj.toString()); + System.out.println(); + } + + public SamReadInputStream(String fname, boolean colorspace_, boolean loadHeader_, boolean interleaved_, boolean allowSubprocess_){ + this(FileFormat.testInput(fname, FileFormat.SAM, null, allowSubprocess_, false), colorspace_, loadHeader_, interleaved_); + } + + public SamReadInputStream(FileFormat ff, boolean colorspace_, boolean loadHeader_, boolean interleaved_){ + + colorspace=colorspace_; + loadHeader=loadHeader_; + +// interleaved=((tf.is==System.in || stdin) ? FASTQ.FORCE_INTERLEAVED : true); + interleaved=interleaved_; + + stdin=ff.stdio(); + if(!ff.samOrBam()){ + System.err.println("Warning: Did not find expected sam file extension for filename "+ff.name()); + } + + tf=ByteFile.makeByteFile(ff, false); + header=new ArrayList(); + + } + + @Override + public void start() { +// if(cris!=null){new Thread(cris).start();} + } + + + @Override + public boolean hasMore() { + if(buffer==null || next>=buffer.size()){ + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next x=nextList(); + if(x==null){return null;} + return x.toArray(new Read[x.size()]); + } + + @Override + public synchronized ArrayList nextList() { + if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");} + if(buffer==null || next>=buffer.size()){fillBuffer();} + ArrayList list=buffer; + buffer=null; + if(list!=null && list.size()==0){list=null;} + consumed+=(list==null ? 0 : list.size()); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return list; + } + public final boolean preferArrays(){return false;} + + private synchronized void fillBuffer(){ + + assert(buffer==null || next>=buffer.size()); + + buffer=null; + next=0; + + buffer=toReadList(tf, BUF_LEN, colorspace, nextReadID, FASTQ.PARSE_CUSTOM); + nextReadID+=buffer.size(); + generated+=buffer.size(); + + if(buffer.size() toReadList(ByteFile tf2, int buflen, + boolean colorspace2, long nextReadID2, boolean parseCustom) { + ArrayList list=new ArrayList(buflen); + while(list.size()(); + tf.reset(); + } + + public static synchronized ArrayList getSharedHeader(boolean wait){ + if(!wait || SHARED_HEADER!=null){return SHARED_HEADER;} + System.err.println("Waiting on header to be read from a sam file."); + while(SHARED_HEADER==null){ + try { + SamReadInputStream.class.wait(1000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + return SHARED_HEADER; + } + + public static synchronized void setSharedHeader(ArrayList list){ + SHARED_HEADER=list; + SamReadInputStream.class.notifyAll(); + } + + private static ArrayList SHARED_HEADER; + + @Override + public boolean paired() {return interleaved;} + + private ArrayList buffer=null; + private ArrayList header=null; + private int next=0; + + private final ByteFile tf; + private final boolean interleaved; + private final boolean loadHeader; + + private final int BUF_LEN=Shared.READ_BUFFER_LENGTH; + + public long generated=0; + public long consumed=0; + private long nextReadID=0; + + public final boolean colorspace; + public final boolean stdin; + +} diff --git a/current/stream/ScarfReadInputStream.java b/current/stream/ScarfReadInputStream.java new file mode 100755 index 0000000..0256b3c --- /dev/null +++ b/current/stream/ScarfReadInputStream.java @@ -0,0 +1,149 @@ +package stream; + +import java.util.ArrayList; + +import align2.Shared; + +import fileIO.ByteFile; +import fileIO.FileFormat; + +public class ScarfReadInputStream extends ReadInputStream { + + public static void main(String[] args){ + + ScarfReadInputStream fris=new ScarfReadInputStream(args[0], false, true); + + Read r=fris.next(); + System.out.println(r.toText(false)); + + } + + public ScarfReadInputStream(String fname, boolean colorspace_, boolean allowSubprocess_){ + this(FileFormat.testInput(fname, FileFormat.SCARF, null, allowSubprocess_, false), colorspace_); + } + + public ScarfReadInputStream(FileFormat ff, boolean colorspace_){ + if(verbose){System.err.println("ScarfReadInputStream("+ff.name()+")");} + + colorspace=colorspace_; + + stdin=ff.stdio(); + if(!ff.scarf()){ + System.err.println("Warning: Did not find expected scarf file extension for filename "+ff.name()); + } + + tf=ByteFile.makeByteFile(ff, false); + + interleaved=FASTQ.FORCE_INTERLEAVED;//((tf.is()==System.in || stdin) ? FASTQ.FORCE_INTERLEAVED : FASTQ.isInterleaved(tf.name)); +// assert(false) : interleaved; + } + + @Override + public void start() { +// if(cris!=null){new Thread(cris).start();} + } + + + @Override + public boolean hasMore() { + if(buffer==null || next>=buffer.size()){ + if(tf.isOpen()){ + fillBuffer(); + }else{ + assert(generated>0) : "Was the file empty?"; + } + } + return (buffer!=null && next x=nextList(); + if(x==null){return null;} + return x.toArray(new Read[x.size()]); + } + + @Override + public synchronized ArrayList nextList() { + if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");} + if(buffer==null || next>=buffer.size()){fillBuffer();} + ArrayList list=buffer; + buffer=null; + if(list!=null && list.size()==0){list=null;} + consumed+=(list==null ? 0 : list.size()); +// System.err.println(hashCode()+" produced "+r[0].numericID); + return list; + } + public final boolean preferArrays(){return false;} + + private synchronized void fillBuffer(){ + + assert(buffer==null || next>=buffer.size()); + + buffer=null; + next=0; + + buffer=FASTQ.toScarfReadList(tf, BUF_LEN, colorspace, nextReadID, interleaved); + int bsize=(buffer==null ? 0 : buffer.size()); + nextReadID+=bsize; + if(bsize=maxReads)+", "+(chrom=buffer.length)); + System.out.println(id+", "+maxReads+", "+chrom+", "+maxChrom+", "+position+", "+maxPosition+", "+buffer+", "+next+", "+(buffer==null ? -1 : buffer.length)); + } +// if(buffer==null || next>=buffer.length){ +// if(tf.isOpen()){ +// fillBuffer(); +// }else{ +// assert(generated>0) : "Was the file empty?"; +// } +// } +// return (buffer!=null && next=maxReads){return false;} + if(chrom=buffer.length){return false;} + return true; + } + + @Override + public Read next() { + if(!hasMore()){return null;} + if(buffer==null || next>=buffer.length){fillBuffer();} + Read r=buffer[next]; + buffer[next]=null; + next++; + consumed++; + return r; + } + + @Override + public synchronized Read[] nextBlock() { + if(next!=0){throw new RuntimeException("'next' should not be used when doing blockwise access.");} + if(!hasMore()){return null;} + if(buffer==null || next>=buffer.length){fillBuffer();} + Read[] r=buffer; + buffer=null; + if(r!=null && r.length==0){r=null;} + consumed+=(r==null ? 0 : r.length); + return r; + } + + @Override + public synchronized ArrayList nextList() { + return toList(nextBlock()); + } + public final boolean preferArrays(){return true;} + + private synchronized void fillBuffer(){ +// System.out.println("fill "+chrom+", "+position); + buffer=null; + if(chrom>maxChrom){return;} + ChromosomeArray cha=Data.getChromosome(chrom); + next=0; + + if(position==0){ + while(position<=maxPosition && !AminoAcid.isFullyDefined((char)cha.get(position))){position++;} + } + + Read[] reads=new Read[BUF_LEN]; + int index=0; + + while(position<=maxPosition && index=minReadlen){ + start=start+firstGood; + stop=stop-(s.length-lastGood-1); + s=Arrays.copyOfRange(s, firstGood, lastGood+1); + assert(s.length==lastGood-firstGood+1); + }else{ + s=null; + } + } + + if(s!=null){ + Read r=new Read(s, chrom, Gene.PLUS, start, stop, id, null, false); + if(alternateStrand && (r.numericID&1)==1){r.reverseComplement();} + r.setSynthetic(true); +// System.out.println("Made read: "+r); +// assert(id!=54406) : "\n"+r.toString()+"\nbases: "+s.length+"\nstart: "+start+"\nstop: "+stop+"\nminlen: "+minReadlen+"\n"; + + reads[index]=r; + index++; + position+=(POSITION_INCREMENT-overlap); + id++; + }else{ + //Move to the next defined position + while(AminoAcid.isFullyDefined((char)cha.get(position))){position++;} + while(position<=maxPosition && !AminoAcid.isFullyDefined((char)cha.get(position))){position++;} + } + } +// System.out.println("got "+index+" from "+chrom+", "+position); + + if(index==0){ + if(UNLOAD && chrom>0){Data.unload(chrom, true);} + chrom++; + position=0; + buffer=null; + fillBuffer(); + return; + } + + generated+=index; + + if(index list=new ArrayList(BUF_LEN); +// +// +// while(position<=maxPosition && list.size(), Cloneable{ + + public SiteScore(int chrom_, byte strand_, int start_, int stop_, int hits_, int quickScore_){ + start=start_; + stop=stop_; + hits=hits_; + quickScore=quickScore_; + score=quickScore_; + chrom=chrom_; + strand=strand_; +// assert(chrom_>=0) : this.toText()+"\nchrom_="+chrom_+", strand_="+strand_+", start_="+start_+", stop_="+stop_+", hits_="+hits_+", quickScore_="+quickScore_; + assert(start_<=stop_) : this.toText()+"\nchrom_="+chrom_+", strand_="+strand_+", start_="+start_+", stop_="+stop_+", hits_="+hits_+", quickScore_="+quickScore_; + } + + public SiteScore(int chrom_, byte strand_, int start_, int stop_, int hits_, int quickScore_, boolean rescued_, boolean perfect_){ + start=start_; + stop=stop_; + hits=hits_; + quickScore=quickScore_; + score=quickScore_; + chrom=chrom_; + strand=strand_; + rescued=rescued_; + perfect=perfect_; + semiperfect=perfect; + assert(start_<=stop_) : this.toText(); + } + + @Override + public int compareTo(SiteScore other) { + int x=other.score-score; + if(x!=0){return x;} + + x=other.slowScore-slowScore; + if(x!=0){return x;} + + x=other.pairedScore-pairedScore; + if(x!=0){return x;} + + x=other.quickScore-quickScore; + if(x!=0){return x;} + + x=chrom-other.chrom; + if(x!=0){return x;} + + x=start-other.start; + return x; + } + + public boolean equals(Object other){ + return compareTo((SiteScore)other)==0; + } + + public String toString(){ + return toText().toString(); + } + +// 9+2+1+9+9+1+1+4+4+4+4+gaps + public CharSequence toText(){ + StringBuilder sb=new StringBuilder(53+(gaps==null ? 0 : gaps.length*10)); + sb.append(chrom); + sb.append(','); + sb.append(strand); + sb.append(','); + sb.append(start); + sb.append(','); + sb.append(stop); + sb.append(','); + sb.append((rescued ? 1 : 0)); + sb.append(','); + sb.append((semiperfect ? 1 : 0)); + sb.append((perfect ? 1 : 0)); + sb.append(','); + sb.append(hits); + sb.append(','); + sb.append(quickScore); + sb.append(','); + sb.append(slowScore); + sb.append(','); + sb.append(pairedScore); + sb.append(','); + sb.append(score); + + if(gaps!=null){ + sb.append(','); + for(int i=0; i0){sb.append('~');} + sb.append(gaps[i]); + } + } + + if(match!=null){ + sb.append(','); + final char[] buffer=Shared.getTLCB(match.length); + for(int i=0; i0){sb.append('~');} + sb.append(gaps[i]); + } + } + + if(match!=null){ + sb.append(','); + sb.append(match); + } + + return sb; +// chrom+","+strand+","+start+","+stop+","+(rescued ? 1 : 0)+","+ +// (perfect ? 1 : 0)+","+quickScore+","+slowScore+","+pairedScore+","+score; + } + + public boolean isSemiPerfect(byte[] bases){ + if(bases.length!=stop-start+1){return false;} + byte[] ref=Data.getChromosome(chrom).array; + + //This block handles cases where the read runs outside the reference + //Of course, padding the reference with 'N' would be better, but... + int readStart=0; + int readStop=bases.length; + final int refStop=start+bases.length; + int maxNoref=bases.length/2; + + if(start<0){ + readStart=0-start; + } + if(refStop>ref.length){ + int dif=(refStop-ref.length); + readStop-=dif; + } + + for(int i=readStart; i=ref.length){return false;} + + for(int i=0; i=ref.length){ + N+=(stop-ref.length+1); + assert(!perfect || !assumePerfectCorrect); + perfect=false; + } + if(N>nlimit){ + perfect=semiperfect=false; + assert(Read.CHECKSITE(this, bases, 0)); //123 + return perfect; + } + + final byte bn=(byte)'N'; + for(; refloc<=max; refloc++, readloc++){ + final byte c=bases[readloc]; + final byte r=ref[refloc]; + assert(Character.isUpperCase(r) && Character.isUpperCase(c)) : + "\nAn input read appears to contain a non-upper-case base. Please rerun with the 'touppercase' flag.\n"+ + r+", "+c+"\n"; + if(c!=r || c==bn){ + perfect=false; + if(c==bn){semiperfect=false;} + if(r!=bn || (N=N+1)>nlimit){ + semiperfect=false; + assert(Read.CHECKSITE(this, bases, 0)); //123 + return semiperfect; + } + } + } + + semiperfect=(semiperfect && (N<=nlimit)); + perfect=(perfect && semiperfect && (N==0)); + assert(Read.CHECKSITE(this, bases, 0)); //123 + return perfect; + } + + public final boolean overlaps(SiteScore ss){ + return chrom==ss.chrom && strand==ss.strand && overlap(start, stop, ss.start, ss.stop); + } + public final boolean overlaps(SiteScore ss, boolean ignoreStrand){ + return chrom==ss.chrom && (ignoreStrand || strand==ss.strand) && overlap(start, stop, ss.start, ss.stop); + } + private static boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + + public static String header() { + return "chrom,strand,start,stop,rescued,semiperfect+perfect,hits,quickScore,slowScore,pairedScore,score,match"; + } + + public static SiteScore fromText(String s){ +// System.err.println("Trying to make a SS from "+s); + String line[]=s.split(","); + + SiteScore ss; + + assert(line.length>=11 && line.length<=13) : "\n"+line.length+"\n"+s+"\n"+Arrays.toString(line)+"\n"; + int chrom=Byte.parseByte(line[0].charAt(0)=='*' ? line[0].substring(1) : line[0]); + byte strand=Byte.parseByte(line[1]); + int start=Integer.parseInt(line[2]); + int stop=Integer.parseInt(line[3]); + boolean rescued=Integer.parseInt(line[4])==1; +// [1, 1, 9397398, 9398220, 0, 00, 20, 8701, 9084, 0, 9084, 9397398~9397471~9398145~9398220] + int p=Integer.parseInt(line[5], 2); +// assert(false) : line[5]+"->"+p; + boolean perfect=(p&1)==1; + boolean semiperfect=(p&2)==2; + int hits=Integer.parseInt(line[6]); + int quickScore=Integer.parseInt(line[7]); + int swscore=Integer.parseInt(line[8]); + int pairedScore=Integer.parseInt(line[9]); + int score=Integer.parseInt(line[10]); + ss=new SiteScore(chrom, strand, start, stop, hits, quickScore, rescued, perfect); + ss.score=score; + ss.slowScore=swscore; + ss.pairedScore=pairedScore; + ss.semiperfect=semiperfect; + + if(line.length>11){ + String[] gstring=line[11].split("~"); + ss.gaps=new int[gstring.length]; + for(int i=0; i12){ + ss.match=line[12].getBytes(); + } + + return ss; + } + + public boolean positionalMatch(SiteScore b, boolean testGaps){ +// return chrom==b.chrom && strand==b.strand && start==b.start && stop==b.stop; + if(chrom!=b.chrom || strand!=b.strand || start!=b.start || stop!=b.stop){ + return false; + } + if(!testGaps || (gaps==null && b.gaps==null)){return true;} + if((gaps==null) != (b.gaps==null)){return false;} + if(gaps.length!=b.gaps.length){return false;} + for(int i=0; i{ + + private PositionComparator(){} + + @Override + public int compare(SiteScore a, SiteScore b) { + if(a.chrom!=b.chrom){return a.chrom-b.chrom;} + if(a.start!=b.start){return a.start-b.start;} + if(a.stop!=b.stop){return a.stop-b.stop;} + if(a.strand!=b.strand){return a.strand-b.strand;} + if(a.score!=b.score){return b.score-a.score;} + if(a.slowScore!=b.slowScore){return b.slowScore-a.slowScore;} + if(a.quickScore!=b.quickScore){return b.quickScore-a.quickScore;} + if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;} + if(a.rescued!=b.rescued){return a.rescued ? 1 : -1;} + return 0; + } + + public void sort(List list){ + if(list==null || list.size()<2){return;} + Collections.sort(list, this); + } + + public void sort(SiteScore[] list){ + if(list==null || list.length<2){return;} + Arrays.sort(list, this); + } + + } + + public SiteScore copy(){ + SiteScore ss2=this.clone(); + if(gaps!=null){ss2.gaps=ss2.gaps.clone();} + return ss2; + } + + public SiteScore clone(){ + try { + return (SiteScore)super.clone(); + } catch (CloneNotSupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + throw new RuntimeException(); + } + + public boolean isInBounds(){ + ChromosomeArray cha=Data.getChromosome(chrom); + return (start>=0 && stop<=cha.maxIndex); + } + + protected boolean matchContainsXY(){ + if(match==null || match.length<1){return false;} + final byte a=match[0], b=match[match.length-1]; + return (a=='X' ||a=='Y' || b=='X' || b=='Y'); + } + + public boolean isCorrect(int chrom_, byte strand_, int start_, int stop_, int thresh){ + if(chrom_!=chrom || strand_!=strand){return false;} + if(thresh<=0){return start_==start && stop_==stop;} + return Tools.absdif(start_, start)<=thresh || Tools.absdif(stop_, stop)<=thresh; + } + + /** TODO: Test + * Attempt to extend match/N symbols where there are X and Y symbols + * */ + public boolean fixXY(byte[] bases, boolean nullifyOnFailure, MSA msa){ + if(verbose){System.err.println("ss.fixXY()");} + + if(!matchContainsXY()){return true;} + + boolean disable=false; + if(disable){ + if(nullifyOnFailure){ + match=null; + } +// else if(clipOnFailure){ +// for(int i=0; i=match.length || mloc>=bases.length){success=false;} + else if(mloc>0){ + mloc--; + int rloc=start+mloc, cloc=mloc; + while(mloc>=0){ + byte m=match[mloc]; + byte c=bases[cloc]; + byte r=ca.get(rloc); + assert(m=='X' || m=='Y') : (char)m+", "+mloc+", "+(char)c+", "+(char)r+"\n"+new String(bases)+"\n"+this.toString(); + if(r=='N' || c=='N'){match[mloc]='N';} + else if(c==r){match[mloc]='m';} + else if(mloc<=tip){match[mloc]='S';} + else{ + success=false; + break; + } + mloc--; + rloc--; + cloc--; + } + } + } + + if(success){//Process right side + int mloc=match.length-1; + while(mloc>=0 && (match[mloc]=='X' || match[mloc]=='Y')){mloc--;} + int dif=match.length-1-mloc; + if(mloc<0){success=false;} + else if(dif>0){ + mloc++; + int rloc=stop-dif+1, cloc=bases.length-dif; + if(cloc<0){success=false;} + else{ + final int tip2=match.length-tip; + while(mloc=tip2){match[mloc]='S';} + else{ + success=false; + break; + } + mloc++; + rloc++; + cloc++; + } + } + } + } + + success=success && !matchContainsXY(); + if(!success && nullifyOnFailure){match=null;} + +// assert(false) : "TODO: Alter score to reflect changes"; //TODO + if(match!=null){slowScore=msa.score(match);} + + return success; + } + +// public boolean plus(){return strand()==Gene.PLUS;} +// public boolean minus(){return strand()==Gene.MINUS;} +// +// public final byte strand(){return (byte)(flags&strandMask);} +// public boolean rescued(){return (flags&rescuedMask)!=0;} +// public boolean perfect(){return (flags&perfectMask)!=0;} +// public boolean semiperfect(){return (flags&semiperfectMask)!=0;} +// +// public final int setStrand(int x){ +// assert(x==0 || x==1); +// if(x==0){flags=(flags&~strandMask);} +// else{flags=(flags|strandMask);} +// assert(strand()==x); +// return x; +// } +// public boolean setRescued(boolean b){ +// if(b){flags=(flags|rescuedMask);} +// else{flags=(flags&~rescuedMask);} +// assert(rescued()==b); +// return b; +// } +// public boolean setPerfect(boolean b){ +// if(b){flags=(flags|semiperfectMask);} +// else{flags=(flags&~semiperfectMask);} +// assert(perfect()==b); +// return b; +// } +// public boolean setSemiperfect(boolean b){ +// if(b){flags=(flags|semiperfectMask);} +// else{flags=(flags&~semiperfectMask);} +// assert(semiperfect()==b); +// return b; +// } + + public boolean plus(){return strand==Gene.PLUS;} + public boolean minus(){return strand==Gene.MINUS;} + public boolean perfect(){return perfect;} + public boolean semiperfect(){return semiperfect;} + public boolean rescued(){return rescued;} + public byte strand(){return strand;} + + public final byte strand; + public boolean rescued=false; + public boolean perfect=false; + public boolean semiperfect=false; + + public int start; + public int stop; + public int quickScore; + public int score; + public int slowScore; + public int pairedScore; + public int hits; + public final int chrom; + + public long flags; //TODO Use this instead of fields + + public int[] gaps; //Limits of large gaps + public byte[] match; + + + public static final PositionComparator PCOMP=new PositionComparator(); + public static final long strandMask=(1L<<0); + public static final long rescuedMask=(1L<<1); + public static final long perfectMask=(1L<<2); + public static final long semiperfectMask=(1L<<3); + public static boolean verbose=false; + +} diff --git a/current/stream/SiteScoreR.java b/current/stream/SiteScoreR.java new file mode 100755 index 0000000..fe06bf7 --- /dev/null +++ b/current/stream/SiteScoreR.java @@ -0,0 +1,285 @@ +package stream; + + +import java.util.Arrays; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + + + +/** + * @author Brian Bushnell + * @date Jul 16, 2012 + * + */ +public final class SiteScoreR implements Comparable{ + + public SiteScoreR(SiteScore ss, int readlen_, long numericID_, byte pairnum_){ + this(ss.chrom, ss.strand, ss.start, ss.stop, readlen_, numericID_, pairnum_, ss.score, ss.pairedScore, ss.perfect, ss.semiperfect); + } + + public SiteScoreR(int chrom_, byte strand_, int start_, int stop_, int readlen_, long numericID_, byte pairnum_, int score_, int pscore_, boolean perfect_, boolean semiperfect_){ + chrom=chrom_; + strand=strand_; + start=start_; + stop=stop_; + readlen=readlen_; + numericID=numericID_; + pairnum=pairnum_; + score=score_; + pairedScore=pscore_; + perfect=perfect_; + semiperfect=semiperfect_|perfect_; + assert(start_<=stop_) : this.toText(); + } + + @Override + public int compareTo(SiteScoreR other) { + int x=other.score-score; + if(x!=0){return x;} + + x=other.pairedScore-pairedScore; + if(x!=0){return x;} + + x=chrom-other.chrom; + if(x!=0){return x;} + + x=strand-other.strand; + if(x!=0){return x;} + + x=start-other.start; + return x; + } + + public boolean equals(Object other){ + return compareTo((SiteScoreR)other)==0; + } + + public boolean equals(SiteScore other){ + if(other.start!=start){return false;} + if(other.stop!=stop){return false;} + if(other.chrom!=chrom){return false;} + if(other.strand!=strand){return false;} + return true; + } + + public boolean equals(SiteScoreR other){ + return compareTo(other)==0; + } + + public String toString(){ +// StringBuilder sb=new StringBuilder(); +// sb.append('\t'); +// sb.append(start); +// int spaces=10-sb.length(); +// for(int i=0; i=a1; + } + + public static String header() { + return "chrom,strand,start,stop,readlen,numericID,pairnum,semiperfect+perfect,quickScore,slowScore,pairedScore,score"; + } + + public static SiteScoreR fromText(String s){ +// System.err.println("Trying to make a SS from "+s); + String line[]=s.split(","); + + SiteScoreR ss; + + assert(line.length==10 || line.length==11) : "\n"+line.length+"\n"+s+"\n"+Arrays.toString(line)+"\n"; + boolean correct=false; + if(line[0].charAt(0)=='*'){ + correct=true; + line[0]=line[0].substring(1); + } + int chrom=Byte.parseByte(line[0]); + byte strand=Byte.parseByte(line[1]); + int start=Integer.parseInt(line[2]); + int stop=Integer.parseInt(line[3]); + int readlen=Integer.parseInt(line[4]); + long numericID=Long.parseLong(line[5]); + byte pairnum=Byte.parseByte(line[6]); + int p=Integer.parseInt(line[7], 2); + boolean perfect=(p&1)==1; + boolean semiperfect=(p&2)==2; + int pairedScore=Integer.parseInt(line[8]); + int score=Integer.parseInt(line[9]); + ss=new SiteScoreR(chrom, strand, start, stop, readlen, numericID, pairnum, score, pairedScore, perfect, semiperfect); + ss.correct=correct; + + return ss; + } + + public static SiteScoreR[] fromTextArray(String s){ + String[] split=s.split("\t"); + SiteScoreR[] out=new SiteScoreR[split.length]; + for(int i=0; i{ + + private PositionComparator(){} + + @Override + public int compare(SiteScoreR a, SiteScoreR b) { + if(a.chrom!=b.chrom){return a.chrom-b.chrom;} + if(a.start!=b.start){return a.start-b.start;} + if(a.stop!=b.stop){return a.stop-b.stop;} + if(a.strand!=b.strand){return a.strand-b.strand;} + if(a.score!=b.score){return b.score-a.score;} + if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;} + return 0; + } + + public void sort(List list){ + if(list==null || list.size()<2){return;} + Collections.sort(list, this); + } + + public void sort(SiteScoreR[] list){ + if(list==null || list.length<2){return;} + Arrays.sort(list, this); + } + + } + + public static class NormalizedComparator implements Comparator{ + + private NormalizedComparator(){} + + @Override + public int compare(SiteScoreR a, SiteScoreR b) { + if((int)a.normalizedScore!=(int)b.normalizedScore){return (int)b.normalizedScore-(int)a.normalizedScore;} + if(a.score!=b.score){return b.score-a.score;} + if(a.pairedScore!=b.pairedScore){return b.pairedScore-a.pairedScore;} + if(a.retainVotes!=b.retainVotes){return b.retainVotes-a.retainVotes;} + if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;} + if(a.chrom!=b.chrom){return a.chrom-b.chrom;} + if(a.start!=b.start){return a.start-b.start;} + if(a.stop!=b.stop){return a.stop-b.stop;} + if(a.strand!=b.strand){return a.strand-b.strand;} + return 0; + } + + public void sort(List list){ + if(list==null || list.size()<2){return;} + Collections.sort(list, this); + } + + public void sort(SiteScoreR[] list){ + if(list==null || list.length<2){return;} + Arrays.sort(list, this); + } + + } + + public static class IDComparator implements Comparator{ + + private IDComparator(){} + + @Override + public int compare(SiteScoreR a, SiteScoreR b) { + if(a.numericID!=b.numericID){return a.numericID>b.numericID ? 1 : -1;} + if(a.pairnum!=b.pairnum){return a.pairnum-b.pairnum;} + + if(a.chrom!=b.chrom){return a.chrom-b.chrom;} + if(a.start!=b.start){return a.start-b.start;} + if(a.stop!=b.stop){return a.stop-b.stop;} + if(a.strand!=b.strand){return a.strand-b.strand;} + if(a.score!=b.score){return b.score-a.score;} + if(a.perfect!=b.perfect){return a.perfect ? -1 : 1;} + return 0; + } + + public void sort(List list){ + if(list==null || list.size()<2){return;} + Collections.sort(list, this); + } + + public void sort(SiteScoreR[] list){ + if(list==null || list.length<2){return;} + Arrays.sort(list, this); + } + + } + + public static final PositionComparator PCOMP=new PositionComparator(); + public static final NormalizedComparator NCOMP=new NormalizedComparator(); + public static final IDComparator IDCOMP=new IDComparator(); + + public int reflen(){return stop-start+1;} + + public int start; + public int stop; + public int readlen; + public int score; + public int pairedScore; + public final int chrom; + public final byte strand; + public boolean perfect; + public boolean semiperfect; + public final long numericID; + public final byte pairnum; + public float normalizedScore; +// public int weight=0; //Temp variable, for calculating normalized score + public boolean correct=false; + public int retainVotes=0; + +} diff --git a/current/var/ApplyVarsToReference.java b/current/var/ApplyVarsToReference.java new file mode 100755 index 0000000..79dcd84 --- /dev/null +++ b/current/var/ApplyVarsToReference.java @@ -0,0 +1,319 @@ +package var; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; + + +import align2.IndexMaker4; +import align2.Tools; +import dna.ChromosomeArray; +import dna.Data; +import dna.FastaToChromArrays; +import dna.Gene; +import dna.Timer; +import fileIO.ReadWrite; + +/** + * @author Brian Bushnell + * @date Jul 23, 2012 + * + */ +public class ApplyVarsToReference { + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + Timer t=new Timer(); + t.start(); + + String inPattern=args[0]; + + int minChrom=-1; + int maxChrom=-1; + int outgenome=-1; + Data.GENOME_BUILD=-1; + String name=null; + + for(int i=1; i1 ? split[1] : null); + + if(a.equals("ingenome")){ + Data.setGenome(Integer.parseInt(b)); + if(minChrom==-1){minChrom=1;} + if(maxChrom==-1){maxChrom=Data.numChroms;} + }else if(a.equals("outgenome")){ + outgenome=Integer.parseInt(b); + }else if(a.equals("minchrom")){ + minChrom=Integer.parseInt(b); + }else if(a.equals("maxchrom")){ + maxChrom=Integer.parseInt(b); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=Integer.parseInt(b); + }else if(a.equals("nblocksize")){ + N_BLOCK_SIZE=Integer.parseInt(b); + }else if(a.equals("nblocktrigger")){ + N_BLOCK_TRIGGER=Integer.parseInt(b); + }else if(a.equals("staynearref")){ + STAY_NEAR_REF=Tools.parseBoolean(b); + }else if(a.equals("overwrite") || a.equals("ow")){ + OVERWRITE=Tools.parseBoolean(b); + }else if(a.startsWith("regen")){ + REGEN_N_BLOCKS=Tools.parseBoolean(b); + }else if(a.startsWith("name=")){ + REGEN_N_BLOCKS=Tools.parseBoolean(b); + }else{ + System.err.println("Unknown argument "+arg); + } + } + + assert(Data.GENOME_BUILD>-1); + assert(outgenome>-1); +// assert(Data.GENOME_BUILD!=outgenome); + if(Data.GENOME_BUILD==outgenome){ + System.out.println("Warning! Overwriting input genome "+outgenome); + } + + String fname=Data.chromFname(minChrom, outgenome); + File f=new File(fname.substring(0, fname.lastIndexOf('/'))); +// assert(false) : f.getAbsolutePath(); + if(!f.exists()){f.mkdirs();} + + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + String outName=Data.chromFname(chrom, outgenome); + assert(OVERWRITE || !new File(outName).exists()) : "Destination "+outName+" already exists."; +// assert(false) : inPattern+", "+outName; + process(inPattern.replaceFirst("#", ""+chrom), outName, chrom); + } + + FastaToChromArrays.writeInfo(outgenome, maxChrom, (name==null ? Data.name : name), ""+Data.GENOME_BUILD+"_plus_variations", false, false); + + t.stop(); + + { + String path=IndexMaker4.fname(1, 1, 12, 1, false); + int lastSlash=path.lastIndexOf('/'); + path=path.substring(0, lastSlash); + File dir=new File(path); + if(dir.exists()){ + System.out.println("Deleting old index for "+outgenome); + for(File f2 : dir.listFiles()){ + if(f2.isFile() && (f2.getName().contains(".int2d") || f2.getName().endsWith(".txt"))){ + f2.delete(); + } + } + } + } + +// System.out.println("Vars in: \t"+VARS_IN); +// System.out.println("Vars out:\t"+VARS_OUT); + System.out.println(); + System.out.println("Time: \t"+t); + + } + + /** + * @param replaceFirst + * @param chromFname + * @param chrom + */ + public static void process(String inVarsName, String outChromName, int chrom) { + ArrayList vars=Varlet.fromTextFile(inVarsName); + ChromosomeArray cha=Data.getChromosome(chrom); + ChromosomeArray chb=new ChromosomeArray(chrom, Gene.PLUS); + + //Next location to read in a + int aloc=0; + //Next location to set in b + int bloc=0; + + for(int i=0; i=aloc) : i+"\n"+vars.get(i-1)+"\n"+v+"\n"; //Overlapping variations + + while(v.beginLoc=vars.size()){break;} + v=vars.get(i); + } + + if(STAY_NEAR_REF && Tools.absdif(aloc, bloc)>=REF_LIMIT){ + int dif=v.lengthDif(); + + if(aloc0){ +// System.err.print("i"); + i++; + if(i>=vars.size()){break;} + v=vars.get(i); + dif=v.lengthDif(); + } + }else{//skip deletions + while(dif<0){ +// System.err.print("d"); + i++; + if(i>=vars.size()){break;} + v=vars.get(i); + dif=v.lengthDif(); + } + } + } + + //Advance to variation's beginning + while(aloc0); + aloc+=len; + } + } + + //Finish writing array + while(aloc=trigger){ + while(ns=0; i--){ + if(chb.get(i)!='N'){break;} + } + while(ns1 ? split[1] : null); + + if(a.startsWith("mincov")){ + minCoverage=Integer.parseInt(b); + assert(minCoverage>0); + }else if(a.startsWith("consensus")){ + consensusRatio=Float.parseFloat(b); +// assert(consensusRatio>=0.5f && consensusRatio<=1f); + assert(consensusRatio>=0f && consensusRatio<=1f); + consensusRatioNR=1-(1-consensusRatio)*.5f; //Lower multiplier is more accurate +// assert(false) : consensusRatioNR; + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); + if(minChrom==-1){minChrom=1;} + if(maxChrom==-1){maxChrom=Data.numChroms;} + }else if(a.equals("verbose")){ + verbose=Tools.parseBoolean(b); + }else if(a.equals("minchrom")){ + minChrom=Integer.parseInt(b); + }else if(a.equals("maxchrom")){ + maxChrom=Integer.parseInt(b); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=Integer.parseInt(b); + }else if(a.startsWith("noref") || a.startsWith("undef")){ + NOREF_CAP=Integer.parseInt(b); + }else{ + System.err.println("Unknown argument "+arg); + } + } + + for(int chrom=minChrom; chrom<=maxChrom; chrom++){ + process(inVarsPattern.replaceFirst("#", ""+chrom), inCovPattern.replaceFirst("#", ""+chrom), outPattern.replaceFirst("#", ""+chrom), chrom, minCoverage); + } + + t.stop(); + + System.out.println(); + System.out.println("Vars in: \t"+(VARS_IN-NOREFS_IN)); + System.out.println("Length Delta in: \t"+VARLEN_IN); + System.out.println("No-refs in: \t"+NOREFS_IN); + System.out.println(); + System.out.println("Vars out: \t"+(VARS_OUT-NOREFS_OUT)); + System.out.println("Length Delta out: \t"+VARLEN_OUT); + System.out.println("No-refs out: \t"+NOREFS_OUT); + System.out.println(); + System.out.println("Time: \t"+t); + + } + + /** Now removes overlapping vars by retaining better quality one. */ + public static void process(final String invars, final String incov, final String outfile, final int chrom, final int mincov){ + TextFile tf=new TextFile(invars, true, false); + CoverageArray ca=ReadWrite.read(CoverageArray.class, incov); + TextStreamWriter tsw=new TextStreamWriter(outfile, true, false, true); + tsw.start(); + + ChromosomeArray cha=Data.getChromosome(chrom); + + Varlet prev=null; + + tsw.println(Varlet.header()); + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='#'){ + Varlet v=Varlet.fromText(s); + VARS_IN++; + int dif=v.lengthDif(); + VARLEN_IN+=dif; + if(v.varType==Variation.NOREF){NOREFS_IN++;} + + boolean passes=passesFilter(v, ca, cha, mincov); + boolean overlap=(prev==null ? false : v.beginLoc<=prev.endLoc); +// if(passes){System.out.println(v.varTypeMap[v.varType]+" " + +// ((v.ref==null || v.ref.length()<1 ? "." : v.ref)+" "+(v.call==null || v.call.length()<1 ? "." : v.call))+ +// " \tchr"+v.chromosome+" "+v.beginLoc+" \tdepth "+v.numReads+" / "+ca.get(v.beginLoc)+"");} + + if(!overlap){ + if(prev!=null){ + StringBuilder sb=prev.toText().append('\n'); + tsw.print(sb); + VARS_OUT++; + VARLEN_OUT+=prev.lengthDif(); + if(prev.varType==Variation.NOREF){NOREFS_OUT++;} + } + prev=null; + }else{ + if(passes && v.score()>prev.score()){ + prev=null; + }else{ + v=null; + } + } + + if(passes && v!=null){ + prev=v; + } + +// if(passesFilter(v, ca, cha, mincov)){ +// StringBuilder sb=v.toText().append('\n'); +// tsw.print(sb); +// VARS_OUT++; +// VARLEN_OUT+=dif; +// if(v.varType==Variation.NOREF){NOREFS_OUT++;} +// }else{ +// +// } + } + } + + if(prev!=null){ + StringBuilder sb=prev.toText().append('\n'); + tsw.print(sb); + VARS_OUT++; + VARLEN_OUT+=prev.lengthDif(); + if(prev.varType==Variation.NOREF){NOREFS_OUT++;} + } + + tf.close(); + tsw.poison(); + Data.unload(chrom, true); + + } + + + /** + * @param v + * @param ca + * @return + */ + private static boolean passesFilter(Varlet v, CoverageArray ca, ChromosomeArray cha, int minCoverageToPass) { + + int dif=v.lengthDif(); + + int midLoc=(v.beginLoc+v.endLoc)/2; + int midCov=ca.get(midLoc); + int maxCov=midCov, minCov=midCov; + + int bound1, bound2; + float ratio; + + if(verbose){System.err.println("\nConsidering varlet "+v);} + + if(v.varType==Variation.NOREF){ + bound1=v.beginLoc; + bound2=v.endLoc; + minCoverageToPass=minCoverageToPass*2+5; + ratio=consensusRatioNR; + }else{ + bound1=v.beginLoc; + bound2=v.endLoc; + ratio=consensusRatio; +// if(dif<0){minCoverageToPass++;} //Helps reduce deletion bias + } + + for(int i=bound1; i<=bound2; i++){ + int cov=ca.get(i); + minCov=Tools.min(minCov, cov); + maxCov=Tools.max(maxCov, cov); + if(verbose){System.err.println("minCov = "+minCov+", maxCov = "+maxCov);} + } +// if(dif<) + + if(minCov=0){ + int a=Tools.max(v.beginLoc-NOREF_CAP, cha.minIndex); + int b=Tools.min(v.endLoc+NOREF_CAP, cha.maxIndex); + if(cha.isFullyUndefined(a, b)){ + if(verbose){System.err.println("Noref cap, mincov="+minCov+"\n"+v);} + return false; + } + } + } + if(verbose){System.err.println("Retaining variation.");} + return true; + } + + + /** TODO */ + public static int THREADS=1; + public static int NOREF_CAP=-1; + public static float consensusRatio=1f; + public static float consensusRatioNR=1f; + public static long VARS_IN=0; + public static long VARLEN_IN=0; + public static long NOREFS_IN=0; + public static long VARS_OUT=0; + public static long VARLEN_OUT=0; + public static long NOREFS_OUT=0; + public static boolean verbose=false; + +} diff --git a/current/var/GenerateVarlets.java b/current/var/GenerateVarlets.java new file mode 100755 index 0000000..4467603 --- /dev/null +++ b/current/var/GenerateVarlets.java @@ -0,0 +1,678 @@ +package var; + +import java.io.IOException; +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.zip.ZipOutputStream; + +import pacbio.CalcCoverageFromSites; +import pacbio.SiteR; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; +import stream.SiteScoreR; + + +import dna.Data; +import dna.Gene; +import dna.Timer; + +import fileIO.ReadWrite; +import fileIO.TextFile; + +import align2.ListNum; +import align2.MultiStateAligner9ts; +import align2.Tools; +import align2.TranslateColorspaceRead; + +public class GenerateVarlets { + + + public static void main(String[] args){ + + Data.GENOME_BUILD=-1; + + String reads1=args[0]; + String reads2=args[1].equalsIgnoreCase("null") ? null : args[1]; + String outname=args[2]; +// assert(outname.contains("#")); + + String sitesfile=null; + + int minChrom=1; + int maxChrom=1; + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + + + for(int i=3; i=0); + + if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} + GenerateVarlets gv=new GenerateVarlets(reads1, reads2, outname, minChrom, maxChrom, MAX_READS, sitesfile); + gv.process(); + } + + public GenerateVarlets(String fname1, String fname2, String outname_, int minChrom, int maxChrom, long maxReads, String sitesfile_){ + this(new RTextInputStream(fname1, fname2, maxReads), outname_, minChrom, maxChrom, maxReads, sitesfile_); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + } + + public GenerateVarlets(RTextInputStream stream_, String outname_, int minChrom, int maxChrom, long maxReads, String sitesfile_){ + sitesfile=sitesfile_; + stream=stream_; + outname=outname_; + assert(outname.contains("#")) : "Output file name must contain the character '#' to be used for chromosome number."; + + outArray=new OutputStream[maxChrom+1]; + printArray=new PrintWriter[maxChrom+1]; + for(int i=minChrom; i> loadSites_old(String fname) { + HashMap> map=new HashMap>(4096); + TextFile tf=new TextFile(fname, false, false); + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + SiteScoreR[] array=CalcCoverageFromSites.toSites(s); + for(SiteScoreR ssr : array){ + long key=ssr.numericID; + if((ssr.pairnum&1)==1){ + key=-key; + assert(key<0); + } + ArrayList list=map.get(key); + if(list==null){ + list=new ArrayList(4); + map.put(key, list); + } + list.add(ssr); + } + } + return map; + } + + + /** + * @param sitesfile2 + * @return + */ + private static final HashMap loadSites(String fname) { + HashMap map=new HashMap(4096); + TextFile tf=new TextFile(fname, false, false); + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + SiteScoreR[] array=CalcCoverageFromSites.toSites(s); + for(SiteScoreR ssr : array){ + SiteR sr=new SiteR(ssr); + Long key=sr.idPairnum; + + SiteR head=map.get(key); + sr.next=head; + map.put(key, sr); + } + } + return map; + } + + + private void writeList(ArrayList list){ + + assert(list!=null && list.size()>0); + int chrom=list.get(0).chromosome; + + PrintWriter out=printArray[chrom]; + synchronized(out){ + for(Varlet v : list){ + out.println(v.toText()); + } + } + + } + + + private final class ProcessThread extends Thread { + + public ProcessThread(){ + for(int i=1; i(WRITE_BUFFER); + } + } + + @Override + public void run(){ + + final boolean processReads=true; + if(!processReads){System.err.println("Warning: Skipping read processing.");} + + if(cris!=null){ + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(!terminate && reads!=null && reads.size()>0){ + if(processReads){processReads(reads);} + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + }else{ + ArrayList reads=stream.nextList(); + while(!terminate && reads!=null && reads.size()>0){ + if(processReads){processReads(reads);} + reads=stream.nextList(); + } + } + + for(ArrayList list : lists){ + if(list!=null && !list.isEmpty()){ + if(MERGE_EQUAL_VARLETS){ + mergeEqualVarlets(list); + }else{ + Collections.sort(list); + } + writeList(list); + list=null; + } + } + + finished=true; + synchronized(this){this.notifyAll();} + } + + private void processReads(ArrayList reads){ + + if(sitemap==null){ + for(Read r : reads){ + Read r2=r.mate; + assert(r2==null || r.mate.mate==r); + + if(r2==null){ + processRead(r); + }else{ + if(!TOSS_SOLO1 || r.paired()){processRead(r);} + if(!TOSS_SOLO2 || r2.paired()){processRead(r2);} + } + } + }else{ + for(Read r : reads){ + Read r2=r.mate; + assert(r2==null || r.mate.mate==r); + + if(r2==null){ + multiprocessRead(r); + }else{ + if(!TOSS_SOLO1 || r.paired()){multiprocessRead(r);} + if(!TOSS_SOLO2 || r2.paired()){multiprocessRead(r2);} + } + } + } + } + + @Deprecated + private void multiprocessRead_old(Read r){ + long key=r.numericID; + if((r.pairnum()&1)==1){ + key=-key; + assert(key<0); + } + if(true){throw new RuntimeException("Deprecated.");} + ArrayList alssr=null;//sitemap.get(key); + if(alssr==null){return;} + + + for(SiteScoreR ssr : alssr){ + SiteScore ss=find(ssr, r.sites); + assert(ss!=null) : "\nCan't find ssr "+ssr+" in read\n"+r+"\n"; + + r.clearSite(); + r.setFromSite(ss); + r.match=null; + + r.setPaired(ss.pairedScore>0); + r.setPerfect(ss.perfect); + r.setRescued(ss.rescued); + + processRead(r); + } + } + + private void multiprocessRead(Read r){ + long key=r.numericID; + if((r.pairnum()&1)==1){ + key=-key; + assert(key<0); + } + + + SiteR head=sitemap.get(key); + +// assert(head==null) : "\n"+r.pairnum()+", "+key+",\n"+r.list+",\n"+r.mate.list+"\n"+head.toTextRecursive(null)+"\n"; + + while(head!=null){ + SiteScore ss=find(head, r.sites); + assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r+"\n"; + + r.clearSite(); + r.setFromSite(ss); + r.match=null; + + r.setPaired(ss.pairedScore>0); + r.setPerfect(ss.perfect); + r.setRescued(ss.rescued); + + processRead(r); + SiteR old=head; + head=old.next; + old.next=null; //Clears up memory. + } + } + + /** + * @param ssr + * @param list + * @return + */ + private SiteScore find(SiteScoreR ssr, ArrayList list) { + for(SiteScore ss : list){ + if(ssr.equals(ss)){return ss;} + } + return null; + } + + private SiteScore find(SiteR sr, ArrayList list) { + for(SiteScore ss : list){ + if(sr.equals(ss)){return ss;} + } + return null; + } + + + private void processRead(Read r_){ + + boolean flag=false; + if(false && (/*r_.numericID==30719442 || r_.numericID==107055007 || */ r_.numericID==42829556) /*&& r_.bases.length<=35*/){ + System.err.println("Processing read:"); + System.err.println("\n"+r_.toText(false)); + System.err.println("\n"+r_.strand()); + System.err.println("\n"); + System.err.println(new String(r_.bases)); + System.err.println(r_.match==null ? "null" : new String(r_.match)); + System.err.println("\n"); + tcr.verbose=true; + flag=true; + System.err.println("Mapped Length: "+(r_.stop-r_.start+1)); + } + + +// if(r_.chrom<1 && r_.list!=null && r_.list.size()>0){ +// SiteScore ss=r_.list.get(0); //Should not be necessary +// r_.start=ss.start; +// r_.stop=ss.stop; +// r_.chrom=ss.chrom; +// r_.setStrand(ss.strand); +// } + assert((r_.chrom>=1)==r_.mapped()) : r_.toText(false); + if(!r_.mapped()){//Unmapped. + assert(r_.sites==null || r_.sites.isEmpty()) : r_.toText(false); + return; + } + if(r_.invalid()){return;} //Probably trimmed too short to be used. + + if(r_.match!=null){ + if(r_.perfect()){//Hopefully this will be set correctly... + assert(TranslateColorspaceRead.perfectMatch(r_.match)); + return; + }else if(TranslateColorspaceRead.perfectMatch(r_.match)){ + return; + } + } + + final Read r; + + if(r_.colorspace()){ + r=tcr.translateToBasespace(r_); + if(r==null){ +// System.err.println("Decoder broke from read "+r_.toText(false)); + return; + } + }else{ + r=r_; +// r.errors=r.estimateErrors(); + } + r_=null; + + if(flag){ + System.err.println("r.match = "+(r.match==null ? null : new String(r.match))); + System.err.println("Mapped Length: "+(r.stop-r.start+1)); + } +// if(r.match!=null){ +// for(int i=0; i vars=tcr.toVars(r, CONDENSE, CONDENSE_SNPS, SPLIT_SUBS); + + if(vars==null){return;} + +// if(r.numericID==36858949){ +// System.err.println(r.toText(false)); +// System.err.println(r.copies); +// System.err.println(r.mate.toText(false)); +// System.err.println(r.mate.copies); +// System.err.println(); +// +// for(Varlet v : vars){ +// System.err.println(v.toText()); +// System.err.println(v.numReads); +// } +// assert(false); +// } + + for(Varlet v : vars){ + if(v.endDist>=MIN_END_DIST){ + assert(v.numUniqueReads==1); + assert(v.numSemiUniqueReads==1); + assert(v.numPlusReads1+v.numMinusReads1+v.numPlusReads2+v.numMinusReads2==1); + assert(v.numReads>=1); + // assert(!TranslateColorspaceReadPacBio.COUNT_DUPLICATES_WHEN_MAKING_VARLETS || v.numReads==1); + assert(v.numReads==r.copies); + assert(v.readMapLen==r.mapLength); + assert(v.readLen==r.bases.length); + varsMade++; + if(v.varType==Variation.NOREF){norefsMade++;} + else if(v.varType==Variation.SNP){snpMade++;} + else if(v.varType==Variation.DEL){delMade++;} + else if(v.varType==Variation.INS){insMade++;} + else if(v.varType==Variation.DELINS){ + int a=v.lengthRef(); + int b=v.lengthVar(); + if(a==b){subnMade++;} + else if(a>b){subdMade++;} + else{subiMade++;} + } + deltaLen+=v.lengthDif(); + addVar(v); + } + } +// System.out.println(varsMade+", "+norefsMade); + } + + private void addVar(Varlet v){ + ArrayList list=lists[v.chromosome]; + list.add(v); + if(list.size()>=WRITE_BUFFER){ + + if(MERGE_EQUAL_VARLETS){ + mergeEqualVarlets(list); + }else{ + Collections.sort(list); + } + + writeList(list); + lists[v.chromosome]=new ArrayList(WRITE_BUFFER); + } + } + + private void mergeEqualVarlets(ArrayList vars){ + + Collections.sort(vars); + ArrayList list=new ArrayList(8); + for(int i=0; i lists[]=new ArrayList[Gene.chromCodes.length]; + private boolean finished=false; + private boolean terminate=false; + private long varsMade=0; + private long norefsMade=0; + private long snpMade=0; + private long delMade=0; + private long subnMade=0; + private long subdMade=0; + private long subiMade=0; + private long insMade=0; + private long deltaLen=0; + + + } + + public final String outname; + public final String sitesfile; +// private HashMap> sitemap=null; + private HashMap sitemap=null; + private final RTextInputStream stream; + private final ConcurrentReadInputStream cris; + private final OutputStream[] outArray; + private final PrintWriter[] printArray; + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + + public static int THREADS=5; + public static int WRITE_BUFFER=20000; //Bigger number uses more memory, for less frequent writes. + + public static boolean CONDENSE=true; + public static boolean CONDENSE_SNPS=true; + public static boolean SPLIT_SUBS=false; + + public static boolean TOSS_SOLO1=false; + public static boolean TOSS_SOLO2=false; + + public static boolean MERGE_EQUAL_VARLETS=false; + public static boolean PAC_BIO_MODE=true; + public static int ALIGN_ROWS=2020; + public static int ALIGN_COLUMNS=3000; + + + public static long MAX_READS=-1; + public static final int MIN_END_DIST=4; + +} diff --git a/current/var/GenerateVarlets2.java b/current/var/GenerateVarlets2.java new file mode 100755 index 0000000..76e781d --- /dev/null +++ b/current/var/GenerateVarlets2.java @@ -0,0 +1,678 @@ +package var; + +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; + +import pacbio.CalcCoverageFromSites; +import pacbio.SiteR; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; +import stream.SiteScoreR; + + +import dna.Data; +import dna.Timer; + +import fileIO.ReadWrite; +import fileIO.TextFile; + +import align2.ListNum; +import align2.MultiStateAligner9ts; +import align2.Tools; +import align2.TranslateColorspaceRead; + +/** Splits output files across blocks for low memory usage */ +public class GenerateVarlets2 { + + + public static void main(String[] args){ + + Data.GENOME_BUILD=-1; + + String reads1=args[0]; + String reads2=args[1].equalsIgnoreCase("null") ? null : args[1]; + String outname=args[2]; +// assert(outname.contains("#")); + + String sitesfile=null; + + byte minChrom=1; + byte maxChrom=1; + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + + + for(int i=3; i1 ? split[1] : "true"); + if("t".equals(b)){b="true";} + if("f".equals(b)){b="false";} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("condense")){ + CONDENSE=Tools.parseBoolean(b); + }else if(a.equals("condensesnps")){ + CONDENSE_SNPS=Tools.parseBoolean(b); + }else if(a.startsWith("splitsubs")){ + SPLIT_SUBS=Tools.parseBoolean(b); + }else if(a.equals("tosssolo1")){ + TOSS_SOLO1=Tools.parseBoolean(b); + }else if(a.equals("tosssolo2")){ + TOSS_SOLO2=Tools.parseBoolean(b); + }else if(a.startsWith("minchrom")){ + minChrom=Byte.parseByte(b); + }else if(a.startsWith("maxchrom")){ + maxChrom=Byte.parseByte(b); + }else if(a.startsWith("build") || a.startsWith("genomebuild") || a.startsWith("genome")){ + Data.setGenome(Integer.parseInt(b)); + System.out.println("Set GENOME_BUILD to "+Data.GENOME_BUILD); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=(Integer.parseInt(b)); + }else if(a.startsWith("buffer") || a.startsWith("writebuffer")){ + WRITE_BUFFER=(Integer.parseInt(b)); + }else if(a.startsWith("maxreads")){ + MAX_READS=(Long.parseLong(b)); + }else if(a.equals("blocksize")){ + BLOCKSIZE=(Integer.parseInt(b)); + }else if(a.startsWith("sites") || a.startsWith("sitesfile")){ + sitesfile=b; + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + assert(minChrom<=maxChrom && minChrom>=0); + if(Data.GENOME_BUILD<0){throw new RuntimeException("Please set genome number.");} + + if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} + GenerateVarlets2 gv=new GenerateVarlets2(reads1, reads2, outname, minChrom, maxChrom, MAX_READS, sitesfile); + gv.process(); + } + + public GenerateVarlets2(String fname1, String fname2, String outname_, byte minChrom, byte maxChrom, long maxReads, String sitesfile_){ + this(new RTextInputStream(fname1, fname2, maxReads), outname_, minChrom, maxChrom, maxReads, sitesfile_); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + } + + public GenerateVarlets2(RTextInputStream stream_, String outname_, byte minChrom, byte maxChrom, long maxReads, String sitesfile_){ + sitesfile=sitesfile_; + stream=stream_; + outname=outname_; + assert(outname==null || outname.contains("#")) : "Output file name must contain the character '#' to be used for key number."; + makeKeyMap(); + + cris=(USE_CRIS ? new ConcurrentReadInputStream(stream, maxReads) : null); + if(CONDENSE_SNPS){assert(!SPLIT_SUBS);} + } + + public void finish(){ + + ArrayList keys=new ArrayList(); + keys.addAll(keymap.keySet()); + Collections.sort(keys); + for(long k : keys){ + ArrayList vars=keymap.remove(k); + if(!vars.isEmpty()){writeList(vars);} + } + + if(cris!=null){ReadWrite.closeStream(cris);} + else{stream.close();} + + } + + public void process(){ + + Timer t=new Timer(); + t.start(); + + if(sitesfile!=null){ + sitemap=loadSites(sitesfile); + } + + new Thread(cris).start(); + ProcessThread[] threadHandles=new ProcessThread[THREADS]; + for(int i=0; i loadSites(String fname) { + HashMap map=new HashMap(4096); + TextFile tf=new TextFile(fname, false, false); + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + SiteScoreR[] array=CalcCoverageFromSites.toSites(s); + for(SiteScoreR ssr : array){ + SiteR sr=new SiteR(ssr); + Long key=sr.idPairnum; + + SiteR head=map.get(key); + sr.next=head; + map.put(key, sr); + } + + } + return map; + } + + + private void writeList(ArrayList list){ + assert(list!=null && list.size()>0); + long key=key(list.get(0).chromosome, list.get(0).beginLoc); + String fname=fname(key, outname); + boolean allowSubprocess=false; + OutputStream os=ReadWrite.getOutputStream(fname, true, true, allowSubprocess); + PrintWriter pw=new PrintWriter(os); + + + for(Varlet v : list){ + pw.println(v.toText()); + } + ReadWrite.finishWriting(pw, os, fname, allowSubprocess); + } + + + private final class ProcessThread extends Thread { + + public ProcessThread(){ + } + + @Override + public void run(){ + + final boolean processReads=true; + if(!processReads){System.err.println("Warning: Skipping read processing.");} + + if(cris!=null){ + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(!terminate && reads!=null && reads.size()>0){ + if(processReads){processReads(reads);} + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + }else{ + ArrayList reads=stream.nextList(); + while(!terminate && reads!=null && reads.size()>0){ + if(processReads){processReads(reads);} + reads=stream.nextList(); + } + } + + finished=true; + synchronized(this){this.notifyAll();} + } + + private void processReads(ArrayList reads){ + + if(sitemap==null){ + for(Read r : reads){ + Read r2=r.mate; + assert(r2==null || r.mate.mate==r); + + if(r2==null){ + processRead(r); + }else{ + if(!TOSS_SOLO1 || r.paired()){processRead(r);} + if(!TOSS_SOLO2 || r2.paired()){processRead(r2);} + } + } + }else{ + for(Read r : reads){ + Read r2=r.mate; + assert(r2==null || r.mate.mate==r); + + if(r2==null){ + multiprocessRead(r); + }else{ + if(!TOSS_SOLO1 || r.paired()){multiprocessRead(r);} + if(!TOSS_SOLO2 || r2.paired()){multiprocessRead(r2);} + } + } + } + } + + @Deprecated + private void multiprocessRead_old(Read r){ + long key=r.numericID; + if((r.pairnum()&1)==1){ + key=-key; + assert(key<0); + } + if(true){throw new RuntimeException("Deprecated.");} + ArrayList alssr=null;//sitemap.get(key); + if(alssr==null){return;} + + + for(SiteScoreR ssr : alssr){ + SiteScore ss=find(ssr, r.sites); + assert(ss!=null) : "\nCan't find ssr "+ssr+" in read\n"+r+"\n"; + + r.clearSite(); + r.setFromSite(ss); + r.match=null; + + r.setPaired(ss.pairedScore>0); + r.setPerfect(ss.perfect); + r.setRescued(ss.rescued); + + processRead(r); + } + } + + private void multiprocessRead(Read r){ + long key=r.numericID; + if((r.pairnum()&1)==1){ + key=-key; + assert(key<0); + } + + + SiteR head=sitemap.get(key); + if(head!=null){readsProcessed++;} + +// assert(head==null) : "\n"+r.pairnum()+", "+key+",\n"+r.list+",\n"+r.mate.list+"\n"+head.toTextRecursive(null)+"\n"; + + while(head!=null){ + SiteScore ss=find(head, r.sites); + assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r+"\n"; + + r.clearSite(); + r.setFromSite(ss); + r.match=null; + + r.setPaired(ss.pairedScore>0); + r.setPerfect(ss.perfect); + r.setRescued(ss.rescued); + + processRead(r); + SiteR old=head; + head=old.next; + old.next=null; //Clears up memory. + } + } + + /** + * @param ssr + * @param list + * @return + */ + private SiteScore find(SiteScoreR ssr, ArrayList list) { + for(SiteScore ss : list){ + if(ssr.equals(ss)){return ss;} + } + return null; + } + + private SiteScore find(SiteR sr, ArrayList list) { + for(SiteScore ss : list){ + if(sr.equals(ss)){return ss;} + } + return null; + } + + + private void processRead(Read r_){ + sitesProcessed++; + + boolean flag=false; + if(false && (/*r_.numericID==30719442 || r_.numericID==107055007 || */ r_.numericID==42829556) /*&& r_.bases.length<=35*/){ + System.err.println("Processing read:"); + System.err.println("\n"+r_.toText(false)); + System.err.println("\n"+r_.strand()); + System.err.println("\n"); + System.err.println(new String(r_.bases)); + System.err.println(r_.match==null ? "null" : new String(r_.match)); + System.err.println("\n"); + tcr.verbose=true; + flag=true; + System.err.println("Mapped Length: "+(r_.stop-r_.start+1)); + } + + +// if(r_.chrom<1 && r_.list!=null && r_.list.size()>0){ +// SiteScore ss=r_.list.get(0); //Should not be necessary +// r_.start=ss.start; +// r_.stop=ss.stop; +// r_.chrom=ss.chrom; +// r_.setStrand(ss.strand); +// } + assert((r_.chrom>=1)==r_.mapped()) : r_.toText(false); + if(!r_.mapped()){//Unmapped. + assert(r_.sites==null || r_.sites.isEmpty()) : r_.toText(false); + return; + } + if(r_.invalid()){return;} //Probably trimmed too short to be used. + + if(r_.match!=null){ + if(r_.perfect()){//Hopefully this will be set correctly... + assert(TranslateColorspaceRead.perfectMatch(r_.match)); + return; + }else if(TranslateColorspaceRead.perfectMatch(r_.match)){ + return; + } + } + + final Read r; + + if(r_.colorspace()){ + r=tcr.translateToBasespace(r_); + if(r==null){ +// System.err.println("Decoder broke from read "+r_.toText(false)); + return; + } + }else{ + r=r_; +// r.errors=r.estimateErrors(); + } + r_=null; + + if(flag){ + System.err.println("r.match = "+(r.match==null ? null : new String(r.match))); + System.err.println("Mapped Length: "+(r.stop-r.start+1)); + } +// if(r.match!=null){ +// for(int i=0; i vars=tcr.toVars(r, CONDENSE, CONDENSE_SNPS, SPLIT_SUBS); + + if(vars==null){return;} + +// if(r.numericID==36858949){ +// System.err.println(r.toText(false)); +// System.err.println(r.copies); +// System.err.println(r.mate.toText(false)); +// System.err.println(r.mate.copies); +// System.err.println(); +// +// for(Varlet v : vars){ +// System.err.println(v.toText()); +// System.err.println(v.numReads); +// } +// assert(false); +// } + + for(Varlet v : vars){ + if(v.endDist>=MIN_END_DIST){ + assert(v.numUniqueReads==1); + assert(v.numSemiUniqueReads==1); + assert(v.numPlusReads1+v.numMinusReads1+v.numPlusReads2+v.numMinusReads2==1); + assert(v.numReads>=1); + // assert(!TranslateColorspaceReadPacBio.COUNT_DUPLICATES_WHEN_MAKING_VARLETS || v.numReads==1); + assert(v.numReads==r.copies); + assert(v.readMapLen==r.mapLength); + assert(v.readLen==r.bases.length); + varsMade++; + if(v.varType==Variation.NOREF){norefsMade++;} + else if(v.varType==Variation.SNP){snpMade++;} + else if(v.varType==Variation.DEL){delMade++;} + else if(v.varType==Variation.INS){insMade++;} + else if(v.varType==Variation.DELINS){ + int a=v.lengthRef(); + int b=v.lengthVar(); + if(a==b){subnMade++;} + else if(a>b){subdMade++;} + else{subiMade++;} + } + deltaLen+=v.lengthDif(); + addVar(v); + } + } +// System.out.println(varsMade+", "+norefsMade); + } + + /** TODO: Synchronize once per read, not once per varlet */ + private void addVar(Varlet v){ + long key=key(v.chromosome, v.beginLoc); + ArrayList list=keymap.get(key); + assert(list!=null) : "\nCan't find "+key+" in "+keymap.keySet()+"\n"; + synchronized(list){ + list.add(v); + if(list.size()>=WRITE_BUFFER){ + + if(MERGE_EQUAL_VARLETS){ + mergeEqualVarlets(list); + }else{ + Collections.sort(list); + } + + writeList(list); + list.clear(); + } + } + } + + private void mergeEqualVarlets(ArrayList vars){ + + Collections.sort(vars); + ArrayList list=new ArrayList(8); + for(int i=0; i>(); + for(int chrom=1; chrom<=Data.numChroms; chrom++){ + long[] keys=keys(chrom); + for(long key : keys){ + keymap.put(key, new ArrayList(WRITE_BUFFER)); + ReadWrite.writeString(header, fname(key, outname), false); + } + } + } + + private HashMap> keymap; + + public final String outname; + public final String sitesfile; +// private HashMap> sitemap=null; + private HashMap sitemap=null; + private final RTextInputStream stream; + private final ConcurrentReadInputStream cris; + + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + + public static int THREADS=7; + public static int WRITE_BUFFER=200000; //Bigger number uses more memory, for less frequent writes. + + public static boolean CONDENSE=true; + public static boolean CONDENSE_SNPS=true; + public static boolean SPLIT_SUBS=false; + + public static boolean TOSS_SOLO1=false; + public static boolean TOSS_SOLO2=false; + + public static boolean MERGE_EQUAL_VARLETS=false; + public static boolean PAC_BIO_MODE=true; + public static int ALIGN_ROWS=2020; + public static int ALIGN_COLUMNS=3000; + + public static long MAX_READS=-1; + public static final int MIN_END_DIST=4; + public static int BLOCKSIZE=1000000; + +} diff --git a/current/var/GenerateVarlets3.java b/current/var/GenerateVarlets3.java new file mode 100755 index 0000000..48f796c --- /dev/null +++ b/current/var/GenerateVarlets3.java @@ -0,0 +1,895 @@ +package var; + +import java.io.OutputStream; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; + +import pacbio.SiteR; + +import stream.ConcurrentReadInputStream; +import stream.RTextInputStream; +import stream.Read; +import stream.SiteScore; +import stream.SiteScoreR; + +import dna.CoverageArray; +import dna.Data; +import dna.Timer; + +import fileIO.ReadWrite; +import fileIO.TextFile; + +import align2.ListNum; +import align2.MultiStateAligner9PacBio; +import align2.MultiStateAligner9ts; +import align2.Tools; +import align2.TranslateColorspaceRead; + +/** Splits output files across blocks for low memory usage. + * Uses id-sorted site list for even lower memory usage. */ +public class GenerateVarlets3 { + + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + Data.GENOME_BUILD=-1; + + String reads1=args[0]; + String reads2=args[1].equalsIgnoreCase("null") ? null : args[1]; + String outname=args[2]; + String pcovFile=null; + String covFile=null; +// assert(outname.contains("#")); + + String sitesfile=null; + + int minChrom=-1; + int maxChrom=-1; + + int distFromDefined=-1; + ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; + + for(int i=3; i1 ? split[1] : null; + if("null".equalsIgnoreCase(b)){b=null;} + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equals("condense")){ + CONDENSE=Tools.parseBoolean(b); + }else if(a.equals("condensesnps")){ + CONDENSE_SNPS=Tools.parseBoolean(b); + }else if(a.startsWith("splitsubs")){ + SPLIT_SUBS=Tools.parseBoolean(b); + }else if(a.startsWith("illumina")){ + PAC_BIO_MODE=!Tools.parseBoolean(b); + }else if(a.startsWith("pacbio")){ + PAC_BIO_MODE=Tools.parseBoolean(b); + }else if(a.equals("tosssolo1")){ + TOSS_SOLO1=Tools.parseBoolean(b); + }else if(a.equals("tosssolo2")){ + TOSS_SOLO2=Tools.parseBoolean(b); + }else if(a.startsWith("minchrom")){ + minChrom=Integer.parseInt(b); + }else if(a.startsWith("maxchrom")){ + maxChrom=Integer.parseInt(b); + }else if(a.startsWith("build") || a.startsWith("genomebuild") || a.startsWith("genome")){ + Data.setGenome(Integer.parseInt(b)); + System.out.println("Set GENOME_BUILD to "+Data.GENOME_BUILD); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=(Integer.parseInt(b)); + }else if(a.startsWith("buffer") || a.startsWith("writebuffer")){ + WRITE_BUFFER=(Integer.parseInt(b)); + }else if(a.startsWith("maxreads")){ + MAX_READS=(Long.parseLong(b)); + }else if(a.startsWith("minenddist")){ + MIN_END_DIST=Integer.parseInt(b); + }else if(a.startsWith("alignrow")){ + ALIGN_ROWS=Integer.parseInt(b); + }else if(a.startsWith("aligncol")){ + ALIGN_COLUMNS=Integer.parseInt(b); + }else if(a.startsWith("pcovtipdist")){ + PCOV_TIP_DIST=Integer.parseInt(b); + }else if(a.equals("blocksize")){ + BLOCKSIZE=(Integer.parseInt(b)); + }else if(a.equals("norefcap") || a.equals("distfromdefined") || a.equals("maxdistfromdefined")){ + distFromDefined=(Integer.parseInt(b)); + }else if(a.startsWith("sites") || a.startsWith("sitesfile")){ + sitesfile=(b==null || b.equalsIgnoreCase("null") ? null : b); + }else if(a.startsWith("pcov") || a.startsWith("perfectcov")){ + pcovFile=(b==null || b.equalsIgnoreCase("null") ? null : b); + }else if(a.equals("cov") || a.startsWith("coverage")){ + covFile=(b==null || b.equalsIgnoreCase("null") ? null : b); + }else if(a.equals("usegzip") || a.equals("gzip")){ + ReadWrite.USE_GZIP=Tools.parseBoolean(b); + }else if(a.equals("usepigz") || a.equals("pigz")){ + if(b!=null && Character.isDigit(b.charAt(0))){ + int zt=Integer.parseInt(b); + if(zt<1){ReadWrite.USE_PIGZ=false;} + else{ + ReadWrite.USE_PIGZ=true; + if(zt>1){ + ReadWrite.MAX_ZIP_THREADS=zt; + ReadWrite.ZIP_THREAD_DIVISOR=1; + } + } + }else{ReadWrite.USE_PIGZ=Tools.parseBoolean(b);} + }else if(a.equals("usegunzip") || a.equals("gunzip")){ + ReadWrite.USE_GUNZIP=Tools.parseBoolean(b); + }else if(a.equals("useunpigz") || a.equals("unpigz")){ + ReadWrite.USE_UNPIGZ=Tools.parseBoolean(b); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + if(Data.GENOME_BUILD<0){throw new RuntimeException("Please set genome number.");} + if(minChrom<0){minChrom=1;} + if(maxChrom<0){maxChrom=Data.numChroms;} + + assert(minChrom<=maxChrom && minChrom>=0); + + if(ReadWrite.ZIPLEVEL<2){ReadWrite.ZIPLEVEL=2;} + GenerateVarlets3 gv=new GenerateVarlets3(reads1, reads2, outname, MAX_READS, sitesfile, pcovFile, distFromDefined); + gv.process(); + } + + public GenerateVarlets3(String fname1, String fname2, String outname_, long maxReads, String sitesfile_, String pcovFile, int distFromDefined_){ + this(new RTextInputStream(fname1, fname2, maxReads), outname_, maxReads, sitesfile_, pcovFile, distFromDefined_); + assert(fname2==null || !fname1.equals(fname2)) : "Error - input files have same name."; + } + + public GenerateVarlets3(RTextInputStream stream_, String outname_, long maxReads, String sitesfile_, String pcovFile, int distFromDefined_){ + sitesfile=sitesfile_; + sitesTextFile=new TextFile(sitesfile, false, false); + stream=stream_; + outname=outname_; + assert(outname==null || outname.contains("#")) : "Output file name must contain the character '#' to be used for key number."; + makeKeyMap(); + + cris=(USE_CRIS ? new ConcurrentReadInputStream(stream, maxReads) : null); + if(CONDENSE_SNPS){assert(!SPLIT_SUBS);} + + maxDistFromDefined=distFromDefined_; + + if(maxDistFromDefined>0){ + //Unfortunately, this serializes the chromosome loading. + nearestDefinedBase=new char[Data.numChroms+1][]; + for(int i=1; i<=Data.numChroms; i++){ + nearestDefinedBase[i]=Data.getChromosome(i).nearestDefinedBase(); + } + }else{ + nearestDefinedBase=null; + } + + if(pcovFile!=null){ + assert(pcovFile.contains("#") || Data.numChroms<2); + pcov=new CoverageArray[Data.numChroms+1]; + for(int i=1; i<=Data.numChroms; i++){ + String fname=pcovFile.replaceFirst("#", ""+i); + pcov[i]=ReadWrite.read(CoverageArray.class, fname); + } + }else{ + pcov=null; + } + + } + + public void finish(){ + + ArrayList keys=new ArrayList(); + keys.addAll(keymap.keySet()); + Collections.sort(keys); + for(long k : keys){ + ArrayList vars=keymap.remove(k); + if(!vars.isEmpty()){writeList(vars);} + } + + if(cris!=null){ReadWrite.closeStream(cris);} + else{stream.close();} + + } + + public void process(){ + + Timer t=new Timer(); + t.start(); + + if(sitesfile==null){ + sitemap=null; + } + + new Thread(cris).start(); + ProcessThread[] threadHandles=new ProcessThread[THREADS]; + for(int i=0; i=maxSiteRead && tf.isOpen()){ +// System.out.print(" ... "); +// System.out.println("Looking for ") + String s; + for(s=tf.nextLine(); s!=null; s=tf.nextLine()){ +// SiteScoreR[] array=CalcCoverageFromSites.toSites(s); +// SiteR head=new SiteR(array[0]); +// sitemap.put(head.idPairnum, head); +// for(int i=1; i=maxFound); + maxFound=id; + } + if(maxFound>maxID){break;} + } + maxSiteRead=Tools.max(maxFound, maxSiteRead); + if(s==null){ + tf.close(); +// System.out.println(" closing file at maxFound="+maxFound+", maxRead="+maxSiteRead+", lines="+linesLoaded); + maxSiteRead=Long.MAX_VALUE; + } + } +// System.out.println(" maxFound="+maxFound+", maxRead="+maxSiteRead+", lines="+linesLoaded); + if(maxSiteRead<=maxID){assert(!tf.isOpen());} + maxSiteTableSize=Tools.max(maxSiteTableSize, sitemap.size()); + + } + + return maxSiteRead; + } + + public SiteR toImperfectSites(String s, boolean retainSemiperfect){ + SiteR head=null; + SiteR prev=null; + String[] split=s.split("\t"); + + + sitesLoaded+=split.length; + linesLoaded++; + + for(int i=0; i=MIN_PCOV_DEPTH_TO_TOSS; + } + if(toss){retain=false;} +// for(int j=ssr.start-PCOV_TIP_DIST; retain && j<=ssr.stop+PCOV_TIP_DIST; j++){ +// retain=ca.get(j)0){ + SiteR[] a2=new SiteR[array.length]; + for(int i=0; i0){a2[i-1].next=a2[i];} + } + return a2[0]; + } + return null; + } + + private void writeList(ArrayList list){ + assert(list!=null && list.size()>0); + long key=key(list.get(0).chromosome, list.get(0).beginLoc); + String fname=fname(key, outname); + boolean allowSubprocess=false; + OutputStream os=ReadWrite.getOutputStream(fname, true, true, allowSubprocess); + PrintWriter pw=new PrintWriter(os); + + + for(Varlet v : list){ + pw.println(v.toText()); + } + ReadWrite.finishWriting(pw, os, fname, allowSubprocess); + } + + + private final class ProcessThread extends Thread { + + public ProcessThread(){ + } + + private void fixReadSites(ArrayList reads){ + assert(sitemap!=null); + if(reads==null || reads.size()==0){return;} + long max=-2; + for(Read r : reads){ + max=Tools.max(max, r.numericID); + } + synchronized(sitemap){ + if(max>=maxSiteRead){ + readSites(sitesTextFile, max); + } + for(Read r : reads){ + { + long key=r.numericID; + if((r.pairnum()&1)==1){ + key=-key; + assert(key<0); + } + SiteR head=sitemap.get(key); + + ArrayList old=r.sites; + r.sites=null; + if(head!=null){ + r.sites=new ArrayList(); + sitemap.remove(key); + while(head!=null){ + SiteScore ss=find(head, old); //Note - I can accelerate this by sorting SiteR and r.list by the same metric, e.g. position. + assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r+"\nlist:\n"+old; + r.sites.add(ss); + head=head.next; + } + } + } + + Read r2=r.mate; + if(r2!=null){ + long key=r2.numericID; + if((r2.pairnum()&1)==1){ + key=-key; + assert(key<0); + } + SiteR head=sitemap.get(key); + + ArrayList old=r2.sites; + r2.sites=null; + if(head!=null){ + r2.sites=new ArrayList(); + sitemap.remove(key); + while(head!=null){ + SiteScore ss=find(head, old); //Note - I can accelerate this by sorting SiteR and r2.list by the same metric, e.g. position. + assert(ss!=null) : "\nCan't find sr "+head+" in read\n"+r2+"\nlist:\n"+old; + r2.sites.add(ss); + } + } + } + + } + } + } + + @Override + public void run(){ + + final boolean processReads=true; + if(!processReads){System.err.println("Warning: Skipping read processing.");} + + if(cris!=null){ + ListNum ln=cris.nextList(); + ArrayList reads=(ln!=null ? ln.list : null); + + while(!terminate && reads!=null && reads.size()>0){ + if(processReads){processReads(reads);} + cris.returnList(ln, ln.list.isEmpty()); + ln=cris.nextList(); + reads=(ln!=null ? ln.list : null); + } + cris.returnList(ln, ln.list.isEmpty()); + }else{ + ArrayList reads=stream.nextList(); + while(!terminate && reads!=null && reads.size()>0){ + if(processReads){processReads(reads);} + reads=stream.nextList(); + } + } + + finished=true; + synchronized(this){this.notifyAll();} + } + + private void processReads(ArrayList reads){ + if(sitemap==null){ + for(Read r : reads){ + Read r2=r.mate; + assert(r2==null || r.mate.mate==r); + + if(r2==null){ + processRead(r); + }else{ + if(!TOSS_SOLO1 || r.paired()){processRead(r);} + if(!TOSS_SOLO2 || r2.paired()){processRead(r2);} + } + } + }else{ + fixReadSites(reads); + + for(Read r : reads){ + Read r2=r.mate; + assert(r2==null || r.mate.mate==r); + + if(r2==null){ + multiprocessRead(r); + }else{ + if(!TOSS_SOLO1 || r.paired()){multiprocessRead(r);} + if(!TOSS_SOLO2 || r2.paired()){multiprocessRead(r2);} + } + } + } + } + + private void multiprocessRead(Read r){ + +// assert(head==null) : "\n"+r.pairnum()+", "+key+",\n"+r.list+",\n"+r.mate.list+"\n"+head.toTextRecursive(null)+"\n"; + + if(r.numSites()==0){return;} + + readsProcessed++; + for(SiteScore ss : r.sites){ + r.clearSite(); + r.setFromSite(ss); + r.match=null; + + r.setPaired(ss.pairedScore>0); + r.setPerfect(ss.perfect); + r.setRescued(ss.rescued); + + processRead(r); + } + } + + /** + * @param ssr + * @param list + * @return + */ + private SiteScore find(SiteScoreR ssr, ArrayList list) { + for(SiteScore ss : list){ + if(ssr.equals(ss)){return ss;} + } + return null; + } + + private SiteScore find(SiteR sr, ArrayList list) { + for(SiteScore ss : list){ + if(sr.equals(ss)){return ss;} + } + return null; + } + + + private void processRead(Read r_){ + sitesProcessed++; + + assert(r_.numericID0){ +// SiteScore ss=r_.list.get(0); //Should not be necessary +// r_.start=ss.start; +// r_.stop=ss.stop; +// r_.chrom=ss.chrom; +// r_.setStrand(ss.strand); +// } + assert((r_.chrom>=1)==r_.mapped()) : r_.toText(false); + if(!r_.mapped()){//Unmapped. + assert(r_.sites==null || r_.sites.isEmpty()) : r_.toText(false); + return; + } + if(r_.invalid()){return;} //Probably trimmed too short to be used. + + if(r_.match!=null){ + if(r_.perfect()){//Hopefully this will be set correctly... + assert(TranslateColorspaceRead.perfectMatch(r_.match)); + return; + }else if(TranslateColorspaceRead.perfectMatch(r_.match)){ + return; + } + } + + final Read r; + + if(r_.colorspace()){ + r=tcr.translateToBasespace(r_); + if(r==null){ +// System.err.println("Decoder broke from read "+r_.toText(false)); + return; + } + }else{ + r=r_; +// r.errors=r.estimateErrors(); + } + r_=null; + assert(r.numericID vars=tcr.toVars(r, CONDENSE, CONDENSE_SNPS, SPLIT_SUBS); + + if(vars==null){return;} + +// if(r.numericID==36858949){ +// System.err.println(r.toText(false)); +// System.err.println(r.copies); +// System.err.println(r.mate.toText(false)); +// System.err.println(r.mate.copies); +// System.err.println(); +// +// for(Varlet v : vars){ +// System.err.println(v.toText()); +// System.err.println(v.numReads); +// } +// assert(false); +// } + + char[] nearest=(nearestDefinedBase == null ? null : nearestDefinedBase[r.chrom]); + CoverageArray ca=(pcov==null ? null : pcov[r.chrom]); + + for(Varlet v : vars){ + if(v.endDist>=MIN_END_DIST){ + assert(v.numUniqueReads==1); + assert(v.numSemiUniqueReads==1); + assert(v.numPlusReads1+v.numMinusReads1+v.numPlusReads2+v.numMinusReads2==1); + assert(v.numReads>=1); + // assert(!TranslateColorspaceReadPacBio.COUNT_DUPLICATES_WHEN_MAKING_VARLETS || v.numReads==1); + assert(v.numReads==r.copies); + assert(v.readMapLen==r.mapLength); + assert(v.readLen==r.bases.length); + + boolean retain=true; + if(maxDistFromDefined>=0 && v.varType==Variation.NOREF){ + char dist=(maxDistFromDefined==0 ? 1 : Tools.min(nearest[v.beginLoc], nearest[v.endLoc])); + if(dist>maxDistFromDefined){retain=false;} + } + + if(retain && v.varType!=Variation.NOREF && ca!=null){ + boolean toss=true; + assert(PCOV_TIP_DIST>0); + for(int j=v.beginLoc-PCOV_TIP_DIST; toss && j<=v.endLoc+PCOV_TIP_DIST; j++){ + toss=ca.get(j)>=MIN_PCOV_DEPTH_TO_TOSS; + } + if(toss){retain=false;} + } + + if(retain){ + varsMade++; + if(v.varType==Variation.NOREF){norefsMade++;} + else if(v.varType==Variation.SNP){snpMade++;} + else if(v.varType==Variation.DEL){delMade++;} + else if(v.varType==Variation.INS){insMade++;} + else if(v.varType==Variation.DELINS){ + int a=v.lengthRef(); + int b=v.lengthVar(); + if(a==b){subnMade++;} + else if(a>b){subdMade++;} + else{subiMade++;} + } + deltaLen+=v.lengthDif(); + addVar(v); + } + + } + } +// System.out.println(varsMade+", "+norefsMade); + } + + /** TODO: Synchronize once per read, not once per varlet */ + private void addVar(Varlet v){ + long key=key(v.chromosome, v.beginLoc); + ArrayList list=keymap.get(key); + assert(list!=null) : "\nCan't find "+key+" in "+keymap.keySet()+"\n"; + synchronized(list){ + list.add(v); + if(list.size()>=WRITE_BUFFER){ + + if(MERGE_EQUAL_VARLETS){ + mergeEqualVarlets(list); + }else{ + Collections.sort(list); + } + + writeList(list); + list.clear(); + } + } + } + + private void mergeEqualVarlets(ArrayList vars){ + + Collections.sort(vars); + ArrayList list=new ArrayList(8); + for(int i=0; i>(); + for(int chrom=1; chrom<=Data.numChroms; chrom++){ + long[] keys=keys(chrom); + for(long key : keys){ + keymap.put(key, new ArrayList(WRITE_BUFFER)); + ReadWrite.writeString(header, fname(key, outname), false); + } + } + } + + private HashMap> keymap; + private final char[][] nearestDefinedBase; + private final int maxDistFromDefined; + + private final CoverageArray[] pcov; + + public final String outname; + public final String sitesfile; + private TextFile sitesTextFile; + private static long maxSiteRead=-1; + private static long maxSiteTableSize=-1; + + private static long sitesLoaded=0; + private static long sitesRetained=0; + private static long linesLoaded=0; + private static long linesRetained=0; + + private HashMap sitemap=new HashMap(4096); + private final RTextInputStream stream; + private final ConcurrentReadInputStream cris; + + public static boolean USE_CRIS=true; //Similar speed either way. "true" may be better with many threads. + + public static int THREADS=Data.LOGICAL_PROCESSORS; + public static int WRITE_BUFFER=16000; //Bigger number uses more memory, for less frequent writes. + + public static boolean CONDENSE=true; + public static boolean CONDENSE_SNPS=true; + public static boolean SPLIT_SUBS=false; + + public static boolean TOSS_SOLO1=false; + public static boolean TOSS_SOLO2=false; + + public static boolean MERGE_EQUAL_VARLETS=false; + public static boolean PAC_BIO_MODE=true; + public static int ALIGN_ROWS=2020; + public static int ALIGN_COLUMNS=3000; + + public static long MAX_READS=-1; + public static int MIN_END_DIST=4; + public static int BLOCKSIZE=1000000; + /** Imperfect reads fully covered by perfect reads to this depth or more will be tossed. */ + public static int MIN_PCOV_DEPTH_TO_TOSS=2; + /** Extend perfect coverage depth requirement by this much of the tips of variations and reads before tossing them. + * A higher number means more varlets will be retained. */ + public static int PCOV_TIP_DIST=8; + +} diff --git a/current/var/StackVariations.java b/current/var/StackVariations.java new file mode 100755 index 0000000..430b94d --- /dev/null +++ b/current/var/StackVariations.java @@ -0,0 +1,741 @@ +package var; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; + + +import dna.Data; +import dna.Gene; +import dna.Timer; +import fileIO.TextStreamWriter; + +import align2.Tools; + +public class StackVariations { + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + Timer t=new Timer(); + t.start(); + + String inPattern=args[0]; + String outPattern=args[1]; + + assert(!inPattern.equalsIgnoreCase(outPattern)); + + int minChrom=-1; + int maxChrom=-1; + + boolean filter=false; + + for(String arg : args){ + final String s=arg.toLowerCase(); + String[] split=s.split("="); + String a=split[0]; + String b=(split.length>1 ? split[1] : null); + + if(a.equalsIgnoreCase("filter")){filter=true;} + else if(a.startsWith("filter")){ + if(b.equals("1") || b.startsWith("t")){filter=true;} + else if(b.equals("0") || b.startsWith("f")){filter=false;} + else{throw new RuntimeException("Unknown parameter "+arg);} + }else if(a.equalsIgnoreCase("strict")){ + if(b==null){STRICT=true;} + else if(b.equals("1") || b.startsWith("t")){STRICT=true;} + else if(b.equals("0") || b.startsWith("f")){STRICT=false;} + else{throw new RuntimeException("Unknown parameter "+arg);} + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); + if(minChrom==-1){minChrom=1;} + if(maxChrom==-1){maxChrom=Data.numChroms;} + }else if(a.equals("minchrom")){ + minChrom=Integer.parseInt(b); + }else if(a.equals("maxchrom")){ + maxChrom=Integer.parseInt(b); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=Integer.parseInt(b); + }else{ +// System.err.println("************* "+s); + } + } + + assert(minChrom>=0 && maxChrom>=minChrom) : "Please set minchrom and maxchrom."; + +// for(byte i=minChrom; i<=maxChrom; i++){ +// String fname1=inPattern.replace("#", i+""); +// String fname2=outPattern.replace("#", i+""); +// assert(new File(fname1).exists()); +// assert(!new File(fname2).exists()); +// processFile(fname1, fname2, filter); +// } + + runThreaded(inPattern, outPattern, minChrom, maxChrom, filter); + + t.stop(); + System.out.println("Input Vars: \t"+(totalIn_global-totalInNR_global)); + System.out.println("Input No-ref: \t"+totalInNR_global); + System.out.println("Input Delta Length:\t"+deltaLenIn_global); + System.out.println(); + System.out.println("Kept Vars: \t"+(totalKept_global-totalKeptNR_global)); + System.out.println("Kept No-ref: \t"+totalKeptNR_global); + System.out.println("Kept Snp: \t"+snpKept_global); + System.out.println("Kept Del: \t"+delKept_global+"\t\tLength: \t"+delLenKept_global); + System.out.println("Kept Ins: \t"+insKept_global+"\t\tLength: \t"+insLenKept_global); + System.out.println("Kept Sub: \t"+subKept_global+"\t\tLength: \t"+subLenKept_global); + System.out.println("Kept Delta Length: \t"+deltaLenKept_global); + System.out.println("Kept Avg Score: \t"+(scoreKept_global/(Tools.max(1, totalKept_global)))); + System.out.println(); + System.out.println("Dropped Vars: \t"+(totalDropped_global-totalDroppedNR_global)); + System.out.println("Dropped No-ref: \t"+totalDroppedNR_global); + System.out.println("Dropped Avg Score: \t"+(scoreDropped_global/Tools.max(1, totalDropped_global))); + System.out.println(); + System.out.println("Time: \t"+t); + } + + public static final void runThreaded(String inPattern, String outPattern, int minChrom, int maxChrom, boolean filter){ + ArrayList svts=new ArrayList(); + for(int i=minChrom; i<=maxChrom; i++){ + String fname1=inPattern.replace("#", i+""); + String fname2=outPattern.replace("#", i+""); + assert(!fname1.equalsIgnoreCase(fname2)); + assert(new File(fname1).exists()); +// assert(!new File(fname2).exists()); + addThread(1); + SVThread svt=new SVThread(fname1, fname2, filter); + svts.add(svt); + new Thread(svt).start(); + } + while(addThread(0)>0){} + for(SVThread svt : svts){ + + snpKept_global+=svt.snpKept; + delKept_global+=svt.delKept; + insKept_global+=svt.insKept; + subKept_global+=svt.subKept; + delLenKept_global+=svt.delLenKept; + insLenKept_global+=svt.insLenKept; + subLenKept_global+=svt.subLenKept; + deltaLenKept_global+=svt.deltaLenKept; + + deltaLenIn_global+=svt.deltaLenIn; + totalIn_global+=svt.totalIn; + totalInNR_global+=svt.totalInNR; + totalKept_global+=svt.totalKept; + totalDropped_global+=svt.totalDropped; + totalKeptNR_global+=svt.totalKeptNR; + totalDroppedNR_global+=svt.totalDroppedNR; + scoreKept_global+=svt.scoreKept; + scoreDropped_global+=svt.scoreDropped; + } + } + + + public static boolean passesFilterSNP(Varlet v){ + + + //Best so far: + + if(STRICT){ + + if(v.endDist<3){return false;} + if(v.tailDist<10){return false;} + + //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required. + if(v.minStrandReads()>=2){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.5f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<18){return false;} + if(v.avgReadQuality()<13){return false;} + if(v.maxVarQuality()<26){return false;} + if(v.avgVarQuality()<18){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<4){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else if(v.minStrandReads()>=1){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.2f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<19){return false;} + if(v.avgReadQuality()<14){return false;} + if(v.maxVarQuality()<28){return false;} + if(v.avgVarQuality()<19){return false;} + if(v.numReads<3){return false;} + if(v.numSemiUniqueReads<3){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else{ + if(v.endDist<8){return false;} + if(v.tailDist<14){return false;} + + if(v.errors>0){return false;} + if(v.expectedErrors>0.5f){return false;} +// if(v.expectedErrors-v.errors>2f){return false;} + if(v.maxReadQuality()<21){return false;} + if(v.avgReadQuality()<17){return false;} + if(v.maxVarQuality()<30){return false;} + if(v.avgVarQuality()<21){return false;} + if(v.numReads<6){return false;} + if(v.numSemiUniqueReads<5){return false;} + if(v.numUniqueReads<3){return false;} + if(v.paired<5){return false;} + if(v.score()<8100){return false;} + } + +// else{ +// if(v.endDist<8){return false;} +// if(v.tailDist<14){return false;} +// +// if(v.errors>0){return false;} +// if(v.expectedErrors>0.5f){return false;} +//// if(v.expectedErrors-v.errors>2f){return false;} +// if(v.maxReadQuality()<21){return false;} +// if(v.avgReadQuality()<17){return false;} +// if(v.maxVarQuality()<30){return false;} +// if(v.avgVarQuality()<21){return false;} +// if(v.numReads<5){return false;} +// if(v.numSemiUniqueReads<4){return false;} +// if(v.numUniqueReads<2){return false;} +// if(v.paired<4){return false;} +// if(v.score()<8100){return false;} +// } + + }else{ + + assert(false) : "disabled"; + + } + + + + return true; + } + + public static boolean passesFilterOther(Varlet v){ + + + + if(v.endDist<3){return false;} + if(v.tailDist<10){return false;} + + //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required. + if(v.minStrandReads()>=2){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.5f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<16){return false;} + if(v.avgReadQuality()<12){return false;} + if(v.maxVarQuality()<26){return false;} + if(v.avgVarQuality()<16){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<4){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else if(v.minStrandReads()>=1){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.2f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<17){return false;} + if(v.avgReadQuality()<13){return false;} + if(v.maxVarQuality()<28){return false;} + if(v.avgVarQuality()<17){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<4){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else{ + if(v.endDist<8){return false;} + if(v.tailDist<14){return false;} + + if(v.errors>0){return false;} + if(v.expectedErrors>0.5f){return false;} +// if(v.expectedErrors-v.errors>2f){return false;} + if(v.maxReadQuality()<20){return false;} + if(v.avgReadQuality()<16){return false;} + if(v.maxVarQuality()<30){return false;} + if(v.avgVarQuality()<20){return false;} + if(v.numReads<6){return false;} + if(v.numSemiUniqueReads<5){return false;} + if(v.numUniqueReads<3){return false;} + if(v.paired<5){return false;} + if(v.score()<6500){return false;} + } + + + + + + return true; + } + + + public static ArrayList mergeAll(ArrayList vars){ + if(vars==null || vars.size()==0){return null;} + ArrayList out=new ArrayList(8+vars.size()/16); + Collections.sort(vars); + + ArrayList temp=new ArrayList(64); + for(int i=0; iMIN_READS_TO_KEEP){ + out.add(result); + }else if(result.numReads==MIN_READS_TO_KEEP){ + if(result.maxVarQuality()>=MIN_QUALITY_AT_MIN_READS && + result.errors<=MAX_ERRORS_AT_MIN_READS && + result.expectedErrors<=MAX_EXPECTED_ERRORS_AT_MIN_READS && + (result.paired>0 || !REQUIRE_PAIRED_AT_MIN_READS)){ + out.add(result); + } + } + temp.clear(); + temp.add(v); + } + } + + + } + + if(!temp.isEmpty()){ + if(temp.size()>=MIN_READS_TO_KEEP){ + Varlet result=mergeEqualVarlets(temp); + out.add(result); + } + temp.clear(); + } + + {//For testing + Collections.sort(out); //Should already be sorted... + for(int i=1; i> plus=new HashMap>(Tools.min(8, vars.size())); + HashMap> minus=new HashMap>(Tools.min(8, vars.size())); + + int numReads=0; + int numSemiUniqueReads=0; + int numUniqueReads=0; + int pairedReads=0; + int plusReads1=0; + int minusReads1=0; + int plusReads2=0; + int minusReads2=0; + + int totalQuality=0; + int totalVarQuality=0; + + int maxReadQuality=0; + int maxVarQuality=0; + + int maxMapScore=0; + int bestLen=0; + int bestMapLen=0; + int minReadStart=Integer.MAX_VALUE; + int maxReadStop=-999999; + + int maxHeadDist=-1; + int maxTailDist=-1; + int maxEndDist=-1; + + Varlet bestVar=null; + + int minErrors=999; + float minExpectedErrors=999f; + + for(Varlet v : vars){ + + numReads+=v.numReads; + numSemiUniqueReads+=v.numSemiUniqueReads; + plusReads1+=v.numPlusReads1; + minusReads1+=v.numMinusReads1; + plusReads2+=v.numPlusReads2; + minusReads2+=v.numMinusReads2; + + if(v.errorsmaxReadQuality)){ + bestVar=v; + } + + totalQuality+=v.avgReadQuality()*v.numReads; + maxReadQuality=Tools.max(maxReadQuality, v.maxReadQuality()); + + totalVarQuality+=v.avgVarQuality()*v.numReads; + maxVarQuality=Tools.max(maxVarQuality, v.maxVarQuality()); + + if(bestLen==0 || (v.mapScore>=maxMapScore && v.readLen>=bestLen)){ + bestLen=v.readLen; + bestMapLen=v.readMapLen; + } + + maxHeadDist=Tools.max(maxHeadDist, v.headDist); + maxTailDist=Tools.max(maxTailDist, v.tailDist); + maxEndDist=Tools.max(maxEndDist, v.endDist); + + minErrors=Tools.min(minErrors, v.errors); + minExpectedErrors=Tools.min(minExpectedErrors, v.expectedErrors); + maxMapScore=Tools.max(maxMapScore, v.mapScore); + minReadStart=Tools.min(minReadStart, v.readStart); + maxReadStop=Tools.max(maxReadStop, v.readStop); + assert(minReadStart value=plus.get(v.readStart); + if(value==null){ + numUniqueReads++; + value=new ArrayList(2); + plus.put(v.readStart, value); + } + value.add(v); + }else{ + ArrayList value=minus.get(v.readStop); + if(value==null){ + numUniqueReads++; + value=new ArrayList(2); + minus.put(v.readStop, value); + } + value.add(v); + } + } + +// byte plusReads=(byte) ((plus.isEmpty() ? 0 : 1)+(minus.isEmpty() ? 0 : 1)); + + float avgVarQuality=totalVarQuality/(float)numReads; + float avgReadQuality=totalQuality/(float)numReads; + + int netQuality=(int)Math.ceil((avgVarQuality+maxVarQuality)/2); + int netReadQuality=(int)Math.ceil((avgReadQuality+maxReadQuality)/2); + + Varlet v=new Varlet(bestVar.chromosome, ((plusReads1+plusReads2>0) && (minusReads1+minusReads2>0) ? Gene.PLUS : bestVar.strand), + bestVar.beginLoc, bestVar.endLoc, bestVar.matchStart, bestVar.matchStop, bestVar.varType, bestVar.ref, bestVar.call, + netQuality, netReadQuality, maxMapScore, minErrors, minExpectedErrors, pairedReads, bestVar.readID, bestLen, bestMapLen, + minReadStart, maxReadStop, numReads, maxHeadDist, maxTailDist, maxEndDist, bestVar.pairNum()); + + + v.setMaxReadQuality(maxReadQuality); + v.setMaxVarQuality(maxVarQuality); + v.setAvgReadQuality((int)Math.ceil(avgReadQuality)); + v.setAvgVarQuality((int)Math.ceil(avgVarQuality)); + + v.numSemiUniqueReads=(short)numSemiUniqueReads; + v.numUniqueReads=(short)numUniqueReads; + v.numPlusReads1=(short)plusReads1; + v.numMinusReads1=(short)minusReads1; + v.numPlusReads2=(short)plusReads2; + v.numMinusReads2=(short)minusReads2; + assert(plusReads1+minusReads1+plusReads2+minusReads2==numSemiUniqueReads); + + assert(v.numReads>=v.numSemiUniqueReads); + assert(v.numSemiUniqueReads>=v.numUniqueReads); + + //This assertion is only correct if stacking is done from raw, uncombined varlets. + assert(v.numSemiUniqueReads==vars.size()) : "\n"+vars.size()+", "+v.numReads+", "+v.numSemiUniqueReads+", "+v.numUniqueReads + +"\n"+v.toText(); + + assert(v.numUniqueReads<=v.numReads && v.numUniqueReads>0); + assert(v.numUniqueReads==plus.size()+minus.size()) : "numUniqueReads="+numUniqueReads+ + ", v.numUniqueReads="+v.numUniqueReads+", v.numReads="+v.numReads + +", plus.size()="+plus.size()+", minus.size()="+minus.size()+"\n"+vars+"\n"; + + return v; + } + + + private static class SVThread implements Runnable { + + public SVThread(String fname1_, String fname2_, boolean filter_){ + fname1=fname1_; + fname2=fname2_; + filter=filter_; + } + + @Override + public void run() { +// addThread(1); + assert(activeThreads>0); + processFile(fname1, fname2, filter); + addThread(-1); + } + + private final ArrayList processFile(String inName, String outName, boolean filter){ + + ArrayList initial=Varlet.fromTextFile(inName); + + for(Varlet v : initial){ + if(v.varType==Variation.NOREF){totalInNR++;} + totalIn++; + } + + if(verbose){System.err.println("Initial: \t"+initial.size());} + ArrayList merged=mergeAll(initial); + initial=null; + if(verbose){System.err.println("Merged: \t"+merged.size());} + ArrayList out; + if(!filter){ +// System.out.println("Not filtering."); + out=merged; + for(Varlet v : out){ + if(v!=null){ + totalKept++; + scoreKept+=v.score(); + } + } + }else{ +// System.out.println("Filtering."); + out=filterLight(merged); +// System.out.println("Filtered: \t"+out.size()); + } + merged=null; + if(out==null){out=new ArrayList(1);} + out.trimToSize(); + if(verbose){if(verbose){System.err.println("Out: \t"+out.size());}} + + if(outName!=null){ + + TextStreamWriter tsw=new TextStreamWriter(outName, true, false, false); + tsw.start(); + tsw.println(Varlet.textHeader()); + for(Varlet v : out){ + StringBuilder sb=v.toText(); + sb.append('\n'); + tsw.print(sb); + } + tsw.poison(); + + } + return out; + } + + + private final ArrayList filterLight(ArrayList vars){ + if(vars==null || vars.size()==0){return null;} + + int dropped=0; + for(int i=0; i2){ + passes=false; + } + + if(passes && STRICT){ + passes=passesFilterLight(v); + } + + if(passes){ + if(v.varType==Variation.NOREF){totalKeptNR++;} + else if(v.varType==Variation.SNP){snpKept++;} + else if(v.varType==Variation.DEL){ + delKept++; +// delLenKept-=v.lengthRef(); + delLenKept+=dif; + } + else if(v.varType==Variation.INS){ + insKept++; +// insLenKept+=v.lengthVar(); + insLenKept+=dif; + } + else if(v.varType==Variation.DELINS){ + subKept++; +// subLenKept+=(v.lengthRef()-v.lengthVar()); + subLenKept+=dif; + } + totalKept++; + scoreKept+=v.score(); + deltaLenKept+=dif; + }else{ + vars.set(i, null); + if(v.varType==Variation.NOREF){totalDroppedNR++;} + dropped++; + scoreDropped+=v.score(); + } + } + totalDropped+=dropped; + if(dropped>0){ + Tools.condenseStrict(vars); + } + return vars; + } + + private static boolean passesFilterLight(Varlet v){ + if(v.endDist<4){return false;} + if(v.tailDist<10){return false;} + + //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required. + if(v.minStrandReads()>=2){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.4f){return false;} + // if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<17){return false;} + if(v.avgReadQuality()<13){return false;} + if(v.maxVarQuality()<26){return false;} + if(v.avgVarQuality()<17){return false;} + if(v.numReads<3){return false;} + if(v.numSemiUniqueReads<3){return false;} + if(v.numUniqueReads<2){return false;} +// if(v.paired<3){return false;} + if(v.score()<8200){return false;} + + }else if(v.minStrandReads()>=1){ + if(v.endDist<7){return false;} + if(v.tailDist<12){return false;} + + if(v.errors>2){return false;} + if(v.expectedErrors>1.1f){return false;} + // if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<18){return false;} + if(v.avgReadQuality()<14){return false;} + if(v.maxVarQuality()<28){return false;} + if(v.avgVarQuality()<18){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<3){return false;} + if(v.numUniqueReads<2){return false;} +// if(v.paired<3){return false;} + if(v.score()<8020){return false;} + }else{ + if(v.endDist<8){return false;} + if(v.tailDist<14){return false;} + + if(v.errors>0){return false;} + if(v.expectedErrors>0.5f){return false;} + // if(v.expectedErrors-v.errors>2f){return false;} + if(v.maxReadQuality()<21){return false;} + if(v.avgReadQuality()<17){return false;} + if(v.maxVarQuality()<30){return false;} + if(v.avgVarQuality()<21){return false;} + if(v.numReads<6){return false;} + if(v.numSemiUniqueReads<5){return false;} + if(v.numUniqueReads<3){return false;} +// if(v.paired<5){return false;} + if(v.score()<7670){return false;} + } + return true; + } + + private long deltaLenKept=0; + private long snpKept=0; + private long delKept=0; + private long insKept=0; + private long subKept=0; + private long delLenKept=0; + private long insLenKept=0; + private long subLenKept=0; + + private long deltaLenIn=0; + private long totalIn=0; + private long totalInNR=0; + private long totalKept=0; + private long totalKeptNR=0; + private long totalDropped=0; + private long totalDroppedNR=0; + private long scoreKept=0; + private long scoreDropped=0; + + private final String fname1; + private final String fname2; + private final boolean filter; + } + + private static int addThread(int x){ + synchronized(THREADLOCK){ + while(x>0 && activeThreads>=THREADS){ + try { + THREADLOCK.wait(200); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + activeThreads+=x; + return activeThreads; + } + } + + + public static long deltaLenKept_global=0; + public static long deltaLenIn_global=0; + + public static long snpKept_global=0; + public static long delKept_global=0; + public static long insKept_global=0; + public static long subKept_global=0; + public static long delLenKept_global=0; + public static long insLenKept_global=0; + public static long subLenKept_global=0; + + public static long totalIn_global=0; + public static long totalInNR_global=0; + public static long totalKept_global=0; + public static long totalDropped_global=0; + public static long totalKeptNR_global=0; + public static long totalDroppedNR_global=0; + public static long scoreKept_global=0; + public static long scoreDropped_global=0; + + private static int activeThreads=0; + + private static final String THREADLOCK=new String("THREADLOCK"); + private static int THREADS=3; + public static final int MIN_READS_TO_KEEP=1; + public static final int MIN_QUALITY_AT_MIN_READS=14; + public static final int MAX_ERRORS_AT_MIN_READS=2; + public static final int MAX_EXPECTED_ERRORS_AT_MIN_READS=4; + public static final boolean REQUIRE_PAIRED_AT_MIN_READS=true; + public static boolean STRICT=false; + public static boolean VSTRICT=false; + public static boolean USTRICT=false; + + public static final boolean verbose=false; +} diff --git a/current/var/StackVariations2.java b/current/var/StackVariations2.java new file mode 100755 index 0000000..e540644 --- /dev/null +++ b/current/var/StackVariations2.java @@ -0,0 +1,835 @@ +package var; + +import java.io.File; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; + + +import dna.Data; +import dna.Gene; +import dna.Timer; +import fileIO.TextStreamWriter; + +import align2.Tools; + +public class StackVariations2 { + + public static void main(String[] args){ + System.err.println("Executing "+(new Object() { }.getClass().getEnclosingClass().getName())+" "+Arrays.toString(args)+"\n"); + + Timer t=new Timer(); + t.start(); + + String inPattern=(args[0].equalsIgnoreCase("null") ? null : args[0]); + String outPattern=args[1]; + + assert(!inPattern.equalsIgnoreCase(outPattern)); + + int minChrom=-1; + int maxChrom=-1; + + boolean filter=false; + Data.GENOME_BUILD=-1; + + for(int i=2; i1 ? split[1] : null); + + if(arg.startsWith("-Xmx") || arg.startsWith("-Xms") || arg.equals("-ea") || arg.equals("-da")){ + //jvm argument; do nothing + }else if(a.equalsIgnoreCase("filter")){filter=true;} + else if(a.startsWith("filter")){ + if(b.equals("1") || b.startsWith("t")){filter=true;} + else if(b.equals("0") || b.startsWith("f")){filter=false;} + else{throw new RuntimeException("Unknown parameter "+args[i]);} + }else if(a.equalsIgnoreCase("strict")){ + if(b==null){STRICT=true;} + else if(b.equals("1") || b.startsWith("t")){STRICT=true;} + else if(b.equals("0") || b.startsWith("f")){STRICT=false;} + else{throw new RuntimeException("Unknown parameter "+args[i]);} + }else if(a.equals("genome") || a.equals("build")){ + Data.setGenome(Integer.parseInt(b)); + if(minChrom==-1){minChrom=1;} + if(maxChrom==-1){maxChrom=Data.numChroms;} + }else if(a.equals("minchrom")){ + minChrom=Integer.parseInt(b); + }else if(a.equals("maxchrom")){ + maxChrom=Integer.parseInt(b); + }else if(a.equals("threads") || a.equals("t")){ + THREADS=Integer.parseInt(b); + }else if(a.equals("minreads")){ + MIN_READS_TO_KEEP=Integer.parseInt(b); + }else if(a.equals("blocksize")){ + GenerateVarlets2.BLOCKSIZE=(Integer.parseInt(b)); + }else if(a.equals("deletefiles") || a.startsWith("deletetemp") || a.startsWith("deleteinput") || a.equals("delete")){ + DELETE_INPUT=(Tools.parseBoolean(b)); + }else{ + throw new RuntimeException("Unknown parameter "+args[i]); + } + } + + assert(minChrom>=0 && maxChrom>=minChrom) : "Please set minchrom and maxchrom."; + if(Data.GENOME_BUILD<0){throw new RuntimeException("Please set genome number.");} + THREADS=Tools.max(1, THREADS); + +// for(byte i=minChrom; i<=maxChrom; i++){ +// String fname1=inPattern.replace("#", i+""); +// String fname2=outPattern.replace("#", i+""); +// assert(new File(fname1).exists()); +// assert(!new File(fname2).exists()); +// processFile(fname1, fname2, filter); +// } + + runThreaded(inPattern, outPattern, minChrom, maxChrom, filter); + + t.stop(); + System.out.println("Input Vars: \t"+(totalIn_global-totalInNR_global)); + System.out.println("Input No-ref: \t"+totalInNR_global); + System.out.println("Input Delta Length:\t"+deltaLenIn_global); + System.out.println(); + System.out.println("Kept Vars: \t"+(totalKept_global-totalKeptNR_global)); + System.out.println("Kept No-ref: \t"+totalKeptNR_global); + System.out.println("Kept Snp: \t"+snpKept_global); + System.out.println("Kept Del: \t"+delKept_global+"\t\tLength: \t"+delLenKept_global); + System.out.println("Kept Ins: \t"+insKept_global+"\t\tLength: \t"+insLenKept_global); + System.out.println("Kept Sub: \t"+subKept_global+"\t\tLength: \t"+subLenKept_global); + System.out.println("Kept Delta Length: \t"+deltaLenKept_global); + System.out.println("Kept Avg Score: \t"+(scoreKept_global/(Tools.max(1, totalKept_global)))); + System.out.println(); + System.out.println("Dropped Vars: \t"+(totalDropped_global-totalDroppedNR_global)); + System.out.println("Dropped No-ref: \t"+totalDroppedNR_global); + System.out.println("Dropped Avg Score: \t"+(scoreDropped_global/Tools.max(1, totalDropped_global))); + System.out.println(); + System.out.println("Time: \t"+t); + } + + public static final void runThreaded(String inPattern, String outPattern, int minChrom, int maxChrom, boolean filter){ + ArrayList svts=new ArrayList(); + for(int i=minChrom; i<=maxChrom; i++){ + assert(inPattern==null || !inPattern.equalsIgnoreCase(outPattern)); + String fname1=inPattern; + String fname2=outPattern.replace("#", i+""); + addThread(1); + SVThread svt=new SVThread(fname1, fname2, i, filter); + svts.add(svt); + new Thread(svt).start(); + } + while(addThread(0)>0){} + for(SVThread svt : svts){ + + snpKept_global+=svt.snpKept; + delKept_global+=svt.delKept; + insKept_global+=svt.insKept; + subKept_global+=svt.subKept; + delLenKept_global+=svt.delLenKept; + insLenKept_global+=svt.insLenKept; + subLenKept_global+=svt.subLenKept; + deltaLenKept_global+=svt.deltaLenKept; + + deltaLenIn_global+=svt.deltaLenIn; + totalIn_global+=svt.totalIn; + totalInNR_global+=svt.totalInNR; + totalKept_global+=svt.totalKept; + totalDropped_global+=svt.totalDropped; + totalKeptNR_global+=svt.totalKeptNR; + totalDroppedNR_global+=svt.totalDroppedNR; + scoreKept_global+=svt.scoreKept; + scoreDropped_global+=svt.scoreDropped; + } + } + + + public static boolean passesFilterSNP(Varlet v){ + + + //Best so far: + + if(STRICT){ + + if(v.endDist<3){return false;} + if(v.tailDist<10){return false;} + + //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required. + if(v.minStrandReads()>=2){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.5f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<18){return false;} + if(v.avgReadQuality()<13){return false;} + if(v.maxVarQuality()<26){return false;} + if(v.avgVarQuality()<18){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<4){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else if(v.minStrandReads()>=1){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.2f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<19){return false;} + if(v.avgReadQuality()<14){return false;} + if(v.maxVarQuality()<28){return false;} + if(v.avgVarQuality()<19){return false;} + if(v.numReads<3){return false;} + if(v.numSemiUniqueReads<3){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else{ + if(v.endDist<8){return false;} + if(v.tailDist<14){return false;} + + if(v.errors>0){return false;} + if(v.expectedErrors>0.5f){return false;} +// if(v.expectedErrors-v.errors>2f){return false;} + if(v.maxReadQuality()<21){return false;} + if(v.avgReadQuality()<17){return false;} + if(v.maxVarQuality()<30){return false;} + if(v.avgVarQuality()<21){return false;} + if(v.numReads<6){return false;} + if(v.numSemiUniqueReads<5){return false;} + if(v.numUniqueReads<3){return false;} + if(v.paired<5){return false;} + if(v.score()<8100){return false;} + } + +// else{ +// if(v.endDist<8){return false;} +// if(v.tailDist<14){return false;} +// +// if(v.errors>0){return false;} +// if(v.expectedErrors>0.5f){return false;} +//// if(v.expectedErrors-v.errors>2f){return false;} +// if(v.maxReadQuality()<21){return false;} +// if(v.avgReadQuality()<17){return false;} +// if(v.maxVarQuality()<30){return false;} +// if(v.avgVarQuality()<21){return false;} +// if(v.numReads<5){return false;} +// if(v.numSemiUniqueReads<4){return false;} +// if(v.numUniqueReads<2){return false;} +// if(v.paired<4){return false;} +// if(v.score()<8100){return false;} +// } + + }else{ + + assert(false) : "disabled"; + + } + + + + return true; + } + + public static boolean passesFilterOther(Varlet v){ + + + + if(v.endDist<3){return false;} + if(v.tailDist<10){return false;} + + //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required. + if(v.minStrandReads()>=2){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.5f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<16){return false;} + if(v.avgReadQuality()<12){return false;} + if(v.maxVarQuality()<26){return false;} + if(v.avgVarQuality()<16){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<4){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else if(v.minStrandReads()>=1){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.2f){return false;} +// if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<17){return false;} + if(v.avgReadQuality()<13){return false;} + if(v.maxVarQuality()<28){return false;} + if(v.avgVarQuality()<17){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<4){return false;} + if(v.numUniqueReads<2){return false;} + if(v.paired<3){return false;} + + }else{ + if(v.endDist<8){return false;} + if(v.tailDist<14){return false;} + + if(v.errors>0){return false;} + if(v.expectedErrors>0.5f){return false;} +// if(v.expectedErrors-v.errors>2f){return false;} + if(v.maxReadQuality()<20){return false;} + if(v.avgReadQuality()<16){return false;} + if(v.maxVarQuality()<30){return false;} + if(v.avgVarQuality()<20){return false;} + if(v.numReads<6){return false;} + if(v.numSemiUniqueReads<5){return false;} + if(v.numUniqueReads<3){return false;} + if(v.paired<5){return false;} + if(v.score()<6500){return false;} + } + + + + + + return true; + } + + + public static ArrayList mergeAll(ArrayList vars){ + if(vars==null || vars.size()==0){return null;} + ArrayList out=new ArrayList(8+vars.size()/16); + Collections.sort(vars); + + ArrayList temp=new ArrayList(64); + for(int i=0; iMIN_READS_TO_KEEP){ + out.add(result); + }else if(result.numReads==MIN_READS_TO_KEEP){ + if(result.maxVarQuality()>=MIN_QUALITY_AT_MIN_READS && + result.errors<=MAX_ERRORS_AT_MIN_READS && + result.expectedErrors<=MAX_EXPECTED_ERRORS_AT_MIN_READS && + (result.paired>0 || !REQUIRE_PAIRED_AT_MIN_READS)){ + out.add(result); + } + } + temp.clear(); + temp.add(v); + } + } + + + } + + if(!temp.isEmpty()){ + if(temp.size()>=MIN_READS_TO_KEEP){ + Varlet result=mergeEqualVarlets(temp); + out.add(result); + } + temp.clear(); + } + + {//For testing + Collections.sort(out); //Should already be sorted... + for(int i=1; i> plus=new HashMap>(Tools.min(8, vars.size())); + HashMap> minus=new HashMap>(Tools.min(8, vars.size())); + + int numReads=0; + int numSemiUniqueReads=0; + int numUniqueReads=0; + int pairedReads=0; + int plusReads1=0; + int minusReads1=0; + int plusReads2=0; + int minusReads2=0; + + int totalQuality=0; + int totalVarQuality=0; + + int maxReadQuality=0; + int maxVarQuality=0; + + int maxMapScore=0; + int bestLen=0; + int bestMapLen=0; + int minReadStart=Integer.MAX_VALUE; + int maxReadStop=-999999; + + int maxHeadDist=-1; + int maxTailDist=-1; + int maxEndDist=-1; + + Varlet bestVar=null; + + int minErrors=999; + float minExpectedErrors=999f; + + for(Varlet v : vars){ + + numReads+=v.numReads; + numSemiUniqueReads+=v.numSemiUniqueReads; + plusReads1+=v.numPlusReads1; + minusReads1+=v.numMinusReads1; + plusReads2+=v.numPlusReads2; + minusReads2+=v.numMinusReads2; + + if(v.errorsmaxReadQuality)){ + bestVar=v; + } + + totalQuality+=v.avgReadQuality()*v.numReads; + maxReadQuality=Tools.max(maxReadQuality, v.maxReadQuality()); + + totalVarQuality+=v.avgVarQuality()*v.numReads; + maxVarQuality=Tools.max(maxVarQuality, v.maxVarQuality()); + + if(bestLen==0 || (v.mapScore>=maxMapScore && v.readLen>=bestLen)){ + bestLen=v.readLen; + bestMapLen=v.readMapLen; + } + + maxHeadDist=Tools.max(maxHeadDist, v.headDist); + maxTailDist=Tools.max(maxTailDist, v.tailDist); + maxEndDist=Tools.max(maxEndDist, v.endDist); + + minErrors=Tools.min(minErrors, v.errors); + minExpectedErrors=Tools.min(minExpectedErrors, v.expectedErrors); + maxMapScore=Tools.max(maxMapScore, v.mapScore); + minReadStart=Tools.min(minReadStart, v.readStart); + maxReadStop=Tools.max(maxReadStop, v.readStop); + assert(minReadStart value=plus.get(v.readStart); + if(value==null){ + numUniqueReads++; + value=new ArrayList(2); + plus.put(v.readStart, value); + } + value.add(v); + }else{ + ArrayList value=minus.get(v.readStop); + if(value==null){ + numUniqueReads++; + value=new ArrayList(2); + minus.put(v.readStop, value); + } + value.add(v); + } + } + +// byte plusReads=(byte) ((plus.isEmpty() ? 0 : 1)+(minus.isEmpty() ? 0 : 1)); + + float avgVarQuality=totalVarQuality/(float)numReads; + float avgReadQuality=totalQuality/(float)numReads; + + int netQuality=(int)Math.ceil((avgVarQuality+maxVarQuality)/2); + int netReadQuality=(int)Math.ceil((avgReadQuality+maxReadQuality)/2); + + Varlet v=new Varlet(bestVar.chromosome, ((plusReads1+plusReads2>0) && (minusReads1+minusReads2>0) ? Gene.PLUS : bestVar.strand), + bestVar.beginLoc, bestVar.endLoc, bestVar.matchStart, bestVar.matchStop, bestVar.varType, bestVar.ref, bestVar.call, + netQuality, netReadQuality, maxMapScore, minErrors, minExpectedErrors, pairedReads, bestVar.readID, bestLen, bestMapLen, + minReadStart, maxReadStop, numReads, maxHeadDist, maxTailDist, maxEndDist, bestVar.pairNum()); + + + v.setMaxReadQuality(maxReadQuality); + v.setMaxVarQuality(maxVarQuality); + v.setAvgReadQuality((int)Math.ceil(avgReadQuality)); + v.setAvgVarQuality((int)Math.ceil(avgVarQuality)); + + v.numSemiUniqueReads=numSemiUniqueReads; + v.numUniqueReads=numUniqueReads; + v.numPlusReads1=plusReads1; + v.numMinusReads1=minusReads1; + v.numPlusReads2=plusReads2; + v.numMinusReads2=minusReads2; + assert(plusReads1+minusReads1+plusReads2+minusReads2==numSemiUniqueReads); + + assert(v.numReads>=v.numSemiUniqueReads); + assert(v.numSemiUniqueReads>=v.numUniqueReads); + + //This assertion is only correct if stacking is done from raw, uncombined varlets. + assert(v.numSemiUniqueReads==vars.size()) : "\n"+vars.size()+", "+v.numReads+", "+v.numSemiUniqueReads+", "+v.numUniqueReads + +"\n"+v.toText(); + + assert(v.numUniqueReads<=v.numReads && v.numUniqueReads>0); + assert(v.numUniqueReads==plus.size()+minus.size()) : "numUniqueReads="+numUniqueReads+ + ", v.numUniqueReads="+v.numUniqueReads+", v.numReads="+v.numReads + +", plus.size()="+plus.size()+", minus.size()="+minus.size()+"\n"+vars+"\n"; + + return v; + } + + + private static class SVThread implements Runnable { + + public SVThread(String fname1_, String fname2_, final int chrom_, boolean filter_){ + fname1=fname1_; + fname2=fname2_; + filter=filter_; + chrom=chrom_; + } + + @Override + public void run() { +// addThread(1); + assert(activeThreads>0); + processFile(fname1, fname2); + addThread(-1); + } + + private final void processFile(final String inName, final String outName){ + + final long[] keys=GenerateVarlets2.keys(chrom); + final TextStreamWriter tsw=(inName==null ? null : new TextStreamWriter(outName, true, false, false)); + if(tsw!=null){ + tsw.start(); + tsw.println(Varlet.textHeader()); + } + + for(final long key : keys){ + String blockname=GenerateVarlets2.fname(key, inName); + + ArrayList initial=Varlet.fromTextFile(blockname); + + for(Varlet v : initial){ + if(v.varType==Variation.NOREF){totalInNR++;} + totalIn++; + + int dif=v.lengthDif(); + deltaLenIn+=dif; + } + + if(verbose){System.err.println("Initial: \t"+initial.size());} + + int merged=mergeAll2(initial, tsw); + + initial=null; + if(verbose){System.err.println("Merged: \t"+merged);} + + } + + if(tsw!=null){ + tsw.poison(); + if(DELETE_INPUT){ + for(int i=0; i<10 && tsw.isAlive(); i++){ + try { + tsw.join(10000); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + if(tsw.isAlive()){ + System.err.println(tsw.getClass().getName()+" for "+outName+" refused to die."); + assert(false); + } + } + } + + if(DELETE_INPUT){ + for(final long key : keys){ + String blockname=GenerateVarlets2.fname(key, inName); +// System.out.println("Deleting "+blockname); + new File(blockname).delete(); + } + } + } + + + + + + private final int mergeAll2(ArrayList vars, TextStreamWriter tsw){ + if(vars==null || vars.size()==0){return 0;} + + Collections.sort(vars); + int out=0; + + ArrayList temp=new ArrayList(64); + for(int i=0; i=MIN_READS_TO_KEEP){ + Varlet result=mergeEqualVarlets(temp); + out++; + processMergedVar(result, tsw); + } + temp.clear(); + } + + return out; + } + + + private final boolean processMergedVar(Varlet v, TextStreamWriter tsw){ + + if(v==null){return false;} + if(v.numReads2){ + passes=false; + } + + if(passes && STRICT){ + passes=passesFilterLight(v); + } + + if(passes){ + if(v.varType==Variation.NOREF){totalKeptNR++;} + else if(v.varType==Variation.SNP){snpKept++;} + else if(v.varType==Variation.DEL){ + delKept++; + // delLenKept-=v.lengthRef(); + delLenKept+=dif; + } + else if(v.varType==Variation.INS){ + insKept++; + // insLenKept+=v.lengthVar(); + insLenKept+=dif; + } + else if(v.varType==Variation.DELINS){ + subKept++; + // subLenKept+=(v.lengthRef()-v.lengthVar()); + subLenKept+=dif; + } + totalKept++; + scoreKept+=v.score(); + deltaLenKept+=dif; + }else{ + if(v.varType==Variation.NOREF){totalDroppedNR++;} + dropped++; + scoreDropped+=v.score(); + } + + totalDropped+=dropped; + return passes; + } + + private static boolean passesFilterLight(Varlet v){ + if(v.endDist<4){return false;} + if(v.tailDist<10){return false;} + + //NOTE! Last thing I did was make this more strict by adding 1 to all the num reads/unique reads required. + if(v.minStrandReads()>=2){ + + if(v.errors>2){return false;} + if(v.expectedErrors>1.4f){return false;} + // if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<17){return false;} + if(v.avgReadQuality()<13){return false;} + if(v.maxVarQuality()<26){return false;} + if(v.avgVarQuality()<17){return false;} + if(v.numReads<3){return false;} + if(v.numSemiUniqueReads<3){return false;} + if(v.numUniqueReads<2){return false;} +// if(v.paired<3){return false;} + if(v.score()<8200){return false;} + + }else if(v.minStrandReads()>=1){ + if(v.endDist<7){return false;} + if(v.tailDist<12){return false;} + + if(v.errors>2){return false;} + if(v.expectedErrors>1.1f){return false;} + // if(v.expectedErrors-v.errors>3f){return false;} + if(v.maxReadQuality()<18){return false;} + if(v.avgReadQuality()<14){return false;} + if(v.maxVarQuality()<28){return false;} + if(v.avgVarQuality()<18){return false;} + if(v.numReads<4){return false;} + if(v.numSemiUniqueReads<3){return false;} + if(v.numUniqueReads<2){return false;} +// if(v.paired<3){return false;} + if(v.score()<8020){return false;} + }else{ + if(v.endDist<8){return false;} + if(v.tailDist<14){return false;} + + if(v.errors>0){return false;} + if(v.expectedErrors>0.5f){return false;} + // if(v.expectedErrors-v.errors>2f){return false;} + if(v.maxReadQuality()<21){return false;} + if(v.avgReadQuality()<17){return false;} + if(v.maxVarQuality()<30){return false;} + if(v.avgVarQuality()<21){return false;} + if(v.numReads<6){return false;} + if(v.numSemiUniqueReads<5){return false;} + if(v.numUniqueReads<3){return false;} +// if(v.paired<5){return false;} + if(v.score()<7670){return false;} + } + return true; + } + + private long deltaLenKept=0; + private long snpKept=0; + private long delKept=0; + private long insKept=0; + private long subKept=0; + private long delLenKept=0; + private long insLenKept=0; + private long subLenKept=0; + + private long deltaLenIn=0; + private long totalIn=0; + private long totalInNR=0; + + private long totalKept=0; + private long totalKeptNR=0; + private long totalDropped=0; + private long totalDroppedNR=0; + private long scoreKept=0; + private long scoreDropped=0; + + private final String fname1; + private final String fname2; + private final boolean filter; + private final int chrom; + } + + private static int addThread(int x){ + synchronized(THREADLOCK){ + while(x>0 && activeThreads>=THREADS){ + try { + THREADLOCK.wait(200); + } catch (InterruptedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + } + activeThreads+=x; + return activeThreads; + } + } + + + public static long deltaLenKept_global=0; + public static long deltaLenIn_global=0; + + public static long snpKept_global=0; + public static long delKept_global=0; + public static long insKept_global=0; + public static long subKept_global=0; + public static long delLenKept_global=0; + public static long insLenKept_global=0; + public static long subLenKept_global=0; + + public static long totalIn_global=0; + public static long totalInNR_global=0; + public static long totalKept_global=0; + public static long totalDropped_global=0; + public static long totalKeptNR_global=0; + public static long totalDroppedNR_global=0; + public static long scoreKept_global=0; + public static long scoreDropped_global=0; + + private static int activeThreads=0; + + private static final String THREADLOCK=new String("THREADLOCK"); + private static int THREADS=7; + private static boolean DELETE_INPUT=false; + public static int MIN_READS_TO_KEEP=1; + public static final int MIN_QUALITY_AT_MIN_READS=14; + public static final int MAX_ERRORS_AT_MIN_READS=2; + public static final int MAX_EXPECTED_ERRORS_AT_MIN_READS=4; + public static final boolean REQUIRE_PAIRED_AT_MIN_READS=false; + public static boolean STRICT=false; + public static boolean VSTRICT=false; + public static boolean USTRICT=false; + + public static final boolean verbose=false; +} diff --git a/current/var/VarLine.java b/current/var/VarLine.java new file mode 100755 index 0000000..81e5b0e --- /dev/null +++ b/current/var/VarLine.java @@ -0,0 +1,249 @@ +package var; + +import java.io.Serializable; + +import dna.Gene; + + +public class VarLine extends Variation implements Serializable, Cloneable{ + + public static final long serialVersionUID = -4089933371294357462L; + +// >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef + + public VarLine(){} + + public VarLine(String s, float version){ + String[] line=s.split("\t", -1); + + for(int i=0; i6 ? line[6] : "null"), varTypeMap); + Byte b=varTypeMap2.get(line.length>6 ? line[6] : "null"); + assert(b!=null) : "Can't find "+line[6]+" in "+varTypeMap2.keySet()+"\n\nLine: "+s+"\n"; + varType=b; + + +// locus=Integer.parseInt(line[0]); + + b=(Byte)ploidyMap.get(line[1]); + assert(b!=null) : "\n\n"+line[1]+"\n\n"+s+"\n\n"; + ploidy=b; + + + haplotype=(byte)find(line[2], haploMap); + assert(haplotype>=0) : line[2]; + + chromosome=Gene.toChromosome(line[3]); + assert(chromosome>0) : line[3]+" -> "+line[3].substring(3); + + beginLoc=Integer.parseInt(line[4]); + int tempInt=Integer.parseInt(line[5])-1; //Note: 0,1 based + tempInt=max(tempInt, beginLoc); + endLoc=tempInt; + + String temp; + + temp=line.length>7 ? line[7] : null; + if("?".equals(temp)){temp=null;} + ref=temp; + + temp=line.length>8 ? line[8] : null; + if("?".equals(temp)){temp=null;} + call=temp; + + + if(version<2){ + + totalScore=((line.length<=9 || line[9]==null || line[9].length()<1) ? -1 : Integer.parseInt(line[9])); + hapLink=((line.length<=10 || line[10]==null || line[10].length()<1) ? -1 : Integer.parseInt(line[10])); + + assert(beginLoc<=endLoc) : s; + + // System.out.println("\n"+this+"\n"+new Variation(this)+"\n"); + }else{ + +// return "#locus\tploidy\tallele\tchromosome\tbegin\tend\tvarType\treference\talleleSeq\t +// varScoreVAF\tvarScoreEAF\tvarQuality\thapLink\txRef + + int varScoreVAF=((line.length<=9 || line[9]==null || line[9].length()<1) ? -1 : Integer.parseInt(line[9])); + int varScoreEAF=((line.length<=10 || line[10]==null || line[10].length()<1) ? -1 : Integer.parseInt(line[10])); + byte VQ=((line.length<=11 || line[11]==null || line[11].length()<1) ? (byte)0 : (byte)find(line[11], VQARRAY)); + + totalScore=varScoreVAF; + hapLink=((line.length<=12 || line[12]==null || line[12].length()<1) ? -1 : Integer.parseInt(line[12])); + + assert(beginLoc<=endLoc) : s; + +// System.out.println("\n"+this+"\n"+new Variation(this)+"\n"); + } + + assert(!((varType==Variation.INS || varType==Variation.DELINS || varType==Variation.SNP) + && call==null)) : "\nversion="+version+"\n"+s+"\n"+line+"\nline.ref="+ref+"\nline.call="+call+"\nref="+ref+"\ncall="+call; + + intern(); + } + + public VarLine clone(){ + VarLine v=null; +// try { +// v=(VarLine) super.clone(); +// } catch (CloneNotSupportedException e) { +// // TODO Auto-generated catch block +// e.printStackTrace(); +// } + v=(VarLine) super.clone(); + return v; + } + + public VarLine[] splitLine(){ + assert(haplotype==3) : this; + VarLine[] r=new VarLine[2]; + r[0]=this.clone(); + r[1]=this.clone(); + assert(this.equals(r[0]) && r[0].equals(this)); + r[0].haplotype=1; + r[1].haplotype=2; + return r; + } + + public VarLine spawnEqualPoint(){ + assert(this.isPoint()); + VarLine v=this.clone(); + v.varType=REFPOINT; + v.call=v.ref=null; + return v; + } + + public static VarLine makeEqualPoint(byte chrom, int loc, byte hap){ + VarLine v=new VarLine(); + v.chromosome=chrom; + v.beginLoc=loc; + v.endLoc=loc; + v.haplotype=hap; + v.varType=REFPOINT; + return v; + } + + public String toSuperString(){return super.toString();} + + + public String toString(){ + StringBuilder sb=new StringBuilder(256); + +// sb.append(locus+"\t"); + sb.append(ploidyMap.get(ploidy)+"\t"); + sb.append(haploMap[haplotype]+"\t"); + sb.append("chr"+Gene.chromCodes[chromosome]+"\t"); + sb.append(beginLoc+"\t"); + sb.append(endLoc+"\t"); + + sb.append(varTypeMap[varType]+"\t"); + sb.append((ref==null ? "" : ref)+"\t"); + sb.append((call==null ? "" : call)+"\t"); + sb.append((totalScore==-1 ? "" : totalScore)+"\t"); //TODO: Note the collision with a true -1 + sb.append((hapLink==-1 ? "" : hapLink)+"\t"); //TODO " + + return sb.toString(); + } + + public static String sourceHeader(){ + return "#locus\tploidy\tallele\tchromosome\tbegin\tend\tvarType\treference\talleleSeq\ttotalScore\thapLink\txRef"; +// return "#locus\tploidy\tallele\tchromosome\tbegin\tend\tvarType\treference\talleleSeq\t +// varScoreVAF\tvarScoreEAF\tvarQuality\thapLink\txRef + } + //locus ploidy allele chromosome begin end varType reference alleleSeq varScoreVAF varScoreEAF varQuality hapLink xRef + + public String toSourceString(){ + StringBuilder sb=new StringBuilder(256); + + sb.append(0+"\t"); + sb.append(ploidyMap.get(ploidy)+"\t"); + sb.append(haploMap[haplotype]+"\t"); + sb.append("chr"+Gene.chromCodes[chromosome]+"\t"); + sb.append(beginLoc+"\t"); + + if(varType==INS){ + sb.append(beginLoc+"\t"); + }else{ + sb.append((endLoc+1)+"\t"); + } + + sb.append(varTypeMap[varType]+"\t"); + sb.append((ref==null ? "" : ref)+"\t"); + sb.append((call==null ? "" : call)+"\t"); + sb.append((totalScore==-1 ? "" : totalScore)+"\t"); //TODO: Note the collision with a true -1 + sb.append((hapLink==-1 ? "" : hapLink)+"\t"); //TODO " + + return sb.toString(); + } + + + public String toShortString(){ + StringBuilder sb=new StringBuilder(256); + + sb.append(haploMap[haplotype]); + while(sb.length()<3){sb.append(' ');} + sb.append('\t'); + sb.append(locationString()+"\t"); + + sb.append(varTypeMap[varType]+"\t"); + sb.append((ref==null ? "" : ref)+"\t"); + sb.append((call==null ? "" : call)+"\t"); +// sb.append((totalScore==-1 ? "" : totalScore)+"\t"); //TODO: Note the collision with a true -1 +// sb.append((hapLink==-1 ? "" : hapLink+"\t")); //TODO " + + return sb.toString(); + } + + @SuppressWarnings("unused") + private static final int min(int x, int y){return xy ? x : y;} + + + @Override + public int compareTo(Variation other) { + if(other.getClass()==VarLine.class){ + return compareTo((VarLine)other); + } + return super.compareTo(other); + } + + public int compareTo(VarLine other) { + int x=super.compareTo((Variation)other); + if(x!=0){return x;} + return haplotype-other.haplotype; + } + + public boolean equals(Object other){ + if(other.getClass()==VarLine.class){ + return equals((VarLine)other); + } + return super.equals(other); + } + + public boolean equals(VarLine other){ + return compareTo(other)==0; + } + + public boolean equals(Variation other){ + return super.equals(other); + } + + public byte ploidy; + + /** Which copy this is on */ + public byte haplotype; + public int totalScore; + public int hapLink; + + public static final String[] VQARRAY=new String[] {"?", "VQLOW", "VQHIGH"}; + +} diff --git a/current/var/Variation.java b/current/var/Variation.java new file mode 100755 index 0000000..4b9fea3 --- /dev/null +++ b/current/var/Variation.java @@ -0,0 +1,869 @@ +package var; +import java.io.Serializable; +import java.lang.reflect.Array; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; +import java.util.HashSet; +import java.util.HashMap; +import java.util.Set; + + +import dna.Data; +import dna.Gene; +import dna.GeneSet; +import dna.Range; +import driver.Search; + + + +public class Variation implements Comparable, Serializable, Cloneable { + +// >locus ploidy haplotype chromosome begin end varType reference alleleSeq totalScore hapLink xRef + + /** + * + */ + private static final long serialVersionUID = -3847258470952802740l; + + public Variation(VarLine line){ +// this(line.chromosome, line.beginLoc, line.endLoc, line.xRef, line.varType, line.ref, line.call); + this(line.chromosome, line.beginLoc, line.endLoc, line.varType, line.ref, line.call); + + assert(!((varType==INS || varType==DELINS || varType==SNP) && call==null)) : "\n"+line+"\n"+this+ + "\nline.ref="+line.ref+"\nline.call="+line.call+"\nref="+ref+"\ncall="+call; + + assert(beginLoc<=endLoc) : line.toString(); + + assert(this.equals(line)) : "\n\n"+this+"\n!=\n"+line; + assert(line.equals(this)) : "\n\n"+this+"\n!=\n"+line; + + +// if(xRef==11429487){ +// System.out.println("\n"+this.toString()); +// } + } + +// public Variation(GeneVarLine line){ +//// this(line.chromosome, line.beginLoc, line.endLoc, line.xRef, line.varType, line.ref, line.call); +// this(line.chromosome, line.beginLoc, line.endLoc, line.xRef, line.xRefArray, line.varType, line.ref, line.call); +// +// assert(beginLoc<=endLoc) : line.toString(); +// +// assert(this.equals(line)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line; +// assert(line.equals(this)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line; +// +// } + + public Variation(Variation line){ + this(line.chromosome, line.beginLoc, line.endLoc, line.varType, line.ref, line.call); + + assert(beginLoc<=endLoc) : line.toString(); + + assert(this.equals(line)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line; + assert(line.equals(this)) : "\n\n"+this+"\n!=\n"+line.toSuperString()+"\n\n"+line; + + } + + public Variation(int chr, int bLoc, int eLoc, byte vType, String rf, String ca){ + chromosome=chr; + beginLoc=bLoc; + endLoc=eLoc; + varType=vType; + + setDetails(vType, rf, ca); + + assert(beginLoc<=endLoc) : toString(); + + } + + public Variation(){} + + + public Variation clone(){ + Variation v=null; + try { + v=(Variation) super.clone(); + } catch (CloneNotSupportedException e) { + // TODO Auto-generated catch block + e.printStackTrace(); + } + return v; + } + + + public static final HashSet toVariations(VarLine[] array, boolean retainEqual){ + HashSet set=new HashSet(array.length); + for(VarLine line : array){ + Variation var=new Variation(line); + if(retainEqual || var.varType!=Variation.REF){ + if(!set.contains(var)){ + set.add(var); + } + } + } + return set; + } + + public static final Variation[] toVariationArray(VarLine[][] array, boolean retainEqual){ + HashSet set=toVariations(array[0], retainEqual); + for(int i=1; i set=toVariations(array, retainEqual); + Variation[] vars=set.toArray(new Variation[set.size()]); + Arrays.sort(vars); + return vars; + } + + @SuppressWarnings("unchecked") + public static final > X[] toArray(Class c, Set set){ + + set.getClass().getTypeParameters(); + X[] array=(X[])Array.newInstance(c,set.size()); + + array=set.toArray(array); + int i=0; + for(X x : set){ + array[i]=x; + i++; + } + Arrays.sort(array); + return array; + } + + + public static VarLine[] filterCodingVariances(VarLine[] variances, int chrom, boolean nearby){ + Range[] ranges=(nearby ? Data.geneNearbyRangeMatrix(chrom) : Data.geneCodeAndExonRangeMatrix(chrom)); + + ArrayList list=new ArrayList(8+variances.length/8); + + for(VarLine var : variances){ + + if(var.varType!=VarLine.REF && var.varType!=VarLine.NOREF){ + int loc=var.beginLoc; + int rnum=Search.findPointBinary(loc, ranges); + + if(ranges[rnum].intersects(var.beginLoc, var.endLoc)){ + list.add(var); + } + + for(int i=rnum; ivar.endLoc){break;} //Out of range + + if(r.intersects(var.beginLoc, var.endLoc)){ + list.add(var); + break; + } + } + } + } + + return list.toArray(new VarLine[list.size()]); + } + + + + + /** + * Generates an array of non-overlapping Ranges, sorted by index, ascending. + * To each is attached a list of all overlapping Variations from the input array. + * @param va + * @return The array of ranges + */ + public static Range[] makeVarRanges(Variation[] va){ + // System.out.println("va.length="+va.length); + + if(va==null || va.length==0){ + return new Range[0]; + } + + ArrayList ra=new ArrayList(va.length); + for(Variation v : va){ + Range r=new Range(v.beginLoc, v.endLoc); + r.obj1=new ArrayList(); + ((ArrayList)r.obj1).add(v); + ra.add(r); + } + Collections.sort(ra); + ArrayList ra2=new ArrayList(va.length); + Range current=null; + // System.out.println("ra.size="+ra.size()); + for(Range r : ra){ + // System.out.println("\ncurrent="+current+", r="+r); + if(current==null){current=r;} + else if(current.intersects(r)){ + // System.out.println("merged"); + Range temp=current.merge(r); + temp.obj1=current.obj1; + ((ArrayList)temp.obj1).addAll((ArrayList)r.obj1); + current=temp; + }else{ + // System.out.println("added"); + ra2.add(current); + current=r; + } + // System.out.println("current="+current+", r="+r); + } + // System.out.println("\ncurrent="+current); + // System.out.println("ra2.size="+ra2.size()); + assert(current!=null); //Note: this could be null if input was empty, I guess... + assert(ra2.size()==0 || ra2.get(ra2.size()-1)!=current); + ra2.add(current); + return ra2.toArray(new Range[ra2.size()]); + } + + public static final int toRsid(String s){return xRefToId(s);} + public static final int xRefToId(String s){ +// System.out.println(s); + if(s==null || s.length()==0){return -1;} +// assert(s.startsWith("dbsnp:rs")) : s; + + if(s.contains(":")){ + s=s.substring(s.indexOf(':')+1); + } + + int i=0, max=s.length(); +// System.err.println(s); + while(i=max){assert(s.equals(".")) : s; return -1;} + s=s.substring(i); + + return Integer.parseInt(s); + } + + public static final int[] toRsidArray(String s){return xRefToIdArray(s);} + public static final int[] xRefToIdArray(String s){ + if(s==null || s.length()<1){return null;} + String[] array=s.split("[,;]"); + int[] r=new int[array.length]; + for(int i=0; i=25); + return true; + } + + + for(GeneSet gs : sets){ +// if(flag){System.out.println("### "+gs);}//TODO UNDO + for(Gene g : gs.genes){ + + if(!g.untranslated){ + +// if(flag){System.out.println("*** "+g);}//TODO UNDO + +// if(flag){ +// System.out.println("intersectsCodeAndExon: "+g.intersectsCodeAndExon(a, b)); +// System.out.println("intersectsCode: "+g.intersectsCode(a, b)); +// System.out.println("intersectsExon: "+g.intersectsExon(a, b)); +// } + + if(g.intersectsCodeAndExon(a, b)){ + return true; + } + + }else if(includeExonsForUntranslatedGenes){ +// if(flag){System.out.println("*** "+g);}//TODO UNDO +// +// if(flag){ +// System.out.println("intersectsExon: "+g.intersectsExon(a, b)); +// } + + if(g.intersectsExon(a, b)){ + return true; + } + + } + + if(includeSplice){ + int[] array=g.nearestSpliceSite(beginLoc, endLoc); + if(array[0]<=range){return true;} + } + + } + } + return false; + } + + + /** Does this variation lie at least partially within an intron? */ + public boolean intersectsIntron(GeneSet[] sets){ + assert(beginLoc<=endLoc); + int a=beginLoc, b=endLoc; + +// int middle=((beginLoc+endLoc)/2); +// GeneSet[] sets=Data.getNearestGeneSets(chromosome, middle); + + if(sets==null){ + assert(chromosome>=25); + return true; + } + + + for(GeneSet gs : sets){ + for(Gene g : gs.genes){ + if(g.intersectsIntron(a, b)){return true;} + } + } + return false; + } + + public int beginLoc=-2; + public int endLoc=-2; + + public int chromosome=-1; + public byte varType=-1; + + public String ref=null; + public String call=null; + + + public static final HashMap ploidyMap=makePloidyMap(); + public static final String[] haploMap={"0","1","2","all"}; + + public static final String[] varTypeMap={"ref","snp","ins","del","sub", + "no-call-rc","no-call-ri","no-call","no-ref","PAR-called-in-X","null","refpoint"}; + + public static final HashMap varTypeMap2=makeVarTypeMap(); + + private static final HashMap makeVarTypeMap(){ + HashMap r=new HashMap(32); + + for(byte i=0; i makePloidyMap(){ + HashMap hashy=new HashMap(64); + for(int i=0; i<10; i++){ + hashy.put((Byte)(byte)i, i+""); + hashy.put((Integer)i, i+""); + hashy.put(i+"", (Byte)(byte)i); + } + hashy.put((Byte)(byte)-1, "?"); + hashy.put((Integer)(-1), "?"); + hashy.put("?",(Byte)(byte)-1); + return hashy; + } + + private static final int min(int x, int y){return xy ? x : y;} + + @Override + public final int hashCode(){ + long x=chromosome; + x=x<<4; + x^=varType; + x=x<<28; + x^=beginLoc; + x=x<<16; + x^=(endLoc-beginLoc+1); + return new Long(x).hashCode(); //TODO: Slow + } + + @Override + public int compareTo(Variation other) { + if(chromosome!=other.chromosome){return other.chromosome>chromosome ? -1 : 1;} + if(beginLoc!=other.beginLoc){return other.beginLoc>beginLoc ? -1 : 1;} + if(endLoc!=other.endLoc){return other.endLoc>endLoc ? -1 : 1;} + if(varType!=other.varType){return other.varType>varType ? -1 : 1;} + if(varType==REF || varType==NOCALL){return 0;} + + if(call==null){ + return other.call==null ? 0 : -1; + } + return other.call==null ? 1 : call.compareTo(other.call); + } + + public boolean equals(Object other){ + return equals((Variation)other); + } + + public boolean equals(Variation other){ + return compareTo(other)==0; + } + + public boolean intersects(int point){ + return point>=beginLoc && point<=endLoc; + } + + public boolean touches(int point){ + return point>=beginLoc-1 && point<=endLoc+1; + } + + /** This is quite clever. */ + public static boolean overlap(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=b1 && b2>=a1; + } + public static boolean touch(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a2<=(b1+1) && b2>=(a1-1); + } + + /** Is (a1, b1) within (a2, b2) ? */ + public static boolean isWithin(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a1>=a2 && b1<=b2; + } + + public static boolean isWithinNotTouching(int a1, int b1, int a2, int b2){ + assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; + return a1>a2 && b10){return overlap(beginLoc, endLoc, v.beginLoc, v.endLoc);} //Normal case + else{ + //TODO: Bad news! Original MAY have been a length 0 no-call in half-open coordinates. + return overlap(beginLoc, endLoc+1, v.beginLoc, v.endLoc); + } + } +// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("x");} + if(v.beginLoc<=beginLoc){return false;} +// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("y");} + } +// if(v.beginLoc==46397336 || v.beginLoc==46397348){System.out.println("z");} + + return overlap(beginLoc, endLoc, v.beginLoc, v.endLoc); + } +} diff --git a/current/var/Varlet.java b/current/var/Varlet.java new file mode 100755 index 0000000..007e7b0 --- /dev/null +++ b/current/var/Varlet.java @@ -0,0 +1,410 @@ +package var; + +import java.util.ArrayList; + +import align2.QualityTools; +import align2.Tools; + + +import dna.Gene; +import fileIO.TextFile; + +public class Varlet extends var.Variation { + + + public Varlet(int chrom_, byte strand_, int start_, int stop_, int matchStart_, int matchStop_, byte vType, String rf, String ca, + int varQuality_, int readQuality_, int mapScore_, int errors_, float expectedErrors_, int paired_, long readID_, + int readLen_, int readMapLen_, + int readStart_, int readStop_, int readCopies_, int headDist_, int tailDist_, int endDist_, int pairnum){ + super(chrom_, start_, stop_, vType, rf, ca); + strand=strand_; + + setQvector(varQuality_, readQuality_, varQuality_, readQuality_); + + mapScore=mapScore_; + errors=errors_; + expectedErrors=expectedErrors_; + paired=paired_; + + matchStart=matchStart_; + matchStop=matchStop_; + + readID=readID_; + readLen=readLen_; + readMapLen=readMapLen_; + + readStart=readStart_; + readStop=readStop_; + + numReads=Tools.min(readCopies_, Short.MAX_VALUE); + + + headDist=headDist_; + tailDist=tailDist_; + endDist=endDist_; + + if(pairnum==0){ + if(strand==Gene.PLUS){numPlusReads1=1;} + else{numMinusReads1=1;} + }else{ + if(strand==Gene.PLUS){numPlusReads2=1;} + else{numMinusReads2=1;} + } + + assert(pairnum==0 || pairnum==1) : pairnum+"\n"+this; +// assert(readID_=1) : readCopies_+"\n"+this; +// assert(readCopies_0){sb.append("\t"+coverageAtLoc);} + return sb; + } + + public static final ArrayList fromTextFile(String fname){ + TextFile tf=new TextFile(fname, false, false); + ArrayList list=new ArrayList(2000); + + for(String s=tf.nextLine(); s!=null; s=tf.nextLine()){ + if(s.charAt(0)!='#'){ + Varlet v=Varlet.fromText(s); + list.add(v); + } + } + tf.close(); + list.trimToSize(); + return list; + } + + public static final Varlet fromText(String line){ + String[] split=line.split("\t"); + + int chrom=Byte.parseByte(split[0]); + byte strand=Gene.toStrand(split[1]); + int readStart=Integer.parseInt(split[2]); + int readStop=Integer.parseInt(split[3]); + int start=Integer.parseInt(split[4]); + int stop=Integer.parseInt(split[5]); + byte varType=Variation.varTypeMap2.get(split[6]); + + int mapScore=Integer.parseInt(split[7]); + int errors=Integer.parseInt(split[8]); + float expectedErrors=Float.parseFloat(split[9]); + + long readID=Integer.parseInt(split[10]); + int readLen=Integer.parseInt(split[11]); + int readMapLen=Integer.parseInt(split[12]); + int headDist=Integer.parseInt(split[13]); + int tailDist=Integer.parseInt(split[14]); + int endDist=Integer.parseInt(split[15]); + + int avgVarQuality=Integer.parseInt(split[16]); + int maxVarQuality=Integer.parseInt(split[17]); + int avgReadQuality=Integer.parseInt(split[18]); + int maxReadQuality=Integer.parseInt(split[19]); + int numReads=Integer.parseInt(split[20]); + int numSemiUniqueReads=Integer.parseInt(split[21]); + int numUniqueReads=Integer.parseInt(split[22]); + int paired=Integer.parseInt(split[23]); + int numPlusReads1=Integer.parseInt(split[24]); + int numMinusReads1=Integer.parseInt(split[25]); + int numPlusReads2=Integer.parseInt(split[26]); + int numMinusReads2=Integer.parseInt(split[27]); + + String ref=split[28]; + String call=split[29]; + if(ref.length()==1 && ref.charAt(0)=='.'){ref=null;} + if(call.length()==1 && call.charAt(0)=='.'){call=null;} + + + + Varlet v=new Varlet(chrom, strand, start, stop, -1, -1, varType, ref, call, avgVarQuality, avgReadQuality, + mapScore, errors, expectedErrors, paired, readID, readLen, readMapLen, readStart, readStop, numReads, + headDist, tailDist, endDist, 1); + + v.setQvector(avgVarQuality, avgReadQuality, maxVarQuality, maxReadQuality); + v.numPlusReads1=numPlusReads1; + v.numMinusReads1=numMinusReads1; + v.numPlusReads2=numPlusReads2; + v.numMinusReads2=numMinusReads2; + v.numSemiUniqueReads=numSemiUniqueReads; + v.numUniqueReads=numUniqueReads; + + return v; + } + + + @Override + public boolean equals(Variation other){ +// assert(other.getClass()!=Varlet.class); + return super.compareTo(other)==0; + } + + //DO NOT enable this! Varlets should use equality based on Variation data only. +// public boolean equals(Varlet other){ +// return compareTo(other)==0; +// } + + @Override + public int compareTo(Variation other) { +// if(other.getClass()==Varlet.class){} //not needed in practice + return(compareTo((Varlet)other)); + } + + public int compareTo(Varlet other) { + +// int a=compareTo2(other); +// int b=other.compareTo2(this); +// assert(a==-b) : "\n"+a+", "+b+"\n"+Varlet.header()+"\n"+this+"\n"+other+"\n"; + + if(chromosome!=other.chromosome){return chromosome-other.chromosome;} + if(beginLoc!=other.beginLoc){return other.beginLoc>beginLoc ? -1 : 1;} + if(endLoc!=other.endLoc){return other.endLoc>endLoc ? -1 : 1;} + if(varType!=other.varType){return varType-other.varType;} + if(varType==REF || varType==NOCALL){return 0;} + + if(call==null && other.call!=null){return -1;} + if(call!=null && other.call==null){return 1;} + if(call!=null && other.call!=null){ + int x=call.compareTo(other.call); + if(x!=0){return x;} + } + + if(readStart!=other.readStart){return readStart-other.readStart;} + if(readStop!=other.readStop){return readStop-other.readStop;} + if(strand!=other.strand){return strand-other.strand;} + if(maxVarQuality()!=other.maxVarQuality()){return other.maxVarQuality()>8)&0xFF;}; + public int maxVarQuality(){return (qvector>>16)&0xFF;}; + public int maxReadQuality(){return (qvector>>24)&0xFF;}; + + public void setAvgVarQuality(int value){ + qvector=((qvector&0xFFFFFF00)|(value&0xFF)); + } + public void setAvgReadQuality(int value){ + qvector=((qvector&0xFFFF00FF)|((value&0xFF)<<8)); + } + public void setMaxVarQuality(int value){ + qvector=((qvector&0xFF00FFFF)|((value&0xFF)<<16)); + } + public void setMaxReadQuality(int value){ + qvector=((qvector&0x00FFFFFF)|((value&0xFF)<<24)); + } + public void setQvector(int avq, int arq, int mvq, int mrq){ + qvector=mrq&0xFF; + qvector=(qvector<<8)|(mvq&0xFF); + qvector=(qvector<<8)|(arq&0xFF); + qvector=(qvector<<8)|(avq&0xFF); + } + + + + public int mapScore; + public int errors; + + public float expectedErrors; + + public int matchStart; + public int matchStop; + + public int readStart; + public int readStop; + + public int headDist; + public int tailDist; + public int endDist; + + public byte strand; + public int paired; + + public long readID; + + /** Length of read when used for calling vars; ie, after being trimmed, and after colorspace conversion. */ + public int readLen; + /** Length of read when mapping */ + public int readMapLen; + + public int numReads; + public int numSemiUniqueReads=1; + public int numUniqueReads=1; +// public int coverageAtLoc=0; + +// public byte numStrands=1; + + /** Varlets from read 1 mapped to plus strand */ + public int numPlusReads1=0; + + /** Varlets from read 1 mapped to minus strand */ + public int numMinusReads1=0; + + /** Varlets from read 2 mapped to plus strand */ + public int numPlusReads2=0; + + /** Varlets from read 2 mapped to minus strand */ + public int numMinusReads2=0; + + /** Number of reads1 and reads2 mapped to the plus strand */ + public int numPlusMappedReads(){ + return numPlusReads1+numPlusReads2; + } + + /** Number of reads1 and reads2 from which the original molecule (i.e., read 1) mapped to the plus strand */ + public int numPlusOriginReads(){ + return numPlusReads1+numMinusReads2; + } + + /** Number of reads1 and reads2 mapped to the minus strand */ + public int numMinusMappedReads(){ + return numMinusReads1+numMinusReads2; + } + + /** Number of reads1 and reads2 from which the original molecule (i.e., read 1) mapped to the minus strand */ + public int numMinusOriginReads(){ + return numMinusReads1+numPlusReads2; + } + + public int minStrandReads(){return Tools.min(numPlusMappedReads(), numMinusMappedReads());} + +// public byte numStrands(){return (byte)((numPlusReads>0 ? 1 : 0)+(numMinusReads>0 ? 1 : 0));} + public int minStrandReads4(){return Tools.min(numPlusReads1, numMinusReads1, numPlusReads2, numMinusReads2);} + public int minStrandReads3(){//return second lowest number + + final int a, b, c, d; + if(numPlusReads1<=numMinusReads1){a=numPlusReads1; b=numMinusReads1;} + else{b=numPlusReads1; a=numMinusReads1;} + if(numPlusReads2<=numMinusReads2){c=numPlusReads2; d=numMinusReads2;} + else{d=numPlusReads2; c=numMinusReads2;} + + return Tools.min(b, d, (a>=c ? a : c)); + + } + public int strandReadCount(){ + return (numPlusReads1>0 ? 1 : 0)+(numMinusReads1>0 ? 1 : 0)+(numPlusReads2>0 ? 1 : 0)+(numMinusReads2>0 ? 1 : 0); + } + + public int pairNum(){ + return (numPlusReads1+numMinusReads1)>0 ? 0 : 1; + } + +} diff --git a/docs/Legal.txt b/docs/Legal.txt new file mode 100755 index 0000000..47b962c --- /dev/null +++ b/docs/Legal.txt @@ -0,0 +1,9 @@ +BBTools Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. + + + +If you have questions about your rights to use or distribute this software, please contact Technology Transfer and IP Management at TTD@lbl.gov referring to " BB Tools (LBNL Ref 2014-042)." + + + +NOTICE. This software was developed under funding from the U.S. Department of Energy. As such, the U.S. Government has been granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, and perform publicly and display publicly. Beginning five (5) years after the date permission to assert copyright is obtained from the U.S. Department of Energy, and subject to any subsequent five (5) year renewals, the U.S. Government is granted for itself and others acting on its behalf a paid-up, nonexclusive, irrevocable, worldwide license in the Software to reproduce, prepare derivative works, distribute copies to the public, perform publicly and display publicly, and to permit others to do so. diff --git a/docs/changelog_bbduk.txt b/docs/changelog_bbduk.txt new file mode 100755 index 0000000..76cbf50 --- /dev/null +++ b/docs/changelog_bbduk.txt @@ -0,0 +1,77 @@ +BBDuk readme by Brian Bushnell +Last updated February 18, 2014. +Please contact me at bbushnell@lbl.gov if you have any questions or encounter any errors. + +Verison 9: +All program message information now defaults to stderr. + +Version 8: +Fixed bug in which the program would exit immediately if the first batch of reads were all discarded. Found by Bryce Foster. + +Version 7: +hdist>0 or edist>0 stopped working because the leading '1' bit was not appended when searching; fixed. +Added assertions requiring a ktrim mode if useShortKmers is enabled. +Added message notifying when maskMiddle is disabled due to useShortKmers or kbig. + +Version 6: +Made .txt extension for files default to specified default format, rather than bread. Bug noted by James Han. +TrimRead.testOptimal() mode added, and made default when quality trimming is performed; old mode can be used with 'otf=f' flag. + +Version 5: +Found and fixed some bugs with mink0). + + +Version 3: +Added "VERSION" field and set it to 3. +Changed default otm to true (output trimmed reads shorter than minlength). +Added comments and reorganized code. +Added "maxns" flag; enables discarding of reads with more Ns (or any non-ACGT symbol) than the limit. +Added mode switch for discarding paired reads. You can send them if BOTH are bad or if EITHER is bad (default is either). +Added support for discarding reads that are bad for different reasons; e.g. read1 has low average quality and read2 is too short. + + +Version 2: +Created BBDukF with a custom data structure, "HashForest". This reduces memory consumption by around 40% (~38B/kmer). Indexing speed is similar; processing speed ranges from the same to around 50% slower. So overall it is generally slower but still very fast. Output should be identical. +Created single-linked KmerTable for comparison. Similar overall to Hashforest. +Created HashArray with kmers in long[], counts in int[], and a HashForest victim cache. Achieves 15B/kmer (tested)! Faster than HashForest in running and loading. +Added kmer trimming (rather than throwing away reads). (suggested by James Han, Shoudan Liang) +Added end-trimming using shorter kmers (suggested by James Han). +Added multiple parameters and revised shellscript help. +TODO: Consider changing HashArray kmers to int[]. +Added emulation for kmers larger than 31. If you set k>31, a "match" will mean (1+k-31) consecutive matches of length-31 kmers. This mode will automatically set the max skip to 1, which will use more memory for large genomes (human would require around 60G in this mode, which will fail with the default -Xmx parameter). +TODO: Define -Xmn for shellscripts and test speed/memory effects. 32m should be enough. +Fixed bug in BBDuk/BBDukF in which ktrim mode incorrectly assumed maskmiddle=f. Noted by Shoudan Liang and James Han. +Revised trim quality flag. It is now correctly called "trimq" everywhere. +Added support for calling output streams "outm" and "outu" for outmatch and outunmatch. +Disabled "maskmiddle" when kbig>k (or mink0 or edist>0). Should increase accuracy on low-quality reads. This is enabled by default but can be disabled with the 'forbidn' or 'fn' flag. Note that when enabled, a read's kmer 'NNN...NNN' will match to a reference kmer 'AAA...AAA' (and any N in a read can match an A in the ref), which may not be desirable. + + + +Version 1: +Multithreaded table loading; increased speed by up to 5x. +Added Hamming distance support (suggested by James Han). +Added edit distance support (suggested by James Han). +Doubled speed when most reads match reference, and no hitcount histogram is needed, by adding an early exit to test loop. +Now defaults to ByteFile2 which increases fastq input speed when there are at least 3 CPU cores. +Added maxskip (mxs) and minskip (mns) flags to control reference kmer skipping when making index. +TODO: Track consecutive hits to emulate support for kmers>31. diff --git a/docs/readme.txt b/docs/readme.txt new file mode 100755 index 0000000..9eaee0b --- /dev/null +++ b/docs/readme.txt @@ -0,0 +1,629 @@ +BBMap readme by Brian Bushnell +Last updated February 18, 2014. +Please contact me at bbushnell@lbl.gov if you have any questions or encounter any errors. +BBMap is free to use for noncommercial purposes, and investigators are free to publish results derived from the program, as long as the source code is not published or modified. + +This is the official release of BBMAP, version 31.x + + +NOTE: +Don't recompile unless you run into version problems (such as trying to run with java 1.6). But if you must, then - +To recompile, run this: +javac -J-Xmx128m align2/*.java jgi/*.java driver/*.java fileIO/*.java dna/*.java org/apache/tools/bzip2/*.java +(*** NOTE - due to email file size limits, I may have omitted the .java files if you received this via email, in which case you can't recompile. ***) + + +Basic Syntax: + +(using shellscript, on Genepool, which autodetects RAM to set -Xmx parameter) +To index: +bbmap.sh build=1 ref= +To map: +bbmap.sh build=1 in= out= + +(without shellscript) +To index: +java -ea -Xmx31g -cp align2.BBMap build=1 ref= +To map: +java -ea -Xmx31g -cp align2.BBMap build=1 in= out= + +...where "" should indicate the path to the directory containing all the source code directories; e.g. "/global/projectb/sandbox/gaag/bbtools/current" + +Please note, the reference is only needed for building the index the first time; subsequently, just specify the build number which corresponds to that reference. +So for example the first time you map to e.coli you might specify "ref=ecoli_reference.fa build=3"; after that, just specify "build=3". +The index files would then be stored in ./ref/genome/3/ and ./ref/index/3/ +Also, the -Xmx parameter should specify approximately 85% of the physical memory of the target machine; so, 21G for a 24GB node. The process needs approximately 8 bytes per reference base (plus a several hundred MB overhead). + + +Advanced Syntax: + + +Indexing Parameters (required when building the index): +path=<.> Base directory to store index files. Default is the local directory. The index will always be placed in a subdirectory "ref". +ref= Use this file to build the index. Needs to be specified only once; subsequently, the build number should be used. +build=<1> Write the index to this location (build=1 would be stored in /ref/genome/1/ and /ref/index/1/). Can be any integer. This parameter defaults to 1, but using additional numbers allows multiple references to be indexed in the same directory. +k=<13> Use length 13 kmers for indexing. Suggested values are 9-15, with lower typically being slower and more accurate. 13 is usually optimal. 14 is better for RNA-SEQ and very large references >4GB; 12 is better for PacBio and cross-species mapping. +midpad=<300> Put this many "N" in between scaffolds when making the index. 300 is fine for metagenomes with millions of contigs; for a finished genome like human with 25 scaffolds, this should be set to 100000+ to prevent cross-scaffold mapping. +startpad=<8000> Put this many "N" at the beginning of a "chrom" file when making index. It's best if this is longer than your longest expected read. +stoppad=<8000> Put this many "N" at the end of a "chrom" file when making index. It's best if this is longer than your longest expected read. +minscaf=<1> Do not include scaffolds shorter than this when generating index. Useful for assemblies with millions of fairly worthless unscaffolded contigs under 100bp. There's no reason to make this shorter than the kmer length. + + +Input Parameters: +path=<.> Base directory to read index files. +build=<1> Use the index at this location (same as when indexing). +in= Use this as the input file for reads. Also accepts fasta. "in=sequential length=200" will break a genome into 200bp pieces and map them to itself. "in=stdin" will accept piped input. The format of piped input can be specified with e.g. "in=stdin.fq.gz" or "in=stdin.fa"; default is uncompressed fastq. +in2= Run mapping paired, with reads2 in the file "reads2.fq" + NOTE: As a shorthand, "in=reads#.fq" is equivalent to "in=reads1.fq in2=reads2.fq" +interleaved= Or "int". Set to "true" to run mapping paired, forcing the reads to be considered interleaved from a single input file. By default the reader will try to determine whether a file is interleaved based on the read names; so if you don't want this, set interleaved=false. +qin= Set to 33 or 64 to specify input quality value ASCII offset. +fastareadlen=<500> If fasta is used for input, breaks the fasta file up into reads of about this length. Useful if you want to map one reference against another, since BBMap currently has internal buffers limited to 500bp. I can change this easily if desired. +fastaminread=<1> Ignore fasta reads shorter than this. Useful if, say, you set fastareadlen=500, and get a length 518 read; this will be broken into a 500bp read and an 18bp read. But it's not usually worth mapping the 18bp read, which will often be ambiguous. +fakequality=<-1> Set to a positive number 1-50 to generate fake quality strings for fasta input reads. Less than one turns this function off. +blacklist= Set a list of comma-delimited fasta files. Any read mapped to a scaffold name in these files will be considered "blacklisted" and can be handled differently by using the "outm", "outb", and "outputblacklisted" flags. The blacklist fasta files should also be merged with other fasta files to make a single combined fasta file; this combined file should be specified with the "ref=" flag when indexing. +touppercase= Set true to convert lowercase read bases to upper case. This is required if any reads have lowercase letters (which real reads should never have). + + +Sampling Parameters: +reads=<-1> Process at most N reads, then stop. Useful for benchmarking. A negative number will use all reads. +samplerate=<1.0> Set to a fraction of 1 if you want to randomly sample reads. For example, samplerate=0.25 would randomly use a quarter of the reads and ignore the rest. Useful for huge datasets where all you want to know is the % mapped. +sampleseed=<1> Set to the RNG seed for random sampling. If this is set to a negative number, a random seed is used; for positive numbers, the number itself is the seed. Since the default is 1, this is deterministic unless you explicitly change it to a negative number. +idmodulo=<1> Set to a higher number if you want to map only every Nth read (for sampling huge datasets). + + +Mapping Parameters: +fast= The fast flag is a macro. It will set many other paramters so that BBMap will run much faster, at slightly reduced sensitivity for most applications. Not recommended for RNAseq, cross-species alignment, or other situations where long deletions or low identity matches are expected. +minratio=<0.56> Alignment sensitivity as a fraction of a read's max possible mapping score. Lower is slower and more sensitive but gives more false positives. Ranges from 0 (very bad alignment) to 1 (perfect alignment only). Default varies between BBMap versions. +minidentity=<> Or "minid". Use this flag to set minratio more easily. If you set minid=0.9, for example, minratio will be set to a value that will be approximately equivalent to 90% identity alignments. +minapproxhits=<1> Controls minimum number of seed hits to examine a site. Higher is less accurate but faster (on large genomes). 2 is maybe 2.5x as fast and 3 is maybe 5x as fast on a genome with of gigabases. Does not speed up genomes under 100MB or so very much. +padding=<4> Sets extra padding for slow-aligning. Higher numbers are more accurate for indels near the tips of reads, but slower. +tipsearch=<100> Controls how far to look for possible deletions near tips of reads by brute force. tipsearch=0 disables this function. Higher is more accurate. +maxindel=<16000> Sets the maximum size of indels allowed during the quick mapping phase. Set higher (~100,000) for RNA-SEQ and lower (~20) for large assemblies with mostly very short contigs. Lower is faster. +strictmaxindel= Set to true to disallow mappings with indels longer than maxindel. Alternately, for an integer X, 'strictmaxindel=X' is equivalent to the pair of flags 'strictmaxindel=t maxindel=X'. +pairlen=<32000> Maximum distance between mates allowed for pairing. +requirecorrectstrand= Or "rcs". Requires correct strand orientation when pairing reads. Please set this to false for long mate pair libraries! +samestrandpairs= Or "ssp". Defines correct strand orientation when pairing reads. Default is false, meaning opposite strands, as in Illumina fragment libraries. "ssp=true" mode is not fully tested. +killbadpairs= Or "kbp". When true, if a read pair is mapped with an inappropriate insert size or orientation, the read with the lower mapping quality is marked unmapped. +rcompmate= ***TODO*** Set to true if you wish the mate of paired reads to be reverse-complemented prior to mapping (to allow better pairing of same-strand pair libraries). +kfilter=<-1> If set to a positive number X, all potential mapping locatiosn that do not have X contiguous perfect matches with the read will be ignored. So, reads that map with "kfilter=51" are assured to have at least 51 contiguous bases that match the reference. Useful for mapping to assemblies generated by a De Bruijn graph assembly that used a kmer length of X, so that you know which reads were actually used in the assembly. +threads= Or "t". Set number of threads. Default is # of logical cores. The total number of active threads will be higher than this, because input and output are in seperate threads. +perfectmode= Only accept perfect mappings. Everything goes much faster. +semiperfectmode= Only accept perfect or "semiperfect" mappings. Semiperfect means there are no mismatches of defined bases, but up to half of the reference is 'N' (to allow mapping to the edge of a contig). +rescue= Controls whether paired may be rescued by searching near the mapping location of a mate. Increases accuracy, with usually a minor speed penalty. +expectedsites=<1> For BBMapPacBioSkimmer only, sets the expected number of correct mapping sites in the target reference. Useful if you are mapping reads to other reads with some known coverage depth. +msa=<> Advanced option, not recommended. Set classname of MSA to use. +bandwidth=0 Or "bw". When above zero, restricts alignment band to this width. Runs faster, but with reduced accuracy for reads with many or long indels. +bandwidthratio=0 Or "bwr". When above zero, restricts alignment band to this fraction of a read's length. Runs faster, but with reduced accuracy for reads with many or long indels. +usequality= Or "uq". Set to false to ignore quality values when mapping. This will allow very low quality reads to be attempted to be mapped rather than discarded. +keepbadkeys= Or "kbk". With kbk=false (default), read keys (kmers) have their probability of being incorrect evaluated from quality scores, and keys with a 94%+ chance of being wrong are discarded. This increases both speed and accuracy. + + +Output Parameters: +out= Write output to this file. If out=null, output is suppressed. If you want to output paired reads to paired files, use a "#" symbol, like out=mapped#.sam. Then reads1 will go to mapped1.sam and reads2 will go to mapped2.sam. (NOTE: split output currently diabled for .sam format, but allowed for native .txt format). To print to standard out, use "out=stdout" +outm=<> Write only mapped reads to this file (excluding blacklisted reads, if any). +outu=<> Write only unmapped reads to this file. +outb=<> Write only blacklisted reads to this file. If a pair has one end mapped to a non-blacklisted scaffold, it will NOT go to this file. (see: blacklist) +out2=<> If you set out2, outu2, outm2, or outb2, the second read in each pair will go to this file. Not currently allowed for SAM format, but OK for others (such as fasta, fastq, bread). +overwrite= Or "ow". Overwrite output file if it exists, instead of aborting. +ambiguous= Or "ambig". Sets how to handle ambiguous reads. "first" or "best" uses the first encountered best site (fastest). "all" returns all best sites. "random" selects a random site from all of the best sites (does not yet work with paired-ends). "toss" discards all sites and considers the read unmapped (same as discardambiguous=true). Note that for all options (aside from toss) ambiguous reads in SAM format will have the extra field "XT:A:R" while unambiguous reads will have "XT:A:U". +ambiguous2= Or "ambig2". Only for splitter mode. Ambiguous2 strictly refers to any read that maps to more than one reference set, regardless of whether it has multiple mappings within a reference set. This may be set to "best" (aka "first"), in which case the read will be written only to the first reference to which it has a best mapping; "all", in which case a read will be written to outputs for all references to which it maps; "toss", in which case it will be considered unmapped; or "split", in which case it will be written to a special output file with the prefix "AMBIGUOUS_" (one per reference). +outputunmapped= Outputs unmapped reads to primary output stream (otherwise they are dropped). +outputblacklisted= Outputs blacklisted reads to primary output stream (otherwise they are dropped). +cigar= Generate cigar strings (for bread format, this means match strings). cigar=false is faster. "cigar=" is synonymous with "match=". This must be enabled if match/insertion/deletion/substitution statistics are desired, but the program will run faster with cigar strings disabled. +keepnames= Retain original names of paired reads, rather than ensuring both reads have the same name when written in sam format by renaming read2 to the same as read1. If this is set to true then the output may not be sam compliant. +mdtag= Generate MD tags for SAM files. Requires that cigar=true. I do not recommend generating MD tags for RNASEQ or other data where long deletions are expected because they will be incredibly long. +xstag= Generate XS (strand) tags for Cufflinks. This should be used with a stranded RNA-seq protocol. +xmtag= Generate XM tag. Indicates number of best alignments. +intronlen=<999999999> Set to a lower number like 10 to change 'D' to 'N' in cigar strings for deletions of at least that length. This is used by Cufflinks; 'N' implies an intron while 'D' implies a deletion, but they are otherwise identical. +stoptag= Allows generation of custom SAM tag YS:i: +idtag= Allows generation of custom SAM tag YI:f: +ordered= Set to true if you want reads to be output in the same order they were input. This takes more memory, and can be slower, due to buffering in multithreaded execution. Not needed for singlethreaded execution. +ziplevel=<2> Sets output compression level, from 1 (fast) to 9 (slow). I/O is multithreaded, and thus faster when writing paired reads to two files rather than one interleaved file. +nodisk= "true" will not write the index to disk, and may load slightly faster. Prevents collisions between multiple bbmap instances writing indexes to the same location at the same time. +usegzip= If gzip is installed, output file compression is done with a gzip subprocess instead of with Java's native deflate method. Can be faster when set to true. The output file must end in a compressed file extension for this to have effect. *Temporarily disabled due to bug in UGE. +usegunzip= If gzip is installed, input file decompression is done with a gzip subprocess instead of with Java's native inflate method. Can be faster when set to true. *Temporarily disabled due to bug in UGE. +samversion=<1.3> SAM specification version. Set to 1.3 for cigar strings with 'M' or 1.4 for cigar strings with '=' and 'X'. Default is currently 1.3 because samtools 0.1.18 and earlier is incompatible with sam format version 1.4. +bamscript= (bs for short) Writes a shell script to with the command line to translate the sam output of BBMap into a sorted bam file, assuming you have samtools in your path. +maxsites=<5> Sets maximum alignments to print per read, if secondary alignments are allowed. Currently secondary alignments may lack cigar strings. +secondary= Print secondary alignments. +quickmatch= Generate cigar strings during the initial alignment (before the best site is known). Currently, this must be enabled to generate cigar strings for secondary alignments. It increases overall speed but may in some very rare cases yield inferior alignments due to less padding. +local= Output local alignments instead of global alignments. The mapping will still be based on the best global alignment, but the mapping score, cigar string, and mapping coordinate will reflect a local alignment (using the same affine matrix as the global alignment). +sortscaffolds= Sort scaffolds alphabetically in SAM headers to allow easier comparisons with Tophat (in cuffdif, etc). Default is in same order as source fasta. +trimreaddescriptions= (trd) Truncate read names at the first whitespace, assuming that the remaineder is a comment or description. + + +Statistics Parameters: +showprogress= Set to true to print out a '.' once per million reads processed. You can also change the interval with e.g. showprogress=20000. +qhist= Output a per-base average quality histogram to . +mhist= Output a per-base match histogram to . Requires cigar strings to be enabled. The columns give fraction of bases at each position having each match string operation: match, substitution, deletion, insertion, N, or other. +ihist= Output a per-read-pair insert size histogram to . +scafstats= Track mapping statistics per scaffold, and output to . +refstats= For BBSplitter, enable or disable tracking of read mapping statistics on a per-reference-set basis, and output to . +verbosestats=<0> From 0-3; higher numbers will print more information about internal program counters. + + +Trimming Parameters: +qtrim= Options are false, left, right, or both. Allows quality-trimming of read ends before mapping. + false: Disable trimming. + left (l): Trim left (leading) end only. + right (r): Trim right (trailing) end only. This is the end with lower quality many platforms. + both (lr): Trim both ends. +trimq=<5> Set the quality cutoff. Bases will be trimmed until there are 2 consecutive bases with quality GREATER than this value; default is 5. If the read is from fasta and has no quality socres, Ns will be trimmed instead, as long as this is set to at least 1. +untrim= Untrim the read after mapping, restoring the trimmed bases. The mapping position will be corrected (if necessary) and the restored bases will be classified as soft-clipped in the cigar string. + +Java Parameters: +-Xmx If running from the shellscript, include it with the rest of the arguments and it will be passed to Java to set memory usage, overriding the shellscript's automatic memory detection. -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max allowed is typically 85% of physical memory. + + + +Splitting Parameters: +The splitter is invoked by calling bbsplit.sh (or align2.BBSplitter) instead of bbmap.sh, for the indexing phase. It allows combining multiple references and outputting reads to different files depending on which one they mapped to best. The order in which references are specified is important in cases of ambiguous mappings; when a read has 2 identically-scoring mapping locations from different references, it will be mapped to the first reference. +All parameters are the same as BBMap with the exception of the ones listed below. You can still use "outu=" to capture unmapped reads. +ref_= Defines a named set of organisms with a single fasta file or list. For example, ref_a=foo.fa,bar.fa defines the references for named set "a"; any read that maps to foo.fasta or bar.fasta will be considered a member of set a. +out_= Sets the output file name for reads mapping to set . out_a=stuff.sam would capture all the reads mapping to ref_a. +basename= This shorthand for mass-specifying all output files, where the % symbol is a wildcard for the set name. For example, "ref_a=a.fa ref_b=b.fa basename=mapped_%.sam" would expand to "ref_a=a.fa ref_b=b.fa out_a=mapped_a.sam out_b=mapped_b.sam" +ref= When run through the splitter, this is shorthand for writing a bunch of ref_ entries. "ref=a.fa,b.fa" would expand to "ref_a=a.fa ref_b=b.fa". + + +Formats and Extensions +.gz,.gzip,.zip,.bz2 These file extensions are allowed on input and output files and will force reading/writing compressed data. +.fa,.fasta,.txt,.fq,.fastq These file extensions are allowed on input and output files. Having one is REQUIRED. So, reads.fq and reads.fq.zip are valid, but reads.zip is NOT valid. Note that outputting in fasta or fastq will not retain mapping locations. +.sam This is only allowed on output files. +.bam This is allowed on output files if samtools is installed. Beware of memory usage; samtools will run in a subprocess, and it can consume over 1kb per scaffold of the reference genome. + + +Different versions: +BBMap Fastest version. Finds single best mapping location. +BBMapAcc Slower and more accurate. Finds single best mapping location. May currently be broken. +BBMapPacBio Optimized for PacBio's error profile (more indels, fewer substitutions). Finds single best mapping location. PacBio reads should be in fasta format. +BBMapPacBioSkimmer Designed to find ALL mapping locations with alignment score above a certain threshold; also optimized for Pac Bio reads. +BBSplit Uses BBMap or BBMapPacBio to map to multiple references simultaneously, and output the reads to the file corresponding to the best-matching reference. Designed to split metagenomes or contaminated datasets prior to assembly. + + + +Notes. +File types are autodetected by parsing the filename. So you can name files, say, out.fq.gz or out.fastq.gz or reads1.fasta.bz2 or data.sam and it will work as long as the extensions are correct. + + +Change Log: + +v31. +TODO: Change pipethreads to redirects (where possible), and hash pipethreads by process, not by filename. +TODO: Improve scoring function by using gembal distribution and/or accounting for read length. +TextStreamWriter was improperly testing for output format 'other'. Noted by Brian Foster. +Fixed bug for read stream 2 in RTextOutputStream3. Found by Brian Foster. +Fixed bug in MateReadsMT creating an unwanted read stream 2. Found by Brian Foster. +TrimRead.testOptimal() mode added, and made default when quality trimming is performed; old mode can be used with 'otf=f' flag. +Fixed a couple cases where output file format was set to "ordered" even though the process was singlethreaded; this had caused an out-of-memory crash noted by Bill A. +Changed shellscripts of MapPacBio classes to remove "interleaved=false" term. +Reduced Shared.READ_BUFFER_LENGTH from 500 to 200 and Shared.READ_BUFFER_MAX_DATA from 1m to 500k, to reduce ram usage of buffers. +Noticed small bug in trimming; somehow a read had a 'T' with quality 0, which triggered assertion error. I disabled the assertion but I'm not sure how it happened. +Fixed bug in which pigz was not used to decompress fasta files. +All program message information now defaults to stderr. + +v30. +Disabled compression/decompression subprocesses when total system threads allowed is less than 3. +Fixed assertion error in calcCorrectness in which SiteScores are not necessarily sorted if AMBIGUOUS_RANDOM=true. Noted by Brian Foster. +Fixed bug in toLocalAlignment with respect to considering XY as insertions, not subs. +TODO: XY should be standardized as substitutions. +Added scarf input support. Requested by Alex Copeland. +TODO: Allow sam input with interleaved flag. +TODO: Make pigz a module dependency or script load. +Fixed bug with nodisk mode dropping the name of the first scaffold of every 500MB chunk after the first. Noted by Vasanth Singan. +Overhaul of I/O channel creation. Sequence files are now initialized with a FileFormat object which contains information about the format, permission to overwrite, etc. +Increased limit of number of index threads in Windows in nodisk mode (since disk fragmentation is no longer relevant). +Renamed Read.list to sites; added Read.topSite() and Read.numSites(); replaced many instances of things like "r.sites!=null && !r.sites.isEmpty()" +Refactored to put Read and all read-streaming I/O classes in 'stream' package. +Moved kmer hashing and indexing classes to kmer package. +Moved Variation, subclasses, and related classes to var package. +Moved FastaToChrom and ChromToFasta to dna package. +Moved pacbio error correction classes to pacbio package. +Removed stack, stats, primes, and other packages; prefixed all unused pacakges with z_. +TODO: Sites failing Data.isSingleScaffold() test should be clipped, not discarded. +RandomReads3 no longer adds /1 and /2 to paired fastq read names by default (can be enabled with 'addpairnum' flag). +Added "inserttag" flag; adds the insert size to sam output. +Fixed insert size histogram anomaly. There was a blip at insert==(read1.length+read2.length) because the algorithm used to calculate insert size was different for reads that overlap and reads that don't overlap. +Skimmer now defaults to cigar=true. +Added maxindel1 and maxindel2 (or maxindelsum) flags. +Removed OUTER_DIST_MULT2 because it caused assertion errors when different from OUTER_DIST_MULT; changed OUTER_DIST_MULT from 15 to 14. +Added shellscript for skimmer, bbmapskimmer.sh +TODO: Document above changes to parameters. + + + +v29. +New version since major refactoring. +Added FRACTION_GENOME_TO_EXCLUDE flag (fgte). Setting this lower increases sensitivity at expense of speed. Range is 0-1 and default is around 0.03. +Added setFractionGenometoExclude() to Skimmer index. +LMP librares were not being paired correctly. Now "rcs=f" may be used to ignore orientation when pairing. Noted by Kurt LaButti. +Allocating memory to alignment score matrices caused uncaught out-of-memory error on low-memory machines, resulting in a hang. This is now caught and results in an exit. Noted by Alicia Clum. +GPINT machines are now detected and restricted to 4 threads max. This helps prevent out-of-memory errors with PacBio mode. +Fixed sam output bug in which an unmapped read would get pnext of 0 rather than 1 when its mate mapped off the beginning of a scaffold. Noted by Rob Egan. +Added memory test prior to allocating mapping threads. Thread count will be reduced if there is not enough memory. This is to address the issue noted by James Han, in which the PacBio versions would crash after running out of memory on low-memory nodes. +TODO: Detect and prevent low-memory crashes while loading the index by aborting. +Fixed assertion error caused by strictmaxindel mode (noted by James Han). +Added flag "trd" (trimreaddescriptions) which truncates read names at the first whitespace. +Added "usequality/uq" flag to turn on/off usage of quality information when mapping. Requested by Rob Egan. +Added "keepbadkeys/kbk" flag to prevent discarding of keys due to low quality. Requested by Rob Egan. +Fixed crash with very long reads and very small kmers due to exceeding length of various kmer array buffers. +Avg Initial Sites and etc no longer printed for read 2 data. +TODO: Support for selecting long-mate-pair orientation has been requested by Alex C. +Fixed possible bug in read trimming when the entire read was below the quality threshold. +Fixed trim mode bug: "trim=both" was only trimming the right side. "qtrim" is also now an alias for "trim". +Fixed bug in ConcurrentGenericReadInputStream causing an incorrect assertion error for input in paired files and read sampling. Found by Alex Copeland. +TODO: Found some bz2 files that cannot be read. Decompressing them and recompressing them fixes the problem, but it's unclear why other programs can read them. Found by Alex Copeland. +Added insert size histogram: ihist= +Added "machineout" flag for machine-readable output stats. +TODO: reads_B1_100000x150bp_0S_0I_0D_0U_0N_interleaved.fq.gz (ecoli) has 0% rescued for read1 and 0.7% rescued for read 2. After swapping r1 and r2, .664% of r2 is rescued and .001% of r1 is rescued. Why are they not symmetric? +Added 'slow' flag to bbmap for increased accuracy. Still in progress. +Added MultiStateAligner11ts to MSA minIdToMinRatio(). +Changed the way files are tested for permission to write (moved to Tools). +Fixed various places in which version string was parsed as an integer. +Added test for "help" and "version" flags. +Fixed bug in testing for file existence; noted by Bryce Foster. +Fixed issue with scaffold names not being trimmed on whitespace boundaries when 'trd=t'. Noted by Rob Egan. +Added pigz (parallel gzip) support, at suggestion of Rob Egan. +Improved support for subprocesses and pipethreads; they are now automatically killed when not needed, even if the I/O stream is not finished. This allows gunzip/unpigz when a file is being partially read. +Added shellscript test for the hostname 'gpint'; in that case, memory will be capped at 4G per process. +Changed the way cris/ros are shut down. All must now go through ReadWrite.closeStreams() +TODO: Force rtis and tsw to go through that too. +TODO: Add "Job.fname" field. +Made output threads kill processes also. +Modified TrimRead to require minlength parameter. +Fixed a bug with gathering statistics in BBMapPacBioSkimmer (found by Matt Scholz). +Fixed a bug in which reads with match string containing X/Y were not eligible to be semiperfect (Found by Brian Foster). +Fixed a bug related to improving the prior fix; I had inverted an == operator (Found by Brian Foster). +Added SiteScore.fixXY(), a fast method to fix reads that go out-of-bounds during alignment. Unfinished; score needs to be altered as a result. +Added "pairsonly" or "po" flag. Enabling it will treat unpaired reads as unmapped, so they will be sent to 'outu' instead of 'outm'. Suggested by James Han and Alex Copeland. +Added shellscript support for java -Xmx flag (Suggested by James Han). +Changed behavior: with 'quickmatch' enabled secondary sites will now get cigar strings (mostly, not all of them). +"fast" flag now enables quickmatch (50% speedup in e.coli with low-identity reads). Very minor effect on accuracy. +Fixed bug with overflowing gref due GREFLIMIT2_CUSHION padding. Found by Alicia Clum. +Fixed bug in which writing the index would use pigz rather than native gzip, allowing reads from scaffolds.txt.gz before the (buffered) writing finished. Rare race condition. Found by Brian Foster. +Fixed stdout.fa.gz writing uncompressed via ReadStreamWriter. +Added "allowSubprocess" flag to all constructors of TextFile and TextStreamWriter, and made TextFile 'tryAllExtensions' flag the last param. +allowSubprocess currently defaults to true for ByteFiles and ReadInput/Output Streams. +TODO: TextFile and TextStreamWriter (and maybe others?) may ignore ReadWrite.killProcess(). +TODO: RTextOutputStream3 - make allowSubprocess a parameter +TODO: Assert that first symbol of reference fasta is '>' to help detect corrupt fastas. +Improved TextStreamWriter, TextFile, and all ReadStream classes usage of ReadWrite's InputStream/OutputStream creation/destruction methods. +All InputStream and OutputStream creation/destruction now has an allowSubprocesses flag. +Added verbose output to all ReadWrite methods. +Fixed bug in which realigned SiteScores were not given a new perfect/semiperfect status. Noted by Brian Foster and Will Andreopoulos. + + +v28. +New version because the new I/O system seems to be stable now. +Re-enabled bam input/output (via samtools subprocess). Lowered shellscript memory from 85% to 84% to provide space for samtools. +Added "-l" to "#!/bin/bash" at top. This may make it less likely for the environment to be messed up. Thanks to Alex Boyd for the tip. +Addressed potential bug in start/stop index padding calculation for scaffolds that began or ended with non-ACGT bases. +Made superclass for Index. +Made superclass for BBMap. +Removed around 5000 lines of code as a result of dereplication into superclasses. +Added MultiStateAligner11ts, which uses arrays for affine transform instead of if blocks. Changing insertions gave a ~5% speedup; subs gave an immeasurably small speedup. +Found bug in calculation of insert penalties during mapping. Fixing this bug increases speed but decreases accuracy, so it was modified toward a compromise. + + +v27. +Added command line to sam file header. +Added "msa=" flag. You can specify which msa to use by entering the classname. +Added initial banded mode. Specify "bandwidth=X" or "bandwidthratio=X" accelerate alignment. +Cleaned up argument parsing a bit. +Improved nodisk mode; now does not use the disk at all for indexing. BBSplitter still uses the disk. +Added "fast" flag, which changes some paramters to make mapping go faster, with slightly lower sensitivity. +Improved error handling; corrupt input files should be more likely to crash with an error message and less likely to hang. Noted by Alex Copeland. +Improved SAM input, particularly coordinates and cigar-string parsing; this should now be correct but requires an indexed reference. Of course this information is irrelevant for mapping so this parsing is turned off by default for bbmap. +Increased maximum read speed with ByteFile2, by using 2 threads per file. May be useful in input-speed limited scenarios, as when reading compressed input on a node with many cores. Also accelerates sam input. +TODO: Make ByteFile auto-select a subtype based on compression and number of cores. +TODO: Consider moving THREADS to Shared. +Updated match/cigar flag syntax. +Updated shellscript documentation. +Changed ByteFile2 from array lists to arrays; should reduce overhead. +TODO: Increase speed of sam input. +TODO: Increase speed of output, for all formats. +TODO: Finish ReadStreamWriter.addStringList(), which allows formatting to be done in the host. +In progress: Moving all MapThread fields to abstract class. +MapThread now passes reverse-complemented bases to functions to prevent replication of this array. +Fixed very rare bug when a non-semiperfect site becomes semiperfect after realignment, but subsequently is no longer highest-ranked. +strictmaxindel can now be assigned a number (e.g. stricmaxindel=5). +If a fasta read is broken into pieces, now all pieces will recieve the _# suffix in their name. Previously, the first piece was exempt. +TODO: Consider changing SamLine.rname to a String and seq, qual to byte[]. +Changed SamLine.seq, qual to byte[]. Now stored in original read order and only reversed for minus strand during I/O. +Added sortscaffolds flag (requested by Vasanth Singan). +Fixed XS tag bug; in some cases read 2 was getting opposite flag (noted by Vasanth Singan). +Fixed bug when reading sam files without qualities (noted by Brian Foster). +Fixed bug where absent cigar strings were printed as "null" instead of "*" as a result of recent changes to sam I/O (noted by Vasanth Singan). +Found error when a read goes off the beginning of a block. Ref padding seems to be absent, because Ns were replaced by random sequence. Cause is unknown; cannot replicate. +Fixed Block.getHitList(int, int). +Changed calcAffineScore() to require base array for information when throwing exceptions. +Changed generated bamscript to unload samtools module before loading samtools/0.1.19. +sam file idflag and stopflag are both now faster, particularly for perfect mappings. But both default to off because they are still slow nonetheless. +Fixed bug in BBIndex in which a site was considered perfect because all bases matched the reference, but some of the bases were N. Canonically, reads with Ns can never be perfect even if the ref has Ns in the same locations. +Fixed above bug again because it was not fully fixed: CHECKSITES was allowing a read to be classified as perfect even if it contained an N. +Increased sam read speed by ~2x; 30MB/s to 66MB/s +Increased sam write speed from ~18MB/s to ~32MB/s on my 4-core computer (during mapping), with mapping at peak 42MB/s with out=null. Standalone (no mapping) sam output seems to run at 51MB/s but it's hard to tell. +Increased fasta write from 118MB/s to 140 MB/s +Increased fastq write from 70MB/s to 100MB/s +Increased fastq read from 120MB/s (I think) to 296MB/s (663 megabytes/sec!) with 2 threads or 166MB/s with 1 thread +Some of these speed increases come from writing byte[] into char[] buffer held in a ThreadLocal, instead of turning them into Strings or appending them byte-by-byte. +All of these speed optimizations caused a few I/O bugs that temporarily affected some users between Oct 1 and Oct 4, 2013. Sorry! +Flipped XS tag from + to - or vice versa. I seem to have misinterpreted the Cufflinks documentation (noted by Vasanth Singan). +Fixed bug in which (as a result of speed optimizations) reads outside scaffold boundaries, in sam 1.3 format, were not getting clipped (Noted by Brian Foster). +Changed default behavior of all shellscripts to run with -Xmx4g if maximum memory cannot be detected (typically, because ulimit=infinity). Was 31. Unfortunately things will break either way. +Fixed off-by-1 error in sam TLEN calculation; also simplified it to give sign based on leftmost POS and always give a plus and minus even when POS is equal. +Added sam NH tag (when ambig=all). +Disabled sam XM tag because the bowtie documentation and output do not make any sense. +Changed sam MD and NM tags to account for 'N' symbol in cigar strings. +Made sam SM tag score compatible with mapping score. +Fixed bug in SamLine when cigar=f (null pointer when parsing match string). (Found by Vasanth Singan) +Fixed bug in BBMapThread* when local=true and ambiguous=toss (null pointer to read.list). (Found by Alexander Spunde) +Changed synthetic read naming and parsing (parsecustom flag) to use " /1" and " /2" at the end of paired read names. (Requested by Kurt LaButti) +Increased fastq write to 200MB/s (590 megabytes/s) +Increased fasta write to 212MB/s (624 megabytes/s measured by fastq input) +Increased sam write to 167MB/s (492 megabytes/s measured by fastq input) +Increased bread write to 196MB/s (579 megabytes/s measured by fastq input) +bf2 (multithreaded input) is now enabled by default on systems with >4 cores, or in ReformatReads always. +Fixed RTextOutputStream3.finishedSuccessfully() returning false when output was in 2 files. +Changed output streams to unbuffered. No notable speed increase. +Fixed bug in ByteFile2 in which reads would be recycled when end of file was hit (found by Brian Foster, Bryce Foster, and Kecia Duffy). + + +v26. +Fixed crash from consecutive newlines in ByteFile. +Made SiteScore clonable/copyable. +Removed @RG line from headers. It implies that reads should be annotated with addition fields based on the RG line information. +Changed sam flags (at advice of Joel Martin). Now single-ended reads will never have flags 0x2, 0x40, or 0x80 set. +Added correct insert size average to output stats, in place of old inner distance and mapping length. +Fixed crash when detecting length of SamLines with no cigar string. (Found by Shayna Stein) +Added flag "keepnames" which keeps the read names unchanged when writing in sam format. Normally, a trailing "/1", "/2", " 1", or " 2" are stripped off, and if read 2's name differs from read 1's name, read 1's name is used for both. This is to remain spec-compliant with the sam format. However, in some cases (such as grading synthetic reads tagged with the correct mapping location) it is useful to retain the original name of each read. +Added local alignment option, "local". Translates global alignments into a local alignments using the same affine transform (and soft-clips ends). +Changed killbadpairs default to false. Now by default improperly paired reads are allowed. +Merged TranslateColorspaceRead versions into a single class. +Added interleaved input and output for bread format. May be useful for error correction pipeline. +TODO: Mode where reads are mapped to multiple scaffolds, but are mapped at most one time per scaffold. I.e., remove all but top site per scaffold (and forbid self-mapping). +Fixed yet another instance of negative coordinates appearing in an unmapped read, which the new version of samtools can't handle. +Fixed bug in counting ambiguous reads; was improperly including in statistics reads that were ambiguous but had a score lower than minratio. +Fixed rare crash found related to realignment of reads with ambiguous mappings (found by Rob Egan). +Unified many of the differences between the MapThread variants, and added a new self-checking function (checkTopSite) to ensure a Read is self-consistent. +Added some bitflag fetch functions to SamLine and fixed 'pairedOnSameChrom()' which was not handling the '=' symbol. +TODO: Make GENERATE_BASE_SCORES_FROM_QUALITY a parameter, default false in BBMapPacBio and true elsewhere. (I verified this should work fine) +TODO: Make GENERATE_KEY_SCORES_FROM_QUALITY a parameter, default true (probably even in BBMapPacBio). (I verified this should work fine) +Updated LongM (merged with LongM from Dedupe). +Fixed bug in SamLine in which clipped leading indels were not considered, causing potential negative coordinates. (Found by Brian Foster) +TODO: Match strings like NNNNNNDDDDDNNNNNmmmmmmmmmmmmmmmmm...mmmmmmm should never exist in the first place. Why did that happen? +Added "strictmaxindel" flag (default: strictmaxindel=f). Attempts to kill mappings in which there is a single indel event longer than the "maxindel" setting. Requested by James Han. +TODO: Ensure strictmaxindel works in all situations, including rescued paired ends and recursively regenerated padded match strings. +TODO: Redo msa to be strictly subtractive. Start with score=100*bases, then use e.g. 0 for match, -1 for del, -370 for sub, -100 for N, etc. No need for negative values. +Changed TIMEBITS in MultiStateAligner9PacBio from 10 to 9 to address a score underflow assertion error found by Alicia Clum. The underflow occuerd around length 5240; new limit should be around 10480. +TODO: Alicia found an error of exceeding gref bounds. +Fixed race condition in TextStreamWriter. +Improved functionality of splitter. Now you can index once and map subsequently using "basename" without specifying "ref=" every single time. +"Reads Used" in output now dispays the number of reads used. Before, for paired reads, it would display the number of pairs (half as many). +Added bases used to reads used at Kurt's request. +Improved bam script generation. Now correctly sets samtools memory based on detected memory, and warns user that crashes may be memory-related. +Fixed an obsolete assertion in SamLine found by Alicia. +Added XS tag option ("xstag=t") for Cufflinks; the need for this was noted by requested by Vasanth Singan. +Added 'N' cigar operation for deletions longer than X bases (intronlen=X). Also needed by Cufflinks. +Secondary alignments now get "*" for bases and qualities, as recommended by the SAM spec. This saves space, but may cause problems when converting sam into other formats. +Fixed bug that caused interleaved=true to override in2. Now if you set in and in2, interleaved input will be disabled. (noted by Andrew Tritt). +Fixed some low-level bugs in I/O streams. When shutting down streams I was waiting until !Thread.isAlive() rather than Thread.getState()==Thread.State.TERMINATED, which caused a race condition (since a thread is not alive before it starts execution). +Added debugging file with random name written to /ref/ directory. This should help debugging if somewhere deep in a pipeline multiple processes try to index at the same location simultaneously. Suggested by Bryce Foster. +Fixed log file generation causing a crash if the /ref/ directory did not exist, found by Vasanth Singan. Also logging is now disabled by default but enabled if you set "log=t". +Input sequence data will now translate '.' and '-' to 'N' automatically, as some fasta databases appear to use '.' instead of 'N'. (Thanks to Kecia Duffy and James Han) +Added capability to convert lowercase reads to upper case (crash on lowercase noted by Vasanth Singan). + + +v25. +Increased BBMapPacBio max read length to 6000, and BBMapPacBioSkimmer to 4000. +Fixed bugs in padding calculations during match string generation. +Improved some assertion error output. +Added flag "maxsites" for max alignments to print. +Added match field to sitescore. +Made untrim() affect sitescores as well. +Decreased read array buffer from 500 to 20 in MapPacBio. +TODO: sam secondary alignments. (PARTIALLY DONE) +TODO: sam secondary match strings. +TODO: stitcher for super long reads. +TODO: msa superclass, "MSA". +TODO: wrapper for split reference mapping and merging. +Improved fillAndScoreLimited to return additional information. +Added flag "secondary" to print secondary alignments. Does not yet ensure that all secondary alignments will get cigar strings, but most do. +Added flag "quickmatch" to generate match strings for SiteScores during slow align. Speeds up the overall process somewhat (at least on my PC; have not tested it on cluster). +Improved pruning during slow align by dynamically increasing msa limit. +Addressed a bug in which reads sometimes have additional sites aligned to the same coordinates as the primary site. The bug can still occur (typically during match generation or as a result of padding), but is detected and corrected during runtime. +Tracked down and fixed a bug relating to negative coordinates in sam output for unmapped reads paired with reads mapped off the beginning of a scaffold, with help from Rob Egan. +Disabled frowny-face warning message which had caused some confusion. +TODO: Add verification of match strings on site scores. +Made superclass for MSA. This will allow merging of redundant code over the various BBMap versions. +Fixed a crash-hang out-of-memory error caused by initialization order. Now crashes cleanly and terminates. Found by James Han. +Fixed bug in output related to detecting cigar string length under sam 1.4 specification (found by Rob Egan). +Added flag "killbadpairs"/"kbp". +Added flag "fakequality" for fasta. +Permanently fixed bugs related to unexpected short match strings caused by error messages. +Increased speed of dynamic program phase when dealing with lots of Ns. +TODO: In-line generation of short match string when printing a read, rather than mutating the read. (mutation is now temporary) +Added flag, "stoptag". Allows generation of SAM tag YS:i: +Added flag, "idtag". Allows generation of SAM tag YI:f: + +v24. +Fixed bug that slightly reduced accuracy for reads with exactly 1 mismatch. They were always skipping slow align, sometimes preventing ambiguous reads from being detected. +Increased speed of MakeRocCurve (for automatic grading of sam files from synthetic reads). Had used 1 pass per quality level; now it uses only 1 pass total. +Increased accuracy of processing reads and contigs with ambiguous bases (in mapping phase). +Adjusted clearzones to use gradient functions and asymptotes rather than step functions. Reduces false positives and increases true positives, especially near the old step cutoffs. +Fixed trimSitesBelowCutoff assertion that failed for paired reads. +Added single scaffold toggle to RandomReads. Default 'singlescaffold=true'; forces reads to come from a single scaffold). This can cause non-termination if no scaffolds are long enough, and may bias against shorter scaffolds. +Added min scaffold overlap to RandomReads. Default 'overlap=1'; forces reads to overlap a scaffold at least this much. This can cause non-termination if no scaffolds are long enough, and may bias against shorter scaffolds. +Fixed setPerfect(). Previously, reads with 'N' overlapping 'N' in the reference could be considered perfect matches, but no reads containing 'N' should ever be considered a perfect mapping to anything. +Formalized definition of semiperfect to require read having no ambiguous bases, and fixed "isSemiperfect()" function accordingly. +Shortened and clarified executable names. +Fixed soft-clipped read start position calculation (mainly relevant to grading). +Prevented reads from being double-counted when grading, when a program gives multiple primary alignments for a read. +Fixed a bug in splitter initialization. +Added "ambiguous2". Reads that map to multiple references can now be written to distinct files (prefixed by "AMBIGUOUS_") or thrown away, independantly of whether they are ambiguous in the normal sense (which includes ambiguous within a single reference). +Added statistics tracking per reference and per scaffold. Enable with "scafstats=" or "refstats=". +"ambiguous" may now be shortened to "ambig" on the command line. +"true" and "false" may now be shortened to t, 1, or f, 0. If omitted entirely, "true" is assumed; e.g. "overwrite" is equivalent to "overwrite=true". +Added stderr as a vaild output destination specified from the command line. +BBSplitter now has a flag, "mapmode"; can be set to normal, accurate, pacbio, or pacbioskimmer. +Fixed issue where stuff was being written to stdout instead of stderr and ended up in SAM files (found by Brian Foster). +TODO: Add secondary alignments. +TODO: Unlimited length reads. +TODO: Protein mapping. +TODO: Soft clipping in both bbmap and GradeSamFile. Should universally adjust coords by soft-clip amount when reported in SAM format. +Fixed assertion error concerning reads containing Ns marked as perfect, when aligned to reference Ns (found by Rob Egan). +Fixed potential null-pointer error in "showprogress" flag. + +v23. +Created BBSplitter wrapper for BBMap that allows merging any number references together and splitting the output into different streams. +Added support for ambiguous=random with paired reads (before it was limited to unpaired). +TODO: Iterative anchored alignment for very long reads, with a full master gref. +TODO: untrim=c/m/s/n/r +TODO: mode=vfast/veryfast: k=14 minratio=0.8 minhits=2 maxindel=20 +TODO: mode=fast: k=13 minratio=0.7 minhits=2 maxindel=200 +TODO: mode=normal: k=13 minratio=0.56 minhits=1 maxindel=16000 +TODO: mode=slow/accurate: BBMapi +TODO: mode=pacbio: BBMapPacBio k=12 +TODO: mode=perfect +TODO: mode=semiperfect +TODO: mode=rnaseq +TODO: Put untrim in caclStatistics section +TODO: Test with MEGAN. +Finished new random read generator. Much faster, and solves coordinate problem with multiple indels. +Improved error message on read parsing failures. +TODO: Insert size histogram +TODO: "outp=", output for reads that mapped paired +TODO: "outs=", output for reads that mapped singly +Corrected assertion in "isSingleScaffold()" +Fixed a rare bug preventing recursive realignment when ambiguous=random (found by Brian Foster) +Added samversion/samv flag. Set to 1.3 for cigar strings with 'M' or 1.4 for cigar strings with '=' and 'X'. Default is 1.3. +Added enforcement of thread limit when indexing. +Added internal autodetection of gpint machines. Set default threadcount for gpints at 2. +Improved ability to map with maxindel=0 +Added XM:i: optional SAM flag because some programs seem to demand it. Like all extra flags, this is omitted if the read is not mapped. Otherwise, it is set to 1 for unambiguously mapped reads, and 2 or more for ambiguously mapped reads. The number can range as high as the total number of equal-scoring sites, but this is not guaranteed unless the "ambiguous=random" flag is used. +Fixed bug in autodetection of paired ends, found by Rob Egan. + + + +v22. +Added match histogram support. +Added quality histogram support. +Added interleaving support to random read generator. +Added ability to disable pair rescue ("rescue=false" flag), which can speed things up in some cases. +Disabled dynamic-programming slow alignment phase when no indels are allowed. +Accelerated rescue in perfect and semiperfect mode. +Vastly accelerated paired mapping against references with a very low expected mapping rate. +Fixed crash in rescue caused by reads without quality strings (e.g. paired fasta files). (found by Brian Foster) + + +v21. +If reference specified is same as already-processed reference, the old index will not be deleted. +Added BBMap memory usage estimator to assembly statistics tool: java -Xmx120m jgi.AssemblyStats2 k= +Added support for multiple output read streams: all reads (set by out=), mapped reads (set by outm=), and unmapped reads (set by outu=). They can be in different formats and any combination can be used at once. You can set pair output to secondary files with out2, outm2, and outu2. +Changed definition of "out=". You can no longer specify split output streams implicitly by using a "#" in the filename; it must be explicit. the "#" wildcard is still allowed for input streams. +Fixed a bug with sam input not working. (found by Brian Foster) +Added additional interleaved autodetection pattern for reads named "xxxxx 1:xxxx" and "xxxxx 2:xxxx" +Fixed a bug with soft-clipped deletions causing an incorrect cigar length. (found by Brian Foster) +Fixed a bug with parsing of negative numbers in byte arrays. +TODO: Found a new situation in which poly-N reads preferentially map to poly-N reference (probably tip search?) +Fixed a bug in which paired reads occasionally are incorrectly considered non-semiperfect. (found by Brian Foster) +Added more assertion tests for perfection/imperfection status. +Added blacklist support. This allows selection of output stream based on the name of the scaffold to which a read maps. +Created Blacklist class, allowing creation of blacklists and whitelists. +Added outb (aka outblacklist) and outb2 streams, to output reads that mapped to blacklisted scaffolds. +Added flag "outputblacklisted=" which contols whether blacklisted reads are printed to the "out=" stream. Default is true. +Added support for streaming references. e.g. "cat ref1.fa ref2.fa | java BBMap ref=stdin.fa" +Updated and reorganized this readme. +Removed a dependency on Java 7 libraries (so that the code runs in Java 6). +Added per-read error rate histogram. Enable with qhist= +TODO: generate standard deviation. +Added per-base-position M/S/D/I/N rate tracking. Enable with mhist= +Added quality trimming. Reads may be trimmed prior to mapping, and optionally untrimmed after mapping, so that no data is lost. Trimmed bases are reported as soft-clipped in this case. +Trimming will extend until at least 2 consecutive bases have a quality greater than trimq (default 5). +Added flags: trim=, trimq=<5>, untrim= +TODO: Correct insert size in realtime for trim length. +TODO: Consider adding a TrimRead pointer to reads, rather than using obj. +TODO: Consider extending match string as 'M' rather than 'C' as long as clipped bases match. +Found and made safe some instances where reads could be trimmed to less than kmer length. +Found and fixed instance where rescue was attempted for length-zero reads. +Fixed an instance where perfect reads were not marked perfect (while making match string). + + +v20.1 (not differentiated from v20 since the differences are minor) +Fixed a minor, longstanding bug that prevented minus-strand alignment of rads that only had a single valid key (due to low complexity or low quality). +Increased accuracy of perfectmode and semiperfectmode, by allowing mapping of reads with only one valid key, without loss of speed. They still don't quite match normal mode since they use fewer keys. +Added detection of and error messages for reads that are too long to map. +Improved shell script usage information. + + +v20. +Made all MapThreads subclasses of MapThread, eliminating duplicate code. +Any exception thrown by a MapThread will now be detected, allowing the process to complete normally without hanging. +Exceptions (e.g. OutOfMemory) when loading reference genome are now detected, typically causing a crash exit instead of a hang. +Exceptions (e.g. OutOfMemory) when generating index are now detected, causing a crash exit instead of a hang. +Exceptions in output stream (RTextOutputStream) subthreads are now detected, throwing an exception. +Added support for soft clipping. All reads that go off the ends of scaffolds will be soft-clipped when output to SAM format. (The necessity of this was noted by Rob Egan, as negative scaffold indices can cause software such as samtools to crash) + + +v19. +Added support for leading FASTA comments (denoted by semicolon). +Fixed potential problem in FASTA read input stream with very long reads. +Recognizes additional FASTA file extensions: .seq, .fna, .ffn, .frn, .fsa, .fas +Disabled gzip subprocesses to circumvent a bug in UGE: Forking can cause a program to be terminated. Gzip is still supported. +Slightly reduced memory allocation in shellscript. +Ported "Analyze Index" improvement over to all versions (except v5). +Added flags: fastaminread, showprogress +Fixed problem noted by Rob Egan in which paired-end reads containing mostly 'N' could be rescued by aligning to the poly-N section off the end of a contig. +Fixed: Synthetic read headers were being improperly parsed by new FASTQ input stream. +Made a new, faster, more correct version of "isSemiperfect". +Added "semiperfect" test for reads changed during findDeletions. +Identified locations in "scoreNoIndels" where call 'N' == ref 'N' is considered a match. Does not seem to cause problems. +Noted that SAM flag 0x40 and 0x80 definitions differ from my usage. + + +v18. +Fastq read input speed doubled. +Fasta read input speed increased 50%. +Increased speed of "Analyze Index" by a factor of 3+ (just for BBMap so far; have not yet ported change over to other versions). +Fixed an array out-of-bounds bug found by Alicia Clum. +Added bam output option (relies on Samtools being installed). +Allows gzip subprocesses, which can sometimes improve gzipping and gunzipping speed over Java's implementation (will be used automatically if gzip is installed). This can be disabled with with the flags "usegzip=false" and "usegunzip=false". +Started a 32-bit mode which allows 4GB per block instead of 2GB, for a slight memory savings (not finished yet). +Added nondeterministic random read sampling option. +Added flags: minscaf, startpad, stoppad, samplerate, sampleseed, kfilter, usegzip, usegunzip + + +v17. +Changed the way error rate statistics are displayed. All now use match string length as denominator. +Identified error in random read generator regarding multiple insertions. It will be hard to fix but does not matter much. +Found out-of-bounds error when filling gref. Fixed (but maybe not everywhere...). +Added random mapping for ambiguous reads. +Changed index from 2d array to single array (saves a lot of memory). +Increased speed by ~10%. +Improved index generation and loading speed (typically more than doubled). +Changed chrom format to gzipped. +Added "nodisk" flag; index is not written to disk. +Fixed a rare out-of-bounds error. +Increased speed of perfect read mapping. +Fixed rare human PAR bug. + + +v16. Changes since last version: +Supports unlimited number of unscaffolded contigs. +Supports piping in and out. Set "out=stdout.sam" and "in=stdin.fq" to pipe in a fastq file and pipe out a sam file (other extensions are also supported). +Ambiguously named files (without proper extensions) will be autodetected as fasta or fastq (though I suggest not relying on that). +Added additional flags (described in parameters section): minapproxhits, padding, tipsearch, maxindel. +minapproxhits has a huge impact on speed. Going from 1 to 2 will typically at least double the speed (on a large genome) at some cost to accuracy. + + +v15. Changes since last version: +Contig names are retained for output. +SAM header @SQ tags fixed. +SAM header @PG tag added. +An out-of-bounds error was fixed. +An error related to short match strings was found and possibly handled. +All versions now give full statistics related to %matches, %substitutions, %deletions, and %insertions (unless match string generation is disabled). +Increased speed and accuracy for tiny (<20MB) genomes. +Added dynamic detection of scaffold sizes to better partition index, reducing memory in some cases. +Added command-line specification of kmer length. +Added more command line flags and described them in this readme. +Allowed overwriting of existing indices, for ease of use (only when overwrite=true). For efficiency you should still only specify "ref=" the first time you map to a particular reference, and just specify the build number subsequently. diff --git a/docs/readme_reformat.txt b/docs/readme_reformat.txt new file mode 100755 index 0000000..f6547a3 --- /dev/null +++ b/docs/readme_reformat.txt @@ -0,0 +1,19 @@ +ReformatReads readme by Brian Bushnell +Last updated February 18, 2014. +Please contact me at bbushnell@lbl.gov if you have any questions or encounter any errors. + +This is currently a stub. + +Change Log: + +Moved processing phase from constructor into a new function. +Added pigz (parallel gzip) support, at suggestion of Rob Egan. +Added ftr/ftl (force trim to a certain base position), at request of Alicia Clum. +Added shellscript support for java -Xmx flag (Suggested by James Han). +Fixed stdout.fa.gz writing uncompressed via ReadStreamWriter. +Added scarf input support. +Major refactoring. +Output quality switch (qout) was being ignored and ASCII33 was always used; this has been fixed. +TrimRead.testOptimal() mode added, and made default when quality trimming is performed; old mode can be used with 'otf=f' flag. +Fixed bug in ByteBuilder for reads with no quality values. Noted by Brian Foster. +All program message information now defaults to stderr. \ No newline at end of file diff --git a/license.txt b/license.txt new file mode 100755 index 0000000..b9653a2 --- /dev/null +++ b/license.txt @@ -0,0 +1,25 @@ +BBTools Copyright (c) 2014, The Regents of the University of California, through Lawrence Berkeley National Laboratory (subject to receipt of any required approvals from the U.S. Dept. of Energy). All rights reserved. + + + +Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: + + + +(1) Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. + + + +(2) Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. + + + +(3) Neither the name of the University of California, Lawrence Berkeley National Laboratory, U.S. Dept. of Energy nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. + + + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + + +You are under no obligation whatsoever to provide any bug fixes, patches, or upgrades to the features, functionality or performance of the source code ("Enhancements") to anyone; however, if you choose to make your Enhancements available either publicly, or directly to Lawrence Berkeley National Laboratory, without imposing a separate written license agreement for such Enhancements, then you hereby grant the following license: a non-exclusive, royalty-free perpetual license to install, use, modify, prepare derivative works, incorporate into other computer software, distribute, and sublicense such enhancements or derivative works thereof, in binary and source code form. diff --git a/resources/primes.txt.gz b/resources/primes.txt.gz new file mode 100755 index 0000000..d337921 Binary files /dev/null and b/resources/primes.txt.gz differ diff --git a/sh/bbduk.sh b/sh/bbduk.sh new file mode 100755 index 0000000..b2cea87 --- /dev/null +++ b/sh/bbduk.sh @@ -0,0 +1,138 @@ +#!/bin/bash -l +#bbduk in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 2G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=2000 + fi + + mult=42; + + y=$(( ((x-20000)*mult/100)/1000 )) + + if [ $y -ge 15000 ]; then + y=15000 + elif [ 100 -ge $y ]; then + y=100 + fi + + #echo "y=$y" + z="-Xmx${y}m" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +bbduk() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP jgi.BBDukF $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "This script is designed for Genepool nodes." + echo "Last modified December 11, 2013" + echo "" + echo "Description: Compares reads to the kmers in a reference dataset, optionally allowing an edit distance." + echo "Splits the reads into two outputs - those that match the reference, and those that don't." + echo "Can also trim (remove) the matching parts of the reads rather than binning the reads." + echo "" + echo "Usage: bbduk.sh in= out= ref=" + echo "" + echo "Input may be stdin or a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file." + echo "If you pipe via stdin/stdout, please include the file type; e.g. for gzipped fasta input, set in=stdin.fa.gz" + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "Input parameters:" + echo "in= The 'in=' flag is needed only if the input file is not the first parameter. 'in=stdin.fq' will pipe from standard in." + echo "in2= Use this if 2nd read of pairs are in a different file." + echo "ref= Comma-delimited list of reference files." + echo "touppercase=f (tuc) Change all letters in reads and reference to upper-case." + echo "interleaved=auto (int) If true, forces fastq input to be paired and interleaved." + echo "qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto." + echo "reads=-1 If set to a positive number, only process this many reads (or pairs), then quit." +# echo "skipreads=0 Ignore this many initial reads (or pairs) and process the rest." + echo "" + echo "Output parameters:" + echo "out= (outnonmatch) Write reads here that do not contain kmers matching the database. 'out=stdout.fq' will pipe to standard out." + echo "out2= (outnonmatch2) Use this to write 2nd read of pairs to a different file." + echo "outmatch= (outm or outb) Write reads here that contain kmers matching the database." + echo "outmatch2= (outm2 or outb2) Use this to write 2nd read of pairs to a different file." + echo "stats= Write statistics about which contamininants were detected." + echo "duk= Write statistics in duk's format." + echo "overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file." + echo "showspeed=t (ss) Set to 'f' to suppress display of processing speed." + echo "ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster." + echo "fastawrap=80 Length of lines in fasta output." + echo "qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input)." + echo "" + echo "Processing parameters:" + echo "threads=auto (t) Set number of threads to use; default is number of logical processors." + echo "k=28 Kmer length used for finding contaminants. Contaminants shorter than k will not be found. k must be at least 1." + echo "rcomp=t Look for reverse-complements of kmers in addition to forward kmers." + echo "maskmiddle=t (mm) Treat the middle base of a kmer as a wildcard, to increase sensitivity in the presence of errors." + echo "maxbadkmers=0 (mbk) Reads with more than this many contaminant kmers will be discarded." + echo "hammingdistance=0 (hdist) Maximum Hamming distance from ref kmers (subs only). Memory use is proportional to (3*K)^hdist." + echo "editdistance=0 (edist) Maximum edit distance from ref kmers (subs and indels). Memory use is proportional to (8*K)^edist." + echo "forbidn=f (fn) Forbids matching of read kmers containing N. By default, these will match a reference 'A' if hdist>0 or edist>0, to increase sensitivity." + echo "minskip=1 (mns) Force minimal skip interval when indexing reference kmers. 1 means use all, 2 means use every other kmer, etc." + echo "maxskip=99 (mxs) Restrict maximal skip interval when indexing reference kmers." + echo " Normally all are used for scaffolds<100kb, but with longer scaffolds, up to K-1 are skipped." + echo "removeifeitherbad=t (rieb) Paired reads get sent to 'outmatch' if either is match (or either is trimmed shorter than minlen). Set to false to require both." + echo "" + echo "Trimming parameters:" + echo "ktrim=f Trim reads to remove bases matching reference kmers." + echo " Values: f (don't trim), r (trim right end), l (trim left end), n (convert to N instead of trimming)." + echo " Any single non-whitespace character other than t, f, r, l, n: convert to that symbol rather than trimming." + echo "useshortkmers=f (usk) Look for shorter kmers at read tips (only for k-trimming). Enabling this will disable maskmiddle." + echo "mink=4 Minimum length of short kmers. Setting this automatically sets useshortkmers=t." + echo "qtrim=f Trim read ends to remove bases with quality below minq. Performed AFTER looking for kmers." + echo " Values: t (trim both ends), f (neither end), r (right end only), l (left end only)." + echo "trimq=4 Trim quality threshold." + echo "minlength=20 (ml) Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter." + echo "minavgquality=0 (maq) Reads with average quality (before trimming) below this will be discarded." + echo "otm=f (outputtrimmedtomatch) Output reads trimmed to shorter than minlength to outm rather than discarding." + echo "" +# echo "Other parameters:" +# echo "array=t Use HashArray data structure." +# echo "forest=f Use HashForest data structure." +# echo "table=f Use KmerTable data structure." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "There is a changelog at /global/projectb/sandbox/gaag/bbtools/docs/changelog_bbduk.txt" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +bbduk "$@" diff --git a/sh/bbest.sh b/sh/bbest.sh new file mode 100755 index 0000000..caf787d --- /dev/null +++ b/sh/bbest.sh @@ -0,0 +1,42 @@ +#!/bin/bash -l +#bbest in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +function bbest() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load samtools + module load pigz + local CMD="java -ea -Xmx64m -cp $CP jgi.SamToEst $@" +# echo $CMD >&2 + $CMD +} + +function usage(){ + echo "Written by Brian Bushnell" + echo "Last modified December 5, 2013" + echo "" + echo "Description: Calculates EST (expressed sequence tags) capture by an assembly from a sam file." + echo "Designed to use BBMap output generated with these flags: k=13 maxindel=100000 customtag ordered" + echo "" + echo "Usage: bbest.sh in= out=" + echo "" + echo "Parameters:" + echo "in= Specify a sam file (or stdin) containing mapped ests." + echo "out= Specify the output stats file (default is stdout)." + echo "ref= Specify the reference file (optional)." + echo "est= Specify the est fasta file (optional)." + echo "fraction=<0.98> Min fraction of bases mapped to ref to be considered 'all mapped'." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +bbest "$@" diff --git a/sh/bbmap.sh b/sh/bbmap.sh new file mode 100755 index 0000000..09e8ed2 --- /dev/null +++ b/sh/bbmap.sh @@ -0,0 +1,189 @@ +#!/bin/bash -l +#bbmap in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=84 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=84 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=84 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=84 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=84 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=84 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + + +bbmap() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP align2.BBMap build=1 overwrite=true match=long fastareadlen=500 $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "BBMap v31" + echo "Written by Brian Bushnell, from Dec. 2010 - present" + echo "Last modified January 6, 2014" + echo "" + echo "Description: Fast and accurate short-read aligner for DNA and RNA." + echo "" + echo "To index: bbmap.sh ref=" + echo "To map: bbmap.sh in= out=" + echo "To map without an index: bbmap.sh ref= in= out= nodisk" + echo "" + echo "in=stdin will accept reads from standard in, and out=stdout will write to standard out," + echo "but file extensions are still needed to specify the format of the input and output files." + echo "e.g. in=stdin.fa.gz will read gzipped fasta from standard in; out=stdout.sam.gz will write gzipped sam." + echo "" + echo "Indexing Parameters (required when building the index):" + echo "nodisk=f Set to true to build index in memory and write nothing to disk except output." + echo "ref= Specify the reference sequence. Only do this ONCE, when building the index (unless using 'nodisk')." + echo "build=1 If multiple references are indexed in the same directory, each needs a unique numeric ID (unless using 'nodisk')." + echo "k=13 Kmer length, range 8-15. Longer is faster but uses more memory. Shorter is more sensitive." + #echo " I suggest 13 for most cases; 14 for large genomes >3GB; and 12 for PacBio or cross-species mapping." + #echo " You can have multiple kmer lengths per build number in the same directory." + #echo "startpad=2000 Pad the beginning of the reference array this many Ns prior to the first scaffold." + #echo "stoppad=8000 Pad the end of the reference array this many Ns after the end of the last scaffold." + #echo "midpad=300 Pad this many Ns between adjacent scaffolds. Higher is better, but wastes memory with tons of tiny scaffolds." + #echo "colorspace=f Set to true to build a SOLiD colorspace index. Probably does not work any more." + echo "path=<.> Specify the location to write the index, if you don't want it in the current working directory." + #echo "minscaf=1 Throw away scaffolds shorter than this when indexing." + echo "" + echo "Input Parameters:" + echo "build=1 Designate index to use. Corresponds to the number specified when building the index." + echo "in= Primary reads input; required parameter." + echo "in2= For paired reads in two files." + echo "qin=auto Set to 33 or 64 to specify input quality value ASCII offset." + echo "interleaved=auto True forces paired/interleaved input; false forces single-ended mapping." + echo " If not specified, interleaved status will be autodetected from read names." + echo "fastareadlen=500 Break up FASTA reads longer than this. Max is 500. Only works for FASTA input." + echo "fakequality=-1 Set to a positive number 1-50 to generate fake quality strings for fasta input reads." + #echo "parsecustom=f Specially process read headers from my random read generator, to determine true and false positive rates." + echo "" + echo "Sampling Parameters:" + echo "reads=-1 Set to a positive number N to only process the first N reads (or pairs), then quit. -1 means use all reads." + #echo "idmodulo=1 Set to a number N to only map every Nth read (for deterministic sampling)." + echo "samplerate=1 Set to a number from 0 to 1 to randomly select that fraction of reads for mapping. 1 uses all reads." + #echo "sampleseed=1 Set to a positive number N set the RNG seed for sampling at the samplerate," + #echo " or a negative number to select a random seed (for nondeterministic sampling)." + echo "skipreads=0 Set to a number N to skip the first N reads (or pairs), then map the rest." + echo "" + echo "Mapping Parameters:" + echo "fast=f This flag is a macro which sets other paramters to run faster, at reduced sensitivity. Bad for RNA-seq." + echo "maxindel=16000 Don't look for indels longer than this. Lower is faster. Set to >=100k for RNAseq with long introns like mammals." + echo "strictmaxindel=f When enabled, do not allow indels longer than 'maxindel'. By default these are not sought, but may be found anyway." + #echo "minratio=0.56 Fraction of max alignment score required to keep a site. Higher is faster." + echo "minid=0.76 Approximate minimum alignment identity to look for. Higher is faster and less sensitive." + echo "minhits=1 Minimum number of seed hits required for candidate sites. Higher is faster." + echo "k=13 Kmer length of index. Higher is faster (for large genomes), less sensitive, and uses more RAM. Max is 15." + echo "local=f Set to true to use local, rather than global, alignments. This will soft-clip ugly ends of poor alignments." + echo "perfectmode=f Allow only perfect mappings when set to true (very fast)." + echo "semiperfectmode=f Allow only perfect and semiperfect (perfect except for N's in reference) mappings." + echo "threads=auto (t) Set to number of threads desired. By default, uses all cores available." + echo "ambiguous= (ambig) Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations)." + echo " best (use the first best site)" + echo " toss (consider unmapped)" + echo " random (select one top-scoring site randomly)" + echo " all (retain all top-scoring sites. Does not work yet with SAM output)" + #echo "kfilter=-1 Set to a positive number N to require minimum N contiguous matches for a mapped read." + echo "samestrandpairs=f (ssp) Specify whether paired reads should map to the same strand or opposite strands." + echo "requirecorrectstrand=t (rcs) Forbid pairing of reads without correct strand orientation. Set to false for long-mate-pair libraries." + echo "killbadpairs=f (kbp) If a read pair is mapped with an inappropriate insert size or orientation, " + echo " the read with the lower mapping quality is marked unmapped." + echo "pairedonly=f (po) Treat unpaired reads as unmapped. Thus they will be sent to 'outu' but not 'outm'." + echo "rcompmate=f Reverse complement second read in each pair prior to mapping." + echo "pairlen=32000 Set max allowed distance between paired reads. (insert size)=(pairlen)+(read1 length)+(read2 length)" + echo "trim=f Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both)." + echo "untrim=f Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings." + echo "bandwidthratio=0 (bwr) If above zero, restrict alignment band to this fraction of read length. Faster but less accurate." + echo "" + echo "Output Parameters:" + echo "outputunmapped=t Set to false if unmapped reads should not be printed to 'out=' target (saves time and disk space)." + echo "out= Write all reads to this file (unless outputunmapped=t)." + echo "outu= Write only unmapped reads to this file. Does not include unmapped paired reads with a mapped mate." + echo "outm= Write only mapped reads to this file. Includes unmapped paired reads with a mapped mate." + echo "scafstats= Write statistics on how many reads mapped to which scaffold to this file." + echo "refstats= Write statistics on how many reads mapped to which reference to this file (for BBSplitter)." + echo "qualityhistogram= (qhist) Write histogram of quality score by read location to this file." + echo "matchhistogram= (mhist) Write histogram of base match, substitution, deletion, and insertion rates by read location." + echo "inserthistogram= (ihist) Write histogram of insert sizes (for paired reads)." + echo "bamscript= (bs) Write a shell script to that will turn the sam output into a sorted, indexed bam file." + echo "ordered=f Set to true to output reads in same order as input. Slower and uses more memory." + #echo " Only relevant with multiple mapping threads." + #echo "showprogress=0 Set to a positive number N to print a '.' once per N reads processed." + echo "cigar=t Set to 'f' to skip generation of cigar strings (faster)." + #echo " 'none' is faster, but prevents generation of match and error rate statistics." + echo "overwrite=f (ow) Allow process to overwrite existing files." + echo "secondary=f Print secondary alignments." + echo "maxsites=5 Maximum number of total alignments to print per read. Only relevant when secondary=t." + echo "quickmatch=f Generate cigar strings more quickly. Must be true to generate secondary site cigar strings." + echo "keepnames=f Keep original names of paired reads, rather than ensuring both reads have the same name." + echo "trimreaddescriptions=f (trd) Truncate read names at the first whitespace, assuming that the remaineder is a comment or description." + echo "sam=1.3 Set to 1.4 to write Sam version 1.4 cigar strings, with = and X instead of M." + echo "md=f Write MD tags." + echo "xs=f Set to 'xs=fs', 'xs=ss', or 'xs=us' to write XS tags for RNAseq using firststrand," + echo " secondstrand, or unstranded libraries. Needed by Cufflinks. JGI mainly uses 'firststrand'." + echo "stoptag=t Write a tag indicating read stop location, prefixed by YS:i:" + echo "idtag=t Write a tag indicating percent identity, prefixed by YI:f:" + echo "ziplevel=2 (zl) Set to true to write a tag indicating percent identity, prefixed by YI:f:" + echo "machineout=f Set to true to output statistics in machine-friendly 'key=value' format." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "This list is not complete. For more information, please consult $DIR""docs/readme.txt" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +bbmap "$@" diff --git a/sh/bbmapskimmer.sh b/sh/bbmapskimmer.sh new file mode 100755 index 0000000..a247527 --- /dev/null +++ b/sh/bbmapskimmer.sh @@ -0,0 +1,75 @@ +#!/bin/bash -l + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=84 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=84 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=84 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=84 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=84 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=84 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +mapPacBioSkimmer() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP align2.BBMapPacBioSkimmer build=1 overwrite=true minratio=0.40 match=long fastareadlen=6000 dprr=false ambiguous=best minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@" + echo $CMD >&2 + $CMD +} + +usage(){ + bash "$DIR"bbmap.sh +} + +if [ -z "$1" ]; then + usage + exit +fi + +mapPacBioSkimmer "$@" diff --git a/sh/bbmerge.sh b/sh/bbmerge.sh new file mode 100755 index 0000000..a6b8b47 --- /dev/null +++ b/sh/bbmerge.sh @@ -0,0 +1,86 @@ +#!/bin/bash -l +#merge in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx200m" +calcXmx () { + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +function merge() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP jgi.MateReadsMT $@" + echo $CMD >&2 + $CMD +} + +function usage(){ + echo "BBMerge v1.4" + echo "This script is designed for Genepool nodes." + echo "Last modified February 12, 2014" + echo "" + echo "Description: Merges paired reads into single reads by overlap detection." + echo "With sufficient coverage, can also merge nonoverlapping reads using gapped kmers." + echo "" + echo "Usage: bbmerge.sh in= out= outbad=" + echo "" + echo "Input may be stdin or a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file." + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "Input parameters:" + echo "in2=null Second input file for paired reads." + echo "extra=null Additional files to use for input (generating hash table) but not for output." + echo "interleaved=auto May be set to true or false to force the input read file to override autodetection of the input file as paired interleaved." + echo "reads=-1 Only process this number of reads, then quit (-1 means all)." + echo "" + echo "Output parameters:" + echo "out= File for merged reads." + echo "outbad= File for unmerged reads." + echo "outinsert= File list of read names and their insert sizes." + echo "hist=null Insert length histogram output file." + echo "ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster." + echo "" + echo "Trimming parameters:" + echo "qtrim=f Trim read ends to remove bases with quality below minq. Performed BEFORE merging." + echo " Values: t (trim both ends), f (neither end), r (right end only), l (left end only)." + echo "trimq=10 Trim quality threshold." + echo "minlength=20 (ml) Reads shorter than this after trimming (before merging) will be discarded. Pairs will be discarded only if both are shorter." + echo "trimonfailure=t (tof) If detecting insert size by overlap fails, the reads will be trimmed and this will be re-attempted." + echo "" + echo "Other parameters:" + echo "join=t Create merged reads. If set to false, you can still generate an insert histogram." + echo "useoverlap=t Attempt merge based on paired ead overlap." + echo "minoverlapbases=12 Minimum number of overlapping bases to merge reads." + echo "mininsert=0 Reads with insert sizes less than this (after merging) will be discarded." + echo "gap=null Sets gap size for merging via gapped kmers." + echo " 'gap=50;50,100' would run one pass with a gap size of 50 and another with both 50 and 100." + echo " This script sets memory appropriately for ungapped merging only, though." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +merge "$@" diff --git a/sh/bbnorm.sh b/sh/bbnorm.sh new file mode 100755 index 0000000..5567ef3 --- /dev/null +++ b/sh/bbnorm.sh @@ -0,0 +1,170 @@ +#!/bin/bash -l +#normalize in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=85 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=85 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +normalize() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + module load pigz + local CMD="java -ea $z -cp $CP jgi.KmerNormalize bits=32 $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "This script is designed for Genepool nodes." + echo "Last modified February 12, 2014" + echo "" + echo "Description: Normalizes read depth based on kmer counts." + echo "Can also error-correct, bin reads by kmer depth, and generate a kmer depth histogram." + echo "" + echo "Usage: bbnorm.sh in= out= outt= hist=" + echo "" + echo "Input may be a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file. 'out' and 'hist' are both optional." + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "Input parameters:" + echo "in2=null Second input file for paired reads" + echo "extra=null Additional files to use for input (generating hash table) but not for output" + echo "fastareadlen=2^31 Break up FASTA reads longer than this. Can be useful when processing scaffolded genomes" + echo "tablereads=-1 Use at most this many reads when building the hashtable (-1 means all)" + echo "kmersample=1 Process every nth kmer, and skip the rest" + echo "readsample=1 Process every nth read, and skip the rest" + echo "interleaved=auto May be set to true or false to force the input read file to ovverride autodetection of the input file as paired interleaved." + echo "" + echo "Output parameters:" + echo "out= File for normalized reads" + echo "outt= (outtoss) File for reads that were excluded from primary output" + echo "reads=-1 Only process this number of reads, then quit (-1 means all)" + echo "sampleoutput=t Use sampling on output as well as input (not used if sample rates are 1)" + echo "keepall=f Set to true to keep all reads (e.g. if you just want error correction)." + echo "zerobin=f Set to true if you want kmers with a count of 0 to go in the 0 bin instead of the 1 bin in histograms." + echo " Default is false, to prevent confusion about how there can be 0-count kmers." + echo " The reason is that based on the 'minq' and 'minprob' settings, some kmers may be excluded from the bloom filter." + echo "tmpdir=$TMPDIR This will specify a directory for temp files (only needed for multipass runs). If null, they will be written to the output directory." + echo "usetempdir=t Allows enabling/disabling of temporary directory; if disabled, temp files will be written to the output directory." + echo "" + echo "Hashing parameters:" + echo "k=31 Kmer length (values under 32 are most efficient, but arbitrarily high values are supported)" + echo "bits=32 Bits per cell in bloom filter; must be 2, 4, 8, 16, or 32. Maximum kmer depth recorded is 2^cbits. Automatically reduced to 16 in 2-pass." + echo " Large values decrease accuracy for a fixed amount of memory, so use the lowest number you can that will still capture highest-depth kmers." + echo "hashes=3 Number of times each kmer is hashed and stored. Higher is slower." + echo " Higher is MORE accurate if there is enough memory, and LESS accurate if there is not enough memory." + echo "prefiliter=f True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable. The prefilter is more memory-efficient because it uses 2-bit cells." + echo "prehashes=2 Number of hashes for prefilter." + echo "buildpasses=1 More passes can sometimes increase accuracy by iteratively removing low-depth kmers" + echo "minq=6 Ignore kmers containing bases with quality below this" + echo "minprob=0.5 Ignore kmers with overall probability of correctness below this" + echo "threads=X Spawn exactly X hashing threads (default is number of logical processors). Total active threads may exceed X due to I/O threads." + echo "rdk=t (removeduplicatekmers) When true, a kmer's count will only be incremented once per read pair, even if that kmer occurs more than once." + echo "" + echo "Normalization parameters:" + echo "fixspikes=f (fs) Do a slower, high-precision bloom filter lookup of kmers that appear to have an abnormally high depth due to collisions." + echo "target=40 (tgt) Target normalization depth. NOTE: All depth parameters control kmer depth, not read depth." + echo " For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))" + echo "maxdepth=-1 (max) Reads will not be downsampled when below this depth, even if they are above the target depth." + echo "mindepth=6 (min) Kmers with depth below this number will not be included when calculating the depth of a read." + echo "minkmers=15 (mgkpr) Reads must have at least this many kmers over min depth to be retained. Aka 'mingoodkmersperread'." + echo "percentile=54.0 (dp) Read depth is by default inferred from the 54th percentile of kmer depth, but this may be changed to any number 1-100." + echo "deterministic=t (dr) Generate random numbers deterministically to ensure identical output between multiple runs. May decrease speed with a huge number of threads." + echo "passes=2 (p) 1 pass is the basic mode. 2 passes (default) allows greater accuracy, error detection, better contol of output depth." + echo "" + echo "Error detection parameters:" + echo "hdp=90.0 (highdepthpercentile) Position in sorted kmer depth array used as proxy of a read's high kmer depth." + echo "ldp=25.0 (lowdepthpercentile) Position in sorted kmer depth array used as proxy of a read's low kmer depth." + echo "tossbadreads=f (tbr) Throw away reads detected as containing errors. Only controls behavior of final pass." + echo "errordetectratio=125 (edr) Reads with a ratio of at least this much between their high and low depth kmers will be classified as error reads." + echo "highthresh=12 (ht) Threshold for high kmer. A high kmer at this or above are considered non-error." + echo "lowthresh=3 (lt) Threshold for low kmer. Kmers at this and below are always considered errors." + echo "" + echo "Error correction parameters:" + echo "ecc=f Set to true to correct errors." + echo "ecclimit=3 Correct up to this many errors per read. If more are detected, the read will remain unchanged." + echo "errorcorrectratio=140 (ecr) Adjacent kmers with a depth ratio of at least this much between will be classified as an error." + echo "echighthresh=22 (echt) Threshold for high kmer. A kmer at this or above may be considered non-error." + echo "eclowthresh=2 (eclt) Threshold for low kmer. Kmers at this and below are considered errors." + echo "eccmaxqual=127 Do not correct bases with quality above this value." + echo "aec=f (aggressiveErrorCorrection) Sets more aggressive values of ecr=100, ecclimit=7, echt=16, eclt=3." + echo "meo=f (markErrorsOnly) Marks errors by reducing quality value of suspected errors; does not correct anything." + echo "mue=t (markUncorrectableErrors) Marks errors only on uncorrectable reads; requires 'ecc=t'." + echo "" + echo "Depth binning parameters:" + echo "lowbindepth=10 (lbd) Cutoff for low depth bin." + echo "highbindepth=80 (hbd) Cutoff for high depth bin." + echo "outlow= Pairs in which both reads have a median below lbd go into this file." + echo "outhigh= Pairs in which both reads have a median above hbd go into this file." + echo "outmid= All other pairs go into this file." + echo "" + echo "Histogram parameters:" + echo "hist= Specify a file to write the input kmer depth histogram" + echo "histout= Specify a file to write the output kmer depth histogram" + echo "pzc=f (printzerocoverage) Print lines in the histogram with zero coverage." + echo "histlen=1048576 Max kmer depth displayed in histogram. Also affects statistics displayed, but does not affect normalization." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +normalize "$@" diff --git a/sh/bbsplit.sh b/sh/bbsplit.sh new file mode 100755 index 0000000..a28f4b1 --- /dev/null +++ b/sh/bbsplit.sh @@ -0,0 +1,177 @@ +#!/bin/bash -l + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=85 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=85 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +function bbsplit() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP align2.BBSplitter build=1 overwrite=true match=long fastareadlen=500 minhits=2 minratio=0.9 maxindel=20 trim=both untrim=true $@" + echo $CMD >&2 + $CMD +} + +function usage(){ + echo "BBSplit / BBMap v30" + echo "Written by Brian Bushnell, from Dec. 2010 - present" + echo "Last modified December 17, 2013" + echo "" + echo "Description: Maps reads to multiple references simultaneously." + echo "Outputs reads to a file for the reference they best match, with multiple options for dealing with ambiguous mappings." + echo "" + echo "To index: bbsplit.sh build=<1> ref_x= ref_y=" + echo "To map: bbsplit.sh build=<1> in= out_x= out_y=" + echo "" + echo "To be concise, and do everything in one command:" + echo "bbsplit.sh ref=x.fa,y.fa in=reads.fq basename=o%.sam" + echo "which is equivalent to" + echo "bbsplit.sh build=1 in=reads.fq ref_x=x.fa ref_y=y.fa out_x=ox.sam out_y=oy.sam" + echo "" + echo "in=stdin will accept reads from standard in, and out=stdout will write to standard out," + echo "but file extensions are still needed to specify the format of the input and output files." + echo "e.g. in=stdin.fa.gz will read gzipped fasta from standard in; out=stdout.sam.gz will write gzipped sam." + echo "" + echo "Indexing Parameters (required when building the index):" + echo "ref_= Specify the reference sequence for the given name; e.g., ref_ecoli=ecoli.fasta" + echo " These can also be comma-delimited lists of files; e.g., ref_a=a1.fa,a2.fa,a3.fa" + echo "build=<1> If multiple references are indexed in the same directory, each needs a unique build ID." + echo "k=<13> Kmer length, range 8-15. Longer is faster but uses more memory. Shorter is more sensitive." + #echo " I suggest 13 for most cases; 14 for large genomes >3GB; and 12 for PacBio or cross-species mapping." + #echo " You can have multiple kmer lengths per build number in the same directory." + #echo "startpad=<2000> Pad the beginning of the reference array this many Ns prior to the first scaffold." + #echo "stoppad=<8000> Pad the end of the reference array this many Ns after the end of the last scaffold." + #echo "midpad=<300> Pad this many Ns between adjacent scaffolds. Higher is better, but wastes memory with tons of tiny scaffolds." + #echo "colorspace= Set to true to build a SOLiD colorspace index. Probably does not work any more." + #echo "path=<.> Specify the location to write the index, if you don't want it in the current directory." + #echo "minscaf=<1> Throw away scaffolds shorter than this when indexing." + echo "" + echo "Input Parameters:" + echo "build=<1> Designate index to use. Corresponds to the number specified when building the index." + echo "in= Primary reads input; required parameter." + echo "in2= For paired reads in two files." + echo "qin= Set to 33 or 64 to specify input quality value ASCII offset." + echo "interleaved= True forces paired/interleaved input; false forces single-ended mapping." + echo " If not specified, interleaved status will be autodetected from read names." + #echo "fastareadlen=<500> Break up FASTA reads longer than this. Max is 500. Only works for FASTA input." + #echo "parsecustom= Specially process read headers from my random read generator, to determine true and false positive rates." + #echo "" + #echo "Sampling Parameters:" + #echo "reads=<-1> Set to a positive number N to only process the first N reads, then quit. -1 means use all reads." + #echo "idmodulo=<1> Set to a number N to only map every Nth read (for deterministic sampling)." + #echo "samplerate=<1> Set to a number from 0-1 to randomly select that fraction of reads for mapping." + #echo "sampleseed=<1> Set to a positive number N set the RNG seed for sampling at the samplerate," + #echo " or a negative number to select a random seed (for nondeterministic sampling)." + echo "" + echo "Mapping Parameters:" + echo "maxindel=<16000> Don't look for indels longer than this. Lower is faster. Set to >=100k for RNA-seq." + echo "minratio=<0.56> Fraction of max alignment score required to keep a site. Higher is faster." + echo "minhits=<1> Minimum number of seed hits required for candidate sites. Higher is faster." + echo "k=<13> Key length for index. Higher is faster (for large genomes) but uses more RAM. Max is 15." + echo "local= Set to true to use local, rather than global, alignments. This will soft-clip ugly ends of poor alignments." + #echo "perfectmode= Allow only perfect mappings when set to true (very fast)." + #echo "semiperfectmode= Allow only perfect and semiperfect (perfect except for N's and off-end-of-contig) mappings." + #echo "threads= Set to number of threads desired. By default, uses all cores available." + echo "ambiguous= Set behavior on ambiguously-mapped reads (with multiple top-scoring mapping locations)." + echo " best (use the first best site)" + echo " toss (consider unmapped)" + echo " random (select one top-scoring site randomly)" + echo " all (retain all top-scoring sites. Does not work yet with SAM output)" + echo "ambiguous2= Set behavior only for reads that map ambiguously to multiple different references." + echo " Normal 'ambiguous=' controls behavior on all ambiguous reads;" + echo " Ambiguous2 excludes reads that map ambiguously within a single reference." + echo " best (use the first best site)" + echo " toss (consider unmapped)" + echo " all (write a copy to the output for each reference to which it maps)" + echo " split (write a copy to the AMBIGUOUS_ output for each reference to which it maps)" + #echo "kfilter=<-1> Set to a positive number N to require minimum N contiguous matches for a mapped read." + #echo "samestrandpairs= Specify whether paired reads should map to the same strand or opposite strands." + #echo "requirecorrectstrand= Forbid pairing of reads without correct strand orientation." + #echo "rcompmate= Reverse complement second read in each pair prior to mapping." + #echo "pairlen=<24000> Set max allowed distance between paired reads. (insert size)=(pairlen)+(read1 length)+(read2 length)" + echo "trim= Quality-trim ends to Q5 before mapping. Options are 'l' (left), 'r' (right), and 'lr' (both)." + echo "untrim= Undo trimming after mapping. Untrimmed bases will be soft-clipped in cigar strings." + echo "" + echo "Output Parameters:" + echo "out_= Output reads that map to the reference to ." + echo "outputunmapped= Set to false if unmapped reads should not be printed (saves time and disk space)." + echo "mdtag= Set to true for applications that need an MD tag in SAM files. Not recommended for RNAseq on euks." + echo "basename=prefix%suffix Equivalent to multiple out_%=prefix%suffix expressions, in which each % is replaced by the name of a reference file." + #echo "ordered= Set to true to output reads in same order as input. Slower and uses more memory." + #echo " Only relevant with multiple mapping threads." + #echo "showprogress=<0> Set to a positive number N to print a '.' once per N reads processed." + #echo "match= Set to 'none' to skip generation of cigar strings. " + #echo " 'none' is faster, prevents generation of match and error rate statistics." + #echo "overwrite= Allow process to overwrite existing files." + echo "secondary= Print secondary alignments." + echo "maxsites=<5> Maximum number of total alignments to print per read. Only relevant when secondary=t." + echo "quickmatch= Generate cigar strings more quickly. Must be true to generate secondary site cigar strings." + echo "bs= Write a shell script to 'file' that will turn the sam output into a sorted, indexed bam file." + echo "scafstats= Write statistics on how many reads mapped to which scaffold to this file." + echo "refstats= Write statistics on how many reads mapped to which reference to this file (for BBSplitter)." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "For more information, please consult /global/projectb/sandbox/gaag/bbtools/docs/readme.txt or the bbmap.sh usage information." + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +bbsplit "$@" diff --git a/sh/bbsplitpairs.sh b/sh/bbsplitpairs.sh new file mode 100755 index 0000000..63efe61 --- /dev/null +++ b/sh/bbsplitpairs.sh @@ -0,0 +1,70 @@ +#!/bin/bash -l +#splitpairs in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx120m" +calcXmx () { + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +splitpairs() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + module load pigz + local CMD="java -ea $z -cp $CP jgi.SplitPairsAndSingles $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "This script is designed for Genepool nodes." + echo "Last modified December 11, 2013" + echo "" + echo "Description: Separates paired reads into files of 'good' pairs and 'good' singletons by removing 'bad' reads that are shorter than a min length." + echo "Designed to handle situations where reads become too short to be useful after trimming. This program also optionally performs quality trimming." + echo "" + echo "Usage: bbsplitpairs.sh in= out= outs= minlen=" + echo "" + echo "Input may be stdin or a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file." + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "in= The 'in=' flag is needed if the input file is not the first parameter. 'in=stdin' will pipe from standard in." + echo "in2= Use this if 2nd read of pairs are in a different file." + echo "out= The 'out=' flag is needed if the output file is not the second parameter. 'out=stdout' will pipe to standard out." + echo "out2= Use this to write 2nd read of pairs to a different file." + echo "outsingle= (outs) Write singleton reads here." + echo "" + echo "overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file." + echo "showspeed=t (ss) Set to 'f' to suppress display of processing speed." + echo "interleaved=auto (int) If true, forces fastq input to be paired and interleaved." + echo "qtrim=f Trim read ends to remove bases with quality below minq." + echo " Values: rl (trim both ends), f (neither end), r (right end only), l (left end only)." + echo "trimq=4 Trim quality threshold." + echo "minlen=20 (ml) Reads shorter than this after trimming will be discarded." + echo "ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster." + echo "fixpairs=f (fp, fint) Fixes corrupted interleaved files by examining pair names. Only use on files with broken interleaving." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +splitpairs "$@" diff --git a/sh/bbstats.sh b/sh/bbstats.sh new file mode 100755 index 0000000..e7b26b5 --- /dev/null +++ b/sh/bbstats.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +#For more information, please see stats.sh + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" + +"$DIR"stats.sh $@ diff --git a/sh/compile.sh b/sh/compile.sh new file mode 100755 index 0000000..32ed19a --- /dev/null +++ b/sh/compile.sh @@ -0,0 +1,2 @@ +#!/bin/bash +javac -J-Xmx256m current/*/*.java diff --git a/sh/countgc.sh b/sh/countgc.sh new file mode 100755 index 0000000..8b94811 --- /dev/null +++ b/sh/countgc.sh @@ -0,0 +1,49 @@ +#!/bin/bash -l +#countgc in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx120m" +calcXmx () { + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +countgc() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + local CMD="java -ea $z -cp $CP jgi.CountGC $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "Written by Brian Bushnell" + echo "Last modified December 11, 2013" + echo "" + echo "Description: Counts GC content of reads or scaffolds." + echo "" + echo "Usage: countgc in= out= format=" + echo "" + echo "Input may be stdin or a fasta or fastq file, compressed or uncompressed." + echo "Output (which is optional) may be stdout or a file." + echo "format=1: name start stop A C G T N" + echo "format=2: name GC" + echo "format=4: name length GC" + echo "Note that in format 1, A+C+G+T=1 even when N is nonzero." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +countgc "$@" diff --git a/sh/current b/sh/current new file mode 120000 index 0000000..2592c81 --- /dev/null +++ b/sh/current @@ -0,0 +1 @@ +../current \ No newline at end of file diff --git a/sh/dedupe.sh b/sh/dedupe.sh new file mode 100755 index 0000000..ffc1727 --- /dev/null +++ b/sh/dedupe.sh @@ -0,0 +1,149 @@ +#!/bin/bash -l +#khist in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +z2="-Xms1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=85 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=85 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + z2="-Xms${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + elif [[ "$arg" == -Xms* ]]; then + z2="$arg" + fi + done +} +calcXmx "$@" + +dedupe() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + module load pigz + local CMD="java -ea $z $z2 -cp $CP jgi.Dedupe $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "This script is designed for Genepool nodes." + echo "Last modified December 11, 2013" + echo "" + echo "Description: Accepts one or more files containing sets of sequences (reads or scaffolds)." + echo "Removes duplicate sequences, which may be specified to be exact matches, subsequences, or sequences within some percent identity." + echo "Can also find overlapping sequences and group them into clusters." + echo "" + echo "Usage: dedupe.sh in= out=" + echo "" + echo "Input may be stdin or a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file. With no output parameter, data will be written to stdout." + echo "If 'out=null', there will be no output, but statistics will still be printed." + echo "You can also use 'dedupe ' without the 'in=' and 'out='." + echo "" + echo "I/O Parameters" + echo "" + echo "in= A single file or a comma-delimited list of files." + echo "out= Destination for all output contigs." + echo "threads=auto (t) Set number of threads to use; default is number of logical processors." + echo "overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file." + echo "showspeed=t (ss) Set to 'f' to suppress display of processing speed." + echo "minscaf=0 (ms) Ignore contigs/scaffolds shorter than this." + echo "interleaved=auto If true, forces fastq input to be paired and interleaved." + echo "ziplevel=2 Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster." + echo "storename=t (sn) Store scaffold names (set false to save memory)." + echo "storequality=t (sq) Store quality values for fastq assemblies (set false to save memory)." + echo "uniquenames=t (un) Ensure all output scaffolds have unique names. Uses more memory." + echo "sort=f Sort output by scaffold length (otherwise it will be random)." + echo " 'a' for ascending, 'd' for descending, 'f' for false (no sorting)." + echo "" + echo "Processing Parameters" + echo "" + echo "absorbrc=t (arc) Absorb reverse-complements as well as normal orientation." + echo "absorbmatch=t (am) Absorb exact matches of contigs." + echo "absorbcontainment=t (ac) Absorb full containments of contigs." +# echo "absorboverlap=f (ao) Absorb (merge) non-contained overlaps of contigs (TODO)." + echo "findoverlap=f (fo) Find overlaps between contigs (containments and non-containments). Necessary for clustering." + echo "cluster=f (c) Group overlapping contigs into clusters." + echo "" + echo "fixmultijoins=t (fmj) Remove redundant overlaps between the same two contigs." + echo "removecycles=t (rc) Remove all cycles so clusters form trees." + echo "renameclusters=f (rnc) Rename contigs to indicate which cluster they are in." + echo "cc=t (canonicizeclusters) Flip contigs so clusters have a single orientation." + echo "fcc=f (fixcanoncontradictions) Truncate graph at nodes with canonization disputes." + echo "foc=f (fixoffsetcontradictions) Truncate graph at nodes with offset disputes." + echo "" + echo "Overlap Detection Parameters" + echo "" + echo "exact=t (ex) Only allow exact symbol matches. When false, an 'N' will match any symbol." + echo "touppercase=f (tuc) Change all input bases to upper case." + echo "maxsubs=0 (s) Allow up to this many mismatches (substitutions only, no indels). May be set higher than maxedits." + echo "maxedits=0 (e) Allow up to this many edits (subs or indels). Higher is slower." + echo "minidentity=100 (mid) Absorb contained sequences with percent identity of at least this (includes indels)." + echo "minlengthpercent=0 (mlp) Smaller contig must be at least this percent of larger contig's length to be absorbed." + echo "minoverlappercent=0 (mop) Overlap must be at least this percent of smaller contig's length to cluster and merge." + echo "minoverlap=200 (mo) Overlap must be at least this long to cluster and merge." + echo "depthratio=0 (dr) When non-zero, overlaps will only be formed between reads with a depth ratio of at most this." + echo " Should be above 1. Depth is determined by parsing the read names; this information can be added" + echo " by running KmerNormalize (khist.sh, bbnorm.sh, or ecc.sh) with the flag 'rename'" + echo "k=31 Seed length used for finding containments and overlaps. Anything shorter than k will not be found." +# echo "numaffixmaps=1 (nam) Set to 2 to index two prefixes and suffixes per contig." +# echo "ignoreaffix1=f (ia1) Ignore first affix (for testing)." +# echo "storesuffix=f (ss) Store suffix as well as prefix. Automatically set to true when doing inexact matches." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "-Xms If you use the -Xmx flag, also set -Xms to the same value." + echo "" + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +dedupe "$@" diff --git a/sh/ecc.sh b/sh/ecc.sh new file mode 100755 index 0000000..b9fd25a --- /dev/null +++ b/sh/ecc.sh @@ -0,0 +1,171 @@ +#!/bin/bash -l +#correct in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=85 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=85 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +correct() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + module load pigz + local CMD="java -ea $z -cp $CP jgi.KmerNormalize bits=16 ecc=t passes=1 keepall dr=f prefilter $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "This script is designed for Genepool nodes." + echo "Last modified February 12, 2014" + echo "" + echo "Description: Corrects substitution errors in reads using kmer depth information." + echo "Can also normalize and/or bin reads by kmer depth." + echo "" + echo "Usage: ecc.sh in= out= outt= hist=" + echo "" + echo "Input may be a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file. 'out' and 'hist' are both optional." + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "Input parameters:" + echo "in2=null Second input file for paired reads" + echo "extra=null Additional files to use for input (generating hash table) but not for output" + echo "fastareadlen=2^31 Break up FASTA reads longer than this. Can be useful when processing scaffolded genomes" + echo "tablereads=-1 Use at most this many reads when building the hashtable (-1 means all)" + echo "kmersample=1 Process every nth kmer, and skip the rest" + echo "readsample=1 Process every nth read, and skip the rest" + echo "interleaved=auto May be set to true or false to force the input read file to ovverride autodetection of the input file as paired interleaved." + echo "" + echo "Output parameters:" + echo "out= File for corrected reads" + echo "outt= (outtoss) File for reads that were excluded from primary output" + echo "reads=-1 Only process this number of reads, then quit (-1 means all)" + echo "sampleoutput=t Use sampling on output as well as input (not used if sample rates are 1)" + echo "keepall=f Set to true to keep all reads (e.g. if you just want error correction)." + echo "zerobin=f Set to true if you want kmers with a count of 0 to go in the 0 bin instead of the 1 bin in histograms." + echo " Default is false, to prevent confusion about how there can be 0-count kmers." + echo " The reason is that based on the 'minq' and 'minprob' settings, some kmers may be excluded from the bloom filter." + echo "tmpdir=$TMPDIR This will specify a directory for temp files (only needed for multipass runs). If null, they will be written to the output directory." + echo "usetempdir=t Allows enabling/disabling of temporary directory; if disabled, temp files will be written to the output directory." + echo "" + echo "Hashing parameters:" + echo "k=31 Kmer length (values under 32 are most efficient, but arbitrarily high values are supported)" + echo "bits=32 Bits per cell in bloom filter; must be 2, 4, 8, 16, or 32. Maximum kmer depth recorded is 2^cbits. Automatically reduced to 16 in 2-pass." + echo " Large values decrease accuracy for a fixed amount of memory, so use the lowest number you can that will still capture highest-depth kmers." + echo "hashes=3 Number of times each kmer is hashed and stored. Higher is slower." + echo " Higher is MORE accurate if there is enough memory, and LESS accurate if there is not enough memory." + echo "prefiliter=f True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable. The prefilter is more memory-efficient because it uses 2-bit cells." + echo "prehashes=2 Number of hashes for prefilter." + echo "buildpasses=1 More passes can sometimes increase accuracy by iteratively removing low-depth kmers" + echo "minq=6 Ignore kmers containing bases with quality below this" + echo "minprob=0.5 Ignore kmers with overall probability of correctness below this" + echo "threads=X Spawn exactly X hashing threads (default is number of logical processors). Total active threads may exceed X due to I/O threads." + echo "rdk=t (removeduplicatekmers) When true, a kmer's count will only be incremented once per read pair, even if that kmer occurs more than once." + echo "" + echo "Normalization parameters:" + echo "fixspikes=f (fs) Do a slower, high-precision bloom filter lookup of kmers that appear to have an abnormally high depth due to collisions." + echo "target=40 (tgt) Target normalization depth. NOTE: All depth parameters control kmer depth, not read depth." + echo " For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))" + echo "maxdepth=-1 (max) Reads will not be downsampled when below this depth, even if they are above the target depth." + echo "mindepth=6 (min) Kmers with depth below this number will not be included when calculating the depth of a read." + echo "minkmers=15 (mgkpr) Reads must have at least this many kmers over min depth to be retained. Aka 'mingoodkmersperread'." + echo "percentile=54.0 (dp) Read depth is by default inferred from the 54th percentile of kmer depth, but this may be changed to any number 1-100." + echo "deterministic=t (dr) Generate random numbers deterministically to ensure identical output between multiple runs. May decrease speed with a huge number of threads." + echo "passes=2 (p) 1 pass is the basic mode. 2 passes (default) allows greater accuracy, error detection, better contol of output depth." + echo "" + echo "Error detection parameters:" + echo "hdp=90.0 (highdepthpercentile) Position in sorted kmer depth array used as proxy of a read's high kmer depth." + echo "ldp=25.0 (lowdepthpercentile) Position in sorted kmer depth array used as proxy of a read's low kmer depth." + echo "tossbadreads=f (tbr) Throw away reads detected as containing errors. Only controls behavior of final pass." + echo "errordetectratio=125 (edr) Reads with a ratio of at least this much between their high and low depth kmers will be classified as error reads." + echo "highthresh=12 (ht) Threshold for high kmer. A high kmer at this or above are considered non-error." + echo "lowthresh=3 (lt) Threshold for low kmer. Kmers at this and below are always considered errors." + echo "" + echo "Error correction parameters:" + echo "ecc=f Set to true to correct errors." + echo "ecclimit=3 Correct up to this many errors per read. If more are detected, the read will remain unchanged." + echo "errorcorrectratio=140 (ecr) Adjacent kmers with a depth ratio of at least this much between will be classified as an error." + echo "echighthresh=22 (echt) Threshold for high kmer. A kmer at this or above may be considered non-error." + echo "eclowthresh=2 (eclt) Threshold for low kmer. Kmers at this and below are considered errors." + echo "eccmaxqual=127 Do not correct bases with quality above this value." + echo "aec=f (aggressiveErrorCorrection) Sets more aggressive values of ecr=100, ecclimit=7, echt=16, eclt=3, pl=2." + echo "cec=f (conservativeErrorCorrection) Sets more conservative values of ecr=180, ecclimit=2, echt=30, eclt=1, sl=4, pl=4." + echo "meo=f (markErrorsOnly) Marks errors by reducing quality value of suspected errors; does not correct anything." + echo "mue=t (markUncorrectableErrors) Marks errors only on uncorrectable reads; requires 'ecc=t'." + echo "" + echo "Depth binning parameters:" + echo "lowbindepth=10 (lbd) Cutoff for low depth bin." + echo "highbindepth=80 (hbd) Cutoff for high depth bin." + echo "outlow= Pairs in which both reads have a median below lbd go into this file." + echo "outhigh= Pairs in which both reads have a median above hbd go into this file." + echo "outmid= All other pairs go into this file." + echo "" + echo "Histogram parameters:" + echo "hist= Specify a file to write the input kmer depth histogram" + echo "histout= Specify a file to write the output kmer depth histogram" + echo "pzc=f (printzerocoverage) Print lines in the histogram with zero coverage." + echo "histlen=1048576 Max kmer depth displayed in histogram. Also affects statistics displayed, but does not affect normalization." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +correct "$@" diff --git a/sh/getreads.sh b/sh/getreads.sh new file mode 100755 index 0000000..0d6ea92 --- /dev/null +++ b/sh/getreads.sh @@ -0,0 +1,30 @@ +#!/bin/bash + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +function tf() { + module load oracle-jdk/1.7_64bit + local CMD="java -ea -Xmx120m -cp $CP jgi.GetReads $@" + echo $CMD >&2 + $CMD +} + +function usage(){ + echo "Selects reads with designated numeric IDs." + echo "Last modified December 18, 2013." + echo "" + echo "Usage: getreads.sh in= id= out=" + echo "" + echo "The first read (or pair) has ID 0, the second read (or pair) has ID 1, etc." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +tf "$@" \ No newline at end of file diff --git a/sh/gradesam.sh b/sh/gradesam.sh new file mode 100755 index 0000000..358ac75 --- /dev/null +++ b/sh/gradesam.sh @@ -0,0 +1,44 @@ +#!/bin/bash -l +#gradesam in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +function gradesam() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load samtools + local CMD="java -ea -Xmx200m -cp $CP align2.GradeSamFile $@" +# echo $CMD >&2 + $CMD +} + +function usage(){ + echo "Written by Brian Bushnell" + echo "Last modified November 7, 2013" + echo "" + echo "Description: Grades mapping correctness of a sam file of synthetic reads with headers generated by RandomReads3.java" + echo "" + echo "Usage: gradesam.sh in= reads=" + echo "" + echo "Parameters:" + echo "in= Specify the input sam file, or stdin." + echo "reads= Number of reads in mapper's input (i.e., the fastq file)." + echo "thresh=20 Max deviation from correct location to be considered 'loosely correct'." + echo "blasr=f Set to 't' for BLASR output; fixes extra information added to read names." + echo "ssaha2=f Set to 't' for SSAHA2 or SMALT output; fixes incorrect soft-clipped read locations." + echo "quality=3 Reads with a mapping quality of this or below will be considered ambiguously mapped." + echo "bitset=t Track read ID's to detect secondary alignments." + echo " Necessary for mappers that incorrectly output multiple primary alignments per read." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +gradesam "$@" diff --git a/sh/khist.sh b/sh/khist.sh new file mode 100755 index 0000000..c5d934c --- /dev/null +++ b/sh/khist.sh @@ -0,0 +1,169 @@ +#!/bin/bash -l +#khist in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=85 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=85 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +khist() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + module load pigz + local CMD="java -ea $z -cp $CP jgi.KmerNormalize bits=32 ecc=f passes=1 keepall dr=f prefilter hist=stdout minprob=0 minqual=0 mindepth=0 minkmers=1 hashes=3 $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "This script is designed for Genepool nodes." + echo "Last modified February 12, 2014" + echo "" + echo "Description: Generates a histogram of kmer counts for the input reads or assemblies." + echo "Can also normalize, error-correct, and/or bin reads by kmer depth." + echo "" + echo "Usage: khist.sh in= out= outt= hist=" + echo "" + echo "Input may be a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file. 'out' and 'hist' are both optional." + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "Input parameters:" + echo "in2=null Second input file for paired reads" + echo "extra=null Additional files to use for input (generating hash table) but not for output" + echo "fastareadlen=2^31 Break up FASTA reads longer than this. Can be useful when processing scaffolded genomes" + echo "tablereads=-1 Use at most this many reads when building the hashtable (-1 means all)" + echo "kmersample=1 Process every nth kmer, and skip the rest" + echo "readsample=1 Process every nth read, and skip the rest" + echo "interleaved=auto May be set to true or false to force the input read file to ovverride autodetection of the input file as paired interleaved." + echo "" + echo "Output parameters:" + echo "out= File for normalized reads" + echo "outt= (outtoss) File for reads that were excluded from primary output" + echo "reads=-1 Only process this number of reads, then quit (-1 means all)" + echo "sampleoutput=t Use sampling on output as well as input (not used if sample rates are 1)" + echo "keepall=f Set to true to keep all reads (e.g. if you just want error correction)." + echo "zerobin=f Set to true if you want kmers with a count of 0 to go in the 0 bin instead of the 1 bin in histograms." + echo " Default is false, to prevent confusion about how there can be 0-count kmers." + echo " The reason is that based on the 'minq' and 'minprob' settings, some kmers may be excluded from the bloom filter." + echo "tmpdir=$TMPDIR This will specify a directory for temp files (only needed for multipass runs). If null, they will be written to the output directory." + echo "usetempdir=t Allows enabling/disabling of temporary directory; if disabled, temp files will be written to the output directory." + echo "" + echo "Hashing parameters:" + echo "k=31 Kmer length (values under 32 are most efficient, but arbitrarily high values are supported)" + echo "bits=32 Bits per cell in bloom filter; must be 2, 4, 8, 16, or 32. Maximum kmer depth recorded is 2^cbits. Automatically reduced to 16 in 2-pass." + echo " Large values decrease accuracy for a fixed amount of memory, so use the lowest number you can that will still capture highest-depth kmers." + echo "hashes=3 Number of times each kmer is hashed and stored. Higher is slower." + echo " Higher is MORE accurate if there is enough memory, and LESS accurate if there is not enough memory." + echo "prefiliter=f True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable. The prefilter is more memory-efficient because it uses 2-bit cells." + echo "prehashes=2 Number of hashes for prefilter." + echo "buildpasses=1 More passes can sometimes increase accuracy by iteratively removing low-depth kmers" + echo "minq=6 Ignore kmers containing bases with quality below this" + echo "minprob=0.5 Ignore kmers with overall probability of correctness below this" + echo "threads=X Spawn exactly X hashing threads (default is number of logical processors). Total active threads may exceed X due to I/O threads." + echo "rdk=t (removeduplicatekmers) When true, a kmer's count will only be incremented once per read pair, even if that kmer occurs more than once." + echo "" + echo "Normalization parameters:" + echo "fixspikes=f (fs) Do a slower, high-precision bloom filter lookup of kmers that appear to have an abnormally high depth due to collisions." + echo "target=40 (tgt) Target normalization depth. NOTE: All depth parameters control kmer depth, not read depth." + echo " For kmer depth Dk, read depth Dr, read length R, and kmer size K: Dr=Dk*(R/(R-K+1))" + echo "maxdepth=-1 (max) Reads will not be downsampled when below this depth, even if they are above the target depth." + echo "mindepth=6 (min) Kmers with depth below this number will not be included when calculating the depth of a read." + echo "minkmers=15 (mgkpr) Reads must have at least this many kmers over min depth to be retained. Aka 'mingoodkmersperread'." + echo "percentile=54.0 (dp) Read depth is by default inferred from the 54th percentile of kmer depth, but this may be changed to any number 1-100." + echo "deterministic=t (dr) Generate random numbers deterministically to ensure identical output between multiple runs. May decrease speed with a huge number of threads." + echo "passes=2 (p) 1 pass is the basic mode. 2 passes (default) allows greater accuracy, error detection, better contol of output depth." + echo "" + echo "Error detection parameters:" + echo "hdp=90.0 (highdepthpercentile) Position in sorted kmer depth array used as proxy of a read's high kmer depth." + echo "ldp=25.0 (lowdepthpercentile) Position in sorted kmer depth array used as proxy of a read's low kmer depth." + echo "tossbadreads=f (tbr) Throw away reads detected as containing errors. Only controls behavior of final pass." + echo "errordetectratio=125 (edr) Reads with a ratio of at least this much between their high and low depth kmers will be classified as error reads." + echo "highthresh=12 (ht) Threshold for high kmer. A high kmer at this or above are considered non-error." + echo "lowthresh=3 (lt) Threshold for low kmer. Kmers at this and below are always considered errors." + echo "" + echo "Error correction parameters:" + echo "ecc=f Set to true to correct errors." + echo "ecclimit=3 Correct up to this many errors per read. If more are detected, the read will remain unchanged." + echo "errorcorrectratio=140 (ecr) Adjacent kmers with a depth ratio of at least this much between will be classified as an error." + echo "echighthresh=22 (echt) Threshold for high kmer. A kmer at this or above may be considered non-error." + echo "eclowthresh=2 (eclt) Threshold for low kmer. Kmers at this and below are considered errors." + echo "eccmaxqual=127 Do not correct bases with quality above this value." + echo "meo=f (markErrorsOnly) Marks errors by reducing quality value of suspected errors; does not correct anything." + echo "mue=t (markUncorrectableErrors) Marks errors only on uncorrectable reads; requires 'ecc=t'." + echo "" + echo "Depth binning parameters:" + echo "lowbindepth=10 (lbd) Cutoff for low depth bin." + echo "highbindepth=80 (hbd) Cutoff for high depth bin." + echo "outlow= Pairs in which both reads have a median below lbd go into this file." + echo "outhigh= Pairs in which both reads have a median above hbd go into this file." + echo "outmid= All other pairs go into this file." + echo "" + echo "Histogram parameters:" + echo "hist= Specify a file to write the input kmer depth histogram" + echo "histout= Specify a file to write the output kmer depth histogram" + echo "pzc=f (printzerocoverage) Print lines in the histogram with zero coverage." + echo "histlen=1048576 Max kmer depth displayed in histogram. Also affects statistics displayed, but does not affect normalization." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +khist "$@" diff --git a/sh/kmercount.sh b/sh/kmercount.sh new file mode 100755 index 0000000..4a2dfb8 --- /dev/null +++ b/sh/kmercount.sh @@ -0,0 +1,123 @@ +#!/bin/bash -l +#kmercount in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + y=1 + if [ "$x" = "unlimited" ]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=86 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=92 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done + +} +calcXmx "$@" + +kmercount() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + module load pigz + local CMD="java -ea $z -cp $CP jgi.KmerCoverage prefilter=true bits=16 interleaved=false $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "Last modified December 11, 2013" + echo "This script is designed for Genepool nodes." + echo "" + echo "Usage: kmercount in= out= hist=" + echo "" + echo "Input may be stdin or a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a file. 'out' and 'hist' are both optional." + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "Input parameters:" + echo "in2=null Second input file for paired reads" + echo "extra=null Additional files to use for input (generating hash table) but not for output" + echo "fastareadlen=2^31 Break up FASTA reads longer than this. Can be useful when processing scaffolded genomes" + echo "tablereads=-1 Use at most this many reads when building the hashtable (-1 means all)" + echo "kmersample=1 Process every nth kmer, and skip the rest" + echo "readsample=1 Process every nth read, and skip the rest" + echo "" + echo "Output parameters:" + echo "hist=null Specify a file to output the depth histogram" + echo "histlen=10000 Max depth displayed on histogram" + echo "reads=-1 Only process this number of reads, then quit (-1 means all)" + echo "sampleoutput=true Use sampling on output as well as input (not used if sample rates are 1)" + echo "printcoverage=false Only print coverage information instead of reads" + echo "useheader=false Append coverage info to the read's header" + echo "minmedian=0 Don't output reads with median coverage below this" + echo "minaverage=0 Don't output reads with average coverage below this" + echo "zerobin=false Set to true if you want kmers with a count of 0 to go in the 0 bin instead of the 1 bin in histograms." + echo " Default is false, to prevent confusion about how there can be 0-count kmers." + echo " The reason is that based on the 'minq' and 'minprob' settings, some kmers may be excluded from the bloom filter." + echo "" + echo "Hashing parameters:" + echo "k=31 Kmer length (values under 32 are most efficient, but arbitrarily high values are supported)" + echo "cbits=8 Bits per cell in bloom filter; must be 2, 4, 8, 16, or 32. Maximum kmer depth recorded is 2^cbits." + echo " Large values decrease accuracy for a fixed amount of memory." + echo "hashes=4 Number of times a kmer is hashed. Higher is slower." + echo " Higher is MORE accurate if there is enough memory, and LESS accurate if there is not enough memory." + echo "prefiliter=false True is slower, but generally more accurate; filters out low-depth kmers from the main hashtable." + echo "prehashes=2 Number of hashes for prefilter." + echo "passes=1 More passes can sometimes increase accuracy by iteratively removing low-depth kmers" + echo "minq=7 Ignore kmers containing bases with quality below this" + echo "minprob=0.5 Ignore kmers with overall probability of correctness below this" + echo "threads=X Spawn exactly X hashing threads (default is number of logical processors). Total active threads may exceed X by up to 4." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +kmercount "$@" diff --git a/sh/mapPacBio.sh b/sh/mapPacBio.sh new file mode 100755 index 0000000..431b927 --- /dev/null +++ b/sh/mapPacBio.sh @@ -0,0 +1,76 @@ +#!/bin/bash -l +#mapPacBio in= out= ref= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=84 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=84 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=84 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=84 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=84 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=84 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +mapPacBio() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -da $z -cp $CP align2.BBMapPacBio build=1 overwrite=true minratio=0.40 match=long fastareadlen=500 dprr=false ambiguous=best minscaf=100 startpad=4000 stoppad=4000 midpad=1000 $@" + echo $CMD >&2 + $CMD +} + +usage(){ + bash "$DIR"bbmap.sh +} + +if [ -z "$1" ]; then + usage + exit +fi + +mapPacBio "$@" diff --git a/sh/mapPacBio8k.sh b/sh/mapPacBio8k.sh new file mode 100755 index 0000000..c664480 --- /dev/null +++ b/sh/mapPacBio8k.sh @@ -0,0 +1,76 @@ +#!/bin/bash -l +#mapPacBio in= out= ref= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=84 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=84 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=84 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=84 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=84 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=84 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +mapPacBio() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP align2.BBMapPacBio build=1 overwrite=true minratio=0.40 match=long fastareadlen=6000 dprr=false ambiguous=best minscaf=100 startpad=10000 stoppad=10000 midpad=6000 $@" + echo $CMD >&2 + $CMD +} + +usage(){ + bash "$DIR"bbmap.sh +} + +if [ -z "$1" ]; then + usage + exit +fi + +mapPacBio "$@" diff --git a/sh/pileup.sh b/sh/pileup.sh new file mode 100755 index 0000000..2a97865 --- /dev/null +++ b/sh/pileup.sh @@ -0,0 +1,132 @@ +#!/bin/bash -l +#pileup in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=85 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=85 + #echo "ram is under 40g" + fi + mult=$(( mult-5 )) #to save room for samtools if needed. + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +pileup() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP jgi.SamPileup $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "Written by Brian Bushnell" + echo "Last modified December 11, 2013" + echo "" + echo "Description: Calculates per-scaffold coverage information from an unsorted sam file." + echo "" + echo "Usage: pileup.sh in= out=" + echo "" + echo "Input may be stdin or a SAM file, compressed or uncompressed." + echo "Output may be stdout or a file." + echo "" + echo "Input Parameters:" + echo "in= The input sam; this is the only required parameter." + echo "ref= Scans a reference fasta for per-scaffold GC counts, not otherwise needed." + echo "fastaorf= A fasta file with ORF header information in PRODIGAL's output format. Must also specify 'outorf'." + echo "" + echo "Output Parameters:" + echo "out= Prints per-scaffold coverage info to this file." + echo "outorf= Prints per-orf coverage info to this file (only if 'fastaorf' is specified)." + echo "twocolumn= Change to true to print only ID and Avg_fold instead of all 6 columns to the 'out=' file." + echo "outsam= Prints the input sam stream to this file (or stdout). Useful for piping data." + echo "hist= Prints a histogram of # occurrences of each depth level." + echo "basecov= Prints coverage per base location (excluding bases where coverage was the same as the previous base)." + echo "bincov= Prints binned coverage per location (one line per X bases)." + echo "binsize=<1000> Set the binsize for binned coverage output." + echo "" + echo "Other parameters:" + echo "32bit= Set to true if you need per-base coverage over 64k; does not affect per-scaffold coverage precision." + echo " This option will double RAM usage (when calculating per-base coverage)." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Output format:" + echo "ID Length Ref_GC Avg_fold Base_Coverage Read_GC" + echo "" + echo "ID: Scaffold ID" + echo "Length: Scaffold length" + echo "Ref_GC: GC ratio of reference" + echo "Avg_fold: Average fold coverage of this scaffold" + echo "Base_Coverage: Percent of scaffold with any coverage" + echo "Read_GC: Average GC ratio of reads mapped to this scaffold" + echo "" + echo "Notes:" + echo "" + echo "Only supports SAM format for reads and FASTA for reference (though either may be gzipped)." + echo "Sorting is not needed, so output may be streamed directly from a mapping program." + echo "Requires approximately 1 bit per reference base plus 100 bytes per scaffold (even if no reference is specified)." + echo "This script will attempt to autodetect and correctly specify the -Xmx parameter to use all memory on the target node." + echo "If this fails with a message including 'Error: Could not create the Java Virtual Machine.', then..." + echo "Please decrease the -Xmx parameter. It should be set to around 85% of the available memory." + echo "For example, -Xmx20g needs around 23 GB of virtual (and physical) memory when qsubbed." + echo "If the program fails with a message including 'java.lang.OutOfMemoryError:', then..." + echo "-Xmx needs to be increased, which probably also means it needs to be qsubbed with a higher memory allocation." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +pileup "$@" diff --git a/sh/printtime.sh b/sh/printtime.sh new file mode 100755 index 0000000..6645392 --- /dev/null +++ b/sh/printtime.sh @@ -0,0 +1,29 @@ +#!/bin/bash -l +#printtime in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +function printtime() { + module load oracle-jdk/1.7_64bit + local CMD="java -ea -Xmx8m -cp $CP align2.PrintTime $@" + echo $CMD >&2 + $CMD +} + +function usage(){ + echo "Prints time elapsed since last called on the same file." + echo "Written by Brian Bushnell" + echo "Last modified October 24, 2013" + echo "" + echo "Usage: printtime.sh " + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +printtime "$@" diff --git a/sh/randomreads.sh b/sh/randomreads.sh new file mode 100755 index 0000000..00acd8b --- /dev/null +++ b/sh/randomreads.sh @@ -0,0 +1,90 @@ +#!/bin/bash -l + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx1g" +calcXmx () { + x=$(ulimit -v) + #echo "x=$x" + HOSTNAME=`hostname` + y=1 + if [[ $x == unlimited ]] || [[ $HOSTNAME == gpint* ]]; then + #echo "ram is unlimited" + echo "This system does not have ulimit set, so max memory cannot be determined. Attempting to use 4G." 1>&2 + echo "If this fails, please set ulimit or run this program qsubbed or from a qlogin session on Genepool." 1>&2 + y=4 + else + mult=75; + if [ $x -ge 1000000000 ]; then + mult=85 + #echo "ram is 1000g+" + elif [ $x -ge 500000000 ]; then + mult=85 + #echo "ram is 500g+" + elif [ $x -ge 250000000 ]; then + mult=85 + #echo "ram is 250g+" + elif [ $x -ge 144000000 ]; then + mult=85 + #echo "ram is 144g+" + elif [ $x -ge 120000000 ]; then + mult=85 + #echo "ram is 120g+" + elif [ $x -ge 40000000 ]; then + mult=80 + #echo "ram is 40g+" + else + mult=85 + #echo "ram is under 40g" + fi + y=$(( ((x-500000)*mult/100)/1000000 )) + fi + #echo "y=$y" + z="-Xmx${y}g" + + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +randomreads() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + module load pigz + local CMD="java -ea $z -cp $CP align2.RandomReads3 build=1 $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "This script is designed for Genepool nodes. It will by default attempt to use all memory on the target node." + echo "Last modified December 11, 2013." + echo "" + echo "Description: Generates random synthetic reads from a reference genome. A read's name indicates its genomic location." + echo "Allows precise customization of things like insert size and synthetic mutation type, sizes, and rates." + echo "" + echo "Usage: randomreads.sh ref= out= minlen= maxlen= reads=" + echo "" + echo "Optional parameters:" + echo "paired= Set to true for paired reads." + echo "interleaved= Set to true if paired output is interleaved (rather than in two files)." + echo "build=<1> If multiple references will be used when running in the same working directory, each needs a unique build ID." + echo "replacenoref= Set to true to replace N in the reference sequence with random letters." + echo "seed= Use this to set the random number generator seed; use -1 for a random seed." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." +} + +if [ -z "$1" ]; then + usage + exit +fi + +randomreads "$@" diff --git a/sh/readlength.sh b/sh/readlength.sh new file mode 100755 index 0000000..d4ae079 --- /dev/null +++ b/sh/readlength.sh @@ -0,0 +1,36 @@ +#!/bin/bash -l +#stats in= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +stats() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + local CMD="java -ea -Xmx120m -cp $CP jgi.MakeLengthHistogram $@" +# echo $CMD >&2 + $CMD +} + +usage(){ + echo "Generates a length histogram of input reads." + echo "Written by Brian Bushnell" + echo "Last modified December 11, 2013" + echo "" + echo "Usage: readlength.sh in=" + echo "" + echo "in= The 'in=' flag is needed only if the input file is not the first parameter. 'in=stdin.fq' will pipe from standard in." + echo "in2= Use this if 2nd read of pairs are in a different file." + echo "out= Write the histogram to this file. Default is stdout." + echo "bin=10 Set the histogram bin size. Default is 10." + echo "max=4000 Set the histogram's max readlength bin. Default is 4000." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +stats "$@" diff --git a/sh/reformat.sh b/sh/reformat.sh new file mode 100755 index 0000000..5292973 --- /dev/null +++ b/sh/reformat.sh @@ -0,0 +1,104 @@ +#!/bin/bash -l +#reformat in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx200m" +calcXmx () { + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +function reformat() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + module load samtools + local CMD="java -ea $z -cp $CP jgi.ReformatReads $@" + echo $CMD >&2 + $CMD +} + +function usage(){ + echo "Written by Brian Bushnell" + echo "Last modified December 11, 2013" + echo "" + echo "Description: Reformats reads to change ASCII quality encoding, interleaving, file format, or compression format." + echo "Optionally performs additional functions such as quality trimming, subsetting, and subsampling." + echo "Supports sam, fastq, fasta, fasta+qual, gzip, zip, and bz2." + echo "" + echo "Usage: reformat.sh in= in2= out= out2=" + echo "" + echo "in2 and out2 are for paired reads and are optional." + echo "If input is paired and there is only one output file, it will be written interleaved." + echo "Other parameters and their defaults:" + echo "" + echo "ow=f (overwrite) Overwrites files that already exist." + echo "zl=4 (ziplevel) Set compression level, 1 (low) to 9 (max)." + echo "int=f (interleaved) Determines whether INPUT file is considered interleaved." + echo "fastawrap=100 Length of lines in fasta output." + echo "fastareadlen=0 Set to a non-zero number to break fasta files into reads of at most this length." + echo "minscaf=1 Ignore fasta reads shorter than this." + echo "tuc=f (touppercase) Change lowercase letters in reads to uppercase." + echo "rcm=f (rcompmate) Reverse-complement read 2 in pairs." + echo "qin=auto ASCII offset for input quality. May be 33 (Sanger), 64 (Illumina), or auto." + echo "qout=auto ASCII offset for output quality. May be 33 (Sanger), 64 (Illumina), or auto (same as input)." + echo "qfin=<.qual file> Read qualities from this qual file, for the reads coming from 'in='" + echo "qfin2=<.qual file> Read qualities from this qual file, for the reads coming from 'in2='" + echo "qfout=<.qual file> Write qualities from this qual file, for the reads going to 'out='" + echo "qfout2=<.qual file> Write qualities from this qual file, for the reads coming from 'out2='" + echo "verifyinterleaved=f (vint) When true, checks a file to see if the names look paired. Prints an error message if not." + echo "tossbrokenreads=f (tbr) Discard reads that have different numbers of bases and qualities. By default this will be detected and cause a crash." + echo "" + echo "Sampling parameters:" + echo "reads=-1 Set to a positive number to only process this many INPUT reads (or pairs), then quit." + echo "samplerate=1 Randomly output only this fraction of reads; 1 means sampling is disabled." + echo "sampleseed=-1 Set to a positive number to use that prng seed for sampling (allowing deterministic sampling)." + echo "samplereadstarget=0 (srt) Exact number of OUTPUT reads (or pairs) desired." + echo "samplebasestarget=0 (sbt) Exact number of OUTPUT bases desired." + echo " Important: srt/sbt flags should not be used with stdin, samplerate, qtrim, minlength, or minavgquality." + echo "" + echo "Trimming parameters:" + echo "qtrim=f Trim read ends to remove bases with quality below minq." + echo " Values: t (trim both ends), f (neither end), r (right end only), l (left end only)." + echo "trimq=4 Trim quality threshold." + echo "minlength=20 (ml) Reads shorter than this after trimming will be discarded. Pairs will be discarded only if both are shorter." + echo "minavgquality=0 (maq) Reads with average quality (before trimming) below this will be discarded." + echo "forcetrimleft=0 (ftl) If nonzero, trim left bases of the read to this position (exclusive, 0-based)." + echo "forcetrimright=0 (ftr) If nonzero, trim right bases of the read after this position (exclusive, 0-based)." + echo "" + echo "Sam file reformatting options. Note that most of these will require an indexed reference." + echo "build= Assign a genome's build id. You can index like this: bbmap.sh ref= build=1" + echo "sam=1.3 Set to 1.4 to write Sam version 1.4 cigar strings, with = and X instead of M." + echo "md=f Set to true to write MD tags." + echo "xs=f Set to 'ss', 'fs', or 'us' to write XS tags for RNAseq using secondstrand, firststrand," + echo " or unstranded libraries. Needed by Cufflinks. JGI mainly uses 'firststrand'." + echo "stoptag=t Set to true to write a tag indicating read stop location, prefixed by YS:i:" + echo "idtag=t Set to true to write a tag indicating percent identity, prefixed by YI:f:" + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Supported input formats are fastq, fasta, fast+qual, scarf, and bread (BBMap's native format)" + echo "Supported output formats are fastq, fasta, fast+qual, bread, sam, and bam (bam only if samtools is installed)" + echo "Supported compression formats are gz, zip, and bz2" + echo "To read from stdin, set 'in=stdin'. The format should be specified with an extension, like 'in=stdin.fq.gz'" + echo "To write to stdout, set 'out=stdout'. The format should be specified with an extension, like 'out=stdout.fasta'" + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +reformat "$@" diff --git a/sh/removehuman.sh b/sh/removehuman.sh new file mode 100755 index 0000000..38d54ee --- /dev/null +++ b/sh/removehuman.sh @@ -0,0 +1,51 @@ +#!/bin/bash -l +#removehuman in= out= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +function removehuman() { + module unload oracle-jdk + #module unload samtools + module load oracle-jdk/1.7_64bit + module load pigz + #module load samtools + local CMD="java -ea -Xmx23g -cp $CP align2.BBMap minratio=0.75 maxindel=20 bwr=0.18 bw=20 quickmatch minhits=2 outputunmapped=f path=/global/projectb/sandbox/gaag/bbtools/hg19 pigz unpigz $@" + echo $CMD >&2 + $CMD +} + +function usage(){ + echo "This script is designed for Genepool nodes. Requires at least 24GB RAM." + echo "Last modified January 29, 2014" + echo "" + echo "Description: Removes all reads that map to the human genome with at least 88% identity." + echo "" + echo "Usage: removehuman.sh in= outu=" + echo "" + echo "Input may be stdin or a fasta, fastq, or sam file, compressed or uncompressed." + echo "Output may be stdout or a fasta, fastq, or sam file." + echo "" + echo "Optional parameters (and their defaults)" + echo "" + echo "threads=auto (t) Set number of threads to use; default is number of logical processors." + echo "overwrite=t (ow) Set to false to force the program to abort rather than overwrite an existing file." + echo "interleaved=auto (int) If true, forces fastq input to be paired and interleaved." + #echo "kfilter=47 Require at least this many contiguous exact matches to the reference." + echo "trim=t Trim read ends to remove bases with quality below minq." + echo " Values: t (trim both ends), f (neither end), r (right end only), l (left end only)." + echo "untrim=t Undo the trimming after mapping." + echo "minq=4 Trim quality threshold." + echo "ziplevel=2 (zl) Set to 1 (lowest) through 9 (max) to change compression level; lower compression is faster." + echo "outm= File to output the reads that mapped to human." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +removehuman "$@" diff --git a/sh/samtoroc.sh b/sh/samtoroc.sh new file mode 100755 index 0000000..aa01bdb --- /dev/null +++ b/sh/samtoroc.sh @@ -0,0 +1,57 @@ +#!/bin/bash -l +#gradesam in= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx200m" +calcXmx () { + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +samtoroc() { + module unload oracle-jdk + module unload samtools + module load oracle-jdk/1.7_64bit + module load samtools + local CMD="java -ea $z -cp $CP align2.MakeRocCurve $@" +# echo $CMD >&2 + $CMD +} + +usage(){ + echo "Written by Brian Bushnell" + echo "Last modified December 11, 2013" + echo "" + echo "Description: Creates a ROC curve from a sam file of synthetic reads with headers generated by RandomReads3.java" + echo "" + echo "Usage: samtoroc.sh in= reads=" + echo "" + echo "Parameters:" + echo "in= Specify the input sam file, or stdin." + echo "thresh=20 Max deviation from correct location to be considered 'loosely correct'." + echo "blasr=f Set to 't' for BLASR output; fixes extra information added to read names." + echo "ssaha2=f Set to 't' for SSAHA2 or SMALT output; fixes incorrect soft-clipped read locations." + echo "bitset=t Track read ID's to detect secondary alignments." + echo " Necessary for mappers that incorrectly output multiple primary alignments per read." + echo "" + echo "Java Parameters:" + echo "-Xmx This will be passed to Java to set memory usage, overriding the program's automatic memory detection." + echo " -Xmx20g will specify 20 gigs of RAM, and -Xmx200m will specify 200 megs. The max is typically 85% of physical memory." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +samtoroc "$@" diff --git a/sh/stats.sh b/sh/stats.sh new file mode 100755 index 0000000..be8a535 --- /dev/null +++ b/sh/stats.sh @@ -0,0 +1,70 @@ +#!/bin/bash -l +#stats in= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx120m" +calcXmx () { + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +stats() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + local CMD="java -ea $z -cp $CP jgi.AssemblyStats2 $@" +# echo $CMD >&2 + $CMD +} + +usage(){ + echo "Calculates basic statistics of assembly fasta files." + echo "Written by Brian Bushnell" + echo "Last modified December 11, 2013" + echo "" + echo "Description: Generates basic assembly statistics such as scaffold count, N50, L50, GC content, gap percent, etc." + echo "" + echo "Usage: stats.sh in=" + echo "" + echo "Parameters:" + echo "in= Specify the input fasta file, or stdin." + echo "gc= Writes ACGTN content per scaffold to a file." + echo "gchist= Filename to output scaffold gc content histogram." + echo "shist= Filename to output cumulative scaffold length histogram." + echo "gcbins=<200> Number of bins for gc histogram." + echo "n=<10> Number of contiguous Ns to signify a break between contigs." + echo "k=<13> Estimate memory usage of BBMap with this kmer length." + echo "minscaf=<0> Ignore scaffolds shorter than this." + echo "phs= (printheaderstats) Set to true to print total size of headers." + echo "pdl= (printduplicatelines) Set to true to print lines in the scaffold size table where the counts did not change." + echo "n_= This flag will prefix the terms 'contigs' and 'scaffolds' with 'n_' in formats 3-6." + echo "" + echo "format=<1 through 6> Format of the stats information." + echo " format=1 uses variable units like MB and KB, and is designed for compatibility with existing tools." + echo " format=2 uses only whole numbers of bases, with no commas in numbers, and is designed for machine parsing." + echo " format=3 outputs stats in 2 rows of tab-delimited columns: a header row and a data row." + echo " format=4 is like 3 but with scaffold data only." + echo " format=5 is like 3 but with contig data only." + echo " format=6 is like 3 but the header starts with a #." + echo "" + echo "gcformat=<1 or 2> Select GC output format." + echo " gcformat=1: name start stop A C G T N GC" + echo " gcformat=2: name GC" + echo " gcformat=4: name length GC" + echo " Note that in gcformat 1, A+C+G+T=1 even when N is nonzero." + echo "" + echo "Please contact Brian Bushnell at bbushnell@lbl.gov if you encounter any problems." +} + +if [ -z "$1" ]; then + usage + exit +fi + +stats "$@" diff --git a/sh/statswrapper.sh b/sh/statswrapper.sh new file mode 100755 index 0000000..b4bbd1e --- /dev/null +++ b/sh/statswrapper.sh @@ -0,0 +1,64 @@ +#!/bin/bash -l +#stats in= + +DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )/" +CP="$DIR""current/" + +z="-Xmx200m" +calcXmx () { + for arg in "$@" + do + if [[ "$arg" == -Xmx* ]]; then + z="$arg" + fi + done +} +calcXmx "$@" + +stats() { + module unload oracle-jdk + module load oracle-jdk/1.7_64bit + local CMD="java -ea $z -cp $CP jgi.AssemblyStatsWrapper $@" + echo $CMD >&2 + $CMD +} + +usage(){ + echo "Last modified December 11, 2013." + echo "" + echo "Description: Runs stats.sh on multiple assemblies to produce one ouput line per file." + echo "" + echo "Usage: statswrapper.sh in=" + echo "" + echo "Parameters:" + echo "in= Specify the input fasta file, or stdin. For multiple files a, b, and c: 'statswrapper.sh in=a in=b in=c'." + echo " 'in=' may be omitted if this is the first arg, and asterisks may be used; e.g. statswrapper.sh *.fa" + echo "gc= Writes ACGTN content per scaffold to a file." + echo "gchist= Filename to output scaffold gc content histogram." + echo "gcbins=<200> Number of bins for gc histogram." + echo "n=<10> Number of contiguous Ns to signify a break between contigs." + echo "k=<13> Estimate memory usage of BBMap with this kmer length." + echo "minscaf=<0> Ignore scaffolds shorter than this." + echo "n_= This flag will prefix the terms 'contigs' and 'scaffolds' with 'n_' in formats 3-6." + echo "" + echo "format=<1 through 6> Format of the stats information." + echo " format=1 uses variable units like MB and KB, and is designed for compatibility with existing tools." + echo " format=2 uses only whole numbers of bases, with no commas in numbers, and is designed for machine parsing." + echo " format=3 outputs stats in 2 rows of tab-delimited columns: a header row and a data row." + echo " format=4 is like 3 but with scaffold data only." + echo " format=5 is like 3 but with contig data only." + echo " format=6 is like 3 but the header starts with a #." + echo "" + echo "gcformat=<1 or 2> Select GC output format." + echo " gcformat=1: name start stop A C G T N GC" + echo " gcformat=2: name GC" + echo " Note that in gcformat 1, A+C+G+T=1 even when N is nonzero." + echo "" +} + +if [ -z "$1" ]; then + usage + exit +fi + +stats "$@"