From d940c2e57e0735c6acd3b8b9c9fe4b5b125ccb4a Mon Sep 17 00:00:00 2001 From: Wim Dumon Date: Tue, 13 Jun 2023 10:44:14 +0200 Subject: [PATCH] Fix for 2 bugs in the fastq reader: MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - \r\n handling may cause reading a byte past end of buffer, parser fails - checking end-of-file condition can only be reliably done after a call to getLine() returns NULL. One particular case is that some gzip files contain empty gzip blocks at the end of the file, which canĀ“t be predicted by the current eof() code Tested with files provided in issue #491. This reverts commit 0ee1b3b3af1c2c890b063261b2ff55305bb220f0, "fix a regression bug of FASTQ reader" --- src/fastqreader.cpp | 49 ++++++++++++++++++++++++++++++++------------- src/fastqreader.h | 1 + 2 files changed, 36 insertions(+), 14 deletions(-) diff --git a/src/fastqreader.cpp b/src/fastqreader.cpp index e65ea21..7326179 100644 --- a/src/fastqreader.cpp +++ b/src/fastqreader.cpp @@ -33,6 +33,7 @@ SOFTWARE. FastqReader::FastqReader(string filename, bool hasQuality, bool phred64){ mFilename = filename; + mSkipNewline = false; mZipped = false; mFile = NULL; mStdinMode = false; @@ -223,6 +224,15 @@ void FastqReader::getLine(string* line){ int end = start; while(end < mBufDataLen) { + // May still need to skip \n from a \r\n pair + if (mSkipNewline) { + mSkipNewline = false; + if (mFastqBuf[end] == '\n') { + start++; + end++; + continue; + } + } if(mFastqBuf[end] != '\r' && mFastqBuf[end] != '\n') end++; else @@ -236,9 +246,9 @@ void FastqReader::getLine(string* line){ // skip \n or \r end++; - // handle \r\n - if(end < mBufDataLen-1 && mFastqBuf[end-1]=='\r' && mFastqBuf[end] == '\n') - end++; + // handle \r\n - not now because we may be at end of buffer + if(mFastqBuf[end-1]=='\r') + mSkipNewline = true; mBufUsedLen = end; @@ -252,7 +262,18 @@ void FastqReader::getLine(string* line){ readToBuf(); start = 0; end = 0; + + while(end < mBufDataLen) { + // May still need to skip \n from a \r\n pair + if (mSkipNewline) { + mSkipNewline = false; + if (mFastqBuf[end] == '\n') { + start++; + end++; + continue; + } + } if(mFastqBuf[end] != '\r' && mFastqBuf[end] != '\n') end++; else @@ -265,9 +286,9 @@ void FastqReader::getLine(string* line){ // skip \n or \r end++; - // handle \r\n - if(end < mBufDataLen-1 && mFastqBuf[end] == '\n') - end++; + // handle \r\n - not now because we may be at end of buffer + if(mFastqBuf[end-1] == '\r') + mSkipNewline = true; mBufUsedLen = end; return; @@ -280,10 +301,6 @@ void FastqReader::getLine(string* line){ } Read* FastqReader::read(){ - if(mBufUsedLen >= mBufDataLen && bufferFinished()) { - return NULL; - } - string* name; string* sequence; string* strand; @@ -306,12 +323,16 @@ Read* FastqReader::read(){ } getLine(name); + if(name->empty() && mBufUsedLen >= mBufDataLen && bufferFinished()) { + // EOF is triggered only after reading past end of file; that + // can happen at the start of a new read. + return NULL; + } // name should start with @ - while((name->empty() && !(mBufUsedLen >= mBufDataLen && bufferFinished())) || (!name->empty() && (*name)[0]!='@')){ - getLine(name); + if (name->empty() || (*name)[0]!='@') { + cerr << *name << endl; + error_exit("Read name line should start with '@'"); } - if(name->empty()) - return NULL; getLine(sequence); getLine(strand); diff --git a/src/fastqreader.h b/src/fastqreader.h index fe65dca..c6a25b3 100644 --- a/src/fastqreader.h +++ b/src/fastqreader.h @@ -65,6 +65,7 @@ class FastqReader{ private: string mFilename; + bool mSkipNewline; struct isal_gzip_header mGzipHeader; struct inflate_state mGzipState; unsigned char *mGzipInputBuffer;