improve message parsing: allow bare carriage return (unless in pedant…

…ic mode), allow empty header, and no longer treat a message with only headers as a message with only a body
mjl- · Aug 11, 2023 · 48eb530 · 48eb530
1 parent 79d0618
commit 48eb530
Show file tree

Hide file tree

Showing 10 changed files with 130 additions and 80 deletions.
diff --git a/ctl.go b/ctl.go
@@ -327,21 +327,16 @@ func servectlcmd(ctx context.Context, ctl *ctl, shutdown func()) {
 				log.Check(err, "closing temporary message file")
 			}
 		}()
-		mw := &message.Writer{Writer: msgFile}
+		mw := message.NewWriter(msgFile)
 		ctl.xwriteok()
 
 		ctl.xstreamto(mw)
 		err = msgFile.Sync()
 		ctl.xcheck(err, "syncing message to storage")
-		msgPrefix := []byte{}
-		if !mw.HaveHeaders {
-			msgPrefix = []byte("\r\n\r\n")
-		}
 
 		m := &store.Message{
-			Received:  time.Now(),
-			Size:      int64(len(msgPrefix)) + mw.Size,
-			MsgPrefix: msgPrefix,
+			Received: time.Now(),
+			Size:     mw.Size,
 		}
 
 		a.WithWLock(func() {

diff --git a/imapserver/server.go b/imapserver/server.go
@@ -2683,7 +2683,7 @@ func (c *conn) cmdAppend(tag, cmd string, p *parser) {
 		}
 	}()
 	defer c.xtrace(mlog.LevelTracedata)()
-	mw := &message.Writer{Writer: msgFile}
+	mw := message.NewWriter(msgFile)
 	msize, err := io.Copy(mw, io.LimitReader(c.br, size))
 	c.xtrace(mlog.LevelTrace) // Restore.
 	if err != nil {
@@ -2693,11 +2693,6 @@ func (c *conn) cmdAppend(tag, cmd string, p *parser) {
 	if msize != size {
 		xserverErrorf("read %d bytes for message, expected %d (%w)", msize, size, errIO)
 	}
-	msgPrefix := []byte{}
-	// todo: should we treat the message as body? i believe headers are required in messages, and bodies are optional. so would make more sense to treat the data as headers. perhaps only if the headers are valid?
-	if !mw.HaveHeaders {
-		msgPrefix = []byte("\r\n")
-	}
 
 	if utf8 {
 		line := c.readline(false)
@@ -2736,8 +2731,7 @@ func (c *conn) cmdAppend(tag, cmd string, p *parser) {
 				Received:      tm,
 				Flags:         storeFlags,
 				Keywords:      keywords,
-				Size:          size + int64(len(msgPrefix)),
-				MsgPrefix:     msgPrefix,
+				Size:          size,
 			}
 
 			mb.Add(m.MailboxCounts())

diff --git a/imapserver/status_test.go b/imapserver/status_test.go
@@ -25,10 +25,10 @@ func TestStatus(t *testing.T) {
 	// Again, now with a message in the mailbox.
 	tc.transactf("ok", "append inbox {4+}\r\ntest")
 	tc.transactf("ok", "status inbox (messages uidnext uidvalidity unseen deleted size recent appendlimit)")
-	tc.xuntagged(imapclient.UntaggedStatus{Mailbox: "Inbox", Attrs: map[string]int64{"MESSAGES": 1, "UIDVALIDITY": 1, "UIDNEXT": 2, "UNSEEN": 1, "DELETED": 0, "SIZE": 6, "RECENT": 0, "APPENDLIMIT": 0}})
+	tc.xuntagged(imapclient.UntaggedStatus{Mailbox: "Inbox", Attrs: map[string]int64{"MESSAGES": 1, "UIDVALIDITY": 1, "UIDNEXT": 2, "UNSEEN": 1, "DELETED": 0, "SIZE": 4, "RECENT": 0, "APPENDLIMIT": 0}})
 
 	tc.client.Select("inbox")
 	tc.client.StoreFlagsSet("1", true, `\Deleted`)
 	tc.transactf("ok", "status inbox (messages uidnext uidvalidity unseen deleted size recent appendlimit)")
-	tc.xuntagged(imapclient.UntaggedStatus{Mailbox: "Inbox", Attrs: map[string]int64{"MESSAGES": 1, "UIDVALIDITY": 1, "UIDNEXT": 2, "UNSEEN": 1, "DELETED": 1, "SIZE": 6, "RECENT": 0, "APPENDLIMIT": 0}})
+	tc.xuntagged(imapclient.UntaggedStatus{Mailbox: "Inbox", Attrs: map[string]int64{"MESSAGES": 1, "UIDVALIDITY": 1, "UIDNEXT": 2, "UNSEEN": 1, "DELETED": 1, "SIZE": 4, "RECENT": 0, "APPENDLIMIT": 0}})
 }
diff --git a/message/part.go b/message/part.go
@@ -4,7 +4,6 @@ package message
 // todo: allow more invalid content-type values, we now stop parsing on: empty media type (eg "content-type: ; name=..."), empty value for property (eg "charset=", missing quotes for characters that should be quoted (eg boundary containing "=" but without quotes), duplicate properties (two charsets), empty pairs (eg "text/html;;").
 // todo: what should our max line length be? rfc says 1000. messages exceed that. we should enforce 1000 for outgoing messages.
 // todo: should we be forgiving when closing boundary in multipart message is missing? seems like spam messages do this...
-// todo: allow bare \r (without \n)? this does happen in messages.
 // todo: should we allow base64 messages where a line starts with a space? and possibly more whitespace. is happening in messages. coreutils base64 accepts it, encoding/base64 does not.
 // todo: handle comments in headers?
 // todo: should we just always store messages with \n instead of \r\n? \r\n seems easier for use with imap.
@@ -45,7 +44,8 @@ var (
 	errLineTooLong            = errors.New("line too long")
 	errMissingBoundaryParam   = errors.New("missing/empty boundary content-type parameter")
 	errMissingClosingBoundary = errors.New("eof without closing boundary")
-	errHalfLineSep            = errors.New("invalid CR or LF without the other")
+	errBareLF                 = errors.New("invalid bare line feed")
+	errBareCR                 = errors.New("invalid bare carriage return")
 	errUnexpectedEOF          = errors.New("unexpected eof")
 )
 
@@ -269,8 +269,12 @@ func newPart(r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
 	hb := &bytes.Buffer{}
 	for {
 		line, _, err := b.ReadLine(true)
+		if err == io.EOF {
+			// No body is valid.
+			break
+		}
 		if err != nil {
-			return p, err
+			return p, fmt.Errorf("reading header line: %w", err)
 		}
 		hb.Write(line)
 		if len(line) == 2 {
@@ -279,13 +283,18 @@ func newPart(r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
 	}
 	p.BodyOffset = b.offset
 
-	h, err := parseHeader(hb)
-	if err != nil {
-		return p, fmt.Errorf("parsing header: %w", err)
+	// Don't attempt to parse empty header, mail.ReadMessage doesn't like it.
+	if p.HeaderOffset == p.BodyOffset {
+		p.header = textproto.MIMEHeader{}
+	} else {
+		h, err := parseHeader(hb)
+		if err != nil {
+			return p, fmt.Errorf("parsing header: %w", err)
+		}
+		p.header = h
 	}
-	p.header = h
 
-	ct := h.Get("Content-Type")
+	ct := p.header.Get("Content-Type")
 	mt, params, err := mime.ParseMediaType(ct)
 	if err != nil && ct != "" {
 		return p, fmt.Errorf("%w: %s: %q", ErrBadContentType, err, ct)
@@ -300,12 +309,12 @@ func newPart(r io.ReaderAt, offset int64, parent *Part) (p Part, rerr error) {
 		p.ContentTypeParams = params
 	}
 
-	p.ContentID = h.Get("Content-Id")
-	p.ContentDescription = h.Get("Content-Description")
-	p.ContentTransferEncoding = strings.ToUpper(h.Get("Content-Transfer-Encoding"))
+	p.ContentID = p.header.Get("Content-Id")
+	p.ContentDescription = p.header.Get("Content-Description")
+	p.ContentTransferEncoding = strings.ToUpper(p.header.Get("Content-Transfer-Encoding"))
 
 	if parent == nil {
-		p.Envelope, err = parseEnvelope(mail.Header(h))
+		p.Envelope, err = parseEnvelope(mail.Header(p.header))
 		if err != nil {
 			return p, err
 		}
@@ -347,6 +356,10 @@ func (p *Part) Header() (textproto.MIMEHeader, error) {
 	if p.header != nil {
 		return p.header, nil
 	}
+	if p.HeaderOffset == p.BodyOffset {
+		p.header = textproto.MIMEHeader{}
+		return p.header, nil
+	}
 	h, err := parseHeader(p.HeaderReader())
 	p.header = h
 	return h, err
@@ -357,6 +370,7 @@ func (p *Part) HeaderReader() io.Reader {
 	return io.NewSectionReader(p.r, p.HeaderOffset, p.BodyOffset-p.HeaderOffset)
 }
 
+// parse a header, only call this on non-empty input (even though that is a valid header).
 func parseHeader(r io.Reader) (textproto.MIMEHeader, error) {
 	// We read using mail.ReadMessage instead of textproto.ReadMIMEHeaders because the
 	// first handles email messages properly, while the second only works for HTTP
@@ -444,7 +458,7 @@ func parseAddressList(h mail.Header, k string) []Address {
 
 // ParseNextPart parses the next (sub)part of this multipart message.
 // ParseNextPart returns io.EOF and a nil part when there are no more parts.
-// Only use for initial parsing of message. Once parsed, use p.Parts.
+// Only used for initial parsing of message. Once parsed, use p.Parts.
 func (p *Part) ParseNextPart() (*Part, error) {
 	if len(p.bound) == 0 {
 		return nil, errNotMultipart
@@ -602,14 +616,15 @@ func (p *Part) RawReader() io.Reader {
 	}
 	p.RawLineCount = 0
 	if p.parent == nil {
-		return &offsetReader{p, p.BodyOffset, true}
+		return &offsetReader{p, p.BodyOffset, true, false}
 	}
-	return &boundReader{p: p, b: &bufAt{r: p.r, offset: p.BodyOffset}, lastnewline: true}
+	return &boundReader{p: p, b: &bufAt{r: p.r, offset: p.BodyOffset}, prevlf: true}
 }
 
 // bufAt is a buffered reader on an underlying ReaderAt.
+// bufAt verifies that lines end with crlf.
 type bufAt struct {
-	offset int64 // Offset in r currently consumed, i.e. ignoring any buffered data.
+	offset int64 // Offset in r currently consumed, i.e. not including any buffered data.
 
 	r       io.ReaderAt
 	buf     []byte // Buffered data.
@@ -649,7 +664,7 @@ func (b *bufAt) ensure() error {
 }
 
 // ReadLine reads a line until \r\n is found, returning the line including \r\n.
-// If not found, or a single \r or \n is encountered, ReadLine returns an error, e.g. io.EOF.
+// If not found, or a bare \n is encountered, or a bare \r is enountered in pedantic mode, ReadLine returns an error.
 func (b *bufAt) ReadLine(requirecrlf bool) (buf []byte, crlf bool, err error) {
 	return b.line(true, requirecrlf)
 }
@@ -664,14 +679,18 @@ func (b *bufAt) line(consume, requirecrlf bool) (buf []byte, crlf bool, err erro
 	}
 	for i, c := range b.buf[:b.nbuf] {
 		if c == '\n' {
-			return nil, false, errHalfLineSep
+			// Should have seen a \r, which should have been handled below.
+			return nil, false, errBareLF
 		}
 		if c != '\r' {
 			continue
 		}
 		i++
 		if i >= b.nbuf || b.buf[i] != '\n' {
-			return nil, false, errHalfLineSep
+			if moxvar.Pedantic {
+				return nil, false, errBareCR
+			}
+			continue
 		}
 		b.scratch = b.scratch[:i+1]
 		copy(b.scratch, b.buf[:i+1])
@@ -708,10 +727,13 @@ func (b *bufAt) PeekByte() (byte, error) {
 	return b.buf[0], nil
 }
 
+// offsetReader reads from p.r starting from offset, and RawLineCount on p.
+// offsetReader validates lines end with \r\n.
 type offsetReader struct {
-	p           *Part
-	offset      int64
-	lastnewline bool
+	p      *Part
+	offset int64
+	prevlf bool
+	prevcr bool
 }
 
 func (r *offsetReader) Read(buf []byte) (int, error) {
@@ -720,10 +742,18 @@ func (r *offsetReader) Read(buf []byte) (int, error) {
 		r.offset += int64(n)
 
 		for _, c := range buf[:n] {
-			if r.lastnewline {
+			if r.prevlf {
 				r.p.RawLineCount++
 			}
-			r.lastnewline = c == '\n'
+			if err == nil || err == io.EOF {
+				if c == '\n' && !r.prevcr {
+					err = errBareLF
+				} else if c != '\n' && r.prevcr && moxvar.Pedantic {
+					err = errBareCR
+				}
+			}
+			r.prevlf = c == '\n'
+			r.prevcr = c == '\r'
 		}
 	}
 	if err == io.EOF {
@@ -735,24 +765,34 @@ func (r *offsetReader) Read(buf []byte) (int, error) {
 var crlf = []byte("\r\n")
 
 // boundReader is a reader that stops at a closing multipart boundary.
+// boundReader ensures lines end with crlf through its use of bufAt.
 type boundReader struct {
-	p           *Part
-	b           *bufAt
-	buf         []byte // Data from previous line, to be served first.
-	nbuf        int    // Number of valid bytes in buf.
-	crlf        []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
-	lastnewline bool   // If last char return was a newline. For counting lines.
+	p      *Part
+	b      *bufAt
+	buf    []byte // Data from previous line, to be served first.
+	nbuf   int    // Number of valid bytes in buf.
+	crlf   []byte // Possible crlf, to be returned if we do not yet encounter a boundary.
+	prevlf bool   // If last char returned was a newline. For counting lines.
+	prevcr bool
 }
 
 func (b *boundReader) Read(buf []byte) (count int, rerr error) {
 	origBuf := buf
 	defer func() {
 		if count > 0 {
 			for _, c := range origBuf[:count] {
-				if b.lastnewline {
+				if b.prevlf {
 					b.p.RawLineCount++
 				}
-				b.lastnewline = c == '\n'
+				if rerr == nil || rerr == io.EOF {
+					if c == '\n' && !b.prevcr {
+						rerr = errBareLF
+					} else if c != '\n' && b.prevcr && moxvar.Pedantic {
+						rerr = errBareCR
+					}
+				}
+				b.prevlf = c == '\n'
+				b.prevcr = c == '\r'
 			}
 		}
 	}()

diff --git a/message/part_test.go b/message/part_test.go
@@ -10,6 +10,8 @@ import (
 	"reflect"
 	"strings"
 	"testing"
+
+	"github.com/mjl-/mox/moxvar"
 )
 
 func tcheck(t *testing.T, err error, msg string) {
@@ -204,11 +206,29 @@ func TestLongLine(t *testing.T) {
 }
 
 func TestHalfCrLf(t *testing.T) {
-	_, err := Parse(strings.NewReader("test\rtest"))
-	tfail(t, err, errHalfLineSep)
-
-	_, err = Parse(strings.NewReader("test\ntest"))
-	tfail(t, err, errHalfLineSep)
+	parse := func(s string) error {
+		p, err := Parse(strings.NewReader(s))
+		if err != nil {
+			return err
+		}
+		return p.Walk(nil)
+	}
+	err := parse("subject: test\ntest\r\n")
+	tfail(t, err, errBareLF)
+	err = parse("\r\ntest\ntest\r\n")
+	tfail(t, err, errBareLF)
+
+	moxvar.Pedantic = true
+	err = parse("subject: test\rtest\r\n")
+	tfail(t, err, errBareCR)
+	err = parse("\r\ntest\rtest\r\n")
+	tfail(t, err, errBareCR)
+	moxvar.Pedantic = false
+
+	err = parse("subject: test\rtest\r\n")
+	tcheck(t, err, "header with bare cr")
+	err = parse("\r\ntest\rtest\r\n")
+	tcheck(t, err, "body with bare cr")
 }
 
 func TestMissingClosingBoundary(t *testing.T) {

diff --git a/message/writer.go b/message/writer.go
@@ -7,17 +7,24 @@ import (
 // Writer is a write-through helper, collecting properties about the written
 // message.
 type Writer struct {
-	Writer      io.Writer
-	HaveHeaders bool
-	Has8bit     bool // Whether a byte with the high/8bit has been read. So whether this is 8BITMIME instead of 7BIT.
-	Size        int64
-	tail        [3]byte // For detecting crlfcrlf.
+	writer io.Writer
+
+	HaveBody bool // Body is optional. ../rfc/5322:343
+	Has8bit  bool // Whether a byte with the high/8bit has been read. So whether this is 8BITMIME instead of 7BIT.
+	Size     int64
+
+	tail [3]byte // For detecting header/body-separating crlf.
 	// todo: should be parsing headers here, as we go
 }
 
+func NewWriter(w io.Writer) *Writer {
+	// Pretend we already saw \r\n, for handling empty header.
+	return &Writer{writer: w, tail: [3]byte{0, '\r', '\n'}}
+}
+
 // Write implements io.Writer.
 func (w *Writer) Write(buf []byte) (int, error) {
-	if !w.HaveHeaders && len(buf) > 0 {
+	if !w.HaveBody && len(buf) > 0 {
 		get := func(i int) byte {
 			if i < 0 {
 				return w.tail[3+i]
@@ -27,7 +34,7 @@ func (w *Writer) Write(buf []byte) (int, error) {
 
 		for i, b := range buf {
 			if b == '\n' && get(i-3) == '\r' && get(i-2) == '\n' && get(i-1) == '\r' {
-				w.HaveHeaders = true
+				w.HaveBody = true
 				break
 			}
 		}
@@ -47,7 +54,7 @@ func (w *Writer) Write(buf []byte) (int, error) {
 			}
 		}
 	}
-	n, err := w.Writer.Write(buf)
+	n, err := w.writer.Write(buf)
 	if n > 0 {
 		w.Size += int64(n)
 	}