446 lines
14 KiB
Go
446 lines
14 KiB
Go
package enmime
|
|
|
|
import (
|
|
"bufio"
|
|
"bytes"
|
|
"encoding/base64"
|
|
"io"
|
|
"io/ioutil"
|
|
"mime/quotedprintable"
|
|
"net/textproto"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/gogs/chardet"
|
|
"github.com/jhillyerd/enmime/internal/coding"
|
|
"github.com/jhillyerd/enmime/mediatype"
|
|
"github.com/pkg/errors"
|
|
)
|
|
|
|
const (
|
|
minCharsetConfidence = 85
|
|
minCharsetRuneLength = 100
|
|
)
|
|
|
|
// Part represents a node in the MIME multipart tree. The Content-Type, Disposition and File Name
|
|
// are parsed out of the header for easier access.
|
|
type Part struct {
|
|
PartID string // PartID labels this parts position within the tree.
|
|
Parent *Part // Parent of this part (can be nil.)
|
|
FirstChild *Part // FirstChild is the top most child of this part.
|
|
NextSibling *Part // NextSibling of this part.
|
|
Header textproto.MIMEHeader // Header for this Part.
|
|
|
|
Boundary string // Boundary marker used within this part.
|
|
ContentID string // ContentID header for cid URL scheme.
|
|
ContentType string // ContentType header without parameters.
|
|
ContentTypeParams map[string]string // Params, added to ContentType header.
|
|
Disposition string // Content-Disposition header without parameters.
|
|
FileName string // The file-name from disposition or type header.
|
|
FileModDate time.Time // The modification date of the file.
|
|
Charset string // The content charset encoding, may differ from charset in header.
|
|
OrigCharset string // The original content charset when a different charset was detected.
|
|
|
|
Errors []*Error // Errors encountered while parsing this part.
|
|
Content []byte // Content after decoding, UTF-8 conversion if applicable.
|
|
Epilogue []byte // Epilogue contains data following the closing boundary marker.
|
|
}
|
|
|
|
// NewPart creates a new Part object.
|
|
func NewPart(contentType string) *Part {
|
|
return &Part{
|
|
Header: make(textproto.MIMEHeader),
|
|
ContentType: contentType,
|
|
ContentTypeParams: make(map[string]string),
|
|
}
|
|
}
|
|
|
|
// AddChild adds a child part to either FirstChild or the end of the children NextSibling chain.
|
|
// The child may have siblings and children attached. This method will set the Parent field on
|
|
// child and all its siblings. Safe to call on nil.
|
|
func (p *Part) AddChild(child *Part) {
|
|
if p == child {
|
|
// Prevent paradox.
|
|
return
|
|
}
|
|
if p != nil {
|
|
if p.FirstChild == nil {
|
|
// Make it the first child.
|
|
p.FirstChild = child
|
|
} else {
|
|
// Append to sibling chain.
|
|
current := p.FirstChild
|
|
for current.NextSibling != nil {
|
|
current = current.NextSibling
|
|
}
|
|
if current == child {
|
|
// Prevent infinite loop.
|
|
return
|
|
}
|
|
current.NextSibling = child
|
|
}
|
|
}
|
|
// Update all new first-level children Parent pointers.
|
|
for c := child; c != nil; c = c.NextSibling {
|
|
if c == c.NextSibling {
|
|
// Prevent infinite loop.
|
|
return
|
|
}
|
|
c.Parent = p
|
|
}
|
|
}
|
|
|
|
// TextContent indicates whether the content is text based on its content type. This value
|
|
// determines what content transfer encoding scheme to use.
|
|
func (p *Part) TextContent() bool {
|
|
if p.ContentType == "" {
|
|
// RFC 2045: no CT is equivalent to "text/plain; charset=us-ascii"
|
|
return true
|
|
}
|
|
return strings.HasPrefix(p.ContentType, "text/") ||
|
|
strings.HasPrefix(p.ContentType, ctMultipartPrefix)
|
|
}
|
|
|
|
// setupHeaders reads the header, then populates the MIME header values for this Part.
|
|
func (p *Part) setupHeaders(r *bufio.Reader, defaultContentType string) error {
|
|
header, err := readHeader(r, p)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
p.Header = header
|
|
ctype := header.Get(hnContentType)
|
|
if ctype == "" {
|
|
if defaultContentType == "" {
|
|
p.addWarning(ErrorMissingContentType, "MIME parts should have a Content-Type header")
|
|
return nil
|
|
}
|
|
ctype = defaultContentType
|
|
}
|
|
// Parse Content-Type header.
|
|
mtype, mparams, minvalidParams, err := mediatype.Parse(ctype)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
for i := range minvalidParams {
|
|
p.addWarning(
|
|
ErrorMalformedHeader,
|
|
"Content-Type header has malformed parameter %q",
|
|
minvalidParams[i])
|
|
}
|
|
p.ContentType = mtype
|
|
// Set disposition, filename, charset if available.
|
|
p.setupContentHeaders(mparams)
|
|
p.Boundary = mparams[hpBoundary]
|
|
p.ContentID = coding.FromIDHeader(header.Get(hnContentID))
|
|
return nil
|
|
}
|
|
|
|
// setupContentHeaders uses Content-Type media params and Content-Disposition headers to populate
|
|
// the disposition, filename, and charset fields.
|
|
func (p *Part) setupContentHeaders(mediaParams map[string]string) {
|
|
// Determine content disposition, filename, character set.
|
|
disposition, dparams, _, err := mediatype.Parse(p.Header.Get(hnContentDisposition))
|
|
if err == nil {
|
|
// Disposition is optional
|
|
p.Disposition = disposition
|
|
p.FileName = coding.DecodeExtHeader(dparams[hpFilename])
|
|
}
|
|
if p.FileName == "" && mediaParams[hpName] != "" {
|
|
p.FileName = coding.DecodeExtHeader(mediaParams[hpName])
|
|
}
|
|
if p.FileName == "" && mediaParams[hpFile] != "" {
|
|
p.FileName = coding.DecodeExtHeader(mediaParams[hpFile])
|
|
}
|
|
if p.Charset == "" {
|
|
p.Charset = mediaParams[hpCharset]
|
|
}
|
|
if p.FileModDate.IsZero() {
|
|
p.FileModDate, _ = time.Parse(time.RFC822, mediaParams[hpModDate])
|
|
}
|
|
}
|
|
|
|
// convertFromDetectedCharset attempts to detect the character set for the given part, and returns
|
|
// an io.Reader that will convert from that charset to UTF-8. If the charset cannot be detected,
|
|
// this method adds a warning to the part and automatically falls back to using
|
|
// `convertFromStatedCharset` and returns the reader from that method.
|
|
func (p *Part) convertFromDetectedCharset(r io.Reader) (io.Reader, error) {
|
|
// Attempt to detect character set from part content.
|
|
var cd *chardet.Detector
|
|
switch p.ContentType {
|
|
case "text/html":
|
|
cd = chardet.NewHtmlDetector()
|
|
default:
|
|
cd = chardet.NewTextDetector()
|
|
}
|
|
|
|
buf, err := ioutil.ReadAll(r)
|
|
if err != nil {
|
|
return nil, errors.WithStack(err)
|
|
}
|
|
|
|
cs, err := cd.DetectBest(buf)
|
|
switch err {
|
|
case nil:
|
|
// Carry on
|
|
default:
|
|
p.addWarning(ErrorCharsetDeclaration, "charset could not be detected: %v", err)
|
|
}
|
|
|
|
// Restore r.
|
|
r = bytes.NewReader(buf)
|
|
|
|
if cs == nil || cs.Confidence < minCharsetConfidence || len(bytes.Runes(buf)) < minCharsetRuneLength {
|
|
// Low confidence or not enough characters, use declared character set.
|
|
return p.convertFromStatedCharset(r), nil
|
|
}
|
|
|
|
// Confidence exceeded our threshold, use detected character set.
|
|
if p.Charset != "" && !strings.EqualFold(cs.Charset, p.Charset) {
|
|
p.addWarning(ErrorCharsetDeclaration,
|
|
"declared charset %q, detected %q, confidence %d",
|
|
p.Charset, cs.Charset, cs.Confidence)
|
|
}
|
|
|
|
if reader, err := coding.NewCharsetReader(cs.Charset, r); err == nil {
|
|
r = reader
|
|
p.OrigCharset = p.Charset
|
|
p.Charset = cs.Charset
|
|
}
|
|
|
|
return r, nil
|
|
}
|
|
|
|
// convertFromStatedCharset returns a reader that will convert from the charset specified for the
|
|
// current `*Part` to UTF-8. In case of error, or an unhandled character set, a warning will be
|
|
// added to the `*Part` and the original io.Reader will be returned.
|
|
func (p *Part) convertFromStatedCharset(r io.Reader) io.Reader {
|
|
if p.Charset == "" {
|
|
// US-ASCII. Just read.
|
|
return r
|
|
}
|
|
|
|
reader, err := coding.NewCharsetReader(p.Charset, r)
|
|
if err != nil {
|
|
// Failed to get a conversion reader.
|
|
p.addWarning(ErrorCharsetConversion, "failed to get reader for charset %q: %v", p.Charset, err)
|
|
} else {
|
|
return reader
|
|
}
|
|
|
|
// Try to parse charset again here to see if we can salvage some badly formed
|
|
// ones like charset="charset=utf-8".
|
|
charsetp := strings.Split(p.Charset, "=")
|
|
if strings.EqualFold(charsetp[0], "charset") && len(charsetp) > 1 ||
|
|
strings.EqualFold(charsetp[0], "iso") && len(charsetp) > 1 {
|
|
p.Charset = charsetp[1]
|
|
reader, err = coding.NewCharsetReader(p.Charset, r)
|
|
if err != nil {
|
|
// Failed to get a conversion reader.
|
|
p.addWarning(ErrorCharsetConversion, "failed to get reader for charset %q: %v", p.Charset, err)
|
|
} else {
|
|
return reader
|
|
}
|
|
}
|
|
|
|
return r
|
|
}
|
|
|
|
// decodeContent performs transport decoding (base64, quoted-printable) and charset decoding,
|
|
// placing the result into Part.Content. IO errors will be returned immediately; other errors
|
|
// and warnings will be added to Part.Errors.
|
|
func (p *Part) decodeContent(r io.Reader) error {
|
|
// contentReader will point to the end of the content decoding pipeline.
|
|
contentReader := r
|
|
// b64cleaner aggregates errors, must maintain a reference to it to get them later.
|
|
var b64cleaner *coding.Base64Cleaner
|
|
// Build content decoding reader.
|
|
encoding := p.Header.Get(hnContentEncoding)
|
|
validEncoding := true
|
|
switch strings.ToLower(encoding) {
|
|
case cteQuotedPrintable:
|
|
contentReader = coding.NewQPCleaner(contentReader)
|
|
contentReader = quotedprintable.NewReader(contentReader)
|
|
case cteBase64:
|
|
b64cleaner = coding.NewBase64Cleaner(contentReader)
|
|
contentReader = base64.NewDecoder(base64.RawStdEncoding, b64cleaner)
|
|
case cte8Bit, cte7Bit, cteBinary, "":
|
|
// No decoding required.
|
|
default:
|
|
// Unknown encoding.
|
|
validEncoding = false
|
|
p.addWarning(
|
|
ErrorContentEncoding,
|
|
"Unrecognized Content-Transfer-Encoding type %q",
|
|
encoding)
|
|
}
|
|
// Build charset decoding reader.
|
|
if validEncoding && strings.HasPrefix(p.ContentType, "text/") {
|
|
var err error
|
|
contentReader, err = p.convertFromDetectedCharset(contentReader)
|
|
if err != nil {
|
|
return p.base64CorruptInputCheck(err)
|
|
}
|
|
}
|
|
// Decode and store content.
|
|
content, err := ioutil.ReadAll(contentReader)
|
|
if err != nil {
|
|
return p.base64CorruptInputCheck(errors.WithStack(err))
|
|
}
|
|
p.Content = content
|
|
// Collect base64 errors.
|
|
if b64cleaner != nil {
|
|
for _, err := range b64cleaner.Errors {
|
|
p.addWarning(ErrorMalformedBase64, err.Error())
|
|
}
|
|
}
|
|
// Set empty content-type error.
|
|
if p.ContentType == "" {
|
|
p.addWarning(
|
|
ErrorMissingContentType, "content-type is empty for part id: %s", p.PartID)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// base64CorruptInputCheck will avoid fatal failure on corrupt base64 input
|
|
//
|
|
// This is a switch on errors.Cause(err).(type) for base64.CorruptInputError
|
|
func (p *Part) base64CorruptInputCheck(err error) error {
|
|
switch errors.Cause(err).(type) {
|
|
case base64.CorruptInputError:
|
|
p.Content = nil
|
|
p.addError(ErrorMalformedBase64, err.Error())
|
|
return nil
|
|
default:
|
|
return err
|
|
}
|
|
}
|
|
|
|
// Clone returns a clone of the current Part.
|
|
func (p *Part) Clone(parent *Part) *Part {
|
|
if p == nil {
|
|
return nil
|
|
}
|
|
|
|
newPart := &Part{
|
|
PartID: p.PartID,
|
|
Header: p.Header,
|
|
Parent: parent,
|
|
Boundary: p.Boundary,
|
|
ContentID: p.ContentID,
|
|
ContentType: p.ContentType,
|
|
Disposition: p.Disposition,
|
|
FileName: p.FileName,
|
|
Charset: p.Charset,
|
|
Errors: p.Errors,
|
|
Content: p.Content,
|
|
Epilogue: p.Epilogue,
|
|
}
|
|
newPart.FirstChild = p.FirstChild.Clone(newPart)
|
|
newPart.NextSibling = p.NextSibling.Clone(parent)
|
|
|
|
return newPart
|
|
}
|
|
|
|
// ReadParts reads a MIME document from the provided reader and parses it into tree of Part objects.
|
|
func ReadParts(r io.Reader) (*Part, error) {
|
|
return defaultParser.ReadParts(r)
|
|
}
|
|
|
|
// ReadParts reads a MIME document from the provided reader and parses it into tree of Part objects.
|
|
func (p Parser) ReadParts(r io.Reader) (*Part, error) {
|
|
br := bufio.NewReader(r)
|
|
root := &Part{PartID: "0"}
|
|
// Read header; top-level default CT is text/plain us-ascii according to RFC 822.
|
|
err := root.setupHeaders(br, `text/plain; charset="us-ascii"`)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
if detectMultipartMessage(root, p.multipartWOBoundaryAsSinglePart) {
|
|
// Content is multipart, parse it.
|
|
err = parseParts(root, br, p.skipMalformedParts)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
} else {
|
|
// Content is text or data, decode it.
|
|
if err := root.decodeContent(br); err != nil {
|
|
return nil, err
|
|
}
|
|
}
|
|
return root, nil
|
|
}
|
|
|
|
// parseParts recursively parses a MIME multipart document and sets each Parts PartID.
|
|
func parseParts(parent *Part, reader *bufio.Reader, skipMalformedParts bool) error {
|
|
firstRecursion := parent.Parent == nil
|
|
// Loop over MIME boundaries.
|
|
br := newBoundaryReader(reader, parent.Boundary)
|
|
for indexPartID := 1; true; indexPartID++ {
|
|
next, err := br.Next()
|
|
if err != nil && errors.Cause(err) != io.EOF {
|
|
return err
|
|
}
|
|
if br.unbounded {
|
|
parent.addWarning(ErrorMissingBoundary, "Boundary %q was not closed correctly",
|
|
parent.Boundary)
|
|
}
|
|
if !next {
|
|
break
|
|
}
|
|
p := &Part{}
|
|
// Set this Part's PartID, indicating its position within the MIME Part tree.
|
|
if firstRecursion {
|
|
p.PartID = strconv.Itoa(indexPartID)
|
|
} else {
|
|
p.PartID = parent.PartID + "." + strconv.Itoa(indexPartID)
|
|
}
|
|
// Look for part header.
|
|
bbr := bufio.NewReader(br)
|
|
if err = p.setupHeaders(bbr, ""); err != nil {
|
|
if skipMalformedParts {
|
|
parent.addError(ErrorMalformedChildPart, "read header: %s", err.Error())
|
|
continue
|
|
}
|
|
|
|
return err
|
|
}
|
|
// Insert this Part into the MIME tree.
|
|
if p.Boundary == "" {
|
|
// Content is text or data, decode it.
|
|
if err = p.decodeContent(bbr); err != nil {
|
|
if skipMalformedParts {
|
|
parent.addError(ErrorMalformedChildPart, "decode content: %s", err.Error())
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
parent.AddChild(p)
|
|
continue
|
|
}
|
|
|
|
parent.AddChild(p)
|
|
// Content is another multipart.
|
|
if err = parseParts(p, bbr, skipMalformedParts); err != nil {
|
|
if skipMalformedParts {
|
|
parent.addError(ErrorMalformedChildPart, "parse parts: %s", err.Error())
|
|
continue
|
|
}
|
|
return err
|
|
}
|
|
}
|
|
// Store any content following the closing boundary marker into the epilogue.
|
|
epilogue, err := ioutil.ReadAll(reader)
|
|
if err != nil {
|
|
return errors.WithStack(err)
|
|
}
|
|
parent.Epilogue = epilogue
|
|
// If a Part is "multipart/" Content-Type, it will have .0 appended to its PartID
|
|
// i.e. it is the root of its MIME Part subtree.
|
|
if !firstRecursion {
|
|
parent.PartID += ".0"
|
|
}
|
|
return nil
|
|
}
|