Files
postmoogle/vendor/github.com/jhillyerd/enmime/part.go
2022-11-16 12:08:51 +02:00

446 lines
14 KiB
Go

package enmime
import (
"bufio"
"bytes"
"encoding/base64"
"io"
"io/ioutil"
"mime/quotedprintable"
"net/textproto"
"strconv"
"strings"
"time"
"github.com/gogs/chardet"
"github.com/jhillyerd/enmime/internal/coding"
"github.com/jhillyerd/enmime/mediatype"
"github.com/pkg/errors"
)
const (
minCharsetConfidence = 85
minCharsetRuneLength = 100
)
// Part represents a node in the MIME multipart tree. The Content-Type, Disposition and File Name
// are parsed out of the header for easier access.
type Part struct {
PartID string // PartID labels this parts position within the tree.
Parent *Part // Parent of this part (can be nil.)
FirstChild *Part // FirstChild is the top most child of this part.
NextSibling *Part // NextSibling of this part.
Header textproto.MIMEHeader // Header for this Part.
Boundary string // Boundary marker used within this part.
ContentID string // ContentID header for cid URL scheme.
ContentType string // ContentType header without parameters.
ContentTypeParams map[string]string // Params, added to ContentType header.
Disposition string // Content-Disposition header without parameters.
FileName string // The file-name from disposition or type header.
FileModDate time.Time // The modification date of the file.
Charset string // The content charset encoding, may differ from charset in header.
OrigCharset string // The original content charset when a different charset was detected.
Errors []*Error // Errors encountered while parsing this part.
Content []byte // Content after decoding, UTF-8 conversion if applicable.
Epilogue []byte // Epilogue contains data following the closing boundary marker.
}
// NewPart creates a new Part object.
func NewPart(contentType string) *Part {
return &Part{
Header: make(textproto.MIMEHeader),
ContentType: contentType,
ContentTypeParams: make(map[string]string),
}
}
// AddChild adds a child part to either FirstChild or the end of the children NextSibling chain.
// The child may have siblings and children attached. This method will set the Parent field on
// child and all its siblings. Safe to call on nil.
func (p *Part) AddChild(child *Part) {
if p == child {
// Prevent paradox.
return
}
if p != nil {
if p.FirstChild == nil {
// Make it the first child.
p.FirstChild = child
} else {
// Append to sibling chain.
current := p.FirstChild
for current.NextSibling != nil {
current = current.NextSibling
}
if current == child {
// Prevent infinite loop.
return
}
current.NextSibling = child
}
}
// Update all new first-level children Parent pointers.
for c := child; c != nil; c = c.NextSibling {
if c == c.NextSibling {
// Prevent infinite loop.
return
}
c.Parent = p
}
}
// TextContent indicates whether the content is text based on its content type. This value
// determines what content transfer encoding scheme to use.
func (p *Part) TextContent() bool {
if p.ContentType == "" {
// RFC 2045: no CT is equivalent to "text/plain; charset=us-ascii"
return true
}
return strings.HasPrefix(p.ContentType, "text/") ||
strings.HasPrefix(p.ContentType, ctMultipartPrefix)
}
// setupHeaders reads the header, then populates the MIME header values for this Part.
func (p *Part) setupHeaders(r *bufio.Reader, defaultContentType string) error {
header, err := readHeader(r, p)
if err != nil {
return err
}
p.Header = header
ctype := header.Get(hnContentType)
if ctype == "" {
if defaultContentType == "" {
p.addWarning(ErrorMissingContentType, "MIME parts should have a Content-Type header")
return nil
}
ctype = defaultContentType
}
// Parse Content-Type header.
mtype, mparams, minvalidParams, err := mediatype.Parse(ctype)
if err != nil {
return err
}
for i := range minvalidParams {
p.addWarning(
ErrorMalformedHeader,
"Content-Type header has malformed parameter %q",
minvalidParams[i])
}
p.ContentType = mtype
// Set disposition, filename, charset if available.
p.setupContentHeaders(mparams)
p.Boundary = mparams[hpBoundary]
p.ContentID = coding.FromIDHeader(header.Get(hnContentID))
return nil
}
// setupContentHeaders uses Content-Type media params and Content-Disposition headers to populate
// the disposition, filename, and charset fields.
func (p *Part) setupContentHeaders(mediaParams map[string]string) {
// Determine content disposition, filename, character set.
disposition, dparams, _, err := mediatype.Parse(p.Header.Get(hnContentDisposition))
if err == nil {
// Disposition is optional
p.Disposition = disposition
p.FileName = coding.DecodeExtHeader(dparams[hpFilename])
}
if p.FileName == "" && mediaParams[hpName] != "" {
p.FileName = coding.DecodeExtHeader(mediaParams[hpName])
}
if p.FileName == "" && mediaParams[hpFile] != "" {
p.FileName = coding.DecodeExtHeader(mediaParams[hpFile])
}
if p.Charset == "" {
p.Charset = mediaParams[hpCharset]
}
if p.FileModDate.IsZero() {
p.FileModDate, _ = time.Parse(time.RFC822, mediaParams[hpModDate])
}
}
// convertFromDetectedCharset attempts to detect the character set for the given part, and returns
// an io.Reader that will convert from that charset to UTF-8. If the charset cannot be detected,
// this method adds a warning to the part and automatically falls back to using
// `convertFromStatedCharset` and returns the reader from that method.
func (p *Part) convertFromDetectedCharset(r io.Reader) (io.Reader, error) {
// Attempt to detect character set from part content.
var cd *chardet.Detector
switch p.ContentType {
case "text/html":
cd = chardet.NewHtmlDetector()
default:
cd = chardet.NewTextDetector()
}
buf, err := ioutil.ReadAll(r)
if err != nil {
return nil, errors.WithStack(err)
}
cs, err := cd.DetectBest(buf)
switch err {
case nil:
// Carry on
default:
p.addWarning(ErrorCharsetDeclaration, "charset could not be detected: %v", err)
}
// Restore r.
r = bytes.NewReader(buf)
if cs == nil || cs.Confidence < minCharsetConfidence || len(bytes.Runes(buf)) < minCharsetRuneLength {
// Low confidence or not enough characters, use declared character set.
return p.convertFromStatedCharset(r), nil
}
// Confidence exceeded our threshold, use detected character set.
if p.Charset != "" && !strings.EqualFold(cs.Charset, p.Charset) {
p.addWarning(ErrorCharsetDeclaration,
"declared charset %q, detected %q, confidence %d",
p.Charset, cs.Charset, cs.Confidence)
}
if reader, err := coding.NewCharsetReader(cs.Charset, r); err == nil {
r = reader
p.OrigCharset = p.Charset
p.Charset = cs.Charset
}
return r, nil
}
// convertFromStatedCharset returns a reader that will convert from the charset specified for the
// current `*Part` to UTF-8. In case of error, or an unhandled character set, a warning will be
// added to the `*Part` and the original io.Reader will be returned.
func (p *Part) convertFromStatedCharset(r io.Reader) io.Reader {
if p.Charset == "" {
// US-ASCII. Just read.
return r
}
reader, err := coding.NewCharsetReader(p.Charset, r)
if err != nil {
// Failed to get a conversion reader.
p.addWarning(ErrorCharsetConversion, "failed to get reader for charset %q: %v", p.Charset, err)
} else {
return reader
}
// Try to parse charset again here to see if we can salvage some badly formed
// ones like charset="charset=utf-8".
charsetp := strings.Split(p.Charset, "=")
if strings.EqualFold(charsetp[0], "charset") && len(charsetp) > 1 ||
strings.EqualFold(charsetp[0], "iso") && len(charsetp) > 1 {
p.Charset = charsetp[1]
reader, err = coding.NewCharsetReader(p.Charset, r)
if err != nil {
// Failed to get a conversion reader.
p.addWarning(ErrorCharsetConversion, "failed to get reader for charset %q: %v", p.Charset, err)
} else {
return reader
}
}
return r
}
// decodeContent performs transport decoding (base64, quoted-printable) and charset decoding,
// placing the result into Part.Content. IO errors will be returned immediately; other errors
// and warnings will be added to Part.Errors.
func (p *Part) decodeContent(r io.Reader) error {
// contentReader will point to the end of the content decoding pipeline.
contentReader := r
// b64cleaner aggregates errors, must maintain a reference to it to get them later.
var b64cleaner *coding.Base64Cleaner
// Build content decoding reader.
encoding := p.Header.Get(hnContentEncoding)
validEncoding := true
switch strings.ToLower(encoding) {
case cteQuotedPrintable:
contentReader = coding.NewQPCleaner(contentReader)
contentReader = quotedprintable.NewReader(contentReader)
case cteBase64:
b64cleaner = coding.NewBase64Cleaner(contentReader)
contentReader = base64.NewDecoder(base64.RawStdEncoding, b64cleaner)
case cte8Bit, cte7Bit, cteBinary, "":
// No decoding required.
default:
// Unknown encoding.
validEncoding = false
p.addWarning(
ErrorContentEncoding,
"Unrecognized Content-Transfer-Encoding type %q",
encoding)
}
// Build charset decoding reader.
if validEncoding && strings.HasPrefix(p.ContentType, "text/") {
var err error
contentReader, err = p.convertFromDetectedCharset(contentReader)
if err != nil {
return p.base64CorruptInputCheck(err)
}
}
// Decode and store content.
content, err := ioutil.ReadAll(contentReader)
if err != nil {
return p.base64CorruptInputCheck(errors.WithStack(err))
}
p.Content = content
// Collect base64 errors.
if b64cleaner != nil {
for _, err := range b64cleaner.Errors {
p.addWarning(ErrorMalformedBase64, err.Error())
}
}
// Set empty content-type error.
if p.ContentType == "" {
p.addWarning(
ErrorMissingContentType, "content-type is empty for part id: %s", p.PartID)
}
return nil
}
// base64CorruptInputCheck will avoid fatal failure on corrupt base64 input
//
// This is a switch on errors.Cause(err).(type) for base64.CorruptInputError
func (p *Part) base64CorruptInputCheck(err error) error {
switch errors.Cause(err).(type) {
case base64.CorruptInputError:
p.Content = nil
p.addError(ErrorMalformedBase64, err.Error())
return nil
default:
return err
}
}
// Clone returns a clone of the current Part.
func (p *Part) Clone(parent *Part) *Part {
if p == nil {
return nil
}
newPart := &Part{
PartID: p.PartID,
Header: p.Header,
Parent: parent,
Boundary: p.Boundary,
ContentID: p.ContentID,
ContentType: p.ContentType,
Disposition: p.Disposition,
FileName: p.FileName,
Charset: p.Charset,
Errors: p.Errors,
Content: p.Content,
Epilogue: p.Epilogue,
}
newPart.FirstChild = p.FirstChild.Clone(newPart)
newPart.NextSibling = p.NextSibling.Clone(parent)
return newPart
}
// ReadParts reads a MIME document from the provided reader and parses it into tree of Part objects.
func ReadParts(r io.Reader) (*Part, error) {
return defaultParser.ReadParts(r)
}
// ReadParts reads a MIME document from the provided reader and parses it into tree of Part objects.
func (p Parser) ReadParts(r io.Reader) (*Part, error) {
br := bufio.NewReader(r)
root := &Part{PartID: "0"}
// Read header; top-level default CT is text/plain us-ascii according to RFC 822.
err := root.setupHeaders(br, `text/plain; charset="us-ascii"`)
if err != nil {
return nil, err
}
if detectMultipartMessage(root, p.multipartWOBoundaryAsSinglePart) {
// Content is multipart, parse it.
err = parseParts(root, br, p.skipMalformedParts)
if err != nil {
return nil, err
}
} else {
// Content is text or data, decode it.
if err := root.decodeContent(br); err != nil {
return nil, err
}
}
return root, nil
}
// parseParts recursively parses a MIME multipart document and sets each Parts PartID.
func parseParts(parent *Part, reader *bufio.Reader, skipMalformedParts bool) error {
firstRecursion := parent.Parent == nil
// Loop over MIME boundaries.
br := newBoundaryReader(reader, parent.Boundary)
for indexPartID := 1; true; indexPartID++ {
next, err := br.Next()
if err != nil && errors.Cause(err) != io.EOF {
return err
}
if br.unbounded {
parent.addWarning(ErrorMissingBoundary, "Boundary %q was not closed correctly",
parent.Boundary)
}
if !next {
break
}
p := &Part{}
// Set this Part's PartID, indicating its position within the MIME Part tree.
if firstRecursion {
p.PartID = strconv.Itoa(indexPartID)
} else {
p.PartID = parent.PartID + "." + strconv.Itoa(indexPartID)
}
// Look for part header.
bbr := bufio.NewReader(br)
if err = p.setupHeaders(bbr, ""); err != nil {
if skipMalformedParts {
parent.addError(ErrorMalformedChildPart, "read header: %s", err.Error())
continue
}
return err
}
// Insert this Part into the MIME tree.
if p.Boundary == "" {
// Content is text or data, decode it.
if err = p.decodeContent(bbr); err != nil {
if skipMalformedParts {
parent.addError(ErrorMalformedChildPart, "decode content: %s", err.Error())
continue
}
return err
}
parent.AddChild(p)
continue
}
parent.AddChild(p)
// Content is another multipart.
if err = parseParts(p, bbr, skipMalformedParts); err != nil {
if skipMalformedParts {
parent.addError(ErrorMalformedChildPart, "parse parts: %s", err.Error())
continue
}
return err
}
}
// Store any content following the closing boundary marker into the epilogue.
epilogue, err := ioutil.ReadAll(reader)
if err != nil {
return errors.WithStack(err)
}
parent.Epilogue = epilogue
// If a Part is "multipart/" Content-Type, it will have .0 appended to its PartID
// i.e. it is the root of its MIME Part subtree.
if !firstRecursion {
parent.PartID += ".0"
}
return nil
}