Files
postmoogle/vendor/github.com/kvannotten/mailstrip/mailstrip.go
2024-02-26 20:42:37 +02:00

269 lines
8.4 KiB
Go

// mailstrip is a Go library that parses email text and strips it of
// signatures and reply quotes. It is a port of email_reply_parser,
// GitHub's library for parsing email replies.
//
// see https://github.com/github/email_reply_parser
package mailstrip
import (
"bufio"
"fmt"
"io"
"regexp"
"strings"
"unicode"
)
// Parse parses a plaintext email and returns the results.
func Parse(text string) Email {
p := &parser{}
return p.Parse(text)
}
type parser struct {
// This determines if any 'visible' Fragment has been found. Once any
// visible Fragment is found, stop looking for hidden ones.
foundVisible bool
// This instance variable points to the current Fragment. If the matched
// line fits, it should be added to this Fragment. Otherwise, finish it and
// start a new Fragment.
fragment *Fragment
// The fragments parsed so far
fragments []*Fragment
}
// > I define UNIX as “30 definitions of regular expressions living under one
// > roof.”
// —Don Knuth
//
// Porting the Ruby regular expressions from email_reply_parser to Go required
// making the following changes:
//
// - Unlike most regexp flavors I'm familiar with, ^ and $ stand for beginning
// and end of line respectively in Ruby. Getting the same behavior in Go
// required enabling Go's multiline mode "(?m)" for these expressions.
// - Ruby's multiline mode "/m" is the same as Go's "(?s)" flag. Both are used
// to make "." match "\n" characters.
var (
// used to join quote headers that were broken into multiple lines by the
// e-mail client. e.g. gmail does that for lines exceeding 80 chars
multiLineReplyHeaderRegexps = []*regexp.Regexp{
// e.g. On Aug 22, 2011, at 7:37 PM, defunkt<reply@reply.github.com> wrote:
regexp.MustCompile("(?sm)^(On\\s(?:.+)wrote:)$"),
// e.g. 2013/11/13 John Smith <john@smith.org>
regexp.MustCompile("(?sm)^(\\d{4}/\\d{1,2}/\\d{1,2} .*<.+@.+>)$"),
}
sigRegexp = regexp.MustCompile("(\\d+ swodniW rof >.*<liaM morf tneS|--|__|(?m)\\w-$)|(?m)(^(\\w+\\s*){1,3} " + reverseString("Sent from my") + "$)")
fwdRegexp = regexp.MustCompile("(?mi)^--+\\s*" + reverseString("Forwarded message") + "\\s*--+$")
quotedRegexp = regexp.MustCompile("(?m)(>+)$")
quoteHeaderRegexp = regexp.MustCompile("(?m)^:etorw.*nO$|^.*[0-9]{4}\\s\\.\\w{2,4}\\s\\d{1,2}\\s.{3,4}$|^\\w{3,4}\\s\\d{1,2}\\s\\w{3,4}\\.\\s[0-9]{4}.*$|^>.*\\d{1,2}/\\d{1,2}/\\d{4}$|^(?m)^.*?[0-9]{4}\\s\\.\\w+\\s\\d\\s.*n\\.*$")
)
func (p *parser) Parse(text string) Email {
// Normalize line endings.
text = strings.Replace(text, "\r\n", "\n", -1)
// Check for multi-line reply headers. Some clients break up the "On DATE,
// NAME <EMAIL> wrote:" line (and similar quote headers) into multiple lines.
for _, r := range multiLineReplyHeaderRegexps {
if m := r.FindStringSubmatch(text); len(m) == 2 {
// Remove all new lines from the reply header.
text = strings.Replace(text, m[1], strings.Replace(m[1], "\n", "", -1), -1)
}
}
// The text is reversed initially due to the way we check for hidden
// fragments.
text = reverseString(text)
// Use the Reader to pull out each line of the email content.
reader := bufio.NewReader(strings.NewReader(text))
for {
line, e := reader.ReadBytes('\n')
p.scanLine(strings.TrimRight(string(line), "\n"))
if e == io.EOF {
break
} else if e != nil {
// Our underlaying reader is a strings.Reader, which will never return
// errors other than io.EOF, so this is merely a sanity check.
panic(fmt.Sprintf("Bug: ReadBytes returned an error other than io.EOF: %#v", e))
}
}
// Finish up the final fragment. Finishing a fragment will detect any
// attributes (hidden, signature, reply), and join each line into a
// string.
p.finishFragment()
// Now that parsing is done, reverse the order.
reverseFragments(p.fragments)
return Email(p.fragments)
}
// scaneLine scans the given line of text and figures out which fragment it
// belongs to.
func (p *parser) scanLine(line string) {
sigMatch := sigRegexp.MatchString(line)
if !sigMatch {
line = strings.TrimLeftFunc(line, unicode.IsSpace)
}
// We're looking for leading `>`'s to see if this line is part of a
// quoted Fragment.
isQuoted := quotedRegexp.MatchString(line)
// Mark the current Fragment as a signature if the current line is empty
// and the Fragment starts with a common signature indicator.
if p.fragment != nil && line == "" {
// lastLine is really the first line, since the lines are still reversed
// at this point.
lastLine := p.fragment.lines[len(p.fragment.lines)-1]
if fwdRegexp.MatchString(lastLine) {
p.fragment.forwarded = true
p.finishFragment()
} else if sigRegexp.MatchString(lastLine) {
p.fragment.signature = true
p.finishFragment()
}
}
isQuoteHeader := p.quoteHeader(line)
// Yahoo! does not use '>' quote indicator in replies, so if a quote header
// suddenly appears in an otherwise unquoted fragment, consider it quoted
// now.
if p.fragment != nil && isQuoteHeader {
p.fragment.quoted = true
}
// If the line matches the current fragment, add it. Note that a common
// reply header also counts as part of the quoted Fragment, even though
// it doesn't start with `>`.
if p.fragment != nil &&
((p.fragment.quoted == isQuoted) ||
(p.fragment.quoted && (isQuoteHeader || line == ""))) {
p.fragment.lines = append(p.fragment.lines, line)
// Otherwise, finish the fragment and start a new one.
} else {
p.finishFragment()
p.fragment = &Fragment{quoted: isQuoted, lines: []string{line}}
}
}
// quoteHeader detects if a given line is a header above a quoted area. It is
// only checked for lines preceding quoted regions. Returns true if the line is
// a valid header, or false.
func (p *parser) quoteHeader(line string) bool {
return quoteHeaderRegexp.MatchString(line)
}
// finishFragment builds the fragment string and reverses it, after all lines
// have been added. It also checks to see if this Fragment is hidden. The
// hidden Fragment check reads from the bottom to the top.
//
// Any quoted Fragments or signature Fragments are marked hidden if they are
// below any visible Fragments. Visible Fragments are expected to contain
// original content by the author. If they are below a quoted Fragment, then
// the Fragment should be visible to give context to the reply.
//
// some original text (visible)
//
// > do you have any two's? (quoted, visible)
//
// Go fish! (visible)
//
// > -- > Player 1 (quoted, hidden)
//
// -- Player 2 (signature, hidden)
func (p *parser) finishFragment() {
if p.fragment != nil {
p.fragment.finish()
if !p.foundVisible {
if p.fragment.quoted || p.fragment.signature ||
strings.TrimSpace(p.fragment.String()) == "" {
p.fragment.hidden = true
} else {
p.foundVisible = true
}
}
p.fragments = append(p.fragments, p.fragment)
}
p.fragment = nil
}
func reverseString(s string) string {
runes := []rune(s)
for i, j := 0, len(runes)-1; i < j; i, j = i+1, j-1 {
runes[i], runes[j] = runes[j], runes[i]
}
return string(runes)
}
func reverseFragments(f []*Fragment) {
for i, j := 0, len(f)-1; i < j; i, j = i+1, j-1 {
f[i], f[j] = f[j], f[i]
}
}
// Email contains the parsed contents of an email.
type Email []*Fragment
// String returns the non-Hidden() fragments of the Email.
func (e Email) String() string {
results := []string{}
for _, fragment := range e {
if fragment.Hidden() {
continue
}
results = append(results, fragment.String())
}
result := strings.Join(results, "\n")
result = strings.TrimRightFunc(result, unicode.IsSpace)
return result
}
// Fragment contains a parsed section of an email.
type Fragment struct {
lines []string
content string
hidden bool
signature bool
forwarded bool
quoted bool
}
// finish builds the string content by joining the lines and reversing them.
func (f *Fragment) finish() {
f.content = strings.Join(f.lines, "\n")
f.lines = nil
f.content = reverseString(f.content)
}
// Forwarded returns if the fragment is forwarded or not.
func (f *Fragment) Forwarded() bool {
return f.forwarded
}
// Signature returns if the fragment is a signature or not.
func (f *Fragment) Signature() bool {
return f.signature
}
// Signature returns if the fragment is a quote or not.
func (f *Fragment) Quoted() bool {
return f.quoted
}
// Signature returns if the fragment is considered hidden or not.
func (f *Fragment) Hidden() bool {
return f.hidden
}
// String returns the content of the fragment.
func (f *Fragment) String() string {
return f.content
}