Email-Triage: Lernen aus IMAP-Ordnern, manuelle Korrektur, reichere Daten
- Automatisches Triage-Lernen aus Archiv-Ordnern im Nacht-Ingest: retention_days=0 (Archiv) → wichtig, retention_days>0 → unwichtig - Drei neue Discord-Commands: /email triage-history, triage-correct, triage-search - StoreDecision speichert jetzt Datum + Body-Zusammenfassung (max 200 Zeichen) - MIME-Multipart-Parsing mit PDF-Attachment-Extraktion (FetchWithBodyAndAttachments) - Deterministische IDs basierend auf Absender+Betreff (idempotente Upserts) - Rueckwaertskompatibles Parsing fuer alte Triage-Eintraege ohne Datum/Body Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,16 +2,23 @@
|
||||
package email
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"crypto/tls"
|
||||
"encoding/base64"
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"mime"
|
||||
"mime/multipart"
|
||||
"mime/quotedprintable"
|
||||
"net/mail"
|
||||
"os"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
imap "github.com/emersion/go-imap/v2"
|
||||
"github.com/emersion/go-imap/v2/imapclient"
|
||||
"github.com/ledongthuc/pdf"
|
||||
|
||||
"my-brain-importer/internal/config"
|
||||
)
|
||||
@@ -144,6 +151,36 @@ func (cl *Client) FetchRecent(n uint32) ([]Message, error) {
|
||||
return parseMessages(msgs), nil
|
||||
}
|
||||
|
||||
// FetchRecentFromFolder holt die letzten n Emails aus einem bestimmten IMAP-Ordner.
|
||||
func (cl *Client) FetchRecentFromFolder(folder string, n uint32) ([]Message, error) {
|
||||
if folder == "" {
|
||||
folder = "INBOX"
|
||||
}
|
||||
|
||||
selectData, err := cl.c.Select(folder, &imap.SelectOptions{ReadOnly: true}).Wait()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("IMAP select %s: %w", folder, err)
|
||||
}
|
||||
if selectData.NumMessages == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
start := uint32(1)
|
||||
if selectData.NumMessages > n {
|
||||
start = selectData.NumMessages - n + 1
|
||||
}
|
||||
|
||||
var seqSet imap.SeqSet
|
||||
seqSet.AddRange(start, selectData.NumMessages)
|
||||
|
||||
msgs, err := cl.c.Fetch(seqSet, &imap.FetchOptions{Envelope: true}).Collect()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("IMAP fetch %s: %w", folder, err)
|
||||
}
|
||||
|
||||
return parseMessages(msgs), nil
|
||||
}
|
||||
|
||||
// FetchUnread holt ungelesene Emails (Envelope-Daten, kein Body).
|
||||
func (cl *Client) FetchUnread() ([]Message, error) {
|
||||
folder := cl.folder
|
||||
@@ -546,6 +583,212 @@ func parseMessage(msg *imapclient.FetchMessageBuffer) Message {
|
||||
return m
|
||||
}
|
||||
|
||||
// FetchWithBodyAndAttachments holt bis zu n Emails aus dem angegebenen Ordner mit Text-Body
|
||||
// und extrahiert Text aus PDF-Anhängen. Der kombinierte Text wird in MessageWithBody.Body gespeichert.
|
||||
// Nutzt stdlib mime/multipart — keine externen Abhängigkeiten außer dem bereits vorhandenen PDF-Parser.
|
||||
func (cl *Client) FetchWithBodyAndAttachments(folder string, n uint32) ([]MessageWithBody, error) {
|
||||
if folder == "" {
|
||||
folder = "INBOX"
|
||||
}
|
||||
|
||||
selectData, err := cl.c.Select(folder, &imap.SelectOptions{ReadOnly: true}).Wait()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("IMAP select %s: %w", folder, err)
|
||||
}
|
||||
if selectData.NumMessages == 0 {
|
||||
return nil, nil
|
||||
}
|
||||
|
||||
total := selectData.NumMessages
|
||||
start := uint32(1)
|
||||
if total > n {
|
||||
start = total - n + 1
|
||||
}
|
||||
|
||||
// Komplette RFC822-Nachricht fetchen (Header + Body + Attachments)
|
||||
fullMsgSec := &imap.FetchItemBodySection{}
|
||||
|
||||
var result []MessageWithBody
|
||||
batchSize := uint32(50)
|
||||
|
||||
for i := start; i <= total; i += batchSize {
|
||||
end := i + batchSize - 1
|
||||
if end > total {
|
||||
end = total
|
||||
}
|
||||
var seqSet imap.SeqSet
|
||||
seqSet.AddRange(i, end)
|
||||
|
||||
msgs, err := cl.c.Fetch(seqSet, &imap.FetchOptions{
|
||||
Envelope: true,
|
||||
BodySection: []*imap.FetchItemBodySection{fullMsgSec},
|
||||
}).Collect()
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("IMAP fetch batch %d-%d: %w", i, end, err)
|
||||
}
|
||||
|
||||
for _, msg := range msgs {
|
||||
if msg.Envelope == nil {
|
||||
continue
|
||||
}
|
||||
m := MessageWithBody{Message: parseMessage(msg)}
|
||||
|
||||
rawMsg := msg.FindBodySection(fullMsgSec)
|
||||
if rawMsg != nil {
|
||||
body, err := extractBodyAndAttachments(rawMsg)
|
||||
if err != nil {
|
||||
slog.Warn("[Email] MIME-Parsing fehlgeschlagen", "betreff", m.Subject, "fehler", err)
|
||||
} else {
|
||||
m.Body = body
|
||||
}
|
||||
}
|
||||
result = append(result, m)
|
||||
}
|
||||
}
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// extractBodyAndAttachments parst eine rohe RFC822-Nachricht und gibt den kombinierten Text zurück.
|
||||
// Text/plain-Teile werden direkt übernommen, PDF-Anhänge werden in Text extrahiert.
|
||||
func extractBodyAndAttachments(rawMsg []byte) (string, error) {
|
||||
parsed, err := mail.ReadMessage(bytes.NewReader(rawMsg))
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("mail parsen: %w", err)
|
||||
}
|
||||
|
||||
contentType := parsed.Header.Get("Content-Type")
|
||||
mediaType, params, err := mime.ParseMediaType(contentType)
|
||||
if err != nil {
|
||||
// Kein valider Content-Type — Body direkt lesen
|
||||
body, readErr := io.ReadAll(parsed.Body)
|
||||
if readErr != nil {
|
||||
return "", fmt.Errorf("body lesen: %w", readErr)
|
||||
}
|
||||
enc := strings.ToLower(parsed.Header.Get("Content-Transfer-Encoding"))
|
||||
return decodeBody(body, enc), nil
|
||||
}
|
||||
|
||||
var parts []string
|
||||
|
||||
if strings.HasPrefix(mediaType, "multipart/") {
|
||||
boundary := params["boundary"]
|
||||
mr := multipart.NewReader(parsed.Body, boundary)
|
||||
for {
|
||||
part, err := mr.NextPart()
|
||||
if err == io.EOF {
|
||||
break
|
||||
}
|
||||
if err != nil {
|
||||
slog.Warn("[Email] Multipart-Teil fehlgeschlagen", "fehler", err)
|
||||
break
|
||||
}
|
||||
|
||||
partContentType := part.Header.Get("Content-Type")
|
||||
partMediaType, _, parseErr := mime.ParseMediaType(partContentType)
|
||||
if parseErr != nil {
|
||||
part.Close()
|
||||
continue
|
||||
}
|
||||
|
||||
enc := strings.ToLower(part.Header.Get("Content-Transfer-Encoding"))
|
||||
data, readErr := io.ReadAll(part)
|
||||
part.Close()
|
||||
if readErr != nil {
|
||||
continue
|
||||
}
|
||||
|
||||
switch {
|
||||
case partMediaType == "text/plain":
|
||||
parts = append(parts, decodeBody(data, enc))
|
||||
case partMediaType == "application/pdf":
|
||||
pdfText, pdfErr := extractPDFTextFromBytes(data, enc)
|
||||
if pdfErr != nil {
|
||||
slog.Warn("[Email] PDF-Anhang konnte nicht gelesen werden", "fehler", pdfErr)
|
||||
} else if pdfText != "" {
|
||||
parts = append(parts, "[PDF-Anhang] "+pdfText)
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Einfache (nicht-multipart) Nachricht
|
||||
body, readErr := io.ReadAll(parsed.Body)
|
||||
if readErr != nil {
|
||||
return "", fmt.Errorf("body lesen: %w", readErr)
|
||||
}
|
||||
enc := strings.ToLower(parsed.Header.Get("Content-Transfer-Encoding"))
|
||||
parts = append(parts, decodeBody(body, enc))
|
||||
}
|
||||
|
||||
combined := strings.TrimSpace(strings.Join(parts, "\n"))
|
||||
if len(combined) > 2000 {
|
||||
combined = combined[:2000]
|
||||
}
|
||||
return combined, nil
|
||||
}
|
||||
|
||||
// extractPDFTextFromBytes dekodiert die rohen Anhang-Bytes (ggf. base64) und extrahiert PDF-Text.
|
||||
func extractPDFTextFromBytes(data []byte, enc string) (string, error) {
|
||||
// PDF-Anhänge sind fast immer base64-kodiert
|
||||
var pdfBytes []byte
|
||||
switch enc {
|
||||
case "base64":
|
||||
cleaned := strings.ReplaceAll(strings.TrimSpace(string(data)), "\r\n", "")
|
||||
cleaned = strings.ReplaceAll(cleaned, "\n", "")
|
||||
decoded, err := base64.StdEncoding.DecodeString(cleaned)
|
||||
if err != nil {
|
||||
decoded, err = base64.RawStdEncoding.DecodeString(cleaned)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("base64 dekodieren: %w", err)
|
||||
}
|
||||
}
|
||||
pdfBytes = decoded
|
||||
default:
|
||||
pdfBytes = data
|
||||
}
|
||||
|
||||
// PDF in temporäre Datei schreiben, da die pdf-Bibliothek einen Datei-Pfad erwartet
|
||||
tmp, err := os.CreateTemp("", "email-pdf-*.pdf")
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("temp-Datei anlegen: %w", err)
|
||||
}
|
||||
tmpPath := tmp.Name()
|
||||
defer os.Remove(tmpPath)
|
||||
|
||||
if _, err := tmp.Write(pdfBytes); err != nil {
|
||||
tmp.Close()
|
||||
return "", fmt.Errorf("temp-Datei schreiben: %w", err)
|
||||
}
|
||||
tmp.Close()
|
||||
|
||||
return extractPDFTextFromFile(tmpPath)
|
||||
}
|
||||
|
||||
// extractPDFTextFromFile liest alle Seiten einer PDF-Datei und gibt den Plain-Text zurück.
|
||||
// Dupliziert die Logik aus brain/ingest_pdf.go um Import-Zyklen zu vermeiden.
|
||||
func extractPDFTextFromFile(filePath string) (string, error) {
|
||||
f, r, err := pdf.Open(filePath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var sb strings.Builder
|
||||
totalPages := r.NumPage()
|
||||
for pageNum := 1; pageNum <= totalPages; pageNum++ {
|
||||
page := r.Page(pageNum)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sb.WriteString(text)
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
return strings.TrimSpace(sb.String()), nil
|
||||
}
|
||||
|
||||
func parseMessages(msgs []*imapclient.FetchMessageBuffer) []Message {
|
||||
result := make([]Message, 0, len(msgs))
|
||||
for _, msg := range msgs {
|
||||
|
||||
@@ -328,7 +328,8 @@ func ClassifyImportance(msg Message, model string) bool {
|
||||
)
|
||||
|
||||
// Entscheidung für künftiges Lernen in Qdrant speichern
|
||||
if err := triage.StoreDecision(msg.Subject, msg.From, isImportant); err != nil {
|
||||
// Body ist bei ClassifyImportance nicht verfügbar (nur Envelope), daher leer.
|
||||
if err := triage.StoreDecision(msg.Subject, msg.From, msg.Date, "", isImportant); err != nil {
|
||||
slog.Warn("[Triage] Entscheidung nicht gespeichert", "fehler", err)
|
||||
}
|
||||
|
||||
@@ -646,3 +647,64 @@ func fallbackList(msgs []Message) string {
|
||||
}
|
||||
return sb.String()
|
||||
}
|
||||
|
||||
// LearnFromFolders scannt die Archiv-Ordner eines Accounts und speichert Triage-Entscheidungen in Qdrant.
|
||||
// Ordner mit retention_days == 0 (Archiv/dauerhaft) → wichtig, retention_days > 0 → unwichtig.
|
||||
// Pro Ordner werden maximal die letzten 50 Emails verarbeitet.
|
||||
func LearnFromFolders(acc config.EmailAccount) (wichtig, unwichtig int, err error) {
|
||||
if len(acc.ArchiveFolders) == 0 {
|
||||
return 0, 0, nil
|
||||
}
|
||||
|
||||
cl, err := ConnectAccount(acc)
|
||||
if err != nil {
|
||||
return 0, 0, fmt.Errorf("IMAP verbinden: %w", err)
|
||||
}
|
||||
defer cl.Close()
|
||||
|
||||
for _, af := range acc.ArchiveFolders {
|
||||
isImportant := af.RetentionDays == 0
|
||||
|
||||
msgs, err := cl.FetchWithBodyAndAttachments(af.IMAPFolder, 50)
|
||||
if err != nil {
|
||||
slog.Warn("[Triage-Learn] Ordner nicht lesbar", "ordner", af.IMAPFolder, "fehler", err)
|
||||
continue
|
||||
}
|
||||
|
||||
for _, m := range msgs {
|
||||
if err := triage.StoreDecision(m.Subject, m.From, m.Date, m.Body, isImportant); err != nil {
|
||||
slog.Warn("[Triage-Learn] Speichern fehlgeschlagen", "betreff", m.Subject, "fehler", err)
|
||||
continue
|
||||
}
|
||||
if isImportant {
|
||||
wichtig++
|
||||
} else {
|
||||
unwichtig++
|
||||
}
|
||||
}
|
||||
|
||||
slog.Info("[Triage-Learn] Ordner verarbeitet",
|
||||
"account", accountLabel(acc),
|
||||
"ordner", af.IMAPFolder,
|
||||
"emails", len(msgs),
|
||||
"wichtig", isImportant,
|
||||
)
|
||||
}
|
||||
|
||||
return wichtig, unwichtig, nil
|
||||
}
|
||||
|
||||
// LearnFromFoldersAllAccounts führt LearnFromFolders für alle konfigurierten Accounts aus.
|
||||
func LearnFromFoldersAllAccounts() (wichtig, unwichtig int, err error) {
|
||||
accounts := config.AllEmailAccounts()
|
||||
for _, acc := range accounts {
|
||||
w, u, accErr := LearnFromFolders(acc)
|
||||
if accErr != nil {
|
||||
slog.Error("[Triage-Learn] Account fehlgeschlagen", "account", accountLabel(acc), "fehler", accErr)
|
||||
continue
|
||||
}
|
||||
wichtig += w
|
||||
unwichtig += u
|
||||
}
|
||||
return wichtig, unwichtig, nil
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user