zwischenstand

2026-03-20 23:24:56 +01:00
parent b1a576f61e
commit 905981cd1e
25 changed files with 3607 additions and 217 deletions
--- a/internal/brain/ingest_url.go
+++ b/internal/brain/ingest_url.go
@@ -0,0 +1,124 @@
+// ingest_url.go – Fetcht eine URL und importiert den Textinhalt in Qdrant
+package brain
+
+import (
+	"context"
+	"fmt"
+	"io"
+	"net/http"
+	"strings"
+	"time"
+
+	pb "github.com/qdrant/go-client/qdrant"
+	"golang.org/x/net/html"
+	"google.golang.org/grpc/metadata"
+
+	"my-brain-importer/internal/config"
+)
+
+// IngestURL fetcht eine URL, extrahiert den Textinhalt und importiert ihn in Qdrant.
+// Gibt Anzahl der importierten Chunks zurück.
+func IngestURL(rawURL string) (int, error) {
+	client := &http.Client{Timeout: 30 * time.Second}
+	resp, err := client.Get(rawURL)
+	if err != nil {
+		return 0, fmt.Errorf("HTTP-Fehler: %w", err)
+	}
+	defer resp.Body.Close()
+
+	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
+		return 0, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
+	}
+
+	contentType := resp.Header.Get("Content-Type")
+	var text string
+	if strings.Contains(contentType, "text/html") {
+		text, err = extractHTMLText(resp.Body)
+		if err != nil {
+			return 0, fmt.Errorf("HTML-Parsing fehlgeschlagen: %w", err)
+		}
+	} else {
+		raw, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) // max 1MB
+		if err != nil {
+			return 0, fmt.Errorf("Lesen fehlgeschlagen: %w", err)
+		}
+		text = string(raw)
+	}
+
+	text = strings.TrimSpace(text)
+	if len(text) < 20 {
+		return 0, fmt.Errorf("kein verwertbarer Inhalt gefunden")
+	}
+
+	ctx := context.Background()
+	ctx = metadata.AppendToOutgoingContext(ctx, "api-key", config.Cfg.Qdrant.APIKey)
+
+	embClient := config.NewEmbeddingClient()
+	conn := config.NewQdrantConn()
+	defer conn.Close()
+
+	ensureCollection(ctx, pb.NewCollectionsClient(conn))
+	pointsClient := pb.NewPointsClient(conn)
+
+	var chunks []chunk
+	for _, part := range splitLongSection(text) {
+		part = strings.TrimSpace(part)
+		if len(part) < 20 {
+			continue
+		}
+		chunks = append(chunks, chunk{Text: part, Source: rawURL, Type: "url"})
+	}
+
+	if len(chunks) == 0 {
+		return 0, fmt.Errorf("kein verwertbarer Inhalt nach Aufteilung")
+	}
+
+	if err := ingestChunks(ctx, embClient, pointsClient, chunks); err != nil {
+		return 0, fmt.Errorf("Ingest fehlgeschlagen: %w", err)
+	}
+	return len(chunks), nil
+}
+
+// extractHTMLText extrahiert sichtbaren Text aus einem HTML-Dokument.
+func extractHTMLText(r io.Reader) (string, error) {
+	doc, err := html.Parse(r)
+	if err != nil {
+		return "", err
+	}
+	var sb strings.Builder
+	extractTextNode(doc, &sb)
+	// Mehrfach-Leerzeilen reduzieren
+	lines := strings.Split(sb.String(), "\n")
+	var cleaned []string
+	for _, line := range lines {
+		line = strings.TrimSpace(line)
+		if line != "" {
+			cleaned = append(cleaned, line)
+		}
+	}
+	return strings.Join(cleaned, "\n"), nil
+}
+
+// skipTags sind HTML-Elemente deren Inhalt nicht extrahiert wird.
+var skipTags = map[string]bool{
+	"script": true, "style": true, "noscript": true,
+	"head": true, "meta": true, "link": true,
+	"nav": true, "footer": true, "header": true,
+}
+
+func extractTextNode(n *html.Node, sb *strings.Builder) {
+	if n.Type == html.TextNode {
+		text := strings.TrimSpace(n.Data)
+		if text != "" {
+			sb.WriteString(text)
+			sb.WriteString("\n")
+		}
+		return
+	}
+	if n.Type == html.ElementNode && skipTags[n.Data] {
+		return
+	}
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		extractTextNode(c, sb)
+	}
+}