Files
ai-agent/internal/brain/ingest_url.go
Christoph K. 905981cd1e zwischenstand
2026-03-20 23:24:56 +01:00

125 lines
3.1 KiB
Go
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// ingest_url.go Fetcht eine URL und importiert den Textinhalt in Qdrant
package brain
import (
"context"
"fmt"
"io"
"net/http"
"strings"
"time"
pb "github.com/qdrant/go-client/qdrant"
"golang.org/x/net/html"
"google.golang.org/grpc/metadata"
"my-brain-importer/internal/config"
)
// IngestURL fetcht eine URL, extrahiert den Textinhalt und importiert ihn in Qdrant.
// Gibt Anzahl der importierten Chunks zurück.
func IngestURL(rawURL string) (int, error) {
client := &http.Client{Timeout: 30 * time.Second}
resp, err := client.Get(rawURL)
if err != nil {
return 0, fmt.Errorf("HTTP-Fehler: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
return 0, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
}
contentType := resp.Header.Get("Content-Type")
var text string
if strings.Contains(contentType, "text/html") {
text, err = extractHTMLText(resp.Body)
if err != nil {
return 0, fmt.Errorf("HTML-Parsing fehlgeschlagen: %w", err)
}
} else {
raw, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) // max 1MB
if err != nil {
return 0, fmt.Errorf("Lesen fehlgeschlagen: %w", err)
}
text = string(raw)
}
text = strings.TrimSpace(text)
if len(text) < 20 {
return 0, fmt.Errorf("kein verwertbarer Inhalt gefunden")
}
ctx := context.Background()
ctx = metadata.AppendToOutgoingContext(ctx, "api-key", config.Cfg.Qdrant.APIKey)
embClient := config.NewEmbeddingClient()
conn := config.NewQdrantConn()
defer conn.Close()
ensureCollection(ctx, pb.NewCollectionsClient(conn))
pointsClient := pb.NewPointsClient(conn)
var chunks []chunk
for _, part := range splitLongSection(text) {
part = strings.TrimSpace(part)
if len(part) < 20 {
continue
}
chunks = append(chunks, chunk{Text: part, Source: rawURL, Type: "url"})
}
if len(chunks) == 0 {
return 0, fmt.Errorf("kein verwertbarer Inhalt nach Aufteilung")
}
if err := ingestChunks(ctx, embClient, pointsClient, chunks); err != nil {
return 0, fmt.Errorf("Ingest fehlgeschlagen: %w", err)
}
return len(chunks), nil
}
// extractHTMLText extrahiert sichtbaren Text aus einem HTML-Dokument.
func extractHTMLText(r io.Reader) (string, error) {
doc, err := html.Parse(r)
if err != nil {
return "", err
}
var sb strings.Builder
extractTextNode(doc, &sb)
// Mehrfach-Leerzeilen reduzieren
lines := strings.Split(sb.String(), "\n")
var cleaned []string
for _, line := range lines {
line = strings.TrimSpace(line)
if line != "" {
cleaned = append(cleaned, line)
}
}
return strings.Join(cleaned, "\n"), nil
}
// skipTags sind HTML-Elemente deren Inhalt nicht extrahiert wird.
var skipTags = map[string]bool{
"script": true, "style": true, "noscript": true,
"head": true, "meta": true, "link": true,
"nav": true, "footer": true, "header": true,
}
func extractTextNode(n *html.Node, sb *strings.Builder) {
if n.Type == html.TextNode {
text := strings.TrimSpace(n.Data)
if text != "" {
sb.WriteString(text)
sb.WriteString("\n")
}
return
}
if n.Type == html.ElementNode && skipTags[n.Data] {
return
}
for c := n.FirstChild; c != nil; c = c.NextSibling {
extractTextNode(c, sb)
}
}