ai-agent/internal/brain/ingest_url.go

// ingest_url.go – Fetcht eine URL und importiert den Textinhalt in Qdrant
package brain

import (
	"context"
	"fmt"
	"io"
	"net/http"
	"strings"
	"time"

	pb "github.com/qdrant/go-client/qdrant"
	"golang.org/x/net/html"
	"google.golang.org/grpc/metadata"

	"my-brain-importer/internal/config"
)

// IngestURL fetcht eine URL, extrahiert den Textinhalt und importiert ihn in Qdrant.
// Gibt Anzahl der importierten Chunks zurück.
func IngestURL(rawURL string) (int, error) {
	client := &http.Client{Timeout: 30 * time.Second}
	resp, err := client.Get(rawURL)
	if err != nil {
		return 0, fmt.Errorf("HTTP-Fehler: %w", err)
	}
	defer resp.Body.Close()

	if resp.StatusCode < 200 || resp.StatusCode >= 300 {
		return 0, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
	}

	contentType := resp.Header.Get("Content-Type")
	var text string
	if strings.Contains(contentType, "text/html") {
		text, err = extractHTMLText(resp.Body)
		if err != nil {
			return 0, fmt.Errorf("HTML-Parsing fehlgeschlagen: %w", err)
		}
	} else {
		raw, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) // max 1MB
		if err != nil {
			return 0, fmt.Errorf("Lesen fehlgeschlagen: %w", err)
		}
		text = string(raw)
	}

	text = strings.TrimSpace(text)
	if len(text) < 20 {
		return 0, fmt.Errorf("kein verwertbarer Inhalt gefunden")
	}

	ctx := context.Background()
	ctx = metadata.AppendToOutgoingContext(ctx, "api-key", config.Cfg.Qdrant.APIKey)

	embClient := config.NewEmbeddingClient()
	conn := config.NewQdrantConn()
	defer conn.Close()

	ensureCollection(ctx, pb.NewCollectionsClient(conn))
	pointsClient := pb.NewPointsClient(conn)

	var chunks []chunk
	for _, part := range splitLongSection(text) {
		part = strings.TrimSpace(part)
		if len(part) < 20 {
			continue
		}
		chunks = append(chunks, chunk{Text: part, Source: rawURL, Type: "url"})
	}

	if len(chunks) == 0 {
		return 0, fmt.Errorf("kein verwertbarer Inhalt nach Aufteilung")
	}

	if err := ingestChunks(ctx, embClient, pointsClient, chunks); err != nil {
		return 0, fmt.Errorf("Ingest fehlgeschlagen: %w", err)
	}
	return len(chunks), nil
}

// extractHTMLText extrahiert sichtbaren Text aus einem HTML-Dokument.
func extractHTMLText(r io.Reader) (string, error) {
	doc, err := html.Parse(r)
	if err != nil {
		return "", err
	}
	var sb strings.Builder
	extractTextNode(doc, &sb)
	// Mehrfach-Leerzeilen reduzieren
	lines := strings.Split(sb.String(), "\n")
	var cleaned []string
	for _, line := range lines {
		line = strings.TrimSpace(line)
		if line != "" {
			cleaned = append(cleaned, line)
		}
	}
	return strings.Join(cleaned, "\n"), nil
}

// skipTags sind HTML-Elemente deren Inhalt nicht extrahiert wird.
var skipTags = map[string]bool{
	"script": true, "style": true, "noscript": true,
	"head": true, "meta": true, "link": true,
	"nav": true, "footer": true, "header": true,
}

func extractTextNode(n *html.Node, sb *strings.Builder) {
	if n.Type == html.TextNode {
		text := strings.TrimSpace(n.Data)
		if text != "" {
			sb.WriteString(text)
			sb.WriteString("\n")
		}
		return
	}
	if n.Type == html.ElementNode && skipTags[n.Data] {
		return
	}
	for c := n.FirstChild; c != nil; c = c.NextSibling {
		extractTextNode(c, sb)
	}
}