zwischenstand
This commit is contained in:
124
internal/brain/ingest_url.go
Normal file
124
internal/brain/ingest_url.go
Normal file
@@ -0,0 +1,124 @@
|
||||
// ingest_url.go – Fetcht eine URL und importiert den Textinhalt in Qdrant
|
||||
package brain
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
pb "github.com/qdrant/go-client/qdrant"
|
||||
"golang.org/x/net/html"
|
||||
"google.golang.org/grpc/metadata"
|
||||
|
||||
"my-brain-importer/internal/config"
|
||||
)
|
||||
|
||||
// IngestURL fetcht eine URL, extrahiert den Textinhalt und importiert ihn in Qdrant.
|
||||
// Gibt Anzahl der importierten Chunks zurück.
|
||||
func IngestURL(rawURL string) (int, error) {
|
||||
client := &http.Client{Timeout: 30 * time.Second}
|
||||
resp, err := client.Get(rawURL)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("HTTP-Fehler: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode < 200 || resp.StatusCode >= 300 {
|
||||
return 0, fmt.Errorf("HTTP %d: %s", resp.StatusCode, resp.Status)
|
||||
}
|
||||
|
||||
contentType := resp.Header.Get("Content-Type")
|
||||
var text string
|
||||
if strings.Contains(contentType, "text/html") {
|
||||
text, err = extractHTMLText(resp.Body)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("HTML-Parsing fehlgeschlagen: %w", err)
|
||||
}
|
||||
} else {
|
||||
raw, err := io.ReadAll(io.LimitReader(resp.Body, 1<<20)) // max 1MB
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("Lesen fehlgeschlagen: %w", err)
|
||||
}
|
||||
text = string(raw)
|
||||
}
|
||||
|
||||
text = strings.TrimSpace(text)
|
||||
if len(text) < 20 {
|
||||
return 0, fmt.Errorf("kein verwertbarer Inhalt gefunden")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
ctx = metadata.AppendToOutgoingContext(ctx, "api-key", config.Cfg.Qdrant.APIKey)
|
||||
|
||||
embClient := config.NewEmbeddingClient()
|
||||
conn := config.NewQdrantConn()
|
||||
defer conn.Close()
|
||||
|
||||
ensureCollection(ctx, pb.NewCollectionsClient(conn))
|
||||
pointsClient := pb.NewPointsClient(conn)
|
||||
|
||||
var chunks []chunk
|
||||
for _, part := range splitLongSection(text) {
|
||||
part = strings.TrimSpace(part)
|
||||
if len(part) < 20 {
|
||||
continue
|
||||
}
|
||||
chunks = append(chunks, chunk{Text: part, Source: rawURL, Type: "url"})
|
||||
}
|
||||
|
||||
if len(chunks) == 0 {
|
||||
return 0, fmt.Errorf("kein verwertbarer Inhalt nach Aufteilung")
|
||||
}
|
||||
|
||||
if err := ingestChunks(ctx, embClient, pointsClient, chunks); err != nil {
|
||||
return 0, fmt.Errorf("Ingest fehlgeschlagen: %w", err)
|
||||
}
|
||||
return len(chunks), nil
|
||||
}
|
||||
|
||||
// extractHTMLText extrahiert sichtbaren Text aus einem HTML-Dokument.
|
||||
func extractHTMLText(r io.Reader) (string, error) {
|
||||
doc, err := html.Parse(r)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
var sb strings.Builder
|
||||
extractTextNode(doc, &sb)
|
||||
// Mehrfach-Leerzeilen reduzieren
|
||||
lines := strings.Split(sb.String(), "\n")
|
||||
var cleaned []string
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
if line != "" {
|
||||
cleaned = append(cleaned, line)
|
||||
}
|
||||
}
|
||||
return strings.Join(cleaned, "\n"), nil
|
||||
}
|
||||
|
||||
// skipTags sind HTML-Elemente deren Inhalt nicht extrahiert wird.
|
||||
var skipTags = map[string]bool{
|
||||
"script": true, "style": true, "noscript": true,
|
||||
"head": true, "meta": true, "link": true,
|
||||
"nav": true, "footer": true, "header": true,
|
||||
}
|
||||
|
||||
func extractTextNode(n *html.Node, sb *strings.Builder) {
|
||||
if n.Type == html.TextNode {
|
||||
text := strings.TrimSpace(n.Data)
|
||||
if text != "" {
|
||||
sb.WriteString(text)
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
return
|
||||
}
|
||||
if n.Type == html.ElementNode && skipTags[n.Data] {
|
||||
return
|
||||
}
|
||||
for c := n.FirstChild; c != nil; c = c.NextSibling {
|
||||
extractTextNode(c, sb)
|
||||
}
|
||||
}
|
||||
Reference in New Issue
Block a user