zwischenstand
This commit is contained in:
82
internal/brain/ingest_pdf.go
Normal file
82
internal/brain/ingest_pdf.go
Normal file
@@ -0,0 +1,82 @@
|
||||
// ingest_pdf.go – Extrahiert Text aus einer PDF-Datei und importiert ihn in Qdrant
|
||||
package brain
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"github.com/ledongthuc/pdf"
|
||||
pb "github.com/qdrant/go-client/qdrant"
|
||||
"google.golang.org/grpc/metadata"
|
||||
|
||||
"my-brain-importer/internal/config"
|
||||
)
|
||||
|
||||
// IngestPDF extrahiert Text aus einer PDF-Datei und importiert ihn in Qdrant.
|
||||
// source ist der Anzeigename der Quelle (z.B. Dateiname).
|
||||
// Gibt Anzahl der importierten Chunks zurück.
|
||||
func IngestPDF(filePath, source string) (int, error) {
|
||||
text, err := extractPDFText(filePath)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("PDF-Parsing fehlgeschlagen: %w", err)
|
||||
}
|
||||
|
||||
text = strings.TrimSpace(text)
|
||||
if len(text) < 20 {
|
||||
return 0, fmt.Errorf("kein verwertbarer Text in PDF gefunden")
|
||||
}
|
||||
|
||||
ctx := context.Background()
|
||||
ctx = metadata.AppendToOutgoingContext(ctx, "api-key", config.Cfg.Qdrant.APIKey)
|
||||
|
||||
embClient := config.NewEmbeddingClient()
|
||||
conn := config.NewQdrantConn()
|
||||
defer conn.Close()
|
||||
|
||||
ensureCollection(ctx, pb.NewCollectionsClient(conn))
|
||||
pointsClient := pb.NewPointsClient(conn)
|
||||
|
||||
var chunks []chunk
|
||||
for _, part := range splitLongSection(text) {
|
||||
part = strings.TrimSpace(part)
|
||||
if len(part) < 20 {
|
||||
continue
|
||||
}
|
||||
chunks = append(chunks, chunk{Text: part, Source: source, Type: "pdf"})
|
||||
}
|
||||
|
||||
if len(chunks) == 0 {
|
||||
return 0, fmt.Errorf("kein verwertbarer Inhalt nach Aufteilung")
|
||||
}
|
||||
|
||||
if err := ingestChunks(ctx, embClient, pointsClient, chunks); err != nil {
|
||||
return 0, fmt.Errorf("Ingest fehlgeschlagen: %w", err)
|
||||
}
|
||||
return len(chunks), nil
|
||||
}
|
||||
|
||||
// extractPDFText liest alle Seiten einer PDF-Datei und gibt den Text zurück.
|
||||
func extractPDFText(filePath string) (string, error) {
|
||||
f, r, err := pdf.Open(filePath)
|
||||
if err != nil {
|
||||
return "", err
|
||||
}
|
||||
defer f.Close()
|
||||
|
||||
var sb strings.Builder
|
||||
totalPages := r.NumPage()
|
||||
for pageNum := 1; pageNum <= totalPages; pageNum++ {
|
||||
page := r.Page(pageNum)
|
||||
if page.V.IsNull() {
|
||||
continue
|
||||
}
|
||||
text, err := page.GetPlainText(nil)
|
||||
if err != nil {
|
||||
continue
|
||||
}
|
||||
sb.WriteString(text)
|
||||
sb.WriteString("\n")
|
||||
}
|
||||
return sb.String(), nil
|
||||
}
|
||||
Reference in New Issue
Block a user