sync

2026-03-12 17:34:49 +01:00
parent fa5c15b607
commit 92f520101a
5 changed files with 180 additions and 32 deletions
--- a/internal/brain/ask.go
+++ b/internal/brain/ask.go
@@ -77,7 +77,7 @@ Basierend auf diesen Informationen, beantworte bitte folgende Frage:
 	}
 	defer stream.Close()

-	fmt.Println("\n💬 Antwort:\n")
+	fmt.Print("\n💬 Antwort:\n\n")
 	for {
 		response, err := stream.Recv()
 		if err != nil {
@@ -88,15 +88,15 @@ Basierend auf diesen Informationen, beantworte bitte folgende Frage:
 		}
 	}

-	fmt.Println("\n")
+	fmt.Print("\n\n")
 	fmt.Println(strings.Repeat("═", 80))
-	fmt.Println("\n📚 Verwendete Quellen:")
+	fmt.Print("\n📚 Verwendete Quellen:\n")
 	for i, chunk := range chunks {
 		preview := chunk.Text
 		if len(preview) > 80 {
 			preview = preview[:80] + "..."
 		}
-		fmt.Printf("  [%d] %.1f%% - %s\n", i+1, chunk.Score*100, preview)
+		fmt.Printf("  [%d] %.1f%% - %s\n      %s\n", i+1, chunk.Score*100, chunk.Source, preview)
 	}
 }

@@ -120,7 +120,7 @@ func searchKnowledge(ctx context.Context, embClient *openai.Client, query string
 		WithPayload: &pb.WithPayloadSelector{
 			SelectorOptions: &pb.WithPayloadSelector_Enable{Enable: true},
 		},
-		ScoreThreshold: floatPtr(0.5),
+		ScoreThreshold: floatPtr(config.Cfg.ScoreThreshold),
 	})
 	if err != nil {
 		log.Printf("❌ Suche fehlgeschlagen: %v", err)
--- a/internal/brain/ingest.go
+++ b/internal/brain/ingest.go
@@ -143,7 +143,7 @@ func splitByHeadings(text string) []string {
 	var sections []string
 	var current strings.Builder
 	for _, line := range lines {
-		if strings.HasPrefix(line, "# ") || strings.HasPrefix(line, "## ") {
+		if strings.HasPrefix(line, "# ") || strings.HasPrefix(line, "## ") || strings.HasPrefix(line, "### ") {
 			if current.Len() > 0 {
 				sections = append(sections, current.String())
 				current.Reset()
--- a/internal/brain/ingest_json.go
+++ b/internal/brain/ingest_json.go
@@ -51,12 +51,26 @@ func RunIngestJSON(inputFile string) {

 	fmt.Printf("🤖 Embedding: %s (%s)\n\n", config.Cfg.Embedding.Model, config.Cfg.Embedding.URL)

+	// Batched embedding – identisch zur Markdown-Ingest-Logik
+	batchSize := 10
 	success := 0
-	for i, entry := range entries {
-		fmt.Printf("[%d/%d] 🔄 %s\n", i+1, len(entries), entry.FileName)
+
+	for i := 0; i < len(entries); i += batchSize {
+		end := i + batchSize
+		if end > len(entries) {
+			end = len(entries)
+		}
+		batch := entries[i:end]
+
+		texts := make([]string, len(batch))
+		for j, e := range batch {
+			texts[j] = e.Description
+		}
+
+		fmt.Printf("[%d–%d/%d] 🔄 Embedding-Batch...\n", i+1, end, len(entries))

 		embResp, err := embClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{
-			Input: []string{entry.Description},
+			Input: texts,
 			Model: openai.EmbeddingModel(config.Cfg.Embedding.Model),
 		})
 		if err != nil {
@@ -64,33 +78,38 @@ func RunIngestJSON(inputFile string) {
 			continue
 		}

-		_, err = pointsClient.Upsert(ctx, &pb.UpsertPoints{
-			CollectionName: config.Cfg.Qdrant.Collection,
-			Points: []*pb.PointStruct{
-				{
-					Id: &pb.PointId{
-						PointIdOptions: &pb.PointId_Uuid{
-							Uuid: generateID(entry.Description, entry.FileName),
-						},
-					},
-					Vectors: &pb.Vectors{
-						VectorsOptions: &pb.Vectors_Vector{
-							Vector: &pb.Vector{Data: embResp.Data[0].Embedding},
-						},
-					},
-					Payload: map[string]*pb.Value{
-						"text":   {Kind: &pb.Value_StringValue{StringValue: entry.Description}},
-						"source": {Kind: &pb.Value_StringValue{StringValue: entry.FileName}},
-						"path":   {Kind: &pb.Value_StringValue{StringValue: entry.FilePath}},
-						"type":   {Kind: &pb.Value_StringValue{StringValue: "image"}},
+		var points []*pb.PointStruct
+		for j, emb := range embResp.Data {
+			e := batch[j]
+			points = append(points, &pb.PointStruct{
+				Id: &pb.PointId{
+					PointIdOptions: &pb.PointId_Uuid{
+						Uuid: generateID(e.Description, e.FileName),
 					},
 				},
-			},
+				Vectors: &pb.Vectors{
+					VectorsOptions: &pb.Vectors_Vector{
+						Vector: &pb.Vector{Data: emb.Embedding},
+					},
+				},
+				Payload: map[string]*pb.Value{
+					"text":   {Kind: &pb.Value_StringValue{StringValue: e.Description}},
+					"source": {Kind: &pb.Value_StringValue{StringValue: e.FileName}},
+					"path":   {Kind: &pb.Value_StringValue{StringValue: e.FilePath}},
+					"type":   {Kind: &pb.Value_StringValue{StringValue: "image"}},
+				},
+			})
+		}
+
+		_, err = pointsClient.Upsert(ctx, &pb.UpsertPoints{
+			CollectionName: config.Cfg.Qdrant.Collection,
+			Points:         points,
+			Wait:           boolPtr(true),
 		})
 		if err != nil {
 			log.Printf("   ❌ Speichern Fehler: %v\n", err)
 		} else {
-			success++
+			success += len(batch)
 		}
 	}

--- a/internal/brain/ingest_test.go
+++ b/internal/brain/ingest_test.go
@@ -0,0 +1,128 @@
+package brain
+
+import (
+	"strings"
+	"testing"
+)
+
+func TestSplitByHeadings_H1(t *testing.T) {
+	text := "# Abschnitt A\nText A\n# Abschnitt B\nText B"
+	sections := splitByHeadings(text)
+	if len(sections) != 2 {
+		t.Fatalf("erwartet 2 Abschnitte, bekam %d", len(sections))
+	}
+	if !strings.Contains(sections[0], "Abschnitt A") {
+		t.Errorf("Abschnitt 0 enthält nicht 'Abschnitt A': %q", sections[0])
+	}
+	if !strings.Contains(sections[1], "Abschnitt B") {
+		t.Errorf("Abschnitt 1 enthält nicht 'Abschnitt B': %q", sections[1])
+	}
+}
+
+func TestSplitByHeadings_H2(t *testing.T) {
+	text := "## Intro\nText\n## Detail\nMehr Text"
+	sections := splitByHeadings(text)
+	if len(sections) != 2 {
+		t.Fatalf("erwartet 2 Abschnitte, bekam %d", len(sections))
+	}
+}
+
+func TestSplitByHeadings_H3(t *testing.T) {
+	text := "### Unterabschnitt A\nText A\n### Unterabschnitt B\nText B"
+	sections := splitByHeadings(text)
+	if len(sections) != 2 {
+		t.Fatalf("H3-Split: erwartet 2 Abschnitte, bekam %d", len(sections))
+	}
+}
+
+func TestSplitByHeadings_Mixed(t *testing.T) {
+	text := "# H1\nText\n## H2\nText2\n### H3\nText3"
+	sections := splitByHeadings(text)
+	if len(sections) != 3 {
+		t.Fatalf("gemischter Split: erwartet 3, bekam %d", len(sections))
+	}
+}
+
+func TestSplitByHeadings_NoHeadings(t *testing.T) {
+	text := "Kein Heading hier\nNur Text."
+	sections := splitByHeadings(text)
+	if len(sections) != 1 {
+		t.Fatalf("ohne Headings: erwartet 1 Abschnitt, bekam %d", len(sections))
+	}
+}
+
+func TestSplitLongSection_ShortPassthrough(t *testing.T) {
+	short := strings.Repeat("x", maxChunkSize-1)
+	chunks := splitLongSection(short)
+	if len(chunks) != 1 {
+		t.Fatalf("kurzer Abschnitt: erwartet 1 Chunk, bekam %d", len(chunks))
+	}
+	if chunks[0] != short {
+		t.Errorf("Inhalt verändert")
+	}
+}
+
+func TestSplitLongSection_SplitsByParagraph(t *testing.T) {
+	// Erzeuge zwei Paragraphen, die zusammen > maxChunkSize sind
+	para := strings.Repeat("a", maxChunkSize/2+10)
+	text := para + "\n\n" + para
+	chunks := splitLongSection(text)
+	if len(chunks) < 2 {
+		t.Fatalf("erwarte >= 2 Chunks für überlangen Text, bekam %d", len(chunks))
+	}
+	for _, c := range chunks {
+		if len(c) > maxChunkSize {
+			t.Errorf("Chunk überschreitet maxChunkSize: len=%d", len(c))
+		}
+	}
+}
+
+func TestSplitLongSection_SingleLongParagraph(t *testing.T) {
+	// Ein einzelner Paragraph > maxChunkSize kann nicht weiter gesplittet werden
+	long := strings.Repeat("b", maxChunkSize+50)
+	chunks := splitLongSection(long)
+	if len(chunks) != 1 {
+		t.Fatalf("einzelner langer Paragraph: erwartet 1 Chunk, bekam %d", len(chunks))
+	}
+}
+
+func TestGenerateID_Deterministic(t *testing.T) {
+	id1 := generateID("text", "source.md")
+	id2 := generateID("text", "source.md")
+	if id1 != id2 {
+		t.Errorf("IDs nicht deterministisch: %s != %s", id1, id2)
+	}
+}
+
+func TestGenerateID_DifferentForDifferentInput(t *testing.T) {
+	id1 := generateID("text A", "source.md")
+	id2 := generateID("text B", "source.md")
+	if id1 == id2 {
+		t.Errorf("verschiedene Texte ergeben gleiche ID")
+	}
+}
+
+func TestGenerateID_DifferentSourceSameText(t *testing.T) {
+	id1 := generateID("same text", "file1.md")
+	id2 := generateID("same text", "file2.md")
+	if id1 == id2 {
+		t.Errorf("verschiedene Quellen mit gleichem Text ergeben gleiche ID")
+	}
+}
+
+func TestReadAndChunk_FiltersShortSections(t *testing.T) {
+	// readAndChunk filtert Abschnitte < 20 Zeichen
+	// Wir testen splitByHeadings + splitLongSection direkt
+	content := "# A\nKurz\n# Abschnitt mit genug Text zum Indizieren hier"
+	sections := splitByHeadings(content)
+	var kept []string
+	for _, s := range sections {
+		s = strings.TrimSpace(s)
+		if len(s) >= 20 {
+			kept = append(kept, s)
+		}
+	}
+	if len(kept) != 1 {
+		t.Fatalf("erwartet 1 beibehaltenen Abschnitt, bekam %d", len(kept))
+	}
+}
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -31,8 +31,9 @@ type Config struct {
 		Model string `yaml:"model"`
 	} `yaml:"chat"`

-	BrainRoot string `yaml:"brain_root"`
-	TopK      uint64 `yaml:"top_k"`
+	BrainRoot      string  `yaml:"brain_root"`
+	TopK           uint64  `yaml:"top_k"`
+	ScoreThreshold float32 `yaml:"score_threshold"`
 }

 var Cfg Config