diff --git a/internal/brain/ask.go b/internal/brain/ask.go index 5174b5a..5db0d6b 100755 --- a/internal/brain/ask.go +++ b/internal/brain/ask.go @@ -77,7 +77,7 @@ Basierend auf diesen Informationen, beantworte bitte folgende Frage: } defer stream.Close() - fmt.Println("\nšŸ’¬ Antwort:\n") + fmt.Print("\nšŸ’¬ Antwort:\n\n") for { response, err := stream.Recv() if err != nil { @@ -88,15 +88,15 @@ Basierend auf diesen Informationen, beantworte bitte folgende Frage: } } - fmt.Println("\n") + fmt.Print("\n\n") fmt.Println(strings.Repeat("═", 80)) - fmt.Println("\nšŸ“š Verwendete Quellen:") + fmt.Print("\nšŸ“š Verwendete Quellen:\n") for i, chunk := range chunks { preview := chunk.Text if len(preview) > 80 { preview = preview[:80] + "..." } - fmt.Printf(" [%d] %.1f%% - %s\n", i+1, chunk.Score*100, preview) + fmt.Printf(" [%d] %.1f%% - %s\n %s\n", i+1, chunk.Score*100, chunk.Source, preview) } } @@ -120,7 +120,7 @@ func searchKnowledge(ctx context.Context, embClient *openai.Client, query string WithPayload: &pb.WithPayloadSelector{ SelectorOptions: &pb.WithPayloadSelector_Enable{Enable: true}, }, - ScoreThreshold: floatPtr(0.5), + ScoreThreshold: floatPtr(config.Cfg.ScoreThreshold), }) if err != nil { log.Printf("āŒ Suche fehlgeschlagen: %v", err) diff --git a/internal/brain/ingest.go b/internal/brain/ingest.go index 15e7afa..60d76be 100755 --- a/internal/brain/ingest.go +++ b/internal/brain/ingest.go @@ -143,7 +143,7 @@ func splitByHeadings(text string) []string { var sections []string var current strings.Builder for _, line := range lines { - if strings.HasPrefix(line, "# ") || strings.HasPrefix(line, "## ") { + if strings.HasPrefix(line, "# ") || strings.HasPrefix(line, "## ") || strings.HasPrefix(line, "### ") { if current.Len() > 0 { sections = append(sections, current.String()) current.Reset() diff --git a/internal/brain/ingest_json.go b/internal/brain/ingest_json.go index c44f87e..3db5359 100755 --- a/internal/brain/ingest_json.go +++ b/internal/brain/ingest_json.go @@ -51,12 +51,26 @@ func RunIngestJSON(inputFile string) { fmt.Printf("šŸ¤– Embedding: %s (%s)\n\n", config.Cfg.Embedding.Model, config.Cfg.Embedding.URL) + // Batched embedding – identisch zur Markdown-Ingest-Logik + batchSize := 10 success := 0 - for i, entry := range entries { - fmt.Printf("[%d/%d] šŸ”„ %s\n", i+1, len(entries), entry.FileName) + + for i := 0; i < len(entries); i += batchSize { + end := i + batchSize + if end > len(entries) { + end = len(entries) + } + batch := entries[i:end] + + texts := make([]string, len(batch)) + for j, e := range batch { + texts[j] = e.Description + } + + fmt.Printf("[%d–%d/%d] šŸ”„ Embedding-Batch...\n", i+1, end, len(entries)) embResp, err := embClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{ - Input: []string{entry.Description}, + Input: texts, Model: openai.EmbeddingModel(config.Cfg.Embedding.Model), }) if err != nil { @@ -64,33 +78,38 @@ func RunIngestJSON(inputFile string) { continue } - _, err = pointsClient.Upsert(ctx, &pb.UpsertPoints{ - CollectionName: config.Cfg.Qdrant.Collection, - Points: []*pb.PointStruct{ - { - Id: &pb.PointId{ - PointIdOptions: &pb.PointId_Uuid{ - Uuid: generateID(entry.Description, entry.FileName), - }, - }, - Vectors: &pb.Vectors{ - VectorsOptions: &pb.Vectors_Vector{ - Vector: &pb.Vector{Data: embResp.Data[0].Embedding}, - }, - }, - Payload: map[string]*pb.Value{ - "text": {Kind: &pb.Value_StringValue{StringValue: entry.Description}}, - "source": {Kind: &pb.Value_StringValue{StringValue: entry.FileName}}, - "path": {Kind: &pb.Value_StringValue{StringValue: entry.FilePath}}, - "type": {Kind: &pb.Value_StringValue{StringValue: "image"}}, + var points []*pb.PointStruct + for j, emb := range embResp.Data { + e := batch[j] + points = append(points, &pb.PointStruct{ + Id: &pb.PointId{ + PointIdOptions: &pb.PointId_Uuid{ + Uuid: generateID(e.Description, e.FileName), }, }, - }, + Vectors: &pb.Vectors{ + VectorsOptions: &pb.Vectors_Vector{ + Vector: &pb.Vector{Data: emb.Embedding}, + }, + }, + Payload: map[string]*pb.Value{ + "text": {Kind: &pb.Value_StringValue{StringValue: e.Description}}, + "source": {Kind: &pb.Value_StringValue{StringValue: e.FileName}}, + "path": {Kind: &pb.Value_StringValue{StringValue: e.FilePath}}, + "type": {Kind: &pb.Value_StringValue{StringValue: "image"}}, + }, + }) + } + + _, err = pointsClient.Upsert(ctx, &pb.UpsertPoints{ + CollectionName: config.Cfg.Qdrant.Collection, + Points: points, + Wait: boolPtr(true), }) if err != nil { log.Printf(" āŒ Speichern Fehler: %v\n", err) } else { - success++ + success += len(batch) } } diff --git a/internal/brain/ingest_test.go b/internal/brain/ingest_test.go new file mode 100644 index 0000000..1c59ab4 --- /dev/null +++ b/internal/brain/ingest_test.go @@ -0,0 +1,128 @@ +package brain + +import ( + "strings" + "testing" +) + +func TestSplitByHeadings_H1(t *testing.T) { + text := "# Abschnitt A\nText A\n# Abschnitt B\nText B" + sections := splitByHeadings(text) + if len(sections) != 2 { + t.Fatalf("erwartet 2 Abschnitte, bekam %d", len(sections)) + } + if !strings.Contains(sections[0], "Abschnitt A") { + t.Errorf("Abschnitt 0 enthƤlt nicht 'Abschnitt A': %q", sections[0]) + } + if !strings.Contains(sections[1], "Abschnitt B") { + t.Errorf("Abschnitt 1 enthƤlt nicht 'Abschnitt B': %q", sections[1]) + } +} + +func TestSplitByHeadings_H2(t *testing.T) { + text := "## Intro\nText\n## Detail\nMehr Text" + sections := splitByHeadings(text) + if len(sections) != 2 { + t.Fatalf("erwartet 2 Abschnitte, bekam %d", len(sections)) + } +} + +func TestSplitByHeadings_H3(t *testing.T) { + text := "### Unterabschnitt A\nText A\n### Unterabschnitt B\nText B" + sections := splitByHeadings(text) + if len(sections) != 2 { + t.Fatalf("H3-Split: erwartet 2 Abschnitte, bekam %d", len(sections)) + } +} + +func TestSplitByHeadings_Mixed(t *testing.T) { + text := "# H1\nText\n## H2\nText2\n### H3\nText3" + sections := splitByHeadings(text) + if len(sections) != 3 { + t.Fatalf("gemischter Split: erwartet 3, bekam %d", len(sections)) + } +} + +func TestSplitByHeadings_NoHeadings(t *testing.T) { + text := "Kein Heading hier\nNur Text." + sections := splitByHeadings(text) + if len(sections) != 1 { + t.Fatalf("ohne Headings: erwartet 1 Abschnitt, bekam %d", len(sections)) + } +} + +func TestSplitLongSection_ShortPassthrough(t *testing.T) { + short := strings.Repeat("x", maxChunkSize-1) + chunks := splitLongSection(short) + if len(chunks) != 1 { + t.Fatalf("kurzer Abschnitt: erwartet 1 Chunk, bekam %d", len(chunks)) + } + if chunks[0] != short { + t.Errorf("Inhalt verƤndert") + } +} + +func TestSplitLongSection_SplitsByParagraph(t *testing.T) { + // Erzeuge zwei Paragraphen, die zusammen > maxChunkSize sind + para := strings.Repeat("a", maxChunkSize/2+10) + text := para + "\n\n" + para + chunks := splitLongSection(text) + if len(chunks) < 2 { + t.Fatalf("erwarte >= 2 Chunks für überlangen Text, bekam %d", len(chunks)) + } + for _, c := range chunks { + if len(c) > maxChunkSize { + t.Errorf("Chunk überschreitet maxChunkSize: len=%d", len(c)) + } + } +} + +func TestSplitLongSection_SingleLongParagraph(t *testing.T) { + // Ein einzelner Paragraph > maxChunkSize kann nicht weiter gesplittet werden + long := strings.Repeat("b", maxChunkSize+50) + chunks := splitLongSection(long) + if len(chunks) != 1 { + t.Fatalf("einzelner langer Paragraph: erwartet 1 Chunk, bekam %d", len(chunks)) + } +} + +func TestGenerateID_Deterministic(t *testing.T) { + id1 := generateID("text", "source.md") + id2 := generateID("text", "source.md") + if id1 != id2 { + t.Errorf("IDs nicht deterministisch: %s != %s", id1, id2) + } +} + +func TestGenerateID_DifferentForDifferentInput(t *testing.T) { + id1 := generateID("text A", "source.md") + id2 := generateID("text B", "source.md") + if id1 == id2 { + t.Errorf("verschiedene Texte ergeben gleiche ID") + } +} + +func TestGenerateID_DifferentSourceSameText(t *testing.T) { + id1 := generateID("same text", "file1.md") + id2 := generateID("same text", "file2.md") + if id1 == id2 { + t.Errorf("verschiedene Quellen mit gleichem Text ergeben gleiche ID") + } +} + +func TestReadAndChunk_FiltersShortSections(t *testing.T) { + // readAndChunk filtert Abschnitte < 20 Zeichen + // Wir testen splitByHeadings + splitLongSection direkt + content := "# A\nKurz\n# Abschnitt mit genug Text zum Indizieren hier" + sections := splitByHeadings(content) + var kept []string + for _, s := range sections { + s = strings.TrimSpace(s) + if len(s) >= 20 { + kept = append(kept, s) + } + } + if len(kept) != 1 { + t.Fatalf("erwartet 1 beibehaltenen Abschnitt, bekam %d", len(kept)) + } +} diff --git a/internal/config/config.go b/internal/config/config.go index 54f73e6..d6413af 100755 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -31,8 +31,9 @@ type Config struct { Model string `yaml:"model"` } `yaml:"chat"` - BrainRoot string `yaml:"brain_root"` - TopK uint64 `yaml:"top_k"` + BrainRoot string `yaml:"brain_root"` + TopK uint64 `yaml:"top_k"` + ScoreThreshold float32 `yaml:"score_threshold"` } var Cfg Config