sync
This commit is contained in:
@@ -77,7 +77,7 @@ Basierend auf diesen Informationen, beantworte bitte folgende Frage:
|
|||||||
}
|
}
|
||||||
defer stream.Close()
|
defer stream.Close()
|
||||||
|
|
||||||
fmt.Println("\n💬 Antwort:\n")
|
fmt.Print("\n💬 Antwort:\n\n")
|
||||||
for {
|
for {
|
||||||
response, err := stream.Recv()
|
response, err := stream.Recv()
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -88,15 +88,15 @@ Basierend auf diesen Informationen, beantworte bitte folgende Frage:
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
fmt.Println("\n")
|
fmt.Print("\n\n")
|
||||||
fmt.Println(strings.Repeat("═", 80))
|
fmt.Println(strings.Repeat("═", 80))
|
||||||
fmt.Println("\n📚 Verwendete Quellen:")
|
fmt.Print("\n📚 Verwendete Quellen:\n")
|
||||||
for i, chunk := range chunks {
|
for i, chunk := range chunks {
|
||||||
preview := chunk.Text
|
preview := chunk.Text
|
||||||
if len(preview) > 80 {
|
if len(preview) > 80 {
|
||||||
preview = preview[:80] + "..."
|
preview = preview[:80] + "..."
|
||||||
}
|
}
|
||||||
fmt.Printf(" [%d] %.1f%% - %s\n", i+1, chunk.Score*100, preview)
|
fmt.Printf(" [%d] %.1f%% - %s\n %s\n", i+1, chunk.Score*100, chunk.Source, preview)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -120,7 +120,7 @@ func searchKnowledge(ctx context.Context, embClient *openai.Client, query string
|
|||||||
WithPayload: &pb.WithPayloadSelector{
|
WithPayload: &pb.WithPayloadSelector{
|
||||||
SelectorOptions: &pb.WithPayloadSelector_Enable{Enable: true},
|
SelectorOptions: &pb.WithPayloadSelector_Enable{Enable: true},
|
||||||
},
|
},
|
||||||
ScoreThreshold: floatPtr(0.5),
|
ScoreThreshold: floatPtr(config.Cfg.ScoreThreshold),
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf("❌ Suche fehlgeschlagen: %v", err)
|
log.Printf("❌ Suche fehlgeschlagen: %v", err)
|
||||||
|
|||||||
@@ -143,7 +143,7 @@ func splitByHeadings(text string) []string {
|
|||||||
var sections []string
|
var sections []string
|
||||||
var current strings.Builder
|
var current strings.Builder
|
||||||
for _, line := range lines {
|
for _, line := range lines {
|
||||||
if strings.HasPrefix(line, "# ") || strings.HasPrefix(line, "## ") {
|
if strings.HasPrefix(line, "# ") || strings.HasPrefix(line, "## ") || strings.HasPrefix(line, "### ") {
|
||||||
if current.Len() > 0 {
|
if current.Len() > 0 {
|
||||||
sections = append(sections, current.String())
|
sections = append(sections, current.String())
|
||||||
current.Reset()
|
current.Reset()
|
||||||
|
|||||||
@@ -51,12 +51,26 @@ func RunIngestJSON(inputFile string) {
|
|||||||
|
|
||||||
fmt.Printf("🤖 Embedding: %s (%s)\n\n", config.Cfg.Embedding.Model, config.Cfg.Embedding.URL)
|
fmt.Printf("🤖 Embedding: %s (%s)\n\n", config.Cfg.Embedding.Model, config.Cfg.Embedding.URL)
|
||||||
|
|
||||||
|
// Batched embedding – identisch zur Markdown-Ingest-Logik
|
||||||
|
batchSize := 10
|
||||||
success := 0
|
success := 0
|
||||||
for i, entry := range entries {
|
|
||||||
fmt.Printf("[%d/%d] 🔄 %s\n", i+1, len(entries), entry.FileName)
|
for i := 0; i < len(entries); i += batchSize {
|
||||||
|
end := i + batchSize
|
||||||
|
if end > len(entries) {
|
||||||
|
end = len(entries)
|
||||||
|
}
|
||||||
|
batch := entries[i:end]
|
||||||
|
|
||||||
|
texts := make([]string, len(batch))
|
||||||
|
for j, e := range batch {
|
||||||
|
texts[j] = e.Description
|
||||||
|
}
|
||||||
|
|
||||||
|
fmt.Printf("[%d–%d/%d] 🔄 Embedding-Batch...\n", i+1, end, len(entries))
|
||||||
|
|
||||||
embResp, err := embClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{
|
embResp, err := embClient.CreateEmbeddings(ctx, openai.EmbeddingRequest{
|
||||||
Input: []string{entry.Description},
|
Input: texts,
|
||||||
Model: openai.EmbeddingModel(config.Cfg.Embedding.Model),
|
Model: openai.EmbeddingModel(config.Cfg.Embedding.Model),
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
@@ -64,33 +78,38 @@ func RunIngestJSON(inputFile string) {
|
|||||||
continue
|
continue
|
||||||
}
|
}
|
||||||
|
|
||||||
_, err = pointsClient.Upsert(ctx, &pb.UpsertPoints{
|
var points []*pb.PointStruct
|
||||||
CollectionName: config.Cfg.Qdrant.Collection,
|
for j, emb := range embResp.Data {
|
||||||
Points: []*pb.PointStruct{
|
e := batch[j]
|
||||||
{
|
points = append(points, &pb.PointStruct{
|
||||||
Id: &pb.PointId{
|
Id: &pb.PointId{
|
||||||
PointIdOptions: &pb.PointId_Uuid{
|
PointIdOptions: &pb.PointId_Uuid{
|
||||||
Uuid: generateID(entry.Description, entry.FileName),
|
Uuid: generateID(e.Description, e.FileName),
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Vectors: &pb.Vectors{
|
Vectors: &pb.Vectors{
|
||||||
VectorsOptions: &pb.Vectors_Vector{
|
VectorsOptions: &pb.Vectors_Vector{
|
||||||
Vector: &pb.Vector{Data: embResp.Data[0].Embedding},
|
Vector: &pb.Vector{Data: emb.Embedding},
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
Payload: map[string]*pb.Value{
|
Payload: map[string]*pb.Value{
|
||||||
"text": {Kind: &pb.Value_StringValue{StringValue: entry.Description}},
|
"text": {Kind: &pb.Value_StringValue{StringValue: e.Description}},
|
||||||
"source": {Kind: &pb.Value_StringValue{StringValue: entry.FileName}},
|
"source": {Kind: &pb.Value_StringValue{StringValue: e.FileName}},
|
||||||
"path": {Kind: &pb.Value_StringValue{StringValue: entry.FilePath}},
|
"path": {Kind: &pb.Value_StringValue{StringValue: e.FilePath}},
|
||||||
"type": {Kind: &pb.Value_StringValue{StringValue: "image"}},
|
"type": {Kind: &pb.Value_StringValue{StringValue: "image"}},
|
||||||
},
|
},
|
||||||
},
|
})
|
||||||
},
|
}
|
||||||
|
|
||||||
|
_, err = pointsClient.Upsert(ctx, &pb.UpsertPoints{
|
||||||
|
CollectionName: config.Cfg.Qdrant.Collection,
|
||||||
|
Points: points,
|
||||||
|
Wait: boolPtr(true),
|
||||||
})
|
})
|
||||||
if err != nil {
|
if err != nil {
|
||||||
log.Printf(" ❌ Speichern Fehler: %v\n", err)
|
log.Printf(" ❌ Speichern Fehler: %v\n", err)
|
||||||
} else {
|
} else {
|
||||||
success++
|
success += len(batch)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
128
internal/brain/ingest_test.go
Normal file
128
internal/brain/ingest_test.go
Normal file
@@ -0,0 +1,128 @@
|
|||||||
|
package brain
|
||||||
|
|
||||||
|
import (
|
||||||
|
"strings"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestSplitByHeadings_H1(t *testing.T) {
|
||||||
|
text := "# Abschnitt A\nText A\n# Abschnitt B\nText B"
|
||||||
|
sections := splitByHeadings(text)
|
||||||
|
if len(sections) != 2 {
|
||||||
|
t.Fatalf("erwartet 2 Abschnitte, bekam %d", len(sections))
|
||||||
|
}
|
||||||
|
if !strings.Contains(sections[0], "Abschnitt A") {
|
||||||
|
t.Errorf("Abschnitt 0 enthält nicht 'Abschnitt A': %q", sections[0])
|
||||||
|
}
|
||||||
|
if !strings.Contains(sections[1], "Abschnitt B") {
|
||||||
|
t.Errorf("Abschnitt 1 enthält nicht 'Abschnitt B': %q", sections[1])
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitByHeadings_H2(t *testing.T) {
|
||||||
|
text := "## Intro\nText\n## Detail\nMehr Text"
|
||||||
|
sections := splitByHeadings(text)
|
||||||
|
if len(sections) != 2 {
|
||||||
|
t.Fatalf("erwartet 2 Abschnitte, bekam %d", len(sections))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitByHeadings_H3(t *testing.T) {
|
||||||
|
text := "### Unterabschnitt A\nText A\n### Unterabschnitt B\nText B"
|
||||||
|
sections := splitByHeadings(text)
|
||||||
|
if len(sections) != 2 {
|
||||||
|
t.Fatalf("H3-Split: erwartet 2 Abschnitte, bekam %d", len(sections))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitByHeadings_Mixed(t *testing.T) {
|
||||||
|
text := "# H1\nText\n## H2\nText2\n### H3\nText3"
|
||||||
|
sections := splitByHeadings(text)
|
||||||
|
if len(sections) != 3 {
|
||||||
|
t.Fatalf("gemischter Split: erwartet 3, bekam %d", len(sections))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitByHeadings_NoHeadings(t *testing.T) {
|
||||||
|
text := "Kein Heading hier\nNur Text."
|
||||||
|
sections := splitByHeadings(text)
|
||||||
|
if len(sections) != 1 {
|
||||||
|
t.Fatalf("ohne Headings: erwartet 1 Abschnitt, bekam %d", len(sections))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitLongSection_ShortPassthrough(t *testing.T) {
|
||||||
|
short := strings.Repeat("x", maxChunkSize-1)
|
||||||
|
chunks := splitLongSection(short)
|
||||||
|
if len(chunks) != 1 {
|
||||||
|
t.Fatalf("kurzer Abschnitt: erwartet 1 Chunk, bekam %d", len(chunks))
|
||||||
|
}
|
||||||
|
if chunks[0] != short {
|
||||||
|
t.Errorf("Inhalt verändert")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitLongSection_SplitsByParagraph(t *testing.T) {
|
||||||
|
// Erzeuge zwei Paragraphen, die zusammen > maxChunkSize sind
|
||||||
|
para := strings.Repeat("a", maxChunkSize/2+10)
|
||||||
|
text := para + "\n\n" + para
|
||||||
|
chunks := splitLongSection(text)
|
||||||
|
if len(chunks) < 2 {
|
||||||
|
t.Fatalf("erwarte >= 2 Chunks für überlangen Text, bekam %d", len(chunks))
|
||||||
|
}
|
||||||
|
for _, c := range chunks {
|
||||||
|
if len(c) > maxChunkSize {
|
||||||
|
t.Errorf("Chunk überschreitet maxChunkSize: len=%d", len(c))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestSplitLongSection_SingleLongParagraph(t *testing.T) {
|
||||||
|
// Ein einzelner Paragraph > maxChunkSize kann nicht weiter gesplittet werden
|
||||||
|
long := strings.Repeat("b", maxChunkSize+50)
|
||||||
|
chunks := splitLongSection(long)
|
||||||
|
if len(chunks) != 1 {
|
||||||
|
t.Fatalf("einzelner langer Paragraph: erwartet 1 Chunk, bekam %d", len(chunks))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGenerateID_Deterministic(t *testing.T) {
|
||||||
|
id1 := generateID("text", "source.md")
|
||||||
|
id2 := generateID("text", "source.md")
|
||||||
|
if id1 != id2 {
|
||||||
|
t.Errorf("IDs nicht deterministisch: %s != %s", id1, id2)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGenerateID_DifferentForDifferentInput(t *testing.T) {
|
||||||
|
id1 := generateID("text A", "source.md")
|
||||||
|
id2 := generateID("text B", "source.md")
|
||||||
|
if id1 == id2 {
|
||||||
|
t.Errorf("verschiedene Texte ergeben gleiche ID")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestGenerateID_DifferentSourceSameText(t *testing.T) {
|
||||||
|
id1 := generateID("same text", "file1.md")
|
||||||
|
id2 := generateID("same text", "file2.md")
|
||||||
|
if id1 == id2 {
|
||||||
|
t.Errorf("verschiedene Quellen mit gleichem Text ergeben gleiche ID")
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
func TestReadAndChunk_FiltersShortSections(t *testing.T) {
|
||||||
|
// readAndChunk filtert Abschnitte < 20 Zeichen
|
||||||
|
// Wir testen splitByHeadings + splitLongSection direkt
|
||||||
|
content := "# A\nKurz\n# Abschnitt mit genug Text zum Indizieren hier"
|
||||||
|
sections := splitByHeadings(content)
|
||||||
|
var kept []string
|
||||||
|
for _, s := range sections {
|
||||||
|
s = strings.TrimSpace(s)
|
||||||
|
if len(s) >= 20 {
|
||||||
|
kept = append(kept, s)
|
||||||
|
}
|
||||||
|
}
|
||||||
|
if len(kept) != 1 {
|
||||||
|
t.Fatalf("erwartet 1 beibehaltenen Abschnitt, bekam %d", len(kept))
|
||||||
|
}
|
||||||
|
}
|
||||||
@@ -33,6 +33,7 @@ type Config struct {
|
|||||||
|
|
||||||
BrainRoot string `yaml:"brain_root"`
|
BrainRoot string `yaml:"brain_root"`
|
||||||
TopK uint64 `yaml:"top_k"`
|
TopK uint64 `yaml:"top_k"`
|
||||||
|
ScoreThreshold float32 `yaml:"score_threshold"`
|
||||||
}
|
}
|
||||||
|
|
||||||
var Cfg Config
|
var Cfg Config
|
||||||
|
|||||||
Reference in New Issue
Block a user