KevinStephenson
Adding in weaviate code
b110593
raw
history blame
41.7 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
//go:build integrationTest
package db
import (
"context"
"fmt"
"testing"
"github.com/go-openapi/strfmt"
"github.com/google/uuid"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/weaviate/weaviate/adapters/repos/db/inverted"
"github.com/weaviate/weaviate/entities/additional"
"github.com/weaviate/weaviate/entities/filters"
"github.com/weaviate/weaviate/entities/models"
"github.com/weaviate/weaviate/entities/schema"
"github.com/weaviate/weaviate/entities/searchparams"
enthnsw "github.com/weaviate/weaviate/entities/vectorindex/hnsw"
)
func BM25FinvertedConfig(k1, b float32, stopWordPreset string) *models.InvertedIndexConfig {
return &models.InvertedIndexConfig{
Bm25: &models.BM25Config{
K1: k1,
B: b,
},
CleanupIntervalSeconds: 60,
Stopwords: &models.StopwordConfig{
Preset: stopWordPreset,
},
IndexNullState: true,
IndexPropertyLength: true,
}
}
func SetupClass(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32,
) {
vFalse := false
vTrue := true
class := &models.Class{
VectorIndexConfig: enthnsw.NewDefaultUserConfig(),
InvertedIndexConfig: BM25FinvertedConfig(k1, b, "none"),
Class: "MyClass",
Properties: []*models.Property{
{
Name: "title",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "description",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "review",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "textField",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationField,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "textWhitespace",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWhitespace,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "relatedToGolf",
DataType: schema.DataTypeBoolean.PropString(),
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "multiTitles",
DataType: schema.DataTypeTextArray.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "multiTextWhitespace",
DataType: schema.DataTypeTextArray.PropString(),
Tokenization: models.PropertyTokenizationWhitespace,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
},
}
schema := schema.Schema{
Objects: &models.Schema{
Classes: []*models.Class{class},
},
}
schemaGetter.schema = schema
migrator := NewMigrator(repo, logger)
migrator.AddClass(context.Background(), class, schemaGetter.shardState)
testData := []map[string]interface{}{}
testData = append(testData, map[string]interface{}{"title": "Our journey to BM25F", "description": "This is how we get to BM25F", "review": "none none none", "multiTitles": []string{"breakfast", "dinner"}})
testData = append(testData, map[string]interface{}{"title": "Why I dont like journey", "description": "This is about how we get somewhere", "multiTitles": []string{"going to a restaurant for dinner", "sandwiches and desert are a great lunch"}})
testData = append(testData, map[string]interface{}{"title": "My journeys in Journey", "description": "A journey story about journeying"})
testData = append(testData, map[string]interface{}{"title": "An unrelated title", "description": "Actually all about journey"})
testData = append(testData, map[string]interface{}{"title": "journey journey", "description": "journey journey journey"})
testData = append(testData, map[string]interface{}{"title": "journey", "description": "journey journey", "multiTextWhitespace": []string{"totally irrelevant:)", "we all MuuultiYell! together"}})
testData = append(testData, map[string]interface{}{"title": "JOURNEY", "description": "A LOUD JOURNEY", "multiTextWhitespace": []string{"MuuultiYell!", "is fun"}})
testData = append(testData, map[string]interface{}{"title": "An unrelated title", "description": "Absolutely nothing to do with the topic", "textField": "*&^$@#$%^&*()(Offtopic!!!!"})
testData = append(testData, map[string]interface{}{"title": "none", "description": "other", "textField": "YELLING IS FUN"})
testData = append(testData, map[string]interface{}{"title": "something", "description": "none none", "review": "none none none none none none"})
for i, data := range testData {
id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String())
obj := &models.Object{Class: "MyClass", ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020}
vector := []float32{1, 3, 5, 0.4}
//{title: "Our journey to BM25F", description: " This is how we get to BM25F"}}
err := repo.PutObject(context.Background(), obj, vector, nil)
require.Nil(t, err)
}
}
// DuplicatedFrom SetupClass to make sure this new test does not alter the results of the existing one
func SetupClassForFilterScoringTest(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32,
) {
vFalse := false
vTrue := true
class := &models.Class{
VectorIndexConfig: enthnsw.NewDefaultUserConfig(),
InvertedIndexConfig: BM25FinvertedConfig(k1, b, "none"),
Class: "FilterClass",
Properties: []*models.Property{
{
Name: "description",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "relatedToGolf",
DataType: schema.DataTypeBoolean.PropString(),
IndexFilterable: &vTrue,
},
},
}
schema := schema.Schema{
Objects: &models.Schema{
Classes: []*models.Class{class},
},
}
schemaGetter.schema = schema
migrator := NewMigrator(repo, logger)
migrator.AddClass(context.Background(), class, schemaGetter.shardState)
testData := []map[string]interface{}{}
testData = append(testData, map[string]interface{}{"description": "Brooks Koepka appeared a lot in the ms marco dataset. I was surprised to see golf content in there. I assume if the dataset was newer, we'd see a lot more Rory though.", "relatedToGolf": true})
testData = append(testData, map[string]interface{}{"description": "While one would expect Koepka to be a somewhat rare name, it did appear in msmarco also outside the context of Brooks.", "relatedToGolf": false})
for i, data := range testData {
id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String())
obj := &models.Object{Class: "FilterClass", ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020}
vector := []float32{1, 3, 5, 0.4}
err := repo.PutObject(context.Background(), obj, vector, nil)
require.Nil(t, err)
}
}
func TestBM25FJourney(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
SetupClass(t, repo, schemaGetter, logger, 1.2, 0.75)
idx := repo.GetIndex("MyClass")
require.NotNil(t, idx)
// Check basic search
addit := additional.Properties{}
t.Run("bm25f journey", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description", "textField"}, Query: "journey"}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Print results
t.Log("--- Start results for basic search ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
// Check results in correct order
require.Equal(t, uint64(4), res[0].DocID())
require.Equal(t, uint64(5), res[1].DocID())
require.Equal(t, uint64(6), res[2].DocID())
require.Equal(t, uint64(3), res[3].DocID())
require.Equal(t, uint64(0), res[4].DocID())
require.Equal(t, uint64(2), res[5].DocID())
// Without additionalExplanations no explainScore entry should be present
require.Contains(t, res[0].Object.Additional, "score")
require.NotContains(t, res[0].Object.Additional, "explainScore")
})
// Check non-alpha search on string field
// text/field are tokenized entirely, so we can search for non-alpha characters
t.Run("bm25f textField non-alpha", func(t *testing.T) {
kwrTextField := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description", "textField"}, Query: "*&^$@#$%^&*()(Offtopic!!!!"}
addit = additional.Properties{}
resTextField, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwrTextField, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Print results
t.Log("--- Start results for textField search ---")
for _, r := range resTextField {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
// Check results in correct order
require.Equal(t, uint64(7), resTextField[0].DocID())
})
// text/field are not lower-cased before indexing, so upper case searches must be passed through unchanged.
t.Run("bm25f textField caps", func(t *testing.T) {
kwrTextField := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"textField"}, Query: "YELLING IS FUN"}
addit := additional.Properties{}
resTextField, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwrTextField, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Print results
t.Log("--- Start results for textField caps search ---")
for _, r := range resTextField {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
// Check results in correct order
require.Equal(t, uint64(8), resTextField[0].DocID())
})
// Check basic text search WITH CAPS
t.Run("bm25f text with caps", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description"}, Query: "JOURNEY"}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
// Print results
t.Log("--- Start results for search with caps ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
require.Nil(t, err)
// Check results in correct order
require.Equal(t, uint64(4), res[0].DocID())
require.Equal(t, uint64(5), res[1].DocID())
require.Equal(t, uint64(6), res[2].DocID())
require.Equal(t, uint64(2), res[3].DocID())
require.Equal(t, uint64(3), res[4].DocID())
require.Equal(t, uint64(0), res[5].DocID())
require.Equal(t, uint64(1), res[6].DocID())
})
t.Run("bm25f journey boosted", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title^3", "description"}, Query: "journey"}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Print results
t.Log("--- Start results for boosted search ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
// Check results in correct order
require.Equal(t, uint64(4), res[0].DocID())
require.Equal(t, uint64(5), res[1].DocID())
require.Equal(t, uint64(6), res[2].DocID())
require.Equal(t, uint64(0), res[3].DocID())
require.Equal(t, uint64(1), res[4].DocID())
require.Equal(t, uint64(2), res[5].DocID())
require.Equal(t, uint64(3), res[6].DocID())
})
t.Run("Check search with two terms", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title", "description"}, Query: "journey somewhere"}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Check results in correct order
require.Equal(t, uint64(1), res[0].DocID())
require.Equal(t, uint64(4), res[1].DocID())
require.Equal(t, uint64(5), res[2].DocID())
require.Equal(t, uint64(6), res[3].DocID())
require.Equal(t, uint64(2), res[4].DocID())
})
t.Run("bm25f journey somewhere no properties", func(t *testing.T) {
// Check search with no properties (should include all properties)
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{}, Query: "journey somewhere"}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Check results in correct order
require.Equal(t, uint64(1), res[0].DocID())
require.Equal(t, uint64(4), res[1].DocID())
require.Equal(t, uint64(5), res[2].DocID())
require.Equal(t, uint64(6), res[3].DocID())
})
t.Run("bm25f non alphanums", func(t *testing.T) {
// Check search with no properties (should include all properties)
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{}, Query: "*&^$@#$%^&*()(Offtopic!!!!"}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Equal(t, uint64(7), res[0].DocID())
})
t.Run("First result has high score", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "about BM25F"}
res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Equal(t, uint64(0), res[0].DocID())
require.Len(t, res, 4) // four results have one of the terms
})
t.Run("More results than limit", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey"}
res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Equal(t, uint64(4), res[0].DocID())
require.Equal(t, uint64(5), res[1].DocID())
require.Equal(t, uint64(6), res[2].DocID())
require.Equal(t, uint64(3), res[3].DocID())
require.Equal(t, uint64(2), res[4].DocID())
require.Len(t, res, 5) // four results have one of the terms
})
t.Run("Results from three properties", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "none"}
res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Equal(t, uint64(9), res[0].DocID())
require.Equal(t, uint64(0), res[1].DocID())
require.Equal(t, uint64(8), res[2].DocID())
require.Len(t, res, 3)
})
t.Run("Include additional explanations", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey", AdditionalExplanations: true}
res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// With additionalExplanations explainScore entry should be present
require.Contains(t, res[0].Object.Additional, "score")
require.Contains(t, res[0].Object.Additional, "explainScore")
require.Contains(t, res[0].Object.Additional["explainScore"], "BM25")
})
t.Run("Array fields text", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"multiTitles"}, Query: "dinner"}
res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Len(t, res, 2)
require.Equal(t, uint64(0), res[0].DocID())
require.Equal(t, uint64(1), res[1].DocID())
})
t.Run("Array fields string", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"multiTextWhitespace"}, Query: "MuuultiYell!"}
res, _, err := idx.objectSearch(context.TODO(), 5, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Len(t, res, 2)
require.Equal(t, uint64(6), res[0].DocID())
require.Equal(t, uint64(5), res[1].DocID())
})
t.Run("With autocut", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "journey", Properties: []string{"description"}}
resNoAutoCut, _, err := idx.objectSearch(context.TODO(), 10, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
resAutoCut, _, err := idx.objectSearch(context.TODO(), 10, nil, kwr, nil, nil, addit, nil, "", 1)
require.Nil(t, err)
require.Less(t, len(resAutoCut), len(resNoAutoCut))
require.EqualValues(t, 0.5868752, resNoAutoCut[0].Score())
require.EqualValues(t, 0.5450892, resNoAutoCut[1].Score()) // <= autocut last element
require.EqualValues(t, 0.34149727, resNoAutoCut[2].Score())
require.EqualValues(t, 0.3049518, resNoAutoCut[3].Score())
require.EqualValues(t, 0.27547202, resNoAutoCut[4].Score())
require.Len(t, resAutoCut, 2)
require.EqualValues(t, 0.5868752, resAutoCut[0].Score())
require.EqualValues(t, 0.5450892, resAutoCut[1].Score())
})
}
func TestBM25FSingleProp(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
SetupClass(t, repo, schemaGetter, logger, 0.5, 100)
idx := repo.GetIndex("MyClass")
require.NotNil(t, idx)
// Check boosted
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey"}
addit := additional.Properties{}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
t.Log("--- Start results for singleprop search ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
require.Nil(t, err)
// Check results in correct order
require.Equal(t, uint64(3), res[0].DocID())
require.Equal(t, uint64(4), res[3].DocID())
// Check scores
EqualFloats(t, float32(0.1248), res[0].Score(), 5)
EqualFloats(t, float32(0.0363), res[1].Score(), 5)
}
func TestBM25FWithFilters(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
SetupClass(t, repo, schemaGetter, logger, 0.5, 100)
idx := repo.GetIndex("MyClass")
require.NotNil(t, idx)
filter := &filters.LocalFilter{
Root: &filters.Clause{
Operator: filters.OperatorOr,
Operands: []filters.Clause{
{
Operator: filters.OperatorEqual,
On: &filters.Path{
Class: schema.ClassName("MyClass"),
Property: schema.PropertyName("title"),
},
Value: &filters.Value{
Value: "My",
Type: schema.DataType("text"),
},
},
{
Operator: filters.OperatorEqual,
On: &filters.Path{
Class: schema.ClassName("MyClass"),
Property: schema.PropertyName("title"),
},
Value: &filters.Value{
Value: "journeys",
Type: schema.DataType("text"),
},
},
},
},
}
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"description"}, Query: "journey"}
addit := additional.Properties{}
res, _, err := idx.objectSearch(context.TODO(), 1000, filter, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.True(t, len(res) == 1)
require.Equal(t, uint64(2), res[0].DocID())
}
func TestBM25FWithFilters_ScoreIsIdenticalWithOrWithoutFilter(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
SetupClassForFilterScoringTest(t, repo, schemaGetter, logger, 1.2, 0.75)
idx := repo.GetIndex("FilterClass")
require.NotNil(t, idx)
filter := &filters.LocalFilter{
Root: &filters.Clause{
On: &filters.Path{
Class: schema.ClassName("FilterClass"),
Property: schema.PropertyName("relatedToGolf"),
},
Operator: filters.OperatorEqual,
Value: &filters.Value{
Value: true,
Type: dtBool,
},
},
}
kwr := &searchparams.KeywordRanking{
Type: "bm25",
Properties: []string{"description"},
Query: "koepka golf",
}
addit := additional.Properties{}
filtered, _, err := idx.objectSearch(context.TODO(), 1000, filter, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
unfiltered, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Len(t, filtered, 1) // should match exactly one element
require.Len(t, unfiltered, 2) // contains irrelevant result
assert.Equal(t, uint64(0), filtered[0].DocID()) // brooks koepka result
assert.Equal(t, uint64(0), unfiltered[0].DocID()) // brooks koepka result
assert.Equal(t, filtered[0].Score(), unfiltered[0].Score())
}
func TestBM25FDifferentParamsJourney(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
SetupClass(t, repo, schemaGetter, logger, 0.5, 100)
idx := repo.GetIndex("MyClass")
require.NotNil(t, idx)
// Check boosted
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title^2", "description"}, Query: "journey"}
addit := additional.Properties{}
res, _, err := idx.objectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit, nil, "", 0)
// Print results
t.Log("--- Start results for boosted search ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
require.Nil(t, err)
// Check results in correct order
require.Equal(t, uint64(6), res[0].DocID())
require.Equal(t, uint64(1), res[3].DocID())
// Print results
t.Log("--- Start results for boosted search ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), r.Score(), r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
// Check scores
EqualFloats(t, float32(0.06023), res[0].Score(), 6)
EqualFloats(t, float32(0.04238), res[1].Score(), 6)
}
func EqualFloats(t *testing.T, expected, actual float32, significantFigures int) {
s1 := fmt.Sprintf("%v", expected)
s2 := fmt.Sprintf("%v", actual)
if len(s1) < 2 || len(s2) < 2 {
t.Fail()
}
if len(s1) <= significantFigures {
significantFigures = len(s1) - 1
}
if len(s2) <= significantFigures {
significantFigures = len(s2) - 1
}
require.Equal(t, s1[:significantFigures+1], s2[:significantFigures+1])
}
// Compare with previous BM25 version to ensure the algorithm functions correctly
func TestBM25FCompare(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
SetupClass(t, repo, schemaGetter, logger, 0.5, 100)
idx := repo.GetIndex("MyClass")
require.NotNil(t, idx)
shardNames := idx.getSchema.CopyShardingState(idx.Config.ClassName.String()).AllPhysicalShards()
for _, shardName := range shardNames {
shard := idx.shards.Load(shardName)
t.Logf("------ BM25F --------\n")
kwr := &searchparams.KeywordRanking{Type: "bm25", Properties: []string{"title"}, Query: "journey"}
addit := additional.Properties{}
withBM25Fobjs, withBM25Fscores, err := shard.ObjectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit)
require.Nil(t, err)
for i, r := range withBM25Fobjs {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), withBM25Fscores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
t.Logf("------ BM25 --------\n")
kwr.Type = ""
objs, scores, err := shard.ObjectSearch(context.TODO(), 1000, nil, kwr, nil, nil, addit)
require.Nil(t, err)
for i, r := range objs {
t.Logf("Result id: %v, score: %v, title: %v, description: %v, additional %+v\n", r.DocID(), scores[i], r.Object.Properties.(map[string]interface{})["title"], r.Object.Properties.(map[string]interface{})["description"], r.Object.Additional)
}
require.Equal(t, len(withBM25Fobjs), len(objs))
for i := range objs {
t.Logf("%v: BM25F score: %v, BM25 score: %v", i, withBM25Fscores[i], scores[i])
EqualFloats(t, withBM25Fscores[i], scores[i], 9)
}
// Not all the scores are unique and the search is not stable, so pick ones that don't move
require.Equal(t, uint64(4), objs[0].DocID())
require.Equal(t, uint64(5), objs[1].DocID())
require.Equal(t, uint64(6), objs[2].DocID())
require.Equal(t, uint64(1), objs[3].DocID())
require.Equal(t, uint64(2), objs[4].DocID())
require.Equal(t, uint64(0), objs[5].DocID())
require.Equal(t, uint64(4), withBM25Fobjs[0].DocID())
require.Equal(t, uint64(5), withBM25Fobjs[1].DocID())
require.Equal(t, uint64(6), withBM25Fobjs[2].DocID())
require.Equal(t, uint64(1), withBM25Fobjs[3].DocID())
require.Equal(t, uint64(2), withBM25Fobjs[4].DocID())
require.Equal(t, uint64(0), withBM25Fobjs[5].DocID())
}
}
func Test_propertyHasSearchableIndex(t *testing.T) {
vFalse := false
vTrue := true
class := &models.Class{
VectorIndexConfig: enthnsw.NewDefaultUserConfig(),
InvertedIndexConfig: BM25FinvertedConfig(1, 1, "none"),
Class: "MyClass",
Properties: []*models.Property{
{
Name: "title",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: nil,
},
{
Name: "description",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "textField",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationField,
IndexFilterable: &vFalse,
IndexSearchable: &vFalse,
},
},
}
ClassSchema := &models.Schema{
Classes: []*models.Class{class},
}
t.Run("Property index", func(t *testing.T) {
if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "description"); got != true {
t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, true)
}
if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "description^2"); got != true {
t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, true)
}
if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "textField"); got != false {
t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, false)
}
if got := inverted.PropertyHasSearchableIndex(ClassSchema, "MyClass", "title"); got != true {
t.Errorf("PropertyHasSearchableIndex() = %v, want %v", got, true)
}
})
}
func SetupClassDocuments(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32, preset string,
) string {
vFalse := false
vTrue := true
className := "DocumentsPreset_" + preset
class := &models.Class{
VectorIndexConfig: enthnsw.NewDefaultUserConfig(),
InvertedIndexConfig: BM25FinvertedConfig(k1, b, preset),
Class: className,
Properties: []*models.Property{
{
Name: "document",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
},
}
schemaGetter.schema = schema.Schema{
Objects: &models.Schema{
Classes: []*models.Class{class},
},
}
migrator := NewMigrator(repo, logger)
migrator.AddClass(context.Background(), class, schemaGetter.shardState)
testData := []map[string]interface{}{}
testData = append(testData, map[string]interface{}{"document": "No matter what you do, the question of \"\"what is income\"\" is *always* going to be an extremely complex question. To use this particular example, is paying a royalty fee to an external party a legitimate business expense that is part of the cost of doing business and which subtracts from your \"\"income\"\"?"})
testData = append(testData, map[string]interface{}{"document": "test"})
testData = append(testData, map[string]interface{}{"document": "As long as the losing business is not considered \"\"passive activity\"\" or \"\"hobby\"\", then yes. Passive Activity is an activity where you do not have to actively do anything to generate income. For example - royalties or rentals. Hobby is an activity that doesn't generate profit. Generally, if your business doesn't consistently generate profit (the IRS looks at 3 out of the last 5 years), it may be characterized as hobby. For hobby, loss deduction is limited by the hobby income and the 2% AGI threshold."})
testData = append(testData, map[string]interface{}{"document": "So you're basically saying that average market fluctuations have an affect on individual stocks, because individual stocks are often priced in relation to the growth of the market as a whole? Also, what kinds of investments would be considered \"\"risk free\"\" in this nomenclature?"})
for i, data := range testData {
id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String())
obj := &models.Object{Class: className, ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020}
vector := []float32{1, 3, 5, 0.4}
//{title: "Our journey to BM25F", description: " This is how we get to BM25F"}}
err := repo.PutObject(context.Background(), obj, vector, nil)
require.Nil(t, err)
}
return className
}
func TestBM25F_ComplexDocuments(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
schemaGetter.schema = schema.Schema{
Objects: &models.Schema{
Classes: []*models.Class{},
},
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
classNone := SetupClassDocuments(t, repo, schemaGetter, logger, 0.5, 0.75, "none")
idxNone := repo.GetIndex(schema.ClassName(classNone))
require.NotNil(t, idxNone)
addit := additional.Properties{}
t.Run("single term", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "considered a"}
res, _, err := idxNone.objectSearch(context.TODO(), 10, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Print results
t.Log("--- Start results for boosted search ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, \n", r.DocID(), r.Score())
}
// Check results in correct order
require.Equal(t, uint64(3), res[0].DocID())
require.Equal(t, uint64(0), res[1].DocID())
require.Equal(t, uint64(2), res[2].DocID())
require.Len(t, res, 3)
// Check scores
EqualFloats(t, float32(0.8914), res[0].Score(), 5)
EqualFloats(t, float32(0.5425), res[1].Score(), 5)
EqualFloats(t, float32(0.3952), res[2].Score(), 5)
})
t.Run("Results without stopwords", func(t *testing.T) {
kwrNoStopwords := &searchparams.KeywordRanking{Type: "bm25", Query: "example losing business"}
resNoStopwords, _, err := idxNone.objectSearch(context.TODO(), 10, nil, kwrNoStopwords, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
classEn := SetupClassDocuments(t, repo, schemaGetter, logger, 0.5, 0.75, "en")
idxEn := repo.GetIndex(schema.ClassName(classEn))
require.NotNil(t, idxEn)
kwrStopwords := &searchparams.KeywordRanking{Type: "bm25", Query: "an example on losing the business"}
resStopwords, _, err := idxEn.objectSearch(context.TODO(), 10, nil, kwrStopwords, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Equal(t, len(resNoStopwords), len(resStopwords))
for i, resNo := range resNoStopwords {
resYes := resStopwords[i]
require.Equal(t, resNo.DocID(), resYes.DocID())
require.Equal(t, resNo.Score(), resYes.Score())
}
kwrStopwordsDuplicate := &searchparams.KeywordRanking{Type: "bm25", Query: "on an example on losing the business on"}
resStopwordsDuplicate, _, err := idxEn.objectSearch(context.TODO(), 10, nil, kwrStopwordsDuplicate, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
require.Equal(t, len(resNoStopwords), len(resStopwordsDuplicate))
for i, resNo := range resNoStopwords {
resYes := resStopwordsDuplicate[i]
require.Equal(t, resNo.DocID(), resYes.DocID())
require.Equal(t, resNo.Score(), resYes.Score())
}
})
}
func MultiPropClass(t require.TestingT, repo *DB, schemaGetter *fakeSchemaGetter, logger logrus.FieldLogger, k1, b float32) string {
vFalse := false
vTrue := true
className := "MultiProps"
class := &models.Class{
VectorIndexConfig: enthnsw.NewDefaultUserConfig(),
InvertedIndexConfig: BM25FinvertedConfig(k1, b, "none"),
Class: className,
Properties: []*models.Property{
{
Name: "document",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
{
Name: "title",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWord,
IndexFilterable: &vFalse,
IndexSearchable: &vTrue,
},
},
}
schemaGetter.schema = schema.Schema{
Objects: &models.Schema{
Classes: []*models.Class{class},
},
}
migrator := NewMigrator(repo, logger)
migrator.AddClass(context.Background(), class, schemaGetter.shardState)
testData := []map[string]interface{}{}
testData = append(testData, map[string]interface{}{"document": "test", "title": "pepper"})
testData = append(testData, map[string]interface{}{"document": "banana", "title": "pepper"})
testData = append(testData, map[string]interface{}{"document": "apple", "title": "banana taste great"})
testData = append(testData, map[string]interface{}{"document": "banana burger", "title": "test"})
testData = append(testData, map[string]interface{}{"document": "carotte", "title": "great"})
for i, data := range testData {
id := strfmt.UUID(uuid.MustParse(fmt.Sprintf("%032d", i)).String())
obj := &models.Object{Class: className, ID: id, Properties: data, CreationTimeUnix: 1565612833955, LastUpdateTimeUnix: 10000020}
vector := []float32{1, 3, 5, 0.4}
err := repo.PutObject(context.Background(), obj, vector, nil)
require.Nil(t, err)
}
return className
}
func TestBM25F_SortMultiProp(t *testing.T) {
t.Skip("Currently failing")
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
schemaGetter.schema = schema.Schema{
Objects: &models.Schema{
Classes: []*models.Class{},
},
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, nil, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(context.TODO()))
defer repo.Shutdown(context.Background())
idx := repo.GetIndex(schema.ClassName(MultiPropClass(t, repo, schemaGetter, logger, 0.5, 0.75)))
require.NotNil(t, idx)
addit := additional.Properties{}
t.Run("single term", func(t *testing.T) {
kwr := &searchparams.KeywordRanking{Type: "bm25", Query: "pepper banana"}
res, _, err := idx.objectSearch(context.TODO(), 1, nil, kwr, nil, nil, addit, nil, "", 0)
require.Nil(t, err)
// Print results
t.Log("--- Start results for boosted search ---")
for _, r := range res {
t.Logf("Result id: %v, score: %v, \n", r.DocID(), r.Score())
}
// Document 1 is a result for both terms
require.Len(t, res, 1)
require.Equal(t, uint64(1), res[0].DocID())
})
}