SemanticSearchPOC / adapters /repos /db /crud_update_integration_test.go
KevinStephenson
Adding in weaviate code
b110593
raw
history blame
12.6 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
//go:build integrationTest
// +build integrationTest
package db
import (
"context"
"testing"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/weaviate/weaviate/adapters/repos/db/inverted"
"github.com/weaviate/weaviate/entities/additional"
"github.com/weaviate/weaviate/entities/dto"
"github.com/weaviate/weaviate/entities/filters"
"github.com/weaviate/weaviate/entities/models"
"github.com/weaviate/weaviate/entities/schema"
libschema "github.com/weaviate/weaviate/entities/schema"
"github.com/weaviate/weaviate/entities/search"
enthnsw "github.com/weaviate/weaviate/entities/vectorindex/hnsw"
)
// Updates are non trivial, because vector indices are built under the
// assumption that items are immutable (this is true for HNSW, the assumption
// is that this is generally true in the majority of cases). Therefore an
// update is essentially a delete and a new import with a new doc ID. This
// needs to be tested extensively because there's a lot of room for error
// regarding the clean up of Doc ID pointers in the inverted indices, etc.
func TestUpdateJourney(t *testing.T) {
dirName := t.TempDir()
logger := logrus.New()
schemaGetter := &fakeSchemaGetter{
schema: schema.Schema{Objects: &models.Schema{Classes: nil}},
shardState: singleShardState(),
}
repo, err := New(logger, Config{
MemtablesFlushIdleAfter: 60,
RootPath: dirName,
QueryMaximumResults: 10000,
MaxImportGoroutinesFactor: 1,
}, &fakeRemoteClient{}, &fakeNodeResolver{}, &fakeRemoteNodeClient{}, &fakeReplicationClient{}, nil)
require.Nil(t, err)
repo.SetSchemaGetter(schemaGetter)
require.Nil(t, repo.WaitForStartup(testCtx()))
defer repo.Shutdown(context.Background())
migrator := NewMigrator(repo, logger)
schema := libschema.Schema{
Objects: &models.Schema{
Classes: []*models.Class{updateTestClass()},
},
}
t.Run("add schema", func(t *testing.T) {
err := migrator.AddClass(context.Background(), updateTestClass(), schemaGetter.shardState)
require.Nil(t, err)
})
schemaGetter.schema = schema
t.Run("import some objects", func(t *testing.T) {
for _, res := range updateTestData() {
err := repo.PutObject(context.Background(), res.Object(), res.Vector, nil)
require.Nil(t, err)
}
tracker := getTracker(repo, "UpdateTestClass")
require.Nil(t, err)
sum, count, mean, err := tracker.PropertyTally("name")
require.Nil(t, err)
assert.Equal(t, 4, sum)
assert.Equal(t, 4, count)
assert.InEpsilon(t, 1, mean, 0.1)
})
searchVector := []float32{0.1, 0.1, 0.1}
t.Run("verify vector search results are initially as expected",
func(t *testing.T) {
res, err := repo.VectorSearch(context.Background(), dto.GetParams{
ClassName: "UpdateTestClass",
SearchVector: searchVector,
Pagination: &filters.Pagination{
Limit: 100,
},
})
expectedInAnyOrder := []interface{}{
"element-0", "element-1", "element-2", "element-3",
}
require.Nil(t, err)
require.Len(t, res, 4)
assert.ElementsMatch(t, expectedInAnyOrder, extractPropValues(res, "name"))
})
searchInv := func(t *testing.T, op filters.Operator, value int) []interface{} {
res, err := repo.ObjectSearch(context.Background(), 0, 100,
&filters.LocalFilter{
Root: &filters.Clause{
Operator: op,
On: &filters.Path{
Class: "UpdateTestClass",
Property: libschema.PropertyName("intProp"),
},
Value: &filters.Value{
Type: libschema.DataTypeInt,
Value: value,
},
},
}, nil, additional.Properties{}, "")
require.Nil(t, err)
return extractPropValues(res, "name")
}
t.Run("verify invert index results are initially as expected",
func(t *testing.T) {
expectedInAnyOrder := []interface{}{
"element-0", "element-1", "element-2", "element-3",
}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorGreaterThanEqual, 0))
expectedInAnyOrder = []interface{}{"element-0"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 0))
expectedInAnyOrder = []interface{}{"element-1"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 10))
expectedInAnyOrder = []interface{}{"element-2"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 20))
expectedInAnyOrder = []interface{}{"element-3"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 30))
})
t.Run("update vector position of one item to move it into a different direction",
func(t *testing.T) {
// updating element-0 to be very far away from our search vector
updatedVec := []float32{-0.1, -0.12, -0.105}
id := updateTestData()[0].ID
old, err := repo.ObjectByID(context.Background(), id, search.SelectProperties{}, additional.Properties{}, "")
require.Nil(t, err)
err = repo.PutObject(context.Background(), old.Object(), updatedVec, nil)
require.Nil(t, err)
tracker := getTracker(repo, "UpdateTestClass")
require.Nil(t, err)
sum, count, mean, err := tracker.PropertyTally("name")
require.Nil(t, err)
assert.Equal(t, 4, sum)
assert.Equal(t, 4, count)
assert.InEpsilon(t, 1, mean, 0.1)
})
t.Run("verify new vector search results are as expected", func(t *testing.T) {
res, err := repo.VectorSearch(context.Background(), dto.GetParams{
ClassName: "UpdateTestClass",
SearchVector: searchVector,
Pagination: &filters.Pagination{
Limit: 100,
},
})
expectedInAnyOrder := []interface{}{
"element-0", "element-1", "element-2", "element-3",
}
require.Nil(t, err)
require.Len(t, res, 4)
assert.ElementsMatch(t, expectedInAnyOrder, extractPropValues(res, "name"))
})
t.Run("verify invert results still work properly", func(t *testing.T) {
expectedInAnyOrder := []interface{}{
"element-0", "element-1", "element-2", "element-3",
}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorGreaterThanEqual, 0))
expectedInAnyOrder = []interface{}{"element-0"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 0))
expectedInAnyOrder = []interface{}{"element-1"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 10))
expectedInAnyOrder = []interface{}{"element-2"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 20))
expectedInAnyOrder = []interface{}{"element-3"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 30))
})
t.Run("update a second object and modify vector and invert props at the same time",
func(t *testing.T) {
// this time we are updating element-2 and move it away from the search
// vector, as well as updating an invert prop
updatedVec := []float32{-0.1, -0.12, -0.105123}
id := updateTestData()[2].ID
old, err := repo.ObjectByID(context.Background(), id, search.SelectProperties{}, additional.Properties{}, "")
require.Nil(t, err)
old.Schema.(map[string]interface{})["intProp"] = int64(21)
err = repo.PutObject(context.Background(), old.Object(), updatedVec, nil)
require.Nil(t, err)
tracker := getTracker(repo, "UpdateTestClass")
require.Nil(t, err)
sum, count, mean, err := tracker.PropertyTally("name")
require.Nil(t, err)
assert.Equal(t, 4, sum)
assert.Equal(t, 4, count)
assert.InEpsilon(t, 1, mean, 0.1)
})
t.Run("verify new vector search results are as expected", func(t *testing.T) {
res, err := repo.VectorSearch(context.Background(), dto.GetParams{
ClassName: "UpdateTestClass",
SearchVector: searchVector,
Pagination: &filters.Pagination{
Limit: 100,
},
})
expectedInAnyOrder := []interface{}{
"element-0", "element-1", "element-2", "element-3",
}
require.Nil(t, err)
require.Len(t, res, 4)
assert.ElementsMatch(t, expectedInAnyOrder, extractPropValues(res, "name"))
})
t.Run("verify invert results have been updated correctly", func(t *testing.T) {
expectedInAnyOrder := []interface{}{
"element-0", "element-1", "element-2", "element-3",
}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorGreaterThanEqual, 0))
expectedInAnyOrder = []interface{}{"element-0"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 0))
expectedInAnyOrder = []interface{}{"element-1"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 10))
expectedInAnyOrder = []interface{}{} // value is no longer 20, but 21
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 20))
expectedInAnyOrder = []interface{}{"element-2"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 21))
expectedInAnyOrder = []interface{}{"element-3"}
assert.ElementsMatch(t, expectedInAnyOrder, searchInv(t, filters.OperatorEqual, 30))
})
t.Run("test recount", func(t *testing.T) {
tracker := getTracker(repo, "UpdateTestClass")
require.Nil(t, err)
sum, count, mean, err := tracker.PropertyTally("name")
require.Nil(t, err)
assert.Equal(t, 4, sum)
assert.Equal(t, 4, count)
assert.InEpsilon(t, 1, mean, 0.1)
tracker.Clear()
sum, count, mean, err = tracker.PropertyTally("name")
require.Nil(t, err)
assert.Equal(t, 0, sum)
assert.Equal(t, 0, count)
assert.Equal(t, float64(0), mean)
logger := logrus.New()
migrator := NewMigrator(repo, logger)
migrator.RecountProperties(context.Background())
sum, count, mean, err = tracker.PropertyTally("name")
require.Nil(t, err)
assert.Equal(t, 4, sum)
assert.Equal(t, 4, count)
assert.Equal(t, float64(1), mean)
})
}
func updateTestClass() *models.Class {
return &models.Class{
Class: "UpdateTestClass",
VectorIndexConfig: enthnsw.NewDefaultUserConfig(),
InvertedIndexConfig: &models.InvertedIndexConfig{
CleanupIntervalSeconds: 3,
},
Properties: []*models.Property{
{
DataType: []string{string(schema.DataTypeInt)},
Name: "intProp",
},
{
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWhitespace,
Name: "name",
},
},
}
}
func updateTestData() search.Results {
return search.Results{
search.Result{
ClassName: "UpdateTestClass",
ID: "426b0b29-9ded-40b6-b786-da3d1fec412f",
Schema: map[string]interface{}{
"intProp": int64(0),
"name": "element-0",
},
Vector: []float32{0.89379513, 0.67022973, 0.57360715},
},
search.Result{
ClassName: "UpdateTestClass",
ID: "a1560f12-f0f0-4439-b5b8-b7bcecf5fed7",
Schema: map[string]interface{}{
"intProp": int64(10),
"name": "element-1",
},
Vector: []float32{0.9660323, 0.35887036, 0.6072966},
},
search.Result{
ClassName: "UpdateTestClass",
ID: "0c73f145-5dc4-49a9-bd58-82725f8b13fa",
Schema: map[string]interface{}{
"intProp": int64(20),
"name": "element-2",
},
Vector: []float32{0.8194746, 0.56142205, 0.5130103},
},
search.Result{
ClassName: "UpdateTestClass",
ID: "aec8462e-276a-4989-a612-8314c35d163a",
Schema: map[string]interface{}{
"intProp": int64(30),
"name": "element-3",
},
Vector: []float32{0.42401955, 0.8278863, 0.5952888},
},
}
}
func extractPropValues(in search.Results, propName string) []interface{} {
out := make([]interface{}, len(in))
for i, res := range in {
out[i] = res.Schema.(map[string]interface{})[propName]
}
return out
}
func getTracker(repo *DB, className string) *inverted.JsonShardMetaData {
index := repo.GetIndex("UpdateTestClass")
var shard ShardLike
index.ForEachShard(func(name string, shardv ShardLike) error {
shard = shardv
return nil
})
tracker := shard.GetPropertyLengthTracker()
return tracker
}