KevinStephenson
Adding in weaviate code
b110593
raw
history blame
42.8 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package hnsw
import (
"context"
"fmt"
"sort"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/weaviate/weaviate/adapters/repos/db/helpers"
"github.com/weaviate/weaviate/adapters/repos/db/lsmkv"
"github.com/weaviate/weaviate/adapters/repos/db/vector/common"
"github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer"
"github.com/weaviate/weaviate/adapters/repos/db/vector/testinghelpers"
"github.com/weaviate/weaviate/entities/cyclemanager"
"github.com/weaviate/weaviate/entities/storobj"
ent "github.com/weaviate/weaviate/entities/vectorindex/hnsw"
)
func TempVectorForIDThunk(vectors [][]float32) func(context.Context, uint64, *common.VectorSlice) ([]float32, error) {
return func(ctx context.Context, id uint64, container *common.VectorSlice) ([]float32, error) {
copy(container.Slice, vectors[int(id)])
return vectors[int(id)], nil
}
}
func TestDelete_WithoutCleaningUpTombstones(t *testing.T) {
vectors := vectorsForDeleteTest()
var vectorIndex *hnsw
store := testinghelpers.NewDummyStore(t)
t.Run("import the test vectors", func(t *testing.T) {
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "delete-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: func(ctx context.Context, id uint64) ([]float32, error) {
return vectors[int(id)], nil
},
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), store)
require.Nil(t, err)
vectorIndex = index
for i, vec := range vectors {
err := vectorIndex.Add(uint64(i), vec)
require.Nil(t, err)
}
})
var control []uint64
t.Run("vectors are cached correctly", func(t *testing.T) {
assert.Equal(t, len(vectors), int(vectorIndex.cache.CountVectors()))
})
t.Run("doing a control search before delete with the respective allow list", func(t *testing.T) {
allowList := helpers.NewAllowList()
for i := range vectors {
if i%2 == 0 {
continue
}
allowList.Insert(uint64(i))
}
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, allowList)
require.Nil(t, err)
require.True(t, len(res) > 0)
control = res
})
t.Run("deleting every even element", func(t *testing.T) {
for i := range vectors {
if i%2 != 0 {
continue
}
err := vectorIndex.Delete(uint64(i))
require.Nil(t, err)
}
})
t.Run("vector cache holds half the original vectors", func(t *testing.T) {
vectorIndex.CleanUpTombstonedNodes(neverStop)
assert.Equal(t, len(vectors)/2, int(vectorIndex.cache.CountVectors()))
})
t.Run("start a search that should only contain the remaining elements", func(t *testing.T) {
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
for _, elem := range res {
if elem%2 == 0 {
t.Errorf("search result contained an even element: %d", elem)
}
}
assert.Equal(t, control, res)
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, vectorIndex.Drop(context.Background()))
})
t.Run("vector cache holds no vectors", func(t *testing.T) {
assert.Equal(t, 0, int(vectorIndex.cache.CountVectors()))
})
}
func TestDelete_WithCleaningUpTombstonesOnce(t *testing.T) {
// there is a single bulk clean event after all the deletes
vectors := vectorsForDeleteTest()
var vectorIndex *hnsw
store := testinghelpers.NewDummyStore(t)
t.Run("import the test vectors", func(t *testing.T) {
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "delete-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: func(ctx context.Context, id uint64) ([]float32, error) {
return vectors[int(id)], nil
},
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), store)
require.Nil(t, err)
vectorIndex = index
for i, vec := range vectors {
err := vectorIndex.Add(uint64(i), vec)
require.Nil(t, err)
}
})
var control []uint64
var bfControl []uint64
t.Run("doing a control search before delete with the respective allow list", func(t *testing.T) {
allowList := helpers.NewAllowList()
for i := range vectors {
if i%2 == 0 {
continue
}
allowList.Insert(uint64(i))
}
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, allowList)
require.Nil(t, err)
require.True(t, len(res) > 0)
require.Len(t, res, 20)
control = res
})
t.Run("brute force control", func(t *testing.T) {
bf := bruteForceCosine(vectors, []float32{0.1, 0.1, 0.1}, 100)
bfControl = make([]uint64, len(bf))
i := 0
for _, elem := range bf {
if elem%2 == 0 {
continue
}
bfControl[i] = elem
i++
}
if i > 20 {
i = 20
}
bfControl = bfControl[:i]
assert.Equal(t, bfControl, control, "control should match bf control")
})
fmt.Printf("entrypoint before %d\n", vectorIndex.entryPointID)
t.Run("deleting every even element", func(t *testing.T) {
for i := range vectors {
if i%2 != 0 {
continue
}
err := vectorIndex.Delete(uint64(i))
require.Nil(t, err)
}
})
t.Run("running the cleanup", func(t *testing.T) {
err := vectorIndex.CleanUpTombstonedNodes(neverStop)
require.Nil(t, err)
})
t.Run("start a search that should only contain the remaining elements", func(t *testing.T) {
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
for _, elem := range res {
if elem%2 == 0 {
t.Errorf("search result contained an even element: %d", elem)
}
}
assert.Equal(t, control, res)
})
t.Run("verify the graph no longer has any tombstones", func(t *testing.T) {
assert.Len(t, vectorIndex.tombstones, 0)
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, vectorIndex.Drop(context.Background()))
})
}
func TestDelete_WithCleaningUpTombstonesInBetween(t *testing.T) {
// there is a single bulk clean event after all the deletes
vectors := vectorsForDeleteTest()
var vectorIndex *hnsw
store := testinghelpers.NewDummyStore(t)
t.Run("import the test vectors", func(t *testing.T) {
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "delete-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: func(ctx context.Context, id uint64) ([]float32, error) {
return vectors[int(id)], nil
},
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), store)
// makes sure index is build only with level 0. To be removed after fixing WEAVIATE-179
index.randFunc = func() float64 { return 0.1 }
require.Nil(t, err)
vectorIndex = index
for i, vec := range vectors {
err := vectorIndex.Add(uint64(i), vec)
require.Nil(t, err)
}
})
var control []uint64
t.Run("doing a control search before delete with the respective allow list", func(t *testing.T) {
allowList := helpers.NewAllowList()
for i := range vectors {
if i%2 == 0 {
continue
}
allowList.Insert(uint64(i))
}
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, allowList)
require.Nil(t, err)
require.True(t, len(res) > 0)
control = res
})
t.Run("deleting every even element", func(t *testing.T) {
for i := range vectors {
if i%10 == 0 {
// occasionally run clean up
err := vectorIndex.CleanUpTombstonedNodes(neverStop)
require.Nil(t, err)
}
if i%2 != 0 {
continue
}
err := vectorIndex.Delete(uint64(i))
require.Nil(t, err)
}
// finally run one final cleanup
err := vectorIndex.CleanUpTombstonedNodes(neverStop)
require.Nil(t, err)
})
t.Run("start a search that should only contain the remaining elements", func(t *testing.T) {
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
for _, elem := range res {
if elem%2 == 0 {
t.Errorf("search result contained an even element: %d", elem)
}
}
assert.Equal(t, control, res)
})
t.Run("verify the graph no longer has any tombstones", func(t *testing.T) {
assert.Len(t, vectorIndex.tombstones, 0)
})
t.Run("delete the remaining elements", func(t *testing.T) {
for i := range vectors {
if i%2 == 0 {
continue
}
err := vectorIndex.Delete(uint64(i))
require.Nil(t, err)
}
err := vectorIndex.CleanUpTombstonedNodes(neverStop)
require.Nil(t, err)
})
t.Run("try to insert again and search", func(t *testing.T) {
for i := 0; i < 5; i++ {
err := vectorIndex.Add(uint64(i), vectors[i])
require.Nil(t, err)
}
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, nil)
require.Nil(t, err)
assert.ElementsMatch(t, []uint64{0, 1, 2, 3, 4}, res)
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, vectorIndex.Drop(context.Background()))
})
}
func createIndexImportAllVectorsAndDeleteEven(t *testing.T, vectors [][]float32, store *lsmkv.Store) (index *hnsw, remainingResult []uint64) {
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "delete-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: func(ctx context.Context, id uint64) ([]float32, error) {
return vectors[int(id)], nil
},
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), store)
require.Nil(t, err)
// makes sure index is build only with level 0. To be removed after fixing WEAVIATE-179
index.randFunc = func() float64 { return 0.1 }
// to speed up test execution, size of nodes array is decreased
// from default 25k to little over number of vectors
index.nodes = make([]*vertex, int(1.2*float64(len(vectors))))
for i, vec := range vectors {
err := index.Add(uint64(i), vec)
require.Nil(t, err)
}
for i := range vectors {
if i%2 != 0 {
continue
}
err := index.Delete(uint64(i))
require.Nil(t, err)
}
res, _, err := index.SearchByVector([]float32{0.1, 0.1, 0.1}, len(vectors), nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
for _, elem := range res {
if elem%2 == 0 {
t.Errorf("search result contained an even element: %d", elem)
}
}
return index, res
}
func genStopAtFunc(i int) func() bool {
counter := 0
return func() bool {
if counter < i {
counter++
return false
}
return true
}
}
func TestDelete_WithCleaningUpTombstonesStopped(t *testing.T) {
vectors := vectorsForDeleteTest()
var index *hnsw
var possibleStopsCount int
// due to not yet resolved bug (https://semi-technology.atlassian.net/browse/WEAVIATE-179)
// db can return less vectors than are actually stored after tombstones cleanup
// controlRemainingResult contains all odd vectors (before cleanup was performed)
// controlRemainingResultAfterCleanup contains most of odd vectors (after cleanup was performed)
//
// this test verifies if partial cleanup will not change search output, therefore depending on
// where cleanup method was stopped, subset of controlRemainingResult is expected, though all
// vectors from controlRemainingResultAfterCleanup should be returned
// TODO to be simplified after fixing WEAVIATE-179, all results should be the same
var controlRemainingResult []uint64
var controlRemainingResultAfterCleanup []uint64
store := testinghelpers.NewDummyStore(t)
t.Run("create control index", func(t *testing.T) {
index, controlRemainingResult = createIndexImportAllVectorsAndDeleteEven(t, vectors, store)
})
t.Run("count all cleanup tombstones stops", func(t *testing.T) {
counter := 0
countingStopFunc := func() bool {
counter++
return false
}
err := index.CleanUpTombstonedNodes(countingStopFunc)
require.Nil(t, err)
possibleStopsCount = counter
})
t.Run("search remaining elements after cleanup", func(t *testing.T) {
res, _, err := index.SearchByVector([]float32{0.1, 0.1, 0.1}, len(vectors), nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
for _, elem := range res {
if elem%2 == 0 {
t.Errorf("search result contained an even element: %d", elem)
}
}
controlRemainingResultAfterCleanup = res
})
t.Run("destroy the control index", func(t *testing.T) {
require.Nil(t, index.Drop(context.Background()))
})
for i := 0; i < possibleStopsCount; i++ {
index, _ = createIndexImportAllVectorsAndDeleteEven(t, vectors, store)
t.Run("stop cleanup at place", func(t *testing.T) {
require.Nil(t, index.CleanUpTombstonedNodes(genStopAtFunc(i)))
})
t.Run("search remaining elements after partial cleanup", func(t *testing.T) {
res, _, err := index.SearchByVector([]float32{0.1, 0.1, 0.1}, len(vectors), nil)
require.Nil(t, err)
require.Subset(t, controlRemainingResult, res)
require.Subset(t, res, controlRemainingResultAfterCleanup)
})
t.Run("run complete cleanup", func(t *testing.T) {
require.Nil(t, index.CleanUpTombstonedNodes(neverStop))
})
t.Run("search remaining elements after complete cleanup", func(t *testing.T) {
res, _, err := index.SearchByVector([]float32{0.1, 0.1, 0.1}, len(vectors), nil)
require.Nil(t, err)
require.Subset(t, controlRemainingResult, res)
require.Subset(t, res, controlRemainingResultAfterCleanup)
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, index.Drop(context.Background()))
})
}
}
func TestDelete_InCompressedIndex_WithCleaningUpTombstonesOnce(t *testing.T) {
var (
vectorIndex *hnsw
// there is a single bulk clean event after all the deletes
vectors = vectorsForDeleteTest()
rootPath = t.TempDir()
userConfig = ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
PQ: ent.PQConfig{
Enabled: true,
Encoder: ent.PQEncoder{
Type: ent.PQEncoderTypeTile,
Distribution: ent.PQEncoderDistributionNormal,
},
},
}
)
store := testinghelpers.NewDummyStore(t)
t.Run("import the test vectors", func(t *testing.T) {
index, err := New(Config{
RootPath: rootPath,
ID: "delete-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: func(ctx context.Context, id uint64) ([]float32, error) {
if int(id) >= len(vectors) {
return nil, storobj.NewErrNotFoundf(id, "out of range")
}
return vectors[int(id)], nil
},
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, userConfig, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), store)
require.Nil(t, err)
vectorIndex = index
for i, vec := range vectors {
err := vectorIndex.Add(uint64(i), vec)
require.Nil(t, err)
}
cfg := ent.PQConfig{
Enabled: true,
Encoder: ent.PQEncoder{
Type: ent.PQEncoderTypeTile,
Distribution: ent.PQEncoderDistributionLogNormal,
},
BitCompression: false,
Segments: 3,
Centroids: 256,
}
userConfig.PQ = cfg
index.compress(userConfig)
})
var control []uint64
var bfControl []uint64
t.Run("doing a control search before delete with the respective allow list", func(t *testing.T) {
allowList := helpers.NewAllowList()
for i := range vectors {
if i%2 == 0 {
continue
}
allowList.Insert(uint64(i))
}
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, allowList)
require.Nil(t, err)
require.True(t, len(res) > 0)
require.Len(t, res, 20)
control = res
})
t.Run("brute force control", func(t *testing.T) {
bf := bruteForceCosine(vectors, []float32{0.1, 0.1, 0.1}, 100)
bfControl = make([]uint64, len(bf))
i := 0
for _, elem := range bf {
if elem%2 == 0 {
continue
}
bfControl[i] = elem
i++
}
if i > 20 {
i = 20
}
bfControl = bfControl[:i]
recall := float32(testinghelpers.MatchesInLists(bfControl, control)) / float32(len(bfControl))
fmt.Println(recall)
assert.True(t, recall > 0.6, "control should match bf control")
})
fmt.Printf("entrypoint before %d\n", vectorIndex.entryPointID)
t.Run("deleting every even element", func(t *testing.T) {
for i := range vectors {
if i%2 != 0 {
continue
}
err := vectorIndex.Delete(uint64(i))
require.Nil(t, err)
}
})
t.Run("running the cleanup", func(t *testing.T) {
err := vectorIndex.CleanUpTombstonedNodes(neverStop)
require.Nil(t, err)
})
t.Run("start a search that should only contain the remaining elements", func(t *testing.T) {
res, _, err := vectorIndex.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
for _, elem := range res {
if elem%2 == 0 {
t.Errorf("search result contained an even element: %d", elem)
}
}
recall := float32(testinghelpers.MatchesInLists(res, control)) / float32(len(control))
assert.True(t, recall > 0.6)
})
t.Run("verify the graph no longer has any tombstones", func(t *testing.T) {
assert.Len(t, vectorIndex.tombstones, 0)
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, vectorIndex.Drop(context.Background()))
})
}
func TestDelete_InCompressedIndex_WithCleaningUpTombstonesOnce_DoesNotCrash(t *testing.T) {
var (
vectorIndex *hnsw
// there is a single bulk clean event after all the deletes
vectors = vectorsForDeleteTest()
rootPath = t.TempDir()
userConfig = ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
PQ: ent.PQConfig{Enabled: true, Encoder: ent.PQEncoder{Type: "tile", Distribution: "normal"}},
}
)
store := testinghelpers.NewDummyStore(t)
t.Run("import the test vectors", func(t *testing.T) {
index, err := New(Config{
RootPath: rootPath,
ID: "delete-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: func(ctx context.Context, id uint64) ([]float32, error) {
return vectors[int(id%uint64(len(vectors)))], nil
},
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, userConfig, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), store)
require.Nil(t, err)
vectorIndex = index
for i, vec := range vectors {
err := vectorIndex.Add(uint64(i), vec)
require.Nil(t, err)
}
cfg := ent.PQConfig{
Enabled: true,
Encoder: ent.PQEncoder{
Type: ent.PQEncoderTypeTile,
Distribution: ent.PQEncoderDistributionLogNormal,
},
BitCompression: false,
Segments: 3,
Centroids: 256,
}
userConfig.PQ = cfg
index.compress(userConfig)
for i := len(vectors); i < 1000; i++ {
err := vectorIndex.Add(uint64(i), vectors[i%len(vectors)])
require.Nil(t, err)
}
})
t.Run("deleting every even element", func(t *testing.T) {
for i := range vectors {
if i%2 != 0 {
continue
}
err := vectorIndex.Delete(uint64(i))
require.Nil(t, err)
}
})
t.Run("running the cleanup", func(t *testing.T) {
err := vectorIndex.CleanUpTombstonedNodes(neverStop)
require.Nil(t, err)
})
t.Run("verify the graph no longer has any tombstones", func(t *testing.T) {
assert.Len(t, vectorIndex.tombstones, 0)
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, vectorIndex.Drop(context.Background()))
})
}
// we need a certain number of elements so that we can make sure that nodes
// from all layers will eventually be deleted, otherwise our test only tests
// edge cases which aren't very common in real life, but ignore the most common
// deletes
func vectorsForDeleteTest() [][]float32 {
return [][]float32{
{0.27335858, 0.42670676, 0.12599982},
{0.34369454, 0.78510034, 0.78000546},
{0.2342731, 0.076864816, 0.6405078},
{0.07597838, 0.7752282, 0.87022865},
{0.78632426, 0.06902865, 0.7423889},
{0.3055758, 0.3901508, 0.9399572},
{0.48687622, 0.26338226, 0.06495104},
{0.5384028, 0.35410047, 0.8821815},
{0.25123185, 0.62722564, 0.86443096},
{0.58484185, 0.13103616, 0.4034975},
{0.0019696166, 0.46822622, 0.42492124},
{0.42401955, 0.8278863, 0.5952888},
{0.15367928, 0.70778894, 0.0070928824},
{0.95760256, 0.45898128, 0.1541115},
{0.9125976, 0.9021616, 0.21607016},
{0.9876307, 0.5243228, 0.37294936},
{0.8194746, 0.56142205, 0.5130103},
{0.805065, 0.62250346, 0.63715476},
{0.9969276, 0.5115748, 0.18916714},
{0.16419733, 0.15029702, 0.36020836},
{0.9660323, 0.35887036, 0.6072966},
{0.72765416, 0.27891788, 0.9094314},
{0.8626208, 0.3540126, 0.3100354},
{0.7153876, 0.17094712, 0.7801294},
{0.23180388, 0.107446484, 0.69542855},
{0.54731685, 0.8949827, 0.68316746},
{0.15049729, 0.1293767, 0.0574729},
{0.89379513, 0.67022973, 0.57360715},
{0.725353, 0.25326362, 0.44264215},
{0.2568602, 0.4986094, 0.9759933},
{0.7300015, 0.70019704, 0.49546525},
{0.54314494, 0.2004176, 0.63803226},
{0.6180191, 0.5260845, 0.9373999},
{0.63356537, 0.81430644, 0.78373694},
{0.69995105, 0.84198904, 0.17851257},
{0.5197941, 0.11502675, 0.95129955},
{0.15791401, 0.07516741, 0.113447875},
{0.06811827, 0.4450082, 0.98595786},
{0.7153448, 0.41833848, 0.06332495},
{0.6704102, 0.28931814, 0.031580303},
{0.47773632, 0.73334247, 0.6925025},
{0.7976896, 0.9499536, 0.6394833},
{0.3074854, 0.14025249, 0.35961738},
{0.49956197, 0.093575336, 0.790093},
{0.4641653, 0.21276893, 0.528895},
{0.1021849, 0.9416305, 0.46738508},
{0.3790398, 0.50099677, 0.98233247},
{0.39650732, 0.020929832, 0.53968865},
{0.77604437, 0.8554197, 0.24056046},
{0.07174444, 0.28758526, 0.67587185},
{0.22292718, 0.66624546, 0.6077909},
{0.22090498, 0.36197436, 0.40415043},
{0.04838009, 0.120789215, 0.17928012},
{0.55166364, 0.3400502, 0.43698996},
{0.7638108, 0.47014108, 0.23208627},
{0.9239513, 0.8418566, 0.23518613},
{0.289589, 0.85010827, 0.055741556},
{0.32436147, 0.18756394, 0.4217864},
{0.041671168, 0.37824047, 0.66486764},
{0.5052222, 0.07982704, 0.64345413},
{0.62675995, 0.20138603, 0.8231867},
{0.86306876, 0.9698708, 0.11398846},
{0.68566775, 0.22026269, 0.13525572},
{0.57706076, 0.32325208, 0.6122228},
{0.80035216, 0.18560356, 0.6328281},
{0.87145543, 0.19380389, 0.8863942},
{0.33777508, 0.6056442, 0.9110077},
{0.3961719, 0.49714503, 0.14191929},
{0.5344362, 0.8166916, 0.75880384},
{0.015749464, 0.63223976, 0.5470922},
{0.10512444, 0.2212036, 0.24995685},
{0.10831311, 0.27044898, 0.8668174},
{0.3272971, 0.6659298, 0.87119603},
{0.42913893, 0.14528985, 0.69957525},
{0.33012474, 0.81964344, 0.092787445},
{0.093618214, 0.90637344, 0.94406706},
{0.12161567, 0.75131124, 0.40563175},
{0.9154454, 0.75925833, 0.8406739},
{0.81649286, 0.9025715, 0.3105051},
{0.2927649, 0.22649862, 0.9708593},
{0.30813727, 0.0079439245, 0.39662006},
{0.94943213, 0.36778906, 0.217876},
{0.716794, 0.3811725, 0.18448676},
{0.66879725, 0.29722908, 0.0031202603},
{0.11104216, 0.13094379, 0.0787222},
{0.8508966, 0.86416596, 0.15885831},
{0.2303136, 0.56660503, 0.17114973},
{0.8632685, 0.4229249, 0.1936724},
{0.03060897, 0.35226125, 0.8115969},
}
}
func TestDelete_EntrypointIssues(t *testing.T) {
// This test is motivated by flakyness of other tests. We seemed to have
// experienced a failure with the following structure
//
// Entrypoint: 6
// Max Level: 1
// Tombstones map[]
// Nodes and Connections:
// Node 0
// Level 0: Connections: [1 2 3 4 5 6 7 8]
// Node 1
// Level 0: Connections: [0 2 3 4 5 6 7 8]
// Node 2
// Level 0: Connections: [1 0 3 4 5 6 7 8]
// Node 3
// Level 0: Connections: [2 1 0 4 5 6 7 8]
// Node 4
// Level 0: Connections: [3 2 1 0 5 6 7 8]
// Node 5
// Level 0: Connections: [3 4 2 1 0 6 7 8]
// Node 6
// Level 0: Connections: [4 2 1 3 5 0 7 8]
// Level 1: Connections: [7]
// Node 7
// Level 1: Connections: [6]
// Level 0: Connections: [6 4 3 5 2 1 0 8]
// Node 8
// Level 0: Connections: [7 6 4 3 5 2 1 0]
//
// This test aims to rebuild this tree exactly (manually) and verifies that
// deletion of the old entrypoint (element 6), works without issue
//
// The underlying test set can be found in vectors_for_test.go
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "delete-entrypoint-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: testVectorForID,
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
require.Nil(t, err)
// manually build the index
index.entryPointID = 6
index.currentMaximumLayer = 1
index.nodes = make([]*vertex, 50)
index.nodes[0] = &vertex{
id: 0,
connections: [][]uint64{
{1, 2, 3, 4, 5, 6, 7, 8},
},
}
index.nodes[1] = &vertex{
id: 1,
connections: [][]uint64{
{0, 2, 3, 4, 5, 6, 7, 8},
},
}
index.nodes[2] = &vertex{
id: 2,
connections: [][]uint64{
{1, 0, 3, 4, 5, 6, 7, 8},
},
}
index.nodes[3] = &vertex{
id: 3,
connections: [][]uint64{
{2, 1, 0, 4, 5, 6, 7, 8},
},
}
index.nodes[4] = &vertex{
id: 4,
connections: [][]uint64{
{3, 2, 1, 0, 5, 6, 7, 8},
},
}
index.nodes[5] = &vertex{
id: 5,
connections: [][]uint64{
{3, 4, 2, 1, 0, 6, 7, 8},
},
}
index.nodes[6] = &vertex{
id: 6,
connections: [][]uint64{
{4, 3, 1, 3, 5, 0, 7, 8},
{7},
},
level: 1,
}
index.nodes[7] = &vertex{
id: 7,
connections: [][]uint64{
{6, 4, 3, 5, 2, 1, 0, 8},
{6},
},
level: 1,
}
index.nodes[8] = &vertex{
id: 8,
connections: [][]uint64{
8: {7, 6, 4, 3, 5, 2, 1, 0},
},
}
dumpIndex(index, "before delete")
t.Run("delete some elements and permanently delete tombstoned elements",
func(t *testing.T) {
err := index.Delete(6)
require.Nil(t, err)
err = index.Delete(8)
require.Nil(t, err)
err = index.CleanUpTombstonedNodes(neverStop)
require.Nil(t, err)
})
dumpIndex(index, "after delete")
expectedResults := []uint64{
3, 5, 4, // cluster 2
7, // cluster 3 with element 6 and 8 deleted
2, 1, 0, // cluster 1
}
t.Run("verify that the results are correct", func(t *testing.T) {
position := 3
res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil)
require.Nil(t, err)
assert.Equal(t, expectedResults, res)
})
// t.Fail()
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, index.Drop(context.Background()))
})
}
func TestDelete_MoreEntrypointIssues(t *testing.T) {
vectors := [][]float32{
{7, 1},
{8, 2},
{23, 14},
{6.5, -1},
}
vecForID := func(ctx context.Context, id uint64) ([]float32, error) {
return vectors[int(id)], nil
}
// This test is motivated by flakyness of other tests. We seemed to have
// experienced a failure with the following structure
//
// ID: thing_geoupdatetestclass_single_location
// Entrypoint: 2
// Max Level: 1
// Tombstones map[0:{} 1:{}]
//
// Nodes and Connections:
// Node 0
// Level 0: Connections: [1]
// Node 1
// Level 0: Connections: [0 2]
// Level 1: Connections: [2]
// Node 2
// Level 1: Connections: [1]
// Level 0: Connections: [1]
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "more-delete-entrypoint-flakyness-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewGeoProvider(),
VectorForIDThunk: vecForID,
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
require.Nil(t, err)
// manually build the index
index.entryPointID = 2
index.currentMaximumLayer = 1
index.tombstones = map[uint64]struct{}{
0: {},
1: {},
}
index.nodes = make([]*vertex, 50)
index.nodes[0] = &vertex{
id: 0,
connections: [][]uint64{
0: {1},
},
}
index.nodes[1] = &vertex{
id: 1,
connections: [][]uint64{
0: {0, 2},
1: {2},
},
}
index.nodes[2] = &vertex{
id: 2,
connections: [][]uint64{
0: {1},
1: {1},
},
}
dumpIndex(index, "before adding another element")
t.Run("adding a third element", func(t *testing.T) {
vec, _ := testVectorForID(context.TODO(), 3)
index.Add(3, vec)
})
expectedResults := []uint64{
3, 2,
}
t.Run("verify that the results are correct", func(t *testing.T) {
position := 3
res, _, err := index.knnSearchByVector(testVectors[position], 50, 36, nil)
require.Nil(t, err)
assert.Equal(t, expectedResults, res)
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, index.Drop(context.Background()))
})
}
func TestDelete_TombstonedEntrypoint(t *testing.T) {
vecForID := func(ctx context.Context, id uint64) ([]float32, error) {
// always return same vec for all elements
return []float32{0.1, 0.2}, nil
}
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "tombstoned-entrypoint-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: vecForID,
TempVectorForIDThunk: TempVectorForIDThunk([][]float32{{0.1, 0.2}}),
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// explicitly turn off, so we only focus on the tombstoned periods
CleanupIntervalSeconds: 0,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), testinghelpers.NewDummyStore(t))
require.Nil(t, err)
objVec := []float32{0.1, 0.2}
searchVec := []float32{0.05, 0.05}
require.Nil(t, index.Add(0, objVec))
require.Nil(t, index.Delete(0))
require.Nil(t, index.Add(1, objVec))
res, _, err := index.SearchByVector(searchVec, 100, nil)
require.Nil(t, err)
assert.Equal(t, []uint64{1}, res, "should contain the only result")
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, index.Drop(context.Background()))
})
}
func TestDelete_Flakyness_gh_1369(t *testing.T) {
// parse a snapshot form a flaky test
snapshotBefore := []byte(`{"labels":["ran a cleanup cycle"],"id":"delete-test","entrypoint":3,"currentMaximumLayer":3,"tombstones":{},"nodes":[{"id":1,"level":0,"connections":{"0":[11,25,33,3,29,32,5,19,30,7,17,27,21,31,36,34,35,23,15,9,13]}},{"id":3,"level":3,"connections":{"0":[1,29,11,5,25,33,19,32,7,17,30,21,35,31,27,36,23,34,9,15,13],"1":[29,36,13],"2":[29,36],"3":[36]}},{"id":5,"level":0,"connections":{"0":[29,19,7,32,35,21,1,31,3,33,23,25,11,17,36,27,30,9,15,34,13]}},{"id":7,"level":0,"connections":{"0":[32,19,21,31,5,35,23,29,33,36,17,1,9,27,25,30,11,3,15,13,34]}},{"id":9,"level":0,"connections":{"0":[36,23,31,21,15,17,27,7,32,35,30,13,19,33,5,25,29,11,1,34,3]}},{"id":11,"level":0,"connections":{"0":[25,33,1,30,17,3,27,32,34,29,19,7,5,36,15,21,31,23,9,13,35]}},{"id":13,"level":1,"connections":{"0":[15,27,34,36,30,17,9,33,25,31,23,21,11,32,7,1,19,35,5,29,3],"1":[36,29,3]}},{"id":15,"level":0,"connections":{"0":[13,27,36,17,30,9,34,33,31,23,25,21,32,11,7,1,19,35,5,29,3]}},{"id":17,"level":0,"connections":{"0":[27,30,36,33,15,32,25,31,9,11,21,7,23,1,34,13,19,5,29,35,3]}},{"id":19,"level":0,"connections":{"0":[5,7,32,29,35,21,31,23,1,33,17,3,25,36,11,27,9,30,15,34,13]}},{"id":21,"level":0,"connections":{"0":[31,23,7,35,32,19,9,36,5,17,27,33,29,30,15,1,25,11,3,13,34]}},{"id":23,"level":0,"connections":{"0":[31,21,9,35,7,36,32,19,17,5,27,33,15,29,30,25,1,13,11,3,34]}},{"id":25,"level":0,"connections":{"0":[11,33,1,30,17,27,32,3,34,29,7,19,36,5,15,21,31,23,9,13,35]}},{"id":27,"level":0,"connections":{"0":[17,30,36,15,33,25,13,9,34,32,11,31,21,7,23,1,19,5,29,35,3]}},{"id":29,"level":2,"connections":{"0":[5,19,32,7,3,1,33,35,21,25,31,11,23,17,30,36,27,9,15,34,13],"1":[3,36,13],"2":[3,36]}},{"id":30,"level":0,"connections":{"0":[27,17,33,25,15,36,11,34,32,1,13,9,31,7,21,23,19,29,5,3,35]}},{"id":31,"level":0,"connections":{"0":[21,23,7,32,35,9,36,19,17,5,27,33,29,30,15,25,1,11,13,3,34]}},{"id":32,"level":0,"connections":{"0":[7,19,21,31,5,33,29,17,23,1,35,36,25,27,30,11,9,3,15,34,13]}},{"id":33,"level":0,"connections":{"0":[25,11,1,17,30,32,27,7,19,36,29,5,21,31,3,34,15,23,9,35,13]}},{"id":34,"level":0,"connections":{"0":[30,27,15,13,25,17,11,33,36,1,32,9,31,7,21,3,23,19,29,5,35]}},{"id":35,"level":0,"connections":{"0":[21,7,31,23,19,5,32,29,9,36,17,33,1,27,25,30,3,11,15,13,34]}},{"id":36,"level":3,"connections":{"0":[17,9,27,15,31,23,21,30,32,7,33,13,25,19,35,11,34,1,5,29,3],"1":[13,29,3],"2":[29,3],"3":[3]}}]}
`)
vectors := vectorsForDeleteTest()
vecForID := func(ctx context.Context, id uint64) ([]float32, error) {
return vectors[int(id)], nil
}
index, err := NewFromJSONDumpMap(snapshotBefore, vecForID)
require.Nil(t, err)
index.forbidFlat = true
var control []uint64
t.Run("control search before delete with the respective allow list", func(t *testing.T) {
allowList := helpers.NewAllowList()
for i := range vectors {
if i%2 == 0 {
continue
}
allowList.Insert(uint64(i))
}
res, _, err := index.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, allowList)
require.Nil(t, err)
require.True(t, len(res) > 0)
control = res
})
t.Run("delete the remaining even entries", func(t *testing.T) {
require.Nil(t, index.Delete(30))
require.Nil(t, index.Delete(32))
require.Nil(t, index.Delete(34))
require.Nil(t, index.Delete(36))
})
t.Run("verify against control BEFORE Tombstone Cleanup", func(t *testing.T) {
res, _, err := index.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
assert.Equal(t, control, res)
})
t.Run("clean up tombstoned nodes", func(t *testing.T) {
require.Nil(t, index.CleanUpTombstonedNodes(neverStop))
})
t.Run("verify against control AFTER Tombstone Cleanup", func(t *testing.T) {
res, _, err := index.SearchByVector([]float32{0.1, 0.1, 0.1}, 20, nil)
require.Nil(t, err)
require.True(t, len(res) > 0)
assert.Equal(t, control, res)
})
t.Run("now delete the entrypoint", func(t *testing.T) {
require.Nil(t, index.Delete(index.entryPointID))
})
t.Run("clean up tombstoned nodes", func(t *testing.T) {
require.Nil(t, index.CleanUpTombstonedNodes(neverStop))
})
t.Run("now delete the entrypoint", func(t *testing.T) {
// this verifies that our findNewLocalEntrypoint also works when the global
// entrypoint is affected
require.Nil(t, index.Delete(index.entryPointID))
})
t.Run("clean up tombstoned nodes", func(t *testing.T) {
require.Nil(t, index.CleanUpTombstonedNodes(neverStop))
})
t.Run("destroy the index", func(t *testing.T) {
require.Nil(t, index.Drop(context.Background()))
})
}
func bruteForceCosine(vectors [][]float32, query []float32, k int) []uint64 {
type distanceAndIndex struct {
distance float32
index uint64
}
distances := make([]distanceAndIndex, len(vectors))
d := distancer.NewCosineDistanceProvider().New(distancer.Normalize(query))
for i, vec := range vectors {
dist, _, _ := d.Distance(distancer.Normalize(vec))
distances[i] = distanceAndIndex{
index: uint64(i),
distance: dist,
}
}
sort.Slice(distances, func(a, b int) bool {
return distances[a].distance < distances[b].distance
})
if len(distances) < k {
k = len(distances)
}
out := make([]uint64, k)
for i := 0; i < k; i++ {
out[i] = distances[i].index
}
return out
}
func neverStop() bool {
return false
}
// This test simulates what happens when the EP is removed from the
// VectorForID-serving store
func Test_DeleteEPVecInUnderlyingObjectStore(t *testing.T) {
var vectorIndex *hnsw
vectors := [][]float32{
{1, 1},
{2, 2},
{3, 3},
}
vectorErrors := []error{
nil,
nil,
nil,
}
store := testinghelpers.NewDummyStore(t)
t.Run("import the test vectors", func(t *testing.T) {
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: "delete-ep-in-underlying-store-test",
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewL2SquaredProvider(),
VectorForIDThunk: func(ctx context.Context, id uint64) ([]float32, error) {
fmt.Printf("vec for pos=%d is %v\n", id, vectors[int(id)])
return vectors[int(id)], vectorErrors[int(id)]
},
TempVectorForIDThunk: TempVectorForIDThunk(vectors),
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
// The actual size does not matter for this test, but if it defaults to
// zero it will constantly think it's full and needs to be deleted - even
// after just being deleted, so make sure to use a positive number here.
VectorCacheMaxObjects: 100000,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(),
cyclemanager.NewCallbackGroupNoop(), store)
require.Nil(t, err)
vectorIndex = index
for i, vec := range vectors {
err := vectorIndex.Add(uint64(i), vec)
require.Nil(t, err)
}
fmt.Printf("ep is %d\n", vectorIndex.entryPointID)
})
t.Run("simulate ep vec deletion in object store", func(t *testing.T) {
vectors[0] = nil
vectorErrors[0] = storobj.NewErrNotFoundf(0, "deleted")
vectorIndex.cache.Delete(context.Background(), 0)
})
t.Run("try to insert a fourth vector", func(t *testing.T) {
vectors = append(vectors, []float32{4, 4})
vectorErrors = append(vectorErrors, nil)
pos := len(vectors) - 1
err := vectorIndex.Add(uint64(pos), vectors[pos])
require.Nil(t, err)
})
}