SemanticSearchPOC / adapters /repos /db /inverted /prop_length_tracker_test.go
KevinStephenson
Adding in weaviate code
b110593
raw
history blame
18.2 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package inverted
import (
"fmt"
"path"
"testing"
"github.com/sirupsen/logrus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func Test_PropertyLengthTracker(t *testing.T) {
dirName := t.TempDir()
trackerPath := path.Join(dirName, "my_test_shard")
l := logrus.New()
// This test suite doesn't actually test persistence, there is a separate
// one. However, we still need to supply a valid path. Since nothing is ever
// written, we can use the same one for each sub-test without them
// accidentally sharing state.
t.Run("single prop", func(t *testing.T) {
type test struct {
values []float32
name string
floatCompare bool
}
tests := []test{
{
values: []float32{2, 2, 3, 100, 100, 500, 7},
name: "mixed_values",
floatCompare: true,
},
{
values: []float32{
1000, 1200, 1000, 1300, 800, 2000, 2050,
2070, 900,
},
name: "high_values",
floatCompare: true,
},
{
values: []float32{
60000, 50000, 65000,
},
name: "very_high_values",
floatCompare: true,
},
{
values: []float32{
1, 2, 4, 3, 4, 2, 1, 5, 6, 7, 8, 2, 7, 2, 3, 5,
6, 3, 5, 9, 3, 4, 8,
},
name: "very_low_values",
floatCompare: true,
},
{
values: []float32{0, 0},
name: "zeros",
floatCompare: false,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
tracker, err := NewJsonShardMetaData(trackerPath+test.name, l)
require.Nil(t, err)
actualMean := float32(0)
for _, v := range test.values {
tracker.TrackProperty("my-very-first-prop", v)
actualMean += v
}
actualMean = actualMean / float32(len(test.values))
res, err := tracker.PropertyMean("my-very-first-prop")
require.Nil(t, err)
if test.floatCompare {
assert.InEpsilon(t, actualMean, res, 0.1)
} else {
assert.Equal(t, actualMean, res)
}
require.Nil(t, tracker.Close())
})
}
})
t.Run("test untrack", func(t *testing.T) {
tracker, err := NewJsonShardMetaData(trackerPath, l)
require.Nil(t, err)
tracker.TrackProperty("test-prop", 1)
tracker.TrackProperty("test-prop", 2)
tracker.TrackProperty("test-prop", 3)
tracker.Flush(false)
sum, count, mean, err := tracker.PropertyTally("test-prop")
require.Nil(t, err)
assert.Equal(t, 6, sum)
assert.Equal(t, 3, count)
assert.InEpsilon(t, 2, mean, 0.1)
tracker.UnTrackProperty("test-prop", 2)
sum, count, mean, err = tracker.PropertyTally("test-prop")
require.Nil(t, err)
assert.Equal(t, 4, sum)
assert.Equal(t, 2, count)
assert.InEpsilon(t, 2, mean, 0.1)
tracker.UnTrackProperty("test-prop", 1)
sum, count, mean, err = tracker.PropertyTally("test-prop")
require.Nil(t, err)
assert.Equal(t, 3, sum)
assert.Equal(t, 1, count)
assert.InEpsilon(t, 3, mean, 0.1)
require.Nil(t, tracker.Close())
})
t.Run("multiple properties (can all fit on one page)", func(t *testing.T) {
type prop struct {
values []float32
propName string
}
props := []prop{
{
values: []float32{2, 2, 3, 100, 100, 500, 7},
propName: "property-numero-uno",
}, {
values: []float32{
1000, 1200, 1000, 1300, 800, 2000, 2050,
2070, 900,
},
propName: "the-second-of-the-properties",
}, {
values: []float32{
60000, 50000, 65000,
},
propName: "property_nummer_DREI",
},
}
// This time we use a single tracker
tracker, err := NewJsonShardMetaData(trackerPath, l)
require.Nil(t, err)
for _, prop := range props {
for _, v := range prop.values {
tracker.TrackProperty(prop.propName, v)
}
}
for _, prop := range props {
actualMean := float32(0)
for _, v := range prop.values {
actualMean += v
}
actualMean = actualMean / float32(len(prop.values))
res, err := tracker.PropertyMean(prop.propName)
require.Nil(t, err)
assert.InEpsilon(t, actualMean, res, 0.1)
}
require.Nil(t, tracker.Close())
})
t.Run("with more properties that can fit on one page", func(t *testing.T) {
// This time we use a single tracker
tracker, err := NewJsonShardMetaData(trackerPath, l)
require.Nil(t, err)
create20PropsAndVerify(t, tracker)
require.Nil(t, tracker.Close())
})
}
func create20PropsAndVerify(t *testing.T, tracker *JsonShardMetaData) {
type prop struct {
values []float32
propName string
}
// the most props we could ever fit on a single page is 16 if there was no
// index, which is impossible. This means the practical max is 15, so at
// least 5 props should overflow to the second page.
propCount := 20
props := make([]prop, propCount)
for i := range props {
props[i] = prop{
values: []float32{1, 4, 3, 17},
propName: fmt.Sprintf("prop_%d", i),
}
}
for _, prop := range props {
for _, v := range prop.values {
tracker.TrackProperty(prop.propName, v)
}
}
for _, prop := range props {
actualMean := float32(0)
for _, v := range prop.values {
actualMean += v
}
actualMean = actualMean / float32(len(prop.values))
res, err := tracker.PropertyMean(prop.propName)
require.Nil(t, err)
assert.InEpsilon(t, actualMean, res, 0.1)
}
// modify a prop on page 2 and verify
tracker.TrackProperty("prop_19", 24)
actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
res, err := tracker.PropertyMean("prop_19")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
}
func Test_PropertyLengthTracker_Persistence(t *testing.T) {
dirName := t.TempDir()
path := path.Join(dirName, "my_test_shard")
var tracker *JsonShardMetaData
l := logrus.New()
t.Run("initializing an empty tracker, no file present", func(t *testing.T) {
tr, err := NewJsonShardMetaData(path, l)
require.Nil(t, err)
tracker = tr
})
t.Run("importing multi-page data and verifying", func(t *testing.T) {
create20PropsAndVerify(t, tracker)
})
t.Run("commit the state to disk", func(t *testing.T) {
require.Nil(t, tracker.Flush(false))
})
t.Run("shut down the tracker", func(t *testing.T) {
require.Nil(t, tracker.Close())
})
var secondTracker *JsonShardMetaData
t.Run("initializing a new tracker from the same file", func(t *testing.T) {
tr, err := NewJsonShardMetaData(path, l)
require.Nil(t, err)
secondTracker = tr
})
t.Run("verify data is correct after read from disk", func(t *testing.T) {
// root page
actualMeanForProp0 := float32(1+4+3+17) / 4.0
res, err := secondTracker.PropertyMean("prop_0")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp0, res, 0.1)
// later page
actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
res, err = secondTracker.PropertyMean("prop_19")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
})
}
// Testing the switch from the old property length tracker to the new one
func TestFormatConversion(t *testing.T) {
dirName := t.TempDir()
path := path.Join(dirName, "my_test_shard")
var tracker *PropertyLengthTracker
t.Run("initializing an empty tracker, no file present", func(t *testing.T) {
tr, err := NewPropertyLengthTracker(path)
require.Nil(t, err)
tracker = tr
})
t.Run("importing multi-page data and verifying", func(t *testing.T) {
create20PropsAndVerify_old(t, tracker)
})
t.Run("commit the state to disk", func(t *testing.T) {
require.Nil(t, tracker.Flush())
})
t.Run("shut down the tracker", func(t *testing.T) {
require.Nil(t, tracker.Close())
})
var newTracker *JsonShardMetaData
l := logrus.New()
t.Run("initializing a new tracker from the same file", func(t *testing.T) {
tr, err := NewJsonShardMetaData(path, l)
require.Nil(t, err)
newTracker = tr
})
t.Run("verify data is correct after read from disk", func(t *testing.T) {
// root page
actualMeanForProp0 := float32(1+4+3+17) / 4.0
res, err := newTracker.PropertyMean("prop_0")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp0, res, 0.1)
// later page
actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
res, err = newTracker.PropertyMean("prop_19")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
res, err = newTracker.PropertyMean("prop_22")
require.Nil(t, err)
assert.EqualValues(t, res, 0)
sum, count, average, _ := newTracker.PropertyTally("prop_22")
assert.EqualValues(t, 0, sum)
assert.EqualValues(t, 3, count)
assert.EqualValues(t, 0, average)
})
}
func create20PropsAndVerify_old(t *testing.T, tracker *PropertyLengthTracker) {
type prop struct {
values []float32
propName string
}
// the most props we could ever fit on a single page is 16 if there was no
// index, which is impossible. This means the practical max is 15, so at
// least 5 props should overflow to the second page.
propCount := 20
props := make([]prop, propCount)
for i := range props {
props[i] = prop{
values: []float32{1, 4, 3, 17},
propName: fmt.Sprintf("prop_%d", i),
}
}
for _, prop := range props {
for _, v := range prop.values {
tracker.TrackProperty(prop.propName, v)
}
}
tracker.TrackProperty("prop_22", 0)
tracker.TrackProperty("prop_22", 0)
tracker.TrackProperty("prop_22", 0)
for _, prop := range props {
actualMean := float32(0)
for _, v := range prop.values {
actualMean += v
}
actualMean = actualMean / float32(len(prop.values))
res, err := tracker.PropertyMean(prop.propName)
require.Nil(t, err)
assert.InEpsilon(t, actualMean, res, 0.1)
}
// modify a prop on page 2 and verify
tracker.TrackProperty("prop_19", 24)
actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
res, err := tracker.PropertyMean("prop_19")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
res, err = tracker.PropertyMean("prop_22")
require.Nil(t, err)
assert.EqualValues(t, res, 0)
sum, _, average, _ := tracker.PropertyTally("prop_22")
assert.EqualValues(t, 0, sum)
// assert.EqualValues(t, 3, count)
assert.EqualValues(t, 0, average)
}
// Test the old property length tracker
func TestOldPropertyLengthTracker(t *testing.T) {
dirName := t.TempDir()
trackerPath := path.Join(dirName, "my_test_shard")
// This test suite doesn't actually test persistence, there is a separate
// one. However, we still need to supply a valid path. Since nothing is ever
// written, we can use the same one for each sub-test without them
// accidentally sharing state.
t.Run("single prop", func(t *testing.T) {
type test struct {
values []float32
name string
floatCompare bool
}
tests := []test{
{
values: []float32{2, 2, 3, 100, 100, 500, 7},
name: "mixed_values",
floatCompare: true,
}, {
values: []float32{
1000, 1200, 1000, 1300, 800, 2000, 2050,
2070, 900,
},
name: "high_values",
floatCompare: true,
}, {
values: []float32{
60000, 50000, 65000,
},
name: "very_high_values",
floatCompare: true,
}, {
values: []float32{
1, 2, 4, 3, 4, 2, 1, 5, 6, 7, 8, 2, 7, 2, 3, 5,
6, 3, 5, 9, 3, 4, 8,
},
name: "very_low_values",
floatCompare: true,
}, {
values: []float32{0, 0},
name: "zeros",
floatCompare: false,
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
tracker, err := NewPropertyLengthTracker(trackerPath + test.name)
require.Nil(t, err)
actualMean := float32(0)
for _, v := range test.values {
tracker.TrackProperty("my-very-first-prop", v)
actualMean += v
}
actualMean = actualMean / float32(len(test.values))
res, err := tracker.PropertyMean("my-very-first-prop")
require.Nil(t, err)
if test.floatCompare {
assert.InEpsilon(t, actualMean, res, 0.1)
} else {
assert.Equal(t, actualMean, res)
}
require.Nil(t, tracker.Close())
})
}
})
t.Run("test untrack", func(t *testing.T) {
tracker, err := NewPropertyLengthTracker(trackerPath)
require.Nil(t, err)
tracker.TrackProperty("test-prop", 1)
tracker.TrackProperty("test-prop", 2)
tracker.TrackProperty("test-prop", 3)
tracker.Flush()
sum, count, mean, err := tracker.PropertyTally("test-prop")
require.Nil(t, err)
assert.Equal(t, 6, sum)
assert.Equal(t, 3, count)
assert.InEpsilon(t, 2, mean, 0.1)
tracker.UnTrackProperty("test-prop", 2)
sum, count, mean, err = tracker.PropertyTally("test-prop")
require.Nil(t, err)
assert.Equal(t, 4, sum)
assert.Equal(t, 2, count)
assert.InEpsilon(t, 2, mean, 0.1)
tracker.UnTrackProperty("test-prop", 1)
sum, count, mean, err = tracker.PropertyTally("test-prop")
require.Nil(t, err)
assert.Equal(t, 3, sum)
assert.Equal(t, 1, count)
assert.InEpsilon(t, 3, mean, 0.1)
require.Nil(t, tracker.Close())
})
t.Run("multiple properties (can all fit on one page)", func(t *testing.T) {
type prop struct {
values []float32
propName string
}
props := []prop{
{
values: []float32{2, 2, 3, 100, 100, 500, 7},
propName: "property-numero-uno",
}, {
values: []float32{
1000, 1200, 1000, 1300, 800, 2000, 2050,
2070, 900,
},
propName: "the-second-of-the-properties",
}, {
values: []float32{
60000, 50000, 65000,
},
propName: "property_nummer_DREI",
},
}
// This time we use a single tracker
tracker, err := NewPropertyLengthTracker(trackerPath)
require.Nil(t, err)
for _, prop := range props {
for _, v := range prop.values {
tracker.TrackProperty(prop.propName, v)
}
}
for _, prop := range props {
actualMean := float32(0)
for _, v := range prop.values {
actualMean += v
}
actualMean = actualMean / float32(len(prop.values))
res, err := tracker.PropertyMean(prop.propName)
require.Nil(t, err)
assert.InEpsilon(t, actualMean, res, 0.1)
}
require.Nil(t, tracker.Close())
})
t.Run("with more properties that can fit on one page", func(t *testing.T) {
// This time we use a single tracker
tracker, err := NewPropertyLengthTracker(trackerPath)
require.Nil(t, err)
create20PropsAndVerify_old(t, tracker)
require.Nil(t, tracker.Close())
})
}
func TestOldPropertyLengthTracker_Persistence(t *testing.T) {
dirName := t.TempDir()
path := path.Join(dirName, "my_test_shard")
var tracker *PropertyLengthTracker
t.Run("initializing an empty tracker, no file present", func(t *testing.T) {
tr, err := NewPropertyLengthTracker(path)
require.Nil(t, err)
tracker = tr
})
t.Run("importing multi-page data and verifying", func(t *testing.T) {
create20PropsAndVerify_old(t, tracker)
})
t.Run("commit the state to disk", func(t *testing.T) {
require.Nil(t, tracker.Flush())
})
t.Run("shut down the tracker", func(t *testing.T) {
require.Nil(t, tracker.Close())
})
var secondTracker *PropertyLengthTracker
t.Run("initializing a new tracker from the same file", func(t *testing.T) {
tr, err := NewPropertyLengthTracker(path)
require.Nil(t, err)
secondTracker = tr
})
t.Run("verify data is correct after read from disk", func(t *testing.T) {
// root page
actualMeanForProp0 := float32(1+4+3+17) / 4.0
res, err := secondTracker.PropertyMean("prop_0")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp0, res, 0.1)
// later page
actualMeanForProp20 := float32(1+4+3+17+25) / 5.0
res, err = secondTracker.PropertyMean("prop_19")
require.Nil(t, err)
assert.InEpsilon(t, actualMeanForProp20, res, 0.1)
})
t.Run("shut down the second tracker", func(t *testing.T) {
require.Nil(t, secondTracker.Close())
})
}
func Test_PropertyLengthTracker_Overflow(t *testing.T) {
dirName := t.TempDir()
path := path.Join(dirName, "my_test_shard")
tracker, err := NewPropertyLengthTracker(path)
require.Nil(t, err)
for i := 0; i < 16*15; i++ {
err := tracker.TrackProperty(fmt.Sprintf("prop_%v", i), float32(i))
require.Nil(t, err)
}
// Check that property that would cause the internal counter to overflow is not added
err = tracker.TrackProperty("OVERFLOW", float32(123))
require.NotNil(t, err)
require.Nil(t, tracker.Close())
}
// Test that object racking works
func Test_PropertyLengthTracker_ObjectTracking(t *testing.T) {
dirName := t.TempDir()
path := path.Join(dirName, "my_test_shard")
var tracker *JsonShardMetaData
l := logrus.New()
t.Run("initializing an empty tracker, no file present", func(t *testing.T) {
tr, err := NewJsonShardMetaData(path, l)
require.Nil(t, err)
tracker = tr
})
t.Run("test object tracking", func(t *testing.T) {
start := tracker.ObjectTally()
require.Equal(t, start, 0)
tracker.TrackObjects(1)
require.Equal(t, tracker.ObjectTally(), 1)
tracker.TrackObjects(1)
require.Equal(t, tracker.ObjectTally(), 2)
tracker.TrackObjects(-1)
require.Equal(t, tracker.ObjectTally(), 1)
tracker.TrackObjects(-1)
require.Equal(t, tracker.ObjectTally(), 0)
tracker.TrackObjects(2)
require.Equal(t, tracker.ObjectTally(), 2)
err := tracker.Close()
require.Nil(t, err)
tr, err := NewJsonShardMetaData(path, l)
require.Nil(t, err)
tracker = tr
require.Equal(t, tracker.ObjectTally(), 2)
})
}