KevinStephenson
Adding in weaviate code
b110593
raw
history blame
9.39 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package hnsw
import (
"flag"
"io"
"net/http"
"net/url"
"os"
"path/filepath"
"sort"
"strconv"
"strings"
"testing"
"time"
"github.com/pkg/errors"
"github.com/stretchr/testify/require"
"github.com/weaviate/weaviate/adapters/repos/db/vector/compressionhelpers"
"gopkg.in/yaml.v2"
)
var download = flag.Bool("download", false, "download datasets if not found locally")
var datasets = map[string]string{
"random-xs": "datasets/big-ann-benchmarks/random10000/data_10000_20",
"random-xs-clustered": "datasets/big-ann-benchmarks/random-clustered10000/clu-random.fbin.crop_nb_10000",
"msturing-1M": "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_1000000",
"msturing-10M": "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_10000000",
"msspacev-1M": "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_1000000",
"msspacev-10M": "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_10000000",
"msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/msturing-10M-clustered.fbin",
}
var queries = map[string]string{
"random-xs": "datasets/big-ann-benchmarks/random10000/queries_1000_20",
"random-xs-clustered": "datasets/big-ann-benchmarks/random-clustered10000/queries_1000_20.fbin",
"msturing-1M": "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin",
"msturing-10M": "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin",
"msspacev-1M": "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin",
"msspacev-10M": "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin",
"msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/testQuery10K.fbin",
}
func BenchmarkHnswNeurips23(b *testing.B) {
runbooks := []string{
"datasets/neurips23/simple_runbook.yaml",
"datasets/neurips23/clustered_runbook.yaml",
}
type datasetPoints struct {
dataset string
points int
}
readDatasets := make(map[datasetPoints][][]float32)
for _, runbookFile := range runbooks {
b.Run(runbookFile, func(b *testing.B) {
runbook := readRunbook(b, runbookFile)
for _, step := range runbook.Steps {
b.Run(step.Dataset, func(b *testing.B) {
// Read the dataset if we haven't already
vectors, ok := readDatasets[datasetPoints{step.Dataset, step.MaxPts}]
if !ok {
file, ok := datasets[step.Dataset]
if !ok {
b.Skipf("Neurips23 dataset %s not found", step.Dataset)
}
if _, err := os.Stat(file); err != nil {
if !*download {
b.Skipf(`Neurips23 dataset %s not found.
Run test with -download to automatically download the dataset.
Ex: go test -v -benchmem -bench ^BenchmarkHnswNeurips23$ -download`, step.Dataset)
}
downloadDataset(b, step.Dataset)
}
readDatasets[datasetPoints{step.Dataset, step.MaxPts}] = readBigAnnDataset(b, file, step.MaxPts)
vectors = readDatasets[datasetPoints{step.Dataset, step.MaxPts}]
}
var queryVectors [][]float32
b.ResetTimer()
for i := 0; i < b.N; i++ {
index := createEmptyHnswIndexForTests(b, idVectorSize(len(vectors[0])))
for _, op := range step.Operations {
switch op.Operation {
case "insert":
compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) {
err := index.Add(uint64(op.Start+int(i)), vectors[op.Start+int(i)])
require.NoError(b, err)
})
case "delete":
compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) {
err := index.Delete(uint64(op.Start + int(i)))
require.NoError(b, err)
})
case "search":
if len(queryVectors) == 0 {
file, ok := queries[step.Dataset]
if !ok {
b.Errorf("query file: not found for %s dataset", step.Dataset)
}
queryVectors = readBigAnnDataset(b, file, 0)
}
compressionhelpers.Concurrently(uint64(len(queryVectors)), func(i uint64) {
_, _, err := index.SearchByVector(queryVectors[i], 0, nil)
require.NoError(b, err)
})
default:
b.Errorf("Unknown operation %s", op.Operation)
}
}
}
})
}
})
}
}
func downloadDataset(t testing.TB, name string) {
t.Helper()
ds, ok := datasets[name]
if !ok {
t.Fatalf("Dataset %s not found", name)
}
qs, ok := queries[name]
if !ok {
t.Fatalf("Query file not found for %s dataset", name)
}
for _, f := range []string{ds, qs} {
downloadDatasetFile(t, f)
}
}
func downloadDatasetFile(t testing.TB, file string) {
t.Helper()
if _, err := os.Stat(file); err == nil {
return
}
err := os.MkdirAll(filepath.Dir(file), 0o755)
require.NoError(t, err)
path := strings.TrimPrefix(file, "datasets/")
u, err := url.JoinPath("https://storage.googleapis.com/ann-datasets/", path)
require.NoError(t, err)
t.Logf("Downloading dataset from %s", u)
client := http.Client{
Timeout: 60 * time.Second,
}
resp, err := client.Get(u)
require.NoError(t, err)
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
t.Fatalf("Could not download dataset. Status code: %d", resp.StatusCode)
}
f, err := os.Create(file)
require.NoError(t, err)
defer f.Close()
_, err = io.Copy(f, resp.Body)
require.NoError(t, err)
t.Logf("Downloaded dataset %s", file)
}
func readBigAnnDataset(t testing.TB, file string, maxObjects int) [][]float32 {
t.Helper()
var vectors [][]float32
f, err := os.Open(file)
if err != nil {
panic(errors.Wrap(err, "Could not open SIFT file"))
}
defer f.Close()
fi, err := f.Stat()
if err != nil {
panic(errors.Wrap(err, "Could not get SIFT file properties"))
}
fileSize := fi.Size()
b := make([]byte, 4)
// The data is a binary file containing either floating point vectors or int8 vectors
// It starts with 8 bytes of header data
// The first 4 bytes are the number of vectors in the file
// The second 4 bytes are the dimensionality of the vectors in the file
// If the file is in fbin format, the vector data needs to be converted from bytes to float.
// If the file is in i8bin format, the vector data needs to be converted from bytes to int8 then to float.
// The first 4 bytes are the number of vectors in the file
_, err = f.Read(b)
require.NoError(t, err)
n := int32FromBytes(b)
// The second 4 bytes are the dimensionality of the vectors in the file
_, err = f.Read(b)
require.NoError(t, err)
d := int32FromBytes(b)
var bytesPerVector int
switch {
case strings.Contains(file, "i8bin"):
bytesPerVector = 1
case strings.Contains(file, "fbin"):
fallthrough
default:
bytesPerVector = 4
}
require.Equal(t, 8+n*d*bytesPerVector, int(fileSize))
vectorBytes := make([]byte, d*bytesPerVector)
if maxObjects > 0 && maxObjects < n {
n = maxObjects
}
for i := 0; i < n; i++ {
_, err = f.Read(vectorBytes)
if err == io.EOF {
break
}
require.NoError(t, err)
vectorFloat := make([]float32, 0, d)
for j := 0; j < d; j++ {
start := j * bytesPerVector
var f float32
if bytesPerVector == 1 {
f = float32(vectorBytes[start])
} else {
f = float32FromBytes(vectorBytes[start : start+bytesPerVector])
}
vectorFloat = append(vectorFloat, f)
}
vectors = append(vectors, vectorFloat)
}
if maxObjects > 0 {
require.Equal(t, maxObjects, len(vectors))
}
return vectors
}
type runbook struct {
Steps []runbookStep
}
type runbookStep struct {
Dataset string
MaxPts int
Operations []runbookOperation
}
type runbookOperation struct {
Operation string
Start int
End int
}
func readRunbook(t testing.TB, file string) *runbook {
f, err := os.Open(file)
require.NoError(t, err, "Could not open runbook file")
defer f.Close()
d := yaml.NewDecoder(f)
var runbook runbook
var m map[string]map[string]any
err = d.Decode(&m)
require.NoError(t, err)
var datasets []string
for datasetName := range m {
datasets = append(datasets, datasetName)
}
sort.Strings(datasets)
for _, datasetName := range datasets {
stepInfo := m[datasetName]
var step runbookStep
step.Dataset = datasetName
step.MaxPts = stepInfo["max_pts"].(int)
i := 1
for {
s := strconv.Itoa(i)
if _, ok := stepInfo[s]; !ok {
break
}
opInfo := stepInfo[s].(map[any]any)
var op runbookOperation
op.Operation = opInfo["operation"].(string)
if op.Operation == "insert" || op.Operation == "delete" {
op.Start = opInfo["start"].(int)
op.End = opInfo["end"].(int)
}
step.Operations = append(step.Operations, op)
i++
}
runbook.Steps = append(runbook.Steps, step)
}
return &runbook
}