Spaces:

MVPilgrim
/

SemanticSearchPOC

Running

SemanticSearchPOC / adapters /repos /db /vector /hnsw /benchmark_test.go

KevinStephenson

Adding in weaviate code

b110593 over 1 year ago

9.39 kB

	// _ _
	// __ _____ __ ___ ___ __ _\| \|_ ___
	// \ \ /\ / / _ \/ _` \ \ / / \|/ _` \| __/ _ \
	// \ V V / __/ (_\| \|\ V /\| \| (_\| \| \|\| __/
	// \_/\_/ \___\|\__,_\| \_/ \|_\|\__,_\|\__\___\|
	//
	// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
	//
	// CONTACT: [email protected]
	//

	package hnsw

	import (
	"flag"
	"io"
	"net/http"
	"net/url"
	"os"
	"path/filepath"
	"sort"
	"strconv"
	"strings"
	"testing"
	"time"

	"github.com/pkg/errors"
	"github.com/stretchr/testify/require"
	"github.com/weaviate/weaviate/adapters/repos/db/vector/compressionhelpers"
	"gopkg.in/yaml.v2"
	)

	var download = flag.Bool("download", false, "download datasets if not found locally")

	var datasets = map[string]string{
	"random-xs": "datasets/big-ann-benchmarks/random10000/data_10000_20",
	"random-xs-clustered": "datasets/big-ann-benchmarks/random-clustered10000/clu-random.fbin.crop_nb_10000",
	"msturing-1M": "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_1000000",
	"msturing-10M": "datasets/big-ann-benchmarks/MSTuringANNS/base1b.fbin.crop_nb_10000000",
	"msspacev-1M": "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_1000000",
	"msspacev-10M": "datasets/big-ann-benchmarks/MSSPACEV1B/spacev1b_base.i8bin.crop_nb_10000000",
	"msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/msturing-10M-clustered.fbin",
	}

	var queries = map[string]string{
	"random-xs": "datasets/big-ann-benchmarks/random10000/queries_1000_20",
	"random-xs-clustered": "datasets/big-ann-benchmarks/random-clustered10000/queries_1000_20.fbin",
	"msturing-1M": "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin",
	"msturing-10M": "datasets/big-ann-benchmarks/MSTuringANNS/query100K.fbin",
	"msspacev-1M": "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin",
	"msspacev-10M": "datasets/big-ann-benchmarks/MSSPACEV1B/query.i8bin",
	"msturing-10M-clustered": "datasets/big-ann-benchmarks/MSTuring-10M-clustered/testQuery10K.fbin",
	}

	func BenchmarkHnswNeurips23(b *testing.B) {
	runbooks := []string{
	"datasets/neurips23/simple_runbook.yaml",
	"datasets/neurips23/clustered_runbook.yaml",
	}

	type datasetPoints struct {
	dataset string
	points int
	}

	readDatasets := make(map[datasetPoints][][]float32)

	for _, runbookFile := range runbooks {
	b.Run(runbookFile, func(b *testing.B) {
	runbook := readRunbook(b, runbookFile)

	for _, step := range runbook.Steps {
	b.Run(step.Dataset, func(b *testing.B) {
	// Read the dataset if we haven't already
	vectors, ok := readDatasets[datasetPoints{step.Dataset, step.MaxPts}]
	if !ok {
	file, ok := datasets[step.Dataset]
	if !ok {
	b.Skipf("Neurips23 dataset %s not found", step.Dataset)
	}

	if _, err := os.Stat(file); err != nil {
	if !*download {
	b.Skipf(`Neurips23 dataset %s not found.
	Run test with -download to automatically download the dataset.
	Ex: go test -v -benchmem -bench ^BenchmarkHnswNeurips23$ -download`, step.Dataset)
	}
	downloadDataset(b, step.Dataset)
	}

	readDatasets[datasetPoints{step.Dataset, step.MaxPts}] = readBigAnnDataset(b, file, step.MaxPts)
	vectors = readDatasets[datasetPoints{step.Dataset, step.MaxPts}]
	}

	var queryVectors [][]float32

	b.ResetTimer()

	for i := 0; i < b.N; i++ {
	index := createEmptyHnswIndexForTests(b, idVectorSize(len(vectors[0])))

	for _, op := range step.Operations {
	switch op.Operation {
	case "insert":
	compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) {
	err := index.Add(uint64(op.Start+int(i)), vectors[op.Start+int(i)])
	require.NoError(b, err)
	})
	case "delete":
	compressionhelpers.Concurrently(uint64(op.End-op.Start), func(i uint64) {
	err := index.Delete(uint64(op.Start + int(i)))
	require.NoError(b, err)
	})
	case "search":
	if len(queryVectors) == 0 {
	file, ok := queries[step.Dataset]
	if !ok {
	b.Errorf("query file: not found for %s dataset", step.Dataset)
	}

	queryVectors = readBigAnnDataset(b, file, 0)
	}

	compressionhelpers.Concurrently(uint64(len(queryVectors)), func(i uint64) {
	_, _, err := index.SearchByVector(queryVectors[i], 0, nil)
	require.NoError(b, err)
	})
	default:
	b.Errorf("Unknown operation %s", op.Operation)
	}
	}
	}
	})
	}
	})
	}
	}

	func downloadDataset(t testing.TB, name string) {
	t.Helper()

	ds, ok := datasets[name]
	if !ok {
	t.Fatalf("Dataset %s not found", name)
	}

	qs, ok := queries[name]
	if !ok {
	t.Fatalf("Query file not found for %s dataset", name)
	}

	for _, f := range []string{ds, qs} {
	downloadDatasetFile(t, f)
	}
	}

	func downloadDatasetFile(t testing.TB, file string) {
	t.Helper()

	if _, err := os.Stat(file); err == nil {
	return
	}

	err := os.MkdirAll(filepath.Dir(file), 0o755)
	require.NoError(t, err)

	path := strings.TrimPrefix(file, "datasets/")

	u, err := url.JoinPath("https://storage.googleapis.com/ann-datasets/", path)
	require.NoError(t, err)

	t.Logf("Downloading dataset from %s", u)

	client := http.Client{
	Timeout: 60 * time.Second,
	}

	resp, err := client.Get(u)
	require.NoError(t, err)
	defer resp.Body.Close()

	if resp.StatusCode != http.StatusOK {
	t.Fatalf("Could not download dataset. Status code: %d", resp.StatusCode)
	}

	f, err := os.Create(file)
	require.NoError(t, err)
	defer f.Close()

	_, err = io.Copy(f, resp.Body)
	require.NoError(t, err)

	t.Logf("Downloaded dataset %s", file)
	}

	func readBigAnnDataset(t testing.TB, file string, maxObjects int) [][]float32 {
	t.Helper()

	var vectors [][]float32

	f, err := os.Open(file)
	if err != nil {
	panic(errors.Wrap(err, "Could not open SIFT file"))
	}
	defer f.Close()

	fi, err := f.Stat()
	if err != nil {
	panic(errors.Wrap(err, "Could not get SIFT file properties"))
	}
	fileSize := fi.Size()

	b := make([]byte, 4)

	// The data is a binary file containing either floating point vectors or int8 vectors
	// It starts with 8 bytes of header data
	// The first 4 bytes are the number of vectors in the file
	// The second 4 bytes are the dimensionality of the vectors in the file
	// If the file is in fbin format, the vector data needs to be converted from bytes to float.
	// If the file is in i8bin format, the vector data needs to be converted from bytes to int8 then to float.

	// The first 4 bytes are the number of vectors in the file
	_, err = f.Read(b)
	require.NoError(t, err)
	n := int32FromBytes(b)

	// The second 4 bytes are the dimensionality of the vectors in the file
	_, err = f.Read(b)
	require.NoError(t, err)
	d := int32FromBytes(b)

	var bytesPerVector int
	switch {
	case strings.Contains(file, "i8bin"):
	bytesPerVector = 1
	case strings.Contains(file, "fbin"):
	fallthrough
	default:
	bytesPerVector = 4
	}

	require.Equal(t, 8+ndbytesPerVector, int(fileSize))

	vectorBytes := make([]byte, d*bytesPerVector)
	if maxObjects > 0 && maxObjects < n {
	n = maxObjects
	}

	for i := 0; i < n; i++ {
	_, err = f.Read(vectorBytes)
	if err == io.EOF {
	break
	}
	require.NoError(t, err)

	vectorFloat := make([]float32, 0, d)
	for j := 0; j < d; j++ {
	start := j * bytesPerVector
	var f float32
	if bytesPerVector == 1 {
	f = float32(vectorBytes[start])
	} else {
	f = float32FromBytes(vectorBytes[start : start+bytesPerVector])
	}

	vectorFloat = append(vectorFloat, f)
	}

	vectors = append(vectors, vectorFloat)
	}

	if maxObjects > 0 {
	require.Equal(t, maxObjects, len(vectors))
	}

	return vectors
	}

	type runbook struct {
	Steps []runbookStep
	}
	type runbookStep struct {
	Dataset string
	MaxPts int
	Operations []runbookOperation
	}

	type runbookOperation struct {
	Operation string
	Start int
	End int
	}

	func readRunbook(t testing.TB, file string) *runbook {
	f, err := os.Open(file)
	require.NoError(t, err, "Could not open runbook file")
	defer f.Close()

	d := yaml.NewDecoder(f)

	var runbook runbook

	var m map[string]map[string]any
	err = d.Decode(&m)
	require.NoError(t, err)

	var datasets []string
	for datasetName := range m {
	datasets = append(datasets, datasetName)
	}

	sort.Strings(datasets)

	for _, datasetName := range datasets {
	stepInfo := m[datasetName]
	var step runbookStep

	step.Dataset = datasetName
	step.MaxPts = stepInfo["max_pts"].(int)
	i := 1
	for {
	s := strconv.Itoa(i)
	if _, ok := stepInfo[s]; !ok {
	break
	}

	opInfo := stepInfo[s].(map[any]any)

	var op runbookOperation
	op.Operation = opInfo["operation"].(string)
	if op.Operation == "insert" \|\| op.Operation == "delete" {
	op.Start = opInfo["start"].(int)
	op.End = opInfo["end"].(int)
	}

	step.Operations = append(step.Operations, op)

	i++
	}

	runbook.Steps = append(runbook.Steps, step)
	}

	return &runbook
	}