KevinStephenson
Adding in weaviate code
b110593
raw
history blame
6.38 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package hnsw
import (
"encoding/json"
"fmt"
"strings"
"github.com/weaviate/weaviate/adapters/repos/db/vector/common"
"github.com/weaviate/weaviate/adapters/repos/db/vector/hnsw/distancer"
"github.com/weaviate/weaviate/entities/cyclemanager"
ent "github.com/weaviate/weaviate/entities/vectorindex/hnsw"
)
// Dump to stdout for debugging purposes
func (h *hnsw) Dump(labels ...string) {
if len(labels) > 0 {
fmt.Printf("--------------------------------------------------\n")
fmt.Printf("-- %s\n", strings.Join(labels, ", "))
}
fmt.Printf("--------------------------------------------------\n")
fmt.Printf("ID: %s\n", h.id)
fmt.Printf("Entrypoint: %d\n", h.entryPointID)
fmt.Printf("Max Level: %d\n", h.currentMaximumLayer)
fmt.Printf("Tombstones %v\n", h.tombstones)
fmt.Printf("\nNodes and Connections:\n")
for _, node := range h.nodes {
if node == nil {
continue
}
fmt.Printf(" Node %d (level %d)\n", node.id, node.level)
for level, conns := range node.connections {
fmt.Printf(" Level %d: Connections: %v\n", level, conns)
}
}
fmt.Printf("--------------------------------------------------\n")
}
// DumpJSON to stdout for debugging purposes
func (h *hnsw) DumpJSON(labels ...string) {
dump := JSONDump{
Labels: labels,
ID: h.id,
Entrypoint: h.entryPointID,
CurrentMaximumLayer: h.currentMaximumLayer,
Tombstones: h.tombstones,
}
for _, node := range h.nodes {
if node == nil {
continue
}
dumpNode := JSONDumpNode{
ID: node.id,
Level: node.level,
Connections: node.connections,
}
dump.Nodes = append(dump.Nodes, dumpNode)
}
out, err := json.Marshal(dump)
if err != nil {
fmt.Println(err)
}
fmt.Printf("%s\n", string(out))
}
type JSONDump struct {
Labels []string `json:"labels"`
ID string `json:"id"`
Entrypoint uint64 `json:"entrypoint"`
CurrentMaximumLayer int `json:"currentMaximumLayer"`
Tombstones map[uint64]struct{} `json:"tombstones"`
Nodes []JSONDumpNode `json:"nodes"`
}
type JSONDumpNode struct {
ID uint64 `json:"id"`
Level int `json:"level"`
Connections [][]uint64 `json:"connections"`
}
type JSONDumpMap struct {
Labels []string `json:"labels"`
ID string `json:"id"`
Entrypoint uint64 `json:"entrypoint"`
CurrentMaximumLayer int `json:"currentMaximumLayer"`
Tombstones map[uint64]struct{} `json:"tombstones"`
Nodes []JSONDumpNodeMap `json:"nodes"`
}
type JSONDumpNodeMap struct {
ID uint64 `json:"id"`
Level int `json:"level"`
Connections map[int][]uint64 `json:"connections"`
}
func NewFromJSONDump(dumpBytes []byte, vecForID common.VectorForID[float32]) (*hnsw, error) {
var dump JSONDump
err := json.Unmarshal(dumpBytes, &dump)
if err != nil {
return nil, err
}
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: dump.ID,
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: vecForID,
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), nil)
if err != nil {
return nil, err
}
index.currentMaximumLayer = dump.CurrentMaximumLayer
index.entryPointID = dump.Entrypoint
index.tombstones = dump.Tombstones
for _, n := range dump.Nodes {
index.nodes[n.ID] = &vertex{
id: n.ID,
level: n.Level,
connections: n.Connections,
}
}
return index, nil
}
func NewFromJSONDumpMap(dumpBytes []byte, vecForID common.VectorForID[float32]) (*hnsw, error) {
var dump JSONDumpMap
err := json.Unmarshal(dumpBytes, &dump)
if err != nil {
return nil, err
}
index, err := New(Config{
RootPath: "doesnt-matter-as-committlogger-is-mocked-out",
ID: dump.ID,
MakeCommitLoggerThunk: MakeNoopCommitLogger,
DistanceProvider: distancer.NewCosineDistanceProvider(),
VectorForIDThunk: vecForID,
}, ent.UserConfig{
MaxConnections: 30,
EFConstruction: 128,
}, cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), cyclemanager.NewCallbackGroupNoop(), nil)
if err != nil {
return nil, err
}
index.currentMaximumLayer = dump.CurrentMaximumLayer
index.entryPointID = dump.Entrypoint
index.tombstones = dump.Tombstones
for _, n := range dump.Nodes {
index.nodes[n.ID] = &vertex{
id: n.ID,
level: n.Level,
connections: make([][]uint64, len(n.Connections)),
}
for level, conns := range n.Connections {
index.nodes[n.ID].connections[level] = conns
}
}
return index, nil
}
// was added as part of
// https://github.com/weaviate/weaviate/issues/1868 for debugging. It
// is not currently in use anywhere as it is somewhat costly, it would lock the
// entire graph and iterate over every node which would lead to disruptions in
// production. However, keeping this method around may be valuable for future
// investigations where the amount of links may be a problem.
func (h *hnsw) ValidateLinkIntegrity() {
h.RLock()
defer h.RUnlock()
for i, node := range h.nodes {
if node == nil {
continue
}
for level, conns := range node.connections {
m := h.maximumConnections
if level == 0 {
m = h.maximumConnectionsLayerZero
}
if len(conns) > m {
h.logger.Warnf("node %d at level %d has %d connections", i, level, len(conns))
}
}
}
h.logger.Infof("completed link integrity check")
}