Spaces:
Running
Running
// _ _ | |
// __ _____ __ ___ ___ __ _| |_ ___ | |
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ | |
// \ V V / __/ (_| |\ V /| | (_| | || __/ | |
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| | |
// | |
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. | |
// | |
// CONTACT: [email protected] | |
// | |
package sempath | |
import ( | |
"context" | |
"fmt" | |
"math" | |
"sort" | |
"time" | |
"github.com/danaugrs/go-tsne/tsne" | |
"github.com/pkg/errors" | |
"github.com/tailor-inc/graphql/language/ast" | |
"github.com/weaviate/weaviate/entities/models" | |
"github.com/weaviate/weaviate/entities/moduletools" | |
"github.com/weaviate/weaviate/entities/search" | |
txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models" | |
"gonum.org/v1/gonum/mat" | |
) | |
func New(c11y Remote) *PathBuilder { | |
return &PathBuilder{ | |
fixedSeed: time.Now().UnixNano(), | |
c11y: c11y, | |
} | |
} | |
type PathBuilder struct { | |
fixedSeed int64 | |
c11y Remote | |
} | |
type Remote interface { | |
MultiNearestWordsByVector(ctx context.Context, vectors [][]float32, k, n int) ([]*txt2vecmodels.NearestNeighbors, error) | |
} | |
func (pb *PathBuilder) AdditionalPropertyDefaultValue() interface{} { | |
return &Params{} | |
} | |
func (pb *PathBuilder) AdditionalPropertyFn(ctx context.Context, | |
in []search.Result, params interface{}, limit *int, | |
argumentModuleParams map[string]interface{}, cfg moduletools.ClassConfig, | |
) ([]search.Result, error) { | |
if parameters, ok := params.(*Params); ok { | |
return pb.CalculatePath(in, parameters) | |
} | |
return nil, errors.New("unknown params") | |
} | |
func (pb *PathBuilder) ExtractAdditionalFn(param []*ast.Argument) interface{} { | |
return &Params{} | |
} | |
func (pb *PathBuilder) CalculatePath(in []search.Result, params *Params) ([]search.Result, error) { | |
if len(in) == 0 { | |
return nil, nil | |
} | |
if params == nil { | |
return nil, fmt.Errorf("no params provided") | |
} | |
dims := len(in[0].Vector) | |
if err := params.SetDefaultsAndValidate(len(in), dims); err != nil { | |
return nil, errors.Wrap(err, "invalid params") | |
} | |
searchNeighbors, err := pb.addSearchNeighbors(params) | |
if err != nil { | |
return nil, err | |
} | |
for i, obj := range in { | |
path, err := pb.calculatePathPerObject(obj, in, params, searchNeighbors) | |
if err != nil { | |
return nil, fmt.Errorf("object %d: %v", i, err) | |
} | |
if in[i].AdditionalProperties == nil { | |
in[i].AdditionalProperties = models.AdditionalProperties{} | |
} | |
in[i].AdditionalProperties["semanticPath"] = path | |
} | |
return in, nil | |
} | |
func (pb *PathBuilder) calculatePathPerObject(obj search.Result, allObjects []search.Result, params *Params, | |
searchNeighbors []*txt2vecmodels.NearestNeighbor, | |
) (*txt2vecmodels.SemanticPath, error) { | |
dims := len(obj.Vector) | |
matrix, neighbors, err := pb.vectorsToMatrix(obj, allObjects, dims, params, searchNeighbors) | |
if err != nil { | |
return nil, err | |
} | |
inputRows := matrix.RawMatrix().Rows | |
t := tsne.NewTSNE(2, float64(inputRows/2), 100, 100, false) | |
res := t.EmbedData(matrix, nil) | |
rows, cols := res.Dims() | |
if rows != inputRows { | |
return nil, fmt.Errorf("have different output results than input %d != %d", inputRows, rows) | |
} | |
// create an explicit copy of the neighbors, so we don't mutate them. | |
// Otherwise the 2nd round will have been influenced by the first | |
projectedNeighbors := copyNeighbors(neighbors) | |
var projectedSearchVector []float32 | |
var projectedTargetVector []float32 | |
for i := 0; i < rows; i++ { | |
vector := make([]float32, cols) | |
for j := range vector { | |
vector[j] = float32(res.At(i, j)) | |
} | |
if i == 0 { // the input object | |
projectedTargetVector = vector | |
} else if i < 1+len(neighbors) { | |
// these must be neighbor props | |
projectedNeighbors[i-1].Vector = vector | |
} else { | |
// is now the very last element which is the search vector | |
projectedSearchVector = vector | |
} | |
} | |
path := pb.buildPath(projectedNeighbors, projectedSearchVector, projectedTargetVector) | |
return pb.addDistancesToPath(path, neighbors, params.SearchVector, obj.Vector) | |
} | |
func (pb *PathBuilder) addSearchNeighbors(params *Params) ([]*txt2vecmodels.NearestNeighbor, error) { | |
nn, err := pb.c11y.MultiNearestWordsByVector(context.TODO(), [][]float32{params.SearchVector}, 36, 50) | |
if err != nil { | |
return nil, err | |
} | |
return nn[0].Neighbors, nil | |
} | |
// TODO: document behavior if it actually stays like this | |
func (pb *PathBuilder) vectorsToMatrix(obj search.Result, allObjects []search.Result, dims int, | |
params *Params, searchNeighbors []*txt2vecmodels.NearestNeighbor, | |
) (*mat.Dense, []*txt2vecmodels.NearestNeighbor, error) { | |
items := 1 // the initial object | |
var neighbors []*txt2vecmodels.NearestNeighbor | |
neighbors = pb.extractNeighbors(allObjects) | |
neighbors = append(neighbors, searchNeighbors...) | |
neighbors = pb.removeDuplicateNeighborsAndDollarNeighbors(neighbors) | |
items += len(neighbors) + 1 // The +1 is for the search vector which we append last | |
// concat all vectors to build gonum dense matrix | |
mergedVectors := make([]float64, items*dims) | |
if l := len(obj.Vector); l != dims { | |
return nil, nil, fmt.Errorf("object: inconsistent vector lengths found: dimensions=%d and object=%d", dims, l) | |
} | |
for j, dim := range obj.Vector { | |
mergedVectors[j] = float64(dim) | |
} | |
withoutNeighbors := 1 * dims | |
for i, neighbor := range neighbors { | |
neighborVector := neighbor.Vector | |
if l := len(neighborVector); l != dims { | |
return nil, nil, fmt.Errorf("neighbor: inconsistent vector lengths found: dimensions=%d and object=%d", dims, l) | |
} | |
for j, dim := range neighborVector { | |
mergedVectors[withoutNeighbors+i*dims+j] = float64(dim) | |
} | |
} | |
for i, dim := range params.SearchVector { | |
mergedVectors[len(mergedVectors)-dims+i] = float64(dim) | |
} | |
return mat.NewDense(items, dims, mergedVectors), neighbors, nil | |
} | |
func (pb *PathBuilder) extractNeighbors(in []search.Result) []*txt2vecmodels.NearestNeighbor { | |
var out []*txt2vecmodels.NearestNeighbor | |
for _, obj := range in { | |
if obj.AdditionalProperties == nil || obj.AdditionalProperties["nearestNeighbors"] == nil { | |
continue | |
} | |
if neighbors, ok := obj.AdditionalProperties["nearestNeighbors"]; ok { | |
if nearestNeighbors, ok := neighbors.(*txt2vecmodels.NearestNeighbors); ok { | |
out = append(out, nearestNeighbors.Neighbors...) | |
} | |
} | |
} | |
return out | |
} | |
func (pb *PathBuilder) removeDuplicateNeighborsAndDollarNeighbors(in []*txt2vecmodels.NearestNeighbor) []*txt2vecmodels.NearestNeighbor { | |
seen := map[string]struct{}{} | |
out := make([]*txt2vecmodels.NearestNeighbor, len(in)) | |
i := 0 | |
for _, candidate := range in { | |
if _, ok := seen[candidate.Concept]; ok { | |
continue | |
} | |
if candidate.Concept[0] == '$' { | |
continue | |
} | |
out[i] = candidate | |
i++ | |
seen[candidate.Concept] = struct{}{} | |
} | |
return out[:i] | |
} | |
func (pb *PathBuilder) buildPath(neighbors []*txt2vecmodels.NearestNeighbor, searchVector []float32, | |
target []float32, | |
) *txt2vecmodels.SemanticPath { | |
var path []*txt2vecmodels.SemanticPathElement | |
minDist := float32(math.MaxFloat32) | |
current := searchVector // initial search point | |
for { | |
nn := pb.nearestNeighbors(current, neighbors, 10) | |
nn = pb.discardFurtherThan(nn, minDist, target) | |
if len(nn) == 0 { | |
break | |
} | |
nn = pb.nearestNeighbors(current, nn, 1) | |
current = nn[0].Vector | |
minDist = pb.distance(current, target) | |
path = append(path, &txt2vecmodels.SemanticPathElement{ | |
Concept: nn[0].Concept, | |
}) | |
} | |
return &txt2vecmodels.SemanticPath{ | |
Path: path, | |
} | |
} | |
func (pb *PathBuilder) nearestNeighbors(search []float32, candidates []*txt2vecmodels.NearestNeighbor, length int) []*txt2vecmodels.NearestNeighbor { | |
sort.Slice(candidates, func(a, b int) bool { | |
return pb.distance(candidates[a].Vector, search) < pb.distance(candidates[b].Vector, search) | |
}) | |
return candidates[:length] | |
} | |
func (pb *PathBuilder) distance(a, b []float32) float32 { | |
var sums float32 | |
for i := range a { | |
sums += (a[i] - b[i]) * (a[i] - b[i]) | |
} | |
return float32(math.Sqrt(float64(sums))) | |
} | |
func (pb *PathBuilder) discardFurtherThan(candidates []*txt2vecmodels.NearestNeighbor, threshold float32, target []float32) []*txt2vecmodels.NearestNeighbor { | |
out := make([]*txt2vecmodels.NearestNeighbor, len(candidates)) | |
i := 0 | |
for _, c := range candidates { | |
if pb.distance(c.Vector, target) >= threshold { | |
continue | |
} | |
out[i] = c | |
i++ | |
} | |
return out[:i] | |
} | |
// create an explicit deep copy that does not keep any references | |
func copyNeighbors(in []*txt2vecmodels.NearestNeighbor) []*txt2vecmodels.NearestNeighbor { | |
out := make([]*txt2vecmodels.NearestNeighbor, len(in)) | |
for i, n := range in { | |
out[i] = &txt2vecmodels.NearestNeighbor{ | |
Concept: n.Concept, | |
Distance: n.Distance, | |
Vector: n.Vector, | |
} | |
} | |
return out | |
} | |
func (pb *PathBuilder) addDistancesToPath(path *txt2vecmodels.SemanticPath, neighbors []*txt2vecmodels.NearestNeighbor, | |
searchVector, targetVector []float32, | |
) (*txt2vecmodels.SemanticPath, error) { | |
for i, elem := range path.Path { | |
vec, ok := neighborVecByConcept(neighbors, elem.Concept) | |
if !ok { | |
return nil, fmt.Errorf("no vector present for concept: %s", elem.Concept) | |
} | |
if i != 0 { | |
// include previous | |
previousVec, ok := neighborVecByConcept(neighbors, path.Path[i-1].Concept) | |
if !ok { | |
return nil, fmt.Errorf("no vector present for previous concept: %s", path.Path[i-1].Concept) | |
} | |
d, err := cosineDist(vec, previousVec) | |
if err != nil { | |
return nil, errors.Wrap(err, "calculate distance between current path and previous element") | |
} | |
path.Path[i].DistanceToPrevious = &d | |
} | |
// target | |
d, err := cosineDist(vec, targetVector) | |
if err != nil { | |
return nil, errors.Wrap(err, "calculate distance between current path and result element") | |
} | |
path.Path[i].DistanceToResult = d | |
// query | |
d, err = cosineDist(vec, searchVector) | |
if err != nil { | |
return nil, errors.Wrap(err, "calculate distance between current path and query element") | |
} | |
path.Path[i].DistanceToQuery = d | |
if i != len(path.Path)-1 { | |
// include next | |
nextVec, ok := neighborVecByConcept(neighbors, path.Path[i+1].Concept) | |
if !ok { | |
return nil, fmt.Errorf("no vector present for next concept: %s", path.Path[i+1].Concept) | |
} | |
d, err := cosineDist(vec, nextVec) | |
if err != nil { | |
return nil, errors.Wrap(err, "calculate distance between current path and next element") | |
} | |
path.Path[i].DistanceToNext = &d | |
} | |
} | |
return path, nil | |
} | |
func neighborVecByConcept(neighbors []*txt2vecmodels.NearestNeighbor, concept string) ([]float32, bool) { | |
for _, n := range neighbors { | |
if n.Concept == concept { | |
return n.Vector, true | |
} | |
} | |
return nil, false | |
} | |
func cosineSim(a, b []float32) (float32, error) { | |
if len(a) != len(b) { | |
return 0, fmt.Errorf("vectors have different dimensions") | |
} | |
var ( | |
sumProduct float64 | |
sumASquare float64 | |
sumBSquare float64 | |
) | |
for i := range a { | |
sumProduct += float64(a[i] * b[i]) | |
sumASquare += float64(a[i] * a[i]) | |
sumBSquare += float64(b[i] * b[i]) | |
} | |
return float32(sumProduct / (math.Sqrt(sumASquare) * math.Sqrt(sumBSquare))), nil | |
} | |
func cosineDist(a, b []float32) (float32, error) { | |
sim, err := cosineSim(a, b) | |
if err != nil { | |
return 0, err | |
} | |
return 1 - sim, nil | |
} | |