Spaces:
Running
Running
// _ _ | |
// __ _____ __ ___ ___ __ _| |_ ___ | |
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ | |
// \ V V / __/ (_| |\ V /| | (_| | || __/ | |
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| | |
// | |
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. | |
// | |
// CONTACT: [email protected] | |
// | |
package vectorizer | |
import ( | |
"context" | |
"fmt" | |
"strings" | |
"unicode" | |
"unicode/utf8" | |
"github.com/weaviate/weaviate/entities/models" | |
txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models" | |
) | |
type InspectorClient interface { | |
VectorForWord(ctx context.Context, word string) ([]float32, error) | |
VectorForCorpi(ctx context.Context, words []string, | |
overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error) | |
NearestWordsByVector(ctx context.Context, vector []float32, n int, k int) ([]string, []float32, error) | |
IsWordPresent(ctx context.Context, word string) (bool, error) | |
} | |
type Inspector struct { | |
client InspectorClient | |
} | |
func NewInspector(client InspectorClient) *Inspector { | |
return &Inspector{client: client} | |
} | |
func (i *Inspector) GetWords(ctx context.Context, words string) (*models.C11yWordsResponse, error) { | |
wordArray, err := i.validateAndSplit(words) | |
if err != nil { | |
return nil, err | |
} | |
concatWord, err := i.concatWord(ctx, words, wordArray) | |
if err != nil { | |
return nil, err | |
} | |
individualWords, err := i.individualWords(ctx, wordArray) | |
if err != nil { | |
return nil, err | |
} | |
return &models.C11yWordsResponse{ | |
ConcatenatedWord: concatWord, | |
IndividualWords: individualWords, | |
}, nil | |
} | |
func (i *Inspector) validateAndSplit(words string) ([]string, error) { | |
// set first character to lowercase | |
wordChars := []rune(words) | |
wordChars[0] = unicode.ToLower(wordChars[0]) | |
words = string(wordChars) | |
for _, r := range words { | |
if !unicode.IsLetter(r) && !unicode.IsNumber(r) { | |
return nil, fmt.Errorf("invalid word input: words must only contain unicode letters and digits") | |
} | |
} | |
return split(words), nil | |
} | |
func (i *Inspector) concatWord(ctx context.Context, words string, | |
wordArray []string, | |
) (*models.C11yWordsResponseConcatenatedWord, error) { | |
if len(wordArray) < 2 { | |
// only build a concat response if we have more than a single word | |
return nil, nil | |
} | |
// join the words into a single corpus. While the contextionary also supports | |
// building a centroid from multiple corpi (thus []string for Corpi, an | |
// occurrence-based weighing can only happen within a corpus. It is thus - by | |
// far - preferable in this case, to concat the words into one corpus, rather | |
// than treating each word as its own. | |
corpus := strings.Join(wordArray, " ") | |
vector, _, err := i.client.VectorForCorpi(ctx, []string{corpus}, nil) | |
if err != nil { | |
return nil, err | |
} | |
nearestNeighbors, err := i.nearestNeighbors(ctx, vector) | |
if err != nil { | |
return nil, err | |
} | |
return &models.C11yWordsResponseConcatenatedWord{ | |
ConcatenatedWord: words, | |
SingleWords: wordArray, | |
ConcatenatedVector: vector, | |
ConcatenatedNearestNeighbors: nearestNeighbors, | |
}, nil | |
} | |
func (i *Inspector) nearestNeighbors(ctx context.Context, | |
vector []float32, | |
) ([]*models.C11yNearestNeighborsItems0, error) { | |
// relate words of centroid | |
words, dists, err := i.client.NearestWordsByVector(ctx, vector, 12, 32) | |
if err != nil { | |
return nil, err | |
} | |
nearestNeighbors := []*models.C11yNearestNeighborsItems0{} | |
// loop over NN Idx' and append to the return object | |
for i, word := range words { | |
item := models.C11yNearestNeighborsItems0{ | |
Word: word, | |
Distance: dists[i], | |
} | |
nearestNeighbors = append(nearestNeighbors, &item) | |
} | |
return nearestNeighbors, nil | |
} | |
func (i *Inspector) individualWords(ctx context.Context, | |
wordArray []string, | |
) ([]*models.C11yWordsResponseIndividualWordsItems0, error) { | |
var res []*models.C11yWordsResponseIndividualWordsItems0 | |
for _, word := range wordArray { | |
iw, err := i.individualWord(ctx, word) | |
if err != nil { | |
return nil, fmt.Errorf("word '%s': %v", word, err) | |
} | |
res = append(res, iw) | |
} | |
return res, nil | |
} | |
func (i *Inspector) individualWord(ctx context.Context, | |
word string, | |
) (*models.C11yWordsResponseIndividualWordsItems0, error) { | |
ok, err := i.client.IsWordPresent(ctx, word) | |
if err != nil { | |
return nil, fmt.Errorf("could not check word presence: %v", err) | |
} | |
if !ok { | |
return i.individualWordNotPresent(word), nil | |
} | |
return i.individualWordPresent(ctx, word) | |
} | |
func (i *Inspector) individualWordNotPresent(word string) *models.C11yWordsResponseIndividualWordsItems0 { | |
return &models.C11yWordsResponseIndividualWordsItems0{ | |
Word: word, | |
Present: false, | |
} | |
} | |
func (i *Inspector) individualWordPresent(ctx context.Context, | |
word string, | |
) (*models.C11yWordsResponseIndividualWordsItems0, error) { | |
info, err := i.individualWordInfo(ctx, word) | |
if err != nil { | |
return nil, err | |
} | |
return &models.C11yWordsResponseIndividualWordsItems0{ | |
Word: word, | |
Present: true, | |
Info: info, | |
}, nil | |
} | |
func (i *Inspector) individualWordInfo(ctx context.Context, | |
word string, | |
) (*models.C11yWordsResponseIndividualWordsItems0Info, error) { | |
vector, err := i.client.VectorForWord(ctx, word) | |
if err != nil { | |
return nil, err | |
} | |
nns, err := i.nearestNeighbors(ctx, vector) | |
if err != nil { | |
return nil, err | |
} | |
return &models.C11yWordsResponseIndividualWordsItems0Info{ | |
Vector: vector, | |
NearestNeighbors: nns, | |
}, nil | |
} | |
// Splits a CamelCase string to an array | |
// Based on: https://github.com/fatih/camelcase | |
func split(src string) (entries []string) { | |
// don't split invalid utf8 | |
if !utf8.ValidString(src) { | |
return []string{src} | |
} | |
entries = []string{} | |
var runes [][]rune | |
lastClass := 0 | |
class := 0 | |
// split into fields based on class of unicode character | |
for _, r := range src { | |
switch true { | |
case unicode.IsLower(r): | |
class = 1 | |
case unicode.IsUpper(r): | |
class = 2 | |
case unicode.IsDigit(r): | |
class = 1 | |
default: | |
class = 4 | |
} | |
if class == lastClass { | |
runes[len(runes)-1] = append(runes[len(runes)-1], r) | |
} else { | |
runes = append(runes, []rune{r}) | |
} | |
lastClass = class | |
} | |
// handle upper case -> lower case sequences, e.g. | |
// "PDFL", "oader" -> "PDF", "Loader" | |
for i := 0; i < len(runes)-1; i++ { | |
if unicode.IsUpper(runes[i][0]) && unicode.IsLower(runes[i+1][0]) { | |
runes[i+1] = append([]rune{runes[i][len(runes[i])-1]}, runes[i+1]...) | |
runes[i] = runes[i][:len(runes[i])-1] | |
} | |
} | |
// construct []string from results | |
for _, s := range runes { | |
if len(s) > 0 { | |
entries = append(entries, strings.ToLower(string(s))) | |
} | |
} | |
return | |
} | |