KevinStephenson
Adding in weaviate code
b110593
raw
history blame
6.88 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package vectorizer
import (
"context"
"fmt"
"strings"
"unicode"
"unicode/utf8"
"github.com/weaviate/weaviate/entities/models"
txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models"
)
type InspectorClient interface {
VectorForWord(ctx context.Context, word string) ([]float32, error)
VectorForCorpi(ctx context.Context, words []string,
overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error)
NearestWordsByVector(ctx context.Context, vector []float32, n int, k int) ([]string, []float32, error)
IsWordPresent(ctx context.Context, word string) (bool, error)
}
type Inspector struct {
client InspectorClient
}
func NewInspector(client InspectorClient) *Inspector {
return &Inspector{client: client}
}
func (i *Inspector) GetWords(ctx context.Context, words string) (*models.C11yWordsResponse, error) {
wordArray, err := i.validateAndSplit(words)
if err != nil {
return nil, err
}
concatWord, err := i.concatWord(ctx, words, wordArray)
if err != nil {
return nil, err
}
individualWords, err := i.individualWords(ctx, wordArray)
if err != nil {
return nil, err
}
return &models.C11yWordsResponse{
ConcatenatedWord: concatWord,
IndividualWords: individualWords,
}, nil
}
func (i *Inspector) validateAndSplit(words string) ([]string, error) {
// set first character to lowercase
wordChars := []rune(words)
wordChars[0] = unicode.ToLower(wordChars[0])
words = string(wordChars)
for _, r := range words {
if !unicode.IsLetter(r) && !unicode.IsNumber(r) {
return nil, fmt.Errorf("invalid word input: words must only contain unicode letters and digits")
}
}
return split(words), nil
}
func (i *Inspector) concatWord(ctx context.Context, words string,
wordArray []string,
) (*models.C11yWordsResponseConcatenatedWord, error) {
if len(wordArray) < 2 {
// only build a concat response if we have more than a single word
return nil, nil
}
// join the words into a single corpus. While the contextionary also supports
// building a centroid from multiple corpi (thus []string for Corpi, an
// occurrence-based weighing can only happen within a corpus. It is thus - by
// far - preferable in this case, to concat the words into one corpus, rather
// than treating each word as its own.
corpus := strings.Join(wordArray, " ")
vector, _, err := i.client.VectorForCorpi(ctx, []string{corpus}, nil)
if err != nil {
return nil, err
}
nearestNeighbors, err := i.nearestNeighbors(ctx, vector)
if err != nil {
return nil, err
}
return &models.C11yWordsResponseConcatenatedWord{
ConcatenatedWord: words,
SingleWords: wordArray,
ConcatenatedVector: vector,
ConcatenatedNearestNeighbors: nearestNeighbors,
}, nil
}
func (i *Inspector) nearestNeighbors(ctx context.Context,
vector []float32,
) ([]*models.C11yNearestNeighborsItems0, error) {
// relate words of centroid
words, dists, err := i.client.NearestWordsByVector(ctx, vector, 12, 32)
if err != nil {
return nil, err
}
nearestNeighbors := []*models.C11yNearestNeighborsItems0{}
// loop over NN Idx' and append to the return object
for i, word := range words {
item := models.C11yNearestNeighborsItems0{
Word: word,
Distance: dists[i],
}
nearestNeighbors = append(nearestNeighbors, &item)
}
return nearestNeighbors, nil
}
func (i *Inspector) individualWords(ctx context.Context,
wordArray []string,
) ([]*models.C11yWordsResponseIndividualWordsItems0, error) {
var res []*models.C11yWordsResponseIndividualWordsItems0
for _, word := range wordArray {
iw, err := i.individualWord(ctx, word)
if err != nil {
return nil, fmt.Errorf("word '%s': %v", word, err)
}
res = append(res, iw)
}
return res, nil
}
func (i *Inspector) individualWord(ctx context.Context,
word string,
) (*models.C11yWordsResponseIndividualWordsItems0, error) {
ok, err := i.client.IsWordPresent(ctx, word)
if err != nil {
return nil, fmt.Errorf("could not check word presence: %v", err)
}
if !ok {
return i.individualWordNotPresent(word), nil
}
return i.individualWordPresent(ctx, word)
}
func (i *Inspector) individualWordNotPresent(word string) *models.C11yWordsResponseIndividualWordsItems0 {
return &models.C11yWordsResponseIndividualWordsItems0{
Word: word,
Present: false,
}
}
func (i *Inspector) individualWordPresent(ctx context.Context,
word string,
) (*models.C11yWordsResponseIndividualWordsItems0, error) {
info, err := i.individualWordInfo(ctx, word)
if err != nil {
return nil, err
}
return &models.C11yWordsResponseIndividualWordsItems0{
Word: word,
Present: true,
Info: info,
}, nil
}
func (i *Inspector) individualWordInfo(ctx context.Context,
word string,
) (*models.C11yWordsResponseIndividualWordsItems0Info, error) {
vector, err := i.client.VectorForWord(ctx, word)
if err != nil {
return nil, err
}
nns, err := i.nearestNeighbors(ctx, vector)
if err != nil {
return nil, err
}
return &models.C11yWordsResponseIndividualWordsItems0Info{
Vector: vector,
NearestNeighbors: nns,
}, nil
}
// Splits a CamelCase string to an array
// Based on: https://github.com/fatih/camelcase
func split(src string) (entries []string) {
// don't split invalid utf8
if !utf8.ValidString(src) {
return []string{src}
}
entries = []string{}
var runes [][]rune
lastClass := 0
class := 0
// split into fields based on class of unicode character
for _, r := range src {
switch true {
case unicode.IsLower(r):
class = 1
case unicode.IsUpper(r):
class = 2
case unicode.IsDigit(r):
class = 1
default:
class = 4
}
if class == lastClass {
runes[len(runes)-1] = append(runes[len(runes)-1], r)
} else {
runes = append(runes, []rune{r})
}
lastClass = class
}
// handle upper case -> lower case sequences, e.g.
// "PDFL", "oader" -> "PDF", "Loader"
for i := 0; i < len(runes)-1; i++ {
if unicode.IsUpper(runes[i][0]) && unicode.IsLower(runes[i+1][0]) {
runes[i+1] = append([]rune{runes[i][len(runes[i])-1]}, runes[i+1]...)
runes[i] = runes[i][:len(runes[i])-1]
}
}
// construct []string from results
for _, s := range runes {
if len(s) > 0 {
entries = append(entries, strings.ToLower(string(s)))
}
}
return
}