Spaces:

MVPilgrim
/

SemanticSearchPOC

Running

File size: 6,884 Bytes

b110593

//                           _       _
// __      _____  __ ___   ___  __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
//  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
//   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
//  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
//  CONTACT: [email protected]
//

package vectorizer

import (
	"context"
	"fmt"
	"strings"
	"unicode"
	"unicode/utf8"

	"github.com/weaviate/weaviate/entities/models"
	txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models"
)

type InspectorClient interface {
	VectorForWord(ctx context.Context, word string) ([]float32, error)
	VectorForCorpi(ctx context.Context, words []string,
		overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error)
	NearestWordsByVector(ctx context.Context, vector []float32, n int, k int) ([]string, []float32, error)
	IsWordPresent(ctx context.Context, word string) (bool, error)
}

type Inspector struct {
	client InspectorClient
}

func NewInspector(client InspectorClient) *Inspector {
	return &Inspector{client: client}
}

func (i *Inspector) GetWords(ctx context.Context, words string) (*models.C11yWordsResponse, error) {
	wordArray, err := i.validateAndSplit(words)
	if err != nil {
		return nil, err
	}

	concatWord, err := i.concatWord(ctx, words, wordArray)
	if err != nil {
		return nil, err
	}

	individualWords, err := i.individualWords(ctx, wordArray)
	if err != nil {
		return nil, err
	}

	return &models.C11yWordsResponse{
		ConcatenatedWord: concatWord,
		IndividualWords:  individualWords,
	}, nil
}

func (i *Inspector) validateAndSplit(words string) ([]string, error) {
	// set first character to lowercase
	wordChars := []rune(words)
	wordChars[0] = unicode.ToLower(wordChars[0])
	words = string(wordChars)

	for _, r := range words {
		if !unicode.IsLetter(r) && !unicode.IsNumber(r) {
			return nil, fmt.Errorf("invalid word input: words must only contain unicode letters and digits")
		}
	}

	return split(words), nil
}

func (i *Inspector) concatWord(ctx context.Context, words string,
	wordArray []string,
) (*models.C11yWordsResponseConcatenatedWord, error) {
	if len(wordArray) < 2 {
		// only build a concat response if we have more than a single word
		return nil, nil
	}

	// join the words into a single corpus. While the contextionary also supports
	// building a centroid from multiple corpi (thus []string for Corpi, an
	// occurrence-based weighing can only happen within a corpus. It is thus - by
	// far - preferable in this case, to concat the words into one corpus, rather
	// than treating each word as its own.
	corpus := strings.Join(wordArray, " ")
	vector, _, err := i.client.VectorForCorpi(ctx, []string{corpus}, nil)
	if err != nil {
		return nil, err
	}

	nearestNeighbors, err := i.nearestNeighbors(ctx, vector)
	if err != nil {
		return nil, err
	}

	return &models.C11yWordsResponseConcatenatedWord{
		ConcatenatedWord:             words,
		SingleWords:                  wordArray,
		ConcatenatedVector:           vector,
		ConcatenatedNearestNeighbors: nearestNeighbors,
	}, nil
}

func (i *Inspector) nearestNeighbors(ctx context.Context,
	vector []float32,
) ([]*models.C11yNearestNeighborsItems0, error) {
	// relate words of centroid
	words, dists, err := i.client.NearestWordsByVector(ctx, vector, 12, 32)
	if err != nil {
		return nil, err
	}

	nearestNeighbors := []*models.C11yNearestNeighborsItems0{}

	// loop over NN Idx' and append to the return object
	for i, word := range words {
		item := models.C11yNearestNeighborsItems0{
			Word:     word,
			Distance: dists[i],
		}

		nearestNeighbors = append(nearestNeighbors, &item)
	}

	return nearestNeighbors, nil
}

func (i *Inspector) individualWords(ctx context.Context,
	wordArray []string,
) ([]*models.C11yWordsResponseIndividualWordsItems0, error) {
	var res []*models.C11yWordsResponseIndividualWordsItems0

	for _, word := range wordArray {
		iw, err := i.individualWord(ctx, word)
		if err != nil {
			return nil, fmt.Errorf("word '%s': %v", word, err)
		}

		res = append(res, iw)
	}

	return res, nil
}

func (i *Inspector) individualWord(ctx context.Context,
	word string,
) (*models.C11yWordsResponseIndividualWordsItems0, error) {
	ok, err := i.client.IsWordPresent(ctx, word)
	if err != nil {
		return nil, fmt.Errorf("could not check word presence:  %v", err)
	}

	if !ok {
		return i.individualWordNotPresent(word), nil
	}

	return i.individualWordPresent(ctx, word)
}

func (i *Inspector) individualWordNotPresent(word string) *models.C11yWordsResponseIndividualWordsItems0 {
	return &models.C11yWordsResponseIndividualWordsItems0{
		Word:    word,
		Present: false,
	}
}

func (i *Inspector) individualWordPresent(ctx context.Context,
	word string,
) (*models.C11yWordsResponseIndividualWordsItems0, error) {
	info, err := i.individualWordInfo(ctx, word)
	if err != nil {
		return nil, err
	}

	return &models.C11yWordsResponseIndividualWordsItems0{
		Word:    word,
		Present: true,
		Info:    info,
	}, nil
}

func (i *Inspector) individualWordInfo(ctx context.Context,
	word string,
) (*models.C11yWordsResponseIndividualWordsItems0Info, error) {
	vector, err := i.client.VectorForWord(ctx, word)
	if err != nil {
		return nil, err
	}

	nns, err := i.nearestNeighbors(ctx, vector)
	if err != nil {
		return nil, err
	}

	return &models.C11yWordsResponseIndividualWordsItems0Info{
		Vector:           vector,
		NearestNeighbors: nns,
	}, nil
}

// Splits a CamelCase string to an array
// Based on: https://github.com/fatih/camelcase
func split(src string) (entries []string) {
	// don't split invalid utf8
	if !utf8.ValidString(src) {
		return []string{src}
	}
	entries = []string{}
	var runes [][]rune
	lastClass := 0
	class := 0
	// split into fields based on class of unicode character
	for _, r := range src {
		switch true {
		case unicode.IsLower(r):
			class = 1
		case unicode.IsUpper(r):
			class = 2
		case unicode.IsDigit(r):
			class = 1
		default:
			class = 4
		}
		if class == lastClass {
			runes[len(runes)-1] = append(runes[len(runes)-1], r)
		} else {
			runes = append(runes, []rune{r})
		}
		lastClass = class
	}
	// handle upper case -> lower case sequences, e.g.
	// "PDFL", "oader" -> "PDF", "Loader"
	for i := 0; i < len(runes)-1; i++ {
		if unicode.IsUpper(runes[i][0]) && unicode.IsLower(runes[i+1][0]) {
			runes[i+1] = append([]rune{runes[i][len(runes[i])-1]}, runes[i+1]...)
			runes[i] = runes[i][:len(runes[i])-1]
		}
	}
	// construct []string from results
	for _, s := range runes {
		if len(s) > 0 {
			entries = append(entries, strings.ToLower(string(s)))
		}
	}
	return
}