Spaces:

MVPilgrim
/

SemanticSearchPOC

Running

File size: 5,230 Bytes

6b502ec

//                           _       _
// __      _____  __ ___   ___  __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
//  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
//   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
//  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
//  CONTACT: [email protected]
//

package modtransformers

import (
	"context"
	"fmt"

	"github.com/pkg/errors"
	"github.com/sirupsen/logrus"
	"github.com/weaviate/weaviate/entities/models"
	"github.com/weaviate/weaviate/entities/modulecapabilities"
	"github.com/weaviate/weaviate/entities/moduletools"
	"github.com/weaviate/weaviate/entities/schema"
	"github.com/weaviate/weaviate/modules/text2vec-transformers/vectorizer"
)

func (m *TransformersModule) ClassConfigDefaults() map[string]interface{} {
	return map[string]interface{}{
		"vectorizeClassName": vectorizer.DefaultVectorizeClassName,
		"poolingStrategy":    vectorizer.DefaultPoolingStrategy,
	}
}

func (m *TransformersModule) PropertyConfigDefaults(
	dt *schema.DataType,
) map[string]interface{} {
	return map[string]interface{}{
		"skip":                  !vectorizer.DefaultPropertyIndexed,
		"vectorizePropertyName": vectorizer.DefaultVectorizePropertyName,
	}
}

func (m *TransformersModule) ValidateClass(ctx context.Context,
	class *models.Class, cfg moduletools.ClassConfig,
) error {
	settings := vectorizer.NewClassSettings(cfg)
	return NewConfigValidator(m.logger).Do(ctx, class, cfg, settings)
}

var _ = modulecapabilities.ClassConfigurator(New())

type ConfigValidator struct {
	logger logrus.FieldLogger
}

type ClassSettings interface {
	VectorizeClassName() bool
	VectorizePropertyName(propName string) bool
	PropertyIndexed(propName string) bool
}

func NewConfigValidator(logger logrus.FieldLogger) *ConfigValidator {
	return &ConfigValidator{logger: logger}
}

func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class,
	cfg moduletools.ClassConfig, settings ClassSettings,
) error {
	// In text2vec-transformers (as opposed to e.g. text2vec-contextionary) the
	// assumption is that the models will be able to deal with any words, even
	// previously unseen ones. Therefore we do not need to validate individual
	// properties, but only the overall "index state"

	if err := cv.validateIndexState(ctx, class, settings); err != nil {
		return errors.Errorf("invalid combination of properties")
	}

	cv.checkForPossibilityOfDuplicateVectors(ctx, class, settings)

	return nil
}

func (cv *ConfigValidator) validateIndexState(ctx context.Context,
	class *models.Class, settings ClassSettings,
) error {
	if settings.VectorizeClassName() {
		// if the user chooses to vectorize the classname, vector-building will
		// always be possible, no need to investigate further

		return nil
	}

	// search if there is at least one indexed, string/text prop. If found pass
	// validation
	for _, prop := range class.Properties {
		if len(prop.DataType) < 1 {
			return errors.Errorf("property %s must have at least one datatype: "+
				"got %v", prop.Name, prop.DataType)
		}

		if prop.DataType[0] != string(schema.DataTypeText) {
			// we can only vectorize text-like props
			continue
		}

		if settings.PropertyIndexed(prop.Name) {
			// found at least one, this is a valid schema
			return nil
		}
	}

	return fmt.Errorf("invalid properties: didn't find a single property which is " +
		"of type string or text and is not excluded from indexing. In addition the " +
		"class name is excluded from vectorization as well, meaning that it cannot be " +
		"used to determine the vector position. To fix this, set 'vectorizeClassName' " +
		"to true if the class name is contextionary-valid. Alternatively add at least " +
		"contextionary-valid text/string property which is not excluded from " +
		"indexing.")
}

func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors(
	ctx context.Context, class *models.Class, settings ClassSettings,
) {
	if !settings.VectorizeClassName() {
		// if the user choses not to vectorize the class name, this means they must
		// have chosen something else to vectorize, otherwise the validation would
		// have error'd before we ever got here. We can skip further checking.

		return
	}

	// search if there is at least one indexed, string/text prop. If found exit
	for _, prop := range class.Properties {
		// length check skipped, because validation has already passed
		if prop.DataType[0] != string(schema.DataTypeText) {
			// we can only vectorize text-like props
			continue
		}

		if settings.PropertyIndexed(prop.Name) {
			// found at least one
			return
		}
	}

	cv.logger.WithField("module", "text2vec-transformers").
		WithField("class", class.Class).
		Warnf("text2vec-contextionary: Class %q does not have any properties "+
			"indexed (or only non text-properties indexed) and the vector position is "+
			"only determined by the class name. Each object will end up with the same "+
			"vector which leads to a severe performance penalty on imports. Consider "+
			"setting vectorIndexConfig.skip=true for this property", class.Class)
}