Spaces:
Running
Running
// _ _ | |
// __ _____ __ ___ ___ __ _| |_ ___ | |
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ | |
// \ V V / __/ (_| |\ V /| | (_| | || __/ | |
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| | |
// | |
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. | |
// | |
// CONTACT: [email protected] | |
// | |
package modtransformers | |
import ( | |
"context" | |
"fmt" | |
"github.com/pkg/errors" | |
"github.com/sirupsen/logrus" | |
"github.com/weaviate/weaviate/entities/models" | |
"github.com/weaviate/weaviate/entities/modulecapabilities" | |
"github.com/weaviate/weaviate/entities/moduletools" | |
"github.com/weaviate/weaviate/entities/schema" | |
"github.com/weaviate/weaviate/modules/text2vec-transformers/vectorizer" | |
) | |
func (m *TransformersModule) ClassConfigDefaults() map[string]interface{} { | |
return map[string]interface{}{ | |
"vectorizeClassName": vectorizer.DefaultVectorizeClassName, | |
"poolingStrategy": vectorizer.DefaultPoolingStrategy, | |
} | |
} | |
func (m *TransformersModule) PropertyConfigDefaults( | |
dt *schema.DataType, | |
) map[string]interface{} { | |
return map[string]interface{}{ | |
"skip": !vectorizer.DefaultPropertyIndexed, | |
"vectorizePropertyName": vectorizer.DefaultVectorizePropertyName, | |
} | |
} | |
func (m *TransformersModule) ValidateClass(ctx context.Context, | |
class *models.Class, cfg moduletools.ClassConfig, | |
) error { | |
settings := vectorizer.NewClassSettings(cfg) | |
return NewConfigValidator(m.logger).Do(ctx, class, cfg, settings) | |
} | |
var _ = modulecapabilities.ClassConfigurator(New()) | |
type ConfigValidator struct { | |
logger logrus.FieldLogger | |
} | |
type ClassSettings interface { | |
VectorizeClassName() bool | |
VectorizePropertyName(propName string) bool | |
PropertyIndexed(propName string) bool | |
} | |
func NewConfigValidator(logger logrus.FieldLogger) *ConfigValidator { | |
return &ConfigValidator{logger: logger} | |
} | |
func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class, | |
cfg moduletools.ClassConfig, settings ClassSettings, | |
) error { | |
// In text2vec-transformers (as opposed to e.g. text2vec-contextionary) the | |
// assumption is that the models will be able to deal with any words, even | |
// previously unseen ones. Therefore we do not need to validate individual | |
// properties, but only the overall "index state" | |
if err := cv.validateIndexState(ctx, class, settings); err != nil { | |
return errors.Errorf("invalid combination of properties") | |
} | |
cv.checkForPossibilityOfDuplicateVectors(ctx, class, settings) | |
return nil | |
} | |
func (cv *ConfigValidator) validateIndexState(ctx context.Context, | |
class *models.Class, settings ClassSettings, | |
) error { | |
if settings.VectorizeClassName() { | |
// if the user chooses to vectorize the classname, vector-building will | |
// always be possible, no need to investigate further | |
return nil | |
} | |
// search if there is at least one indexed, string/text prop. If found pass | |
// validation | |
for _, prop := range class.Properties { | |
if len(prop.DataType) < 1 { | |
return errors.Errorf("property %s must have at least one datatype: "+ | |
"got %v", prop.Name, prop.DataType) | |
} | |
if prop.DataType[0] != string(schema.DataTypeText) { | |
// we can only vectorize text-like props | |
continue | |
} | |
if settings.PropertyIndexed(prop.Name) { | |
// found at least one, this is a valid schema | |
return nil | |
} | |
} | |
return fmt.Errorf("invalid properties: didn't find a single property which is " + | |
"of type string or text and is not excluded from indexing. In addition the " + | |
"class name is excluded from vectorization as well, meaning that it cannot be " + | |
"used to determine the vector position. To fix this, set 'vectorizeClassName' " + | |
"to true if the class name is contextionary-valid. Alternatively add at least " + | |
"contextionary-valid text/string property which is not excluded from " + | |
"indexing.") | |
} | |
func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors( | |
ctx context.Context, class *models.Class, settings ClassSettings, | |
) { | |
if !settings.VectorizeClassName() { | |
// if the user choses not to vectorize the class name, this means they must | |
// have chosen something else to vectorize, otherwise the validation would | |
// have error'd before we ever got here. We can skip further checking. | |
return | |
} | |
// search if there is at least one indexed, string/text prop. If found exit | |
for _, prop := range class.Properties { | |
// length check skipped, because validation has already passed | |
if prop.DataType[0] != string(schema.DataTypeText) { | |
// we can only vectorize text-like props | |
continue | |
} | |
if settings.PropertyIndexed(prop.Name) { | |
// found at least one | |
return | |
} | |
} | |
cv.logger.WithField("module", "text2vec-transformers"). | |
WithField("class", class.Class). | |
Warnf("text2vec-contextionary: Class %q does not have any properties "+ | |
"indexed (or only non text-properties indexed) and the vector position is "+ | |
"only determined by the class name. Each object will end up with the same "+ | |
"vector which leads to a severe performance penalty on imports. Consider "+ | |
"setting vectorIndexConfig.skip=true for this property", class.Class) | |
} | |