Spaces:
Running
Running
// _ _ | |
// __ _____ __ ___ ___ __ _| |_ ___ | |
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ | |
// \ V V / __/ (_| |\ V /| | (_| | || __/ | |
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| | |
// | |
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. | |
// | |
// CONTACT: [email protected] | |
// | |
package modopenai | |
import ( | |
"context" | |
"github.com/weaviate/weaviate/entities/models" | |
"github.com/weaviate/weaviate/entities/modulecapabilities" | |
"github.com/weaviate/weaviate/entities/moduletools" | |
"github.com/weaviate/weaviate/entities/schema" | |
"github.com/weaviate/weaviate/modules/text2vec-openai/vectorizer" | |
) | |
func (m *OpenAIModule) ClassConfigDefaults() map[string]interface{} { | |
return map[string]interface{}{ | |
"vectorizeClassName": vectorizer.DefaultVectorizeClassName, | |
"type": vectorizer.DefaultOpenAIDocumentType, | |
"model": vectorizer.DefaultOpenAIModel, | |
"baseURL": vectorizer.DefaultBaseURL, | |
"modelVersion": vectorizer.PickDefaultModelVersion(vectorizer.DefaultOpenAIModel, | |
vectorizer.DefaultOpenAIDocumentType), | |
} | |
} | |
func (m *OpenAIModule) PropertyConfigDefaults( | |
dt *schema.DataType, | |
) map[string]interface{} { | |
return map[string]interface{}{ | |
"skip": !vectorizer.DefaultPropertyIndexed, | |
"vectorizePropertyName": vectorizer.DefaultVectorizePropertyName, | |
} | |
} | |
func (m *OpenAIModule) ValidateClass(ctx context.Context, | |
class *models.Class, cfg moduletools.ClassConfig, | |
) error { | |
settings := vectorizer.NewClassSettings(cfg) | |
return settings.Validate(class) | |
} | |
var _ = modulecapabilities.ClassConfigurator(New()) | |
// type ConfigValidator struct { | |
// logger logrus.FieldLogger | |
// } | |
// type ClassSettings interface { | |
// VectorizeClassName() bool | |
// VectorizePropertyName(propName string) bool | |
// PropertyIndexed(propName string) bool | |
// } | |
// func NewConfigValidator(logger logrus.FieldLogger) *ConfigValidator { | |
// return &ConfigValidator{logger: logger} | |
// } | |
// func (cv *ConfigValidator) Do(ctx context.Context, class *models.Class, | |
// cfg moduletools.ClassConfig, settings ClassSettings) error { | |
// // In text2vec-openai (as opposed to e.g. text2vec-contextionary) the | |
// // assumption is that the models will be able to deal with any words, even | |
// // previously unseen ones. Therefore we do not need to validate individual | |
// // properties, but only the overall "index state" | |
// if err := cv.validateIndexState(ctx, class, settings); err != nil { | |
// return errors.Errorf("invalid combination of properties") | |
// } | |
// cv.checkForPossibilityOfDuplicateVectors(ctx, class, settings) | |
// return nil | |
// } | |
// func (cv *ConfigValidator) validateIndexState(ctx context.Context, | |
// class *models.Class, settings ClassSettings) error { | |
// if settings.VectorizeClassName() { | |
// // if the user chooses to vectorize the classname, vector-building will | |
// // always be possible, no need to investigate further | |
// return nil | |
// } | |
// // search if there is at least one indexed, string/text prop. If found pass | |
// // validation | |
// for _, prop := range class.Properties { | |
// if len(prop.DataType) < 1 { | |
// return errors.Errorf("property %s must have at least one datatype: "+ | |
// "got %v", prop.Name, prop.DataType) | |
// } | |
// if prop.DataType[0] != string(schema.DataTypeText) { | |
// // we can only vectorize text-like props | |
// continue | |
// } | |
// if settings.PropertyIndexed(prop.Name) { | |
// // found at least one, this is a valid schema | |
// return nil | |
// } | |
// } | |
// return fmt.Errorf("invalid properties: didn't find a single property which is " + | |
// "of type string or text and is not excluded from indexing. In addition the " + | |
// "class name is excluded from vectorization as well, meaning that it cannot be " + | |
// "used to determine the vector position. To fix this, set 'vectorizeClassName' " + | |
// "to true if the class name is contextionary-valid. Alternatively add at least " + | |
// "contextionary-valid text/string property which is not excluded from " + | |
// "indexing.") | |
// } | |
// func (cv *ConfigValidator) checkForPossibilityOfDuplicateVectors( | |
// ctx context.Context, class *models.Class, settings ClassSettings) { | |
// if !settings.VectorizeClassName() { | |
// // if the user choses not to vectorize the class name, this means they must | |
// // have chosen something else to vectorize, otherwise the validation would | |
// // have error'd before we ever got here. We can skip further checking. | |
// return | |
// } | |
// // search if there is at least one indexed, string/text prop. If found exit | |
// for _, prop := range class.Properties { | |
// // length check skipped, because validation has already passed | |
// if prop.DataType[0] != string(schema.DataTypeText) { | |
// // we can only vectorize text-like props | |
// continue | |
// } | |
// if settings.PropertyIndexed(prop.Name) { | |
// // found at least one | |
// return | |
// } | |
// } | |
// cv.logger.WithField("module", "text2vec-openai"). | |
// WithField("class", class.Class). | |
// Warnf("text2vec-openai: Class %q does not have any properties "+ | |
// "indexed (or only non text-properties indexed) and the vector position is "+ | |
// "only determined by the class name. Each object will end up with the same "+ | |
// "vector which leads to a severe performance penalty on imports. Consider "+ | |
// "setting vectorIndexConfig.skip=true for this property", class.Class) | |
// } | |