KevinStephenson
Adding in weaviate code
b110593
raw
history blame
13.6 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package vectorizer
import (
"context"
"testing"
"github.com/sirupsen/logrus"
ltest "github.com/sirupsen/logrus/hooks/test"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"github.com/weaviate/weaviate/entities/models"
"github.com/weaviate/weaviate/entities/schema"
)
func TestConfigValidator(t *testing.T) {
t.Run("validate class names", func(t *testing.T) {
type testCase struct {
input string
valid bool
name string
vectorize bool
}
// for all test cases keep in mind that the word "carrot" is not present in
// the fake c11y, but every other word is.
//
// Additionally, the word "the" is a stopword
//
// all inputs represent class names (!)
tests := []testCase{
// valid names
{
name: "Single uppercase word present in the c11y",
input: "Car",
valid: true,
vectorize: true,
},
{
name: "Single lowercase word present in the c11y, stored as uppercase",
input: "car",
valid: true,
vectorize: true,
},
{
name: "combination of valid words starting with uppercase letter",
input: "CarGarage",
valid: true,
vectorize: true,
},
{
name: "combination of valid words starting with lowercase letter, stored as uppercase",
input: "carGarage",
valid: true,
vectorize: true,
},
{
name: "combination of valid words and stopwords, starting with uppercase",
input: "TheCarGarage",
valid: true,
vectorize: true,
},
{
name: "combination of valid words and stopwords starting with lowercase letter, stored as uppercase",
input: "carTheGarage",
valid: true,
vectorize: true,
},
// invalid names
{
name: "Single uppercase word NOT present in the c11y",
input: "Carrot",
valid: false,
vectorize: true,
},
{
name: "Single lowercase word NOT present in the c11y",
input: "carrot",
valid: false,
vectorize: true,
},
{
name: "Single uppercase stopword",
input: "The",
valid: false,
vectorize: true,
},
{
name: "Single lowercase stopword",
input: "the",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, valid word first lowercased",
input: "potatoCarrot",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, valid word first uppercased",
input: "PotatoCarrot",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, invalid word first lowercased",
input: "carrotPotato",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, invalid word first uppercased",
input: "CarrotPotato",
valid: false,
vectorize: true,
},
{
name: "combination of only stopwords, starting with lowercase",
input: "theThe",
valid: false,
vectorize: true,
},
{
name: "combination of only stopwords, starting with uppercase",
input: "TheThe",
valid: false,
vectorize: true,
},
// vectorize turned off
{
name: "non-vectorized: combination of only stopwords, starting with uppercase",
input: "TheThe",
valid: true,
vectorize: false,
},
{
name: "non-vectorized: excluded word",
input: "carrot",
valid: true,
vectorize: false,
},
}
for _, test := range tests {
t.Run(test.name+" object class", func(t *testing.T) {
class := &models.Class{
Class: test.input,
Properties: []*models.Property{{
Name: "dummyPropSoWeDontRunIntoAllNoindexedError",
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWhitespace,
}},
}
logger, _ := ltest.NewNullLogger()
v := NewConfigValidator(&fakeRemote{}, logger)
err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
vectorizeClassName: test.vectorize,
propertyIndexed: true,
})
assert.Equal(t, test.valid, err == nil)
// only proceed if input was supposed to be valid
if test.valid == false {
return
}
})
}
})
t.Run("validate property names", func(t *testing.T) {
type testCase struct {
input string
valid bool
name string
vectorize bool
}
// for all test cases keep in mind that the word "carrot" is not present in
// the fake c11y, but every other word is
//
// all inputs represent property names (!)
tests := []testCase{
// valid names
{
name: "Single uppercase word present in the c11y, stored as lowercase",
input: "Brand",
valid: true,
vectorize: true,
},
{
name: "Single lowercase word present in the c11y",
input: "brand",
valid: true,
vectorize: true,
},
{
name: "combination of valid words starting with uppercase letter, stored as lowercase",
input: "BrandGarage",
valid: true,
vectorize: true,
},
{
name: "combination of valid words starting with lowercase letter",
input: "brandGarage",
valid: true,
vectorize: true,
},
{
name: "combination of valid words and stop words starting with uppercase letter, stored as lowercase",
input: "TheGarage",
valid: true,
vectorize: true,
},
{
name: "combination of valid words and stop words starting with lowercase letter",
input: "theGarage",
valid: true,
vectorize: true,
},
// invalid names
{
name: "Single uppercase word NOT present in the c11y",
input: "Carrot",
valid: false,
vectorize: true,
},
{
name: "Single lowercase word NOT present in the c11y",
input: "carrot",
valid: false,
vectorize: true,
},
{
name: "Single lowercase stop word",
input: "the",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, valid word first lowercased",
input: "potatoCarrot",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, valid word first uppercased",
input: "PotatoCarrot",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, invalid word first lowercased",
input: "carrotPotato",
valid: false,
vectorize: true,
},
{
name: "combination of valid and invalid words, invalid word first uppercased",
input: "CarrotPotato",
valid: false,
vectorize: true,
},
{
name: "combination of only stop words, first lowercased",
input: "theThe",
valid: false,
vectorize: true,
},
{
name: "combination of only stop words, first uppercased",
input: "TheThe",
valid: false,
vectorize: true,
},
// without vectorizing
{
name: "non-vectorizing: combination of only stop words, first uppercased",
input: "TheThe",
valid: true,
vectorize: false,
},
{
name: "non-vectorizing: combination of only stop words, first uppercased",
input: "carrot",
valid: true,
vectorize: false,
},
}
for _, test := range tests {
t.Run(test.name+" object class", func(t *testing.T) {
class := &models.Class{
Class: "ValidName",
Properties: []*models.Property{{
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWhitespace,
Name: test.input,
}},
}
logger, _ := ltest.NewNullLogger()
v := NewConfigValidator(&fakeRemote{}, logger)
err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
vectorizePropertyName: test.vectorize,
propertyIndexed: true,
})
assert.Equal(t, test.valid, err == nil)
})
}
})
t.Run("all usable props no-indexed", func(t *testing.T) {
t.Run("all schema vectorization turned off", func(t *testing.T) {
class := &models.Class{
Vectorizer: "text2vec-contextionary",
Class: "ValidName",
Properties: []*models.Property{
{
DataType: []string{"text"},
Name: "description",
},
{
DataType: schema.DataTypeText.PropString(),
Tokenization: models.PropertyTokenizationWhitespace,
Name: "name",
},
{
DataType: []string{"int"},
Name: "amount",
},
},
}
logger, _ := ltest.NewNullLogger()
v := NewConfigValidator(&fakeRemote{}, logger)
err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
vectorizePropertyName: false,
vectorizeClassName: false,
propertyIndexed: false,
})
assert.NotNil(t, err)
})
})
t.Run("with only array types", func(t *testing.T) {
class := &models.Class{
Vectorizer: "text2vec-contextionary",
Class: "ValidName",
Properties: []*models.Property{
{
DataType: []string{"text[]"},
Name: "descriptions",
},
{
DataType: schema.DataTypeTextArray.PropString(),
Tokenization: models.PropertyTokenizationWhitespace,
Name: "names",
},
},
}
logger, _ := ltest.NewNullLogger()
v := NewConfigValidator(&fakeRemote{}, logger)
err := v.Do(context.Background(), class, nil, &fakeIndexChecker{
vectorizePropertyName: false,
vectorizeClassName: false,
propertyIndexed: true,
})
assert.Nil(t, err)
})
}
func TestConfigValidator_RiskOfDuplicateVectors(t *testing.T) {
type test struct {
name string
in *models.Class
expectWarning bool
indexChecker *fakeIndexChecker
}
tests := []test{
{
name: "usable properties",
in: &models.Class{
Class: "ValidName",
Properties: []*models.Property{
{
DataType: []string{string(schema.DataTypeText)},
Name: "textProp",
},
},
},
expectWarning: false,
indexChecker: &fakeIndexChecker{
vectorizePropertyName: false,
vectorizeClassName: true,
propertyIndexed: true,
},
},
{
name: "no properties",
in: &models.Class{
Class: "ValidName",
},
expectWarning: true,
indexChecker: &fakeIndexChecker{
vectorizePropertyName: false,
vectorizeClassName: true,
propertyIndexed: false,
},
},
{
name: "usable properties, but they are no-indexed",
in: &models.Class{
Class: "ValidName",
Properties: []*models.Property{
{
DataType: []string{string(schema.DataTypeText)},
Name: "textProp",
},
},
},
expectWarning: true,
indexChecker: &fakeIndexChecker{
vectorizePropertyName: false,
vectorizeClassName: true,
propertyIndexed: false,
},
},
{
name: "only unusable properties",
in: &models.Class{
Class: "ValidName",
Properties: []*models.Property{
{
DataType: []string{string(schema.DataTypeInt)},
Name: "intProp",
},
},
},
expectWarning: true,
indexChecker: &fakeIndexChecker{
vectorizePropertyName: false,
vectorizeClassName: true,
propertyIndexed: false,
},
},
}
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
logger, hook := ltest.NewNullLogger()
v := NewConfigValidator(&fakeRemote{}, logger)
err := v.Do(context.Background(), test.in, nil, test.indexChecker)
require.Nil(t, err)
entry := hook.LastEntry()
if test.expectWarning {
require.NotNil(t, entry)
assert.Equal(t, logrus.WarnLevel, entry.Level)
} else {
assert.Nil(t, entry)
}
})
}
}
type fakeIndexChecker struct {
vectorizeClassName bool
vectorizePropertyName bool
propertyIndexed bool
}
func (f *fakeIndexChecker) VectorizeClassName() bool {
return f.vectorizeClassName
}
func (f *fakeIndexChecker) VectorizePropertyName(propName string) bool {
return f.vectorizePropertyName
}
func (f *fakeIndexChecker) PropertyIndexed(propName string) bool {
return f.propertyIndexed
}
// Every word in this fake c11y remote client is present except for the word
// Carrot which is not present
type fakeRemote struct{}
func (f *fakeRemote) IsWordPresent(ctx context.Context, word string) (bool, error) {
if word == "carrot" || word == "the" {
return false, nil
}
return true, nil
}
func (f *fakeRemote) IsStopWord(ctx context.Context, word string) (bool, error) {
return word == "the", nil
}