File size: 6,635 Bytes
b110593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
//                           _       _
// __      _____  __ ___   ___  __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
//  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
//   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
//  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
//  CONTACT: [email protected]
//

package vectorizer

// TODO: This entire package should be part of the text2vec-contextionary
// module, if methods/objects in here are used from non-modular code, they
// probably shouldn't be in here

import (
	"context"
	"fmt"
	"strings"

	"github.com/fatih/camelcase"
	"github.com/weaviate/weaviate/entities/models"
	"github.com/weaviate/weaviate/entities/moduletools"
	txt2vecmodels "github.com/weaviate/weaviate/modules/text2vec-contextionary/additional/models"
	objectsvectorizer "github.com/weaviate/weaviate/usecases/modulecomponents/vectorizer"
)

// Vectorizer turns objects into vectors
type Vectorizer struct {
	client           client
	objectVectorizer *objectsvectorizer.ObjectVectorizer
}

type ErrNoUsableWords struct {
	Err error
}

func (e ErrNoUsableWords) Error() string {
	return e.Err.Error()
}

func NewErrNoUsableWordsf(pattern string, args ...interface{}) ErrNoUsableWords {
	return ErrNoUsableWords{Err: fmt.Errorf(pattern, args...)}
}

type client interface {
	VectorForCorpi(ctx context.Context, corpi []string,
		overrides map[string]string) ([]float32, []txt2vecmodels.InterpretationSource, error)
}

// IndexCheck returns whether a property of a class should be indexed
type ClassIndexCheck interface {
	PropertyIndexed(property string) bool
	VectorizeClassName() bool
	VectorizePropertyName(propertyName string) bool
}

// New from c11y client
func New(client client) *Vectorizer {
	return &Vectorizer{
		client:           client,
		objectVectorizer: objectsvectorizer.New(),
	}
}

func (v *Vectorizer) Texts(ctx context.Context, inputs []string,
	cfg moduletools.ClassConfig,
) ([]float32, error) {
	return v.Corpi(ctx, inputs)
}

// Object object to vector
func (v *Vectorizer) Object(ctx context.Context, object *models.Object,
	objectDiff *moduletools.ObjectDiff, cfg moduletools.ClassConfig,
) error {
	var overrides map[string]string
	if object.VectorWeights != nil {
		overrides = object.VectorWeights.(map[string]string)
	}

	icheck := NewIndexChecker(cfg)
	vec, sources, err := v.object(ctx, object.Class, object.Properties, objectDiff, overrides,
		icheck)
	if err != nil {
		return err
	}

	object.Vector = vec

	if object.Additional == nil {
		object.Additional = models.AdditionalProperties{}
	}

	object.Additional["interpretation"] = &txt2vecmodels.Interpretation{
		Source: sourceFromInputElements(sources),
	}

	return nil
}

func (v *Vectorizer) object(ctx context.Context, className string,
	schema interface{}, objDiff *moduletools.ObjectDiff, overrides map[string]string,
	icheck ClassIndexCheck,
) ([]float32, []txt2vecmodels.InterpretationSource, error) {
	corpi, vector, err := v.objectVectorizer.TextsOrVector(ctx, className, schema, objDiff, icheck)
	if err != nil {
		return nil, nil, err
	}
	// no property was changed, old vector can be used
	if vector != nil {
		// dont' re-vectorize
		return vector, []txt2vecmodels.InterpretationSource{}, nil
	}
	// vectorize text
	vector, ie, err := v.client.VectorForCorpi(ctx, []string{corpi}, overrides)
	if err != nil {
		switch err.(type) {
		case ErrNoUsableWords:
			return nil, nil, fmt.Errorf("The object is invalid, as weaviate could not extract "+
				"any contextionary-valid words from it. This is the case when you have "+
				"set the options 'vectorizeClassName: false' and 'vectorizePropertyName: false' in this class' schema definition "+
				"and not a single property's value "+
				"contains at least one contextionary-valid word. To fix this, you have several "+
				"options:\n\n1.) Make sure that the schema class name or the set properties are "+
				"a contextionary-valid term and include them in vectorization using the "+
				"'vectorizeClassName' or 'vectorizePropertyName' setting. In this case the vector position "+
				"will be composed of both the class/property names and the values for those fields. "+
				"Even if no property values are contextionary-valid, the overall word corpus is still valid "+
				"due to the contextionary-valid class/property names."+
				"\n\n2.) Alternatively, if you do not want to include schema class/property names "+
				"in vectorization, you must make sure that at least one text/string property contains "+
				"at least one contextionary-valid word."+
				"\n\n3.) If the word corpus weaviate extracted from your object "+
				"(see below) does contain enough meaning to build a vector position, but the contextionary "+
				"did not recognize the words, you can extend the contextionary using the "+
				"REST API. This is the case	when you use mostly industry-specific terms which are "+
				"not known to the common language contextionary. Once extended, simply reimport this object."+
				"\n\nThe following words were extracted from your object: %v"+
				"\n\nTo learn more about the contextionary and how it behaves, check out: https://www.semi.technology/documentation/weaviate/current/contextionary.html"+
				"\n\nOriginal error: %v", corpi, err)
		default:
			return nil, nil, fmt.Errorf("vectorizing object with corpus '%+v': %v", corpi, err)
		}
	}

	return vector, ie, nil
}

// Corpi takes any list of strings and builds a common vector for all of them
func (v *Vectorizer) Corpi(ctx context.Context, corpi []string,
) ([]float32, error) {
	for i, corpus := range corpi {
		corpi[i] = camelCaseToLower(corpus)
	}

	vector, _, err := v.client.VectorForCorpi(ctx, corpi, nil)
	if err != nil {
		return nil, fmt.Errorf("vectorizing corpus '%+v': %v", corpi, err)
	}

	return vector, nil
}

func camelCaseToLower(in string) string {
	parts := camelcase.Split(in)
	var sb strings.Builder
	for i, part := range parts {
		if part == " " {
			continue
		}

		if i > 0 {
			sb.WriteString(" ")
		}

		sb.WriteString(strings.ToLower(part))
	}

	return sb.String()
}

func sourceFromInputElements(in []txt2vecmodels.InterpretationSource) []*txt2vecmodels.InterpretationSource {
	out := make([]*txt2vecmodels.InterpretationSource, len(in))
	for i, elem := range in {
		out[i] = &txt2vecmodels.InterpretationSource{
			Concept:    elem.Concept,
			Occurrence: elem.Occurrence,
			Weight:     float64(elem.Weight),
		}
	}

	return out
}