Spaces:
Running
Running
// _ _ | |
// __ _____ __ ___ ___ __ _| |_ ___ | |
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ | |
// \ V V / __/ (_| |\ V /| | (_| | || __/ | |
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| | |
// | |
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. | |
// | |
// CONTACT: [email protected] | |
// | |
package helpers | |
import ( | |
"strings" | |
"unicode" | |
"github.com/weaviate/weaviate/entities/models" | |
) | |
var Tokenizations []string = []string{ | |
models.PropertyTokenizationWord, | |
models.PropertyTokenizationLowercase, | |
models.PropertyTokenizationWhitespace, | |
models.PropertyTokenizationField, | |
} | |
func Tokenize(tokenization string, in string) []string { | |
switch tokenization { | |
case models.PropertyTokenizationWord: | |
return tokenizeWord(in) | |
case models.PropertyTokenizationLowercase: | |
return tokenizeLowercase(in) | |
case models.PropertyTokenizationWhitespace: | |
return tokenizeWhitespace(in) | |
case models.PropertyTokenizationField: | |
return tokenizeField(in) | |
default: | |
return []string{} | |
} | |
} | |
func TokenizeWithWildcards(tokenization string, in string) []string { | |
switch tokenization { | |
case models.PropertyTokenizationWord: | |
return tokenizeWordWithWildcards(in) | |
case models.PropertyTokenizationLowercase: | |
return tokenizeLowercase(in) | |
case models.PropertyTokenizationWhitespace: | |
return tokenizeWhitespace(in) | |
case models.PropertyTokenizationField: | |
return tokenizeField(in) | |
default: | |
return []string{} | |
} | |
} | |
// tokenizeField trims white spaces | |
// (former DataTypeString/Field) | |
func tokenizeField(in string) []string { | |
return []string{strings.TrimFunc(in, unicode.IsSpace)} | |
} | |
// tokenizeWhitespace splits on white spaces, does not alter casing | |
// (former DataTypeString/Word) | |
func tokenizeWhitespace(in string) []string { | |
return strings.FieldsFunc(in, unicode.IsSpace) | |
} | |
// tokenizeLowercase splits on white spaces and lowercases the words | |
func tokenizeLowercase(in string) []string { | |
terms := tokenizeWhitespace(in) | |
return lowercase(terms) | |
} | |
// tokenizeWord splits on any non-alphanumerical and lowercases the words | |
// (former DataTypeText/Word) | |
func tokenizeWord(in string) []string { | |
terms := strings.FieldsFunc(in, func(r rune) bool { | |
return !unicode.IsLetter(r) && !unicode.IsNumber(r) | |
}) | |
return lowercase(terms) | |
} | |
// tokenizeWordWithWildcards splits on any non-alphanumerical except wildcard-symbols and | |
// lowercases the words | |
func tokenizeWordWithWildcards(in string) []string { | |
terms := strings.FieldsFunc(in, func(r rune) bool { | |
return !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '?' && r != '*' | |
}) | |
return lowercase(terms) | |
} | |
func lowercase(terms []string) []string { | |
for i := range terms { | |
terms[i] = strings.ToLower(terms[i]) | |
} | |
return terms | |
} | |
func TokenizeAndCountDuplicates(tokenization string, in string) ([]string, []int) { | |
counts := map[string]int{} | |
for _, term := range Tokenize(tokenization, in) { | |
counts[term]++ | |
} | |
unique := make([]string, len(counts)) | |
boosts := make([]int, len(counts)) | |
i := 0 | |
for term, boost := range counts { | |
unique[i] = term | |
boosts[i] = boost | |
i++ | |
} | |
return unique, boosts | |
} | |