KevinStephenson
Adding in weaviate code
b110593
raw
history blame
3.22 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package helpers
import (
"strings"
"unicode"
"github.com/weaviate/weaviate/entities/models"
)
var Tokenizations []string = []string{
models.PropertyTokenizationWord,
models.PropertyTokenizationLowercase,
models.PropertyTokenizationWhitespace,
models.PropertyTokenizationField,
}
func Tokenize(tokenization string, in string) []string {
switch tokenization {
case models.PropertyTokenizationWord:
return tokenizeWord(in)
case models.PropertyTokenizationLowercase:
return tokenizeLowercase(in)
case models.PropertyTokenizationWhitespace:
return tokenizeWhitespace(in)
case models.PropertyTokenizationField:
return tokenizeField(in)
default:
return []string{}
}
}
func TokenizeWithWildcards(tokenization string, in string) []string {
switch tokenization {
case models.PropertyTokenizationWord:
return tokenizeWordWithWildcards(in)
case models.PropertyTokenizationLowercase:
return tokenizeLowercase(in)
case models.PropertyTokenizationWhitespace:
return tokenizeWhitespace(in)
case models.PropertyTokenizationField:
return tokenizeField(in)
default:
return []string{}
}
}
// tokenizeField trims white spaces
// (former DataTypeString/Field)
func tokenizeField(in string) []string {
return []string{strings.TrimFunc(in, unicode.IsSpace)}
}
// tokenizeWhitespace splits on white spaces, does not alter casing
// (former DataTypeString/Word)
func tokenizeWhitespace(in string) []string {
return strings.FieldsFunc(in, unicode.IsSpace)
}
// tokenizeLowercase splits on white spaces and lowercases the words
func tokenizeLowercase(in string) []string {
terms := tokenizeWhitespace(in)
return lowercase(terms)
}
// tokenizeWord splits on any non-alphanumerical and lowercases the words
// (former DataTypeText/Word)
func tokenizeWord(in string) []string {
terms := strings.FieldsFunc(in, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r)
})
return lowercase(terms)
}
// tokenizeWordWithWildcards splits on any non-alphanumerical except wildcard-symbols and
// lowercases the words
func tokenizeWordWithWildcards(in string) []string {
terms := strings.FieldsFunc(in, func(r rune) bool {
return !unicode.IsLetter(r) && !unicode.IsNumber(r) && r != '?' && r != '*'
})
return lowercase(terms)
}
func lowercase(terms []string) []string {
for i := range terms {
terms[i] = strings.ToLower(terms[i])
}
return terms
}
func TokenizeAndCountDuplicates(tokenization string, in string) ([]string, []int) {
counts := map[string]int{}
for _, term := range Tokenize(tokenization, in) {
counts[term]++
}
unique := make([]string, len(counts))
boosts := make([]int, len(counts))
i := 0
for term, boost := range counts {
unique[i] = term
boosts[i] = boost
i++
}
return unique, boosts
}