Spaces:
Running
Running
File size: 4,221 Bytes
b110593 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 |
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package inverted
import (
"runtime"
"strings"
"github.com/pkg/errors"
"github.com/weaviate/weaviate/adapters/repos/db/inverted/stopwords"
"github.com/weaviate/weaviate/entities/models"
"github.com/weaviate/weaviate/entities/schema"
"github.com/weaviate/weaviate/usecases/config"
)
var _NUMCPU = runtime.NumCPU()
func ValidateConfig(conf *models.InvertedIndexConfig) error {
if conf.CleanupIntervalSeconds < 0 {
return errors.Errorf("cleanup interval seconds must be > 0")
}
err := validateBM25Config(conf.Bm25)
if err != nil {
return err
}
err = validateStopwordConfig(conf.Stopwords)
if err != nil {
return err
}
return nil
}
func ConfigFromModel(iicm *models.InvertedIndexConfig) schema.InvertedIndexConfig {
var conf schema.InvertedIndexConfig
conf.IndexTimestamps = iicm.IndexTimestamps
conf.IndexNullState = iicm.IndexNullState
conf.IndexPropertyLength = iicm.IndexPropertyLength
if iicm.Bm25 == nil {
conf.BM25.K1 = float64(config.DefaultBM25k1)
conf.BM25.B = float64(config.DefaultBM25b)
} else {
conf.BM25.K1 = float64(iicm.Bm25.K1)
conf.BM25.B = float64(iicm.Bm25.B)
}
if iicm.Stopwords == nil {
conf.Stopwords = models.StopwordConfig{
Preset: stopwords.EnglishPreset,
}
} else {
conf.Stopwords.Preset = iicm.Stopwords.Preset
conf.Stopwords.Additions = iicm.Stopwords.Additions
conf.Stopwords.Removals = iicm.Stopwords.Removals
}
return conf
}
func validateBM25Config(conf *models.BM25Config) error {
if conf == nil {
return nil
}
if conf.K1 < 0 {
return errors.Errorf("BM25.k1 must be >= 0")
}
if conf.B < 0 || conf.B > 1 {
return errors.Errorf("BM25.b must be <= 0 and <= 1")
}
return nil
}
func validateStopwordConfig(conf *models.StopwordConfig) error {
if conf == nil {
conf = &models.StopwordConfig{}
}
if conf.Preset == "" {
conf.Preset = stopwords.EnglishPreset
}
if _, ok := stopwords.Presets[conf.Preset]; !ok {
return errors.Errorf("stopwordPreset '%s' does not exist", conf.Preset)
}
err := validateStopwordAdditionsRemovals(conf)
if err != nil {
return err
}
return nil
}
func validateStopwordAdditionsRemovals(conf *models.StopwordConfig) error {
// the same stopword cannot exist
// in both additions and removals
foundAdditions := make(map[string]int)
for idx, add := range conf.Additions {
if strings.TrimSpace(add) == "" {
return errors.Errorf("cannot use whitespace in stopword.additions")
}
// save the index of the addition since it
// is readily available here. we will need
// this below when trimming additions that
// already exist in the selected preset
foundAdditions[add] = idx
}
for _, rem := range conf.Removals {
if strings.TrimSpace(rem) == "" {
return errors.Errorf("cannot use whitespace in stopword.removals")
}
if _, ok := foundAdditions[rem]; ok {
return errors.Errorf(
"found '%s' in both stopwords.additions and stopwords.removals", rem)
}
}
removeStopwordAdditionsIfInPreset(conf, foundAdditions)
return nil
}
func removeStopwordAdditionsIfInPreset(conf *models.StopwordConfig, foundAdditions map[string]int) {
presets := stopwords.Presets[conf.Preset]
// if any of the elements in stopwords.additions
// already exist in the preset, mark it as to
// be removed
indicesToRemove := make(map[int]bool)
for _, preset := range presets {
if idx, ok := foundAdditions[preset]; ok {
indicesToRemove[idx] = true
}
}
if len(indicesToRemove) == 0 {
return
}
// take remaining additions, build new list
var trimmedAdditions []string
for idx, add := range conf.Additions {
if _, ok := indicesToRemove[idx]; !ok {
trimmedAdditions = append(trimmedAdditions, add)
}
}
conf.Additions = trimmedAdditions
}
|