File size: 1,774 Bytes
b110593
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
//                           _       _
// __      _____  __ ___   ___  __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
//  \ V  V /  __/ (_| |\ V /| | (_| | ||  __/
//   \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
//  Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
//  CONTACT: [email protected]
//

package stopwords

import (
	"sync"

	"github.com/weaviate/weaviate/entities/models"

	"github.com/pkg/errors"
)

type StopwordDetector interface {
	IsStopword(string) bool
}

type Detector struct {
	sync.Mutex
	stopwords map[string]struct{}
}

func NewDetectorFromConfig(config models.StopwordConfig) (*Detector, error) {
	d, err := NewDetectorFromPreset(config.Preset)
	if err != nil {
		return nil, errors.Wrap(err, "failed to create new detector from config")
	}

	d.SetAdditions(config.Additions)
	d.SetRemovals(config.Removals)

	return d, nil
}

func NewDetectorFromPreset(preset string) (*Detector, error) {
	var list []string
	var ok bool

	if preset != "" {
		list, ok = Presets[preset]
		if !ok {
			return nil, errors.Errorf("preset %q not known to stopword detector", preset)
		}
	}

	d := &Detector{
		stopwords: map[string]struct{}{},
	}

	for _, word := range list {
		d.stopwords[word] = struct{}{}
	}

	return d, nil
}

func (d *Detector) SetAdditions(additions []string) {
	d.Lock()
	defer d.Unlock()

	for _, add := range additions {
		d.stopwords[add] = struct{}{}
	}
}

func (d *Detector) SetRemovals(removals []string) {
	d.Lock()
	defer d.Unlock()

	for _, rem := range removals {
		delete(d.stopwords, rem)
	}
}

func (d *Detector) IsStopword(word string) bool {
	d.Lock()
	defer d.Unlock()

	_, ok := d.stopwords[word]
	return ok
}