Spaces:
Running
Running
File size: 5,276 Bytes
b110593 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 |
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package hybrid
import (
"fmt"
"sort"
"github.com/go-openapi/strfmt"
)
func FusionRanked(weights []float64, resultSets [][]*Result, setNames []string) []*Result {
combinedResults := map[strfmt.UUID]*Result{}
for resultSetIndex, resultSet := range resultSets {
for i, res := range resultSet {
tempResult := res
docId := tempResult.ID
score := weights[resultSetIndex] / float64(i+60+1) // TODO replace 60 with a class configured variable
if tempResult.AdditionalProperties == nil {
tempResult.AdditionalProperties = map[string]interface{}{}
}
// Get previous results from the map, if any
previousResult, ok := combinedResults[docId]
if ok {
tempResult.AdditionalProperties["explainScore"] = fmt.Sprintf(
"%v\n(Result Set %v) Document %v contributed %v to the score",
previousResult.AdditionalProperties["explainScore"], setNames[resultSetIndex], tempResult.ID, score)
score += float64(previousResult.Score)
} else {
tempResult.AdditionalProperties["explainScore"] = fmt.Sprintf(
"%v\n(Result Set %v) Document %v contributed %v to the score",
tempResult.ExplainScore, setNames[resultSetIndex], tempResult.ID, score)
}
tempResult.AdditionalProperties["rank_score"] = score
tempResult.AdditionalProperties["score"] = score
tempResult.Score = float32(score)
combinedResults[docId] = tempResult
}
}
// Sort the results
var (
concat = make([]*Result, len(combinedResults))
i = 0
)
for _, res := range combinedResults {
res.ExplainScore = res.AdditionalProperties["explainScore"].(string)
concat[i] = res
i++
}
sort.Slice(concat, func(i, j int) bool {
if concat[j].Score == concat[i].Score {
return concat[i].SecondarySortValue > concat[j].SecondarySortValue
}
return float64(concat[i].Score) > float64(concat[j].Score)
})
return concat
}
// FusionRelativeScore uses the relative differences in the scores from keyword and vector search to combine the
// results. This method retains more information than ranked fusion and should result in better results.
//
// The scores from each result are normalized between 0 and 1, e.g. the maximum score becomes 1 and the minimum 0 and the
// other scores are in between, keeping their relative distance to the other scores.
// Example:
//
// Input score = [1, 8, 6, 11] => [0, 0.7, 0.5, 1]
//
// The normalized scores are then combined using their respective weight and the combined scores are sorted
func FusionRelativeScore(weights []float64, resultSets [][]*Result, names []string) []*Result {
if len(resultSets[0]) == 0 && (len(resultSets) == 1 || len(resultSets[1]) == 0) {
return []*Result{}
}
var maximum []float32
var minimum []float32
for i := range resultSets {
if len(resultSets[i]) > 0 {
maximum = append(maximum, resultSets[i][0].SecondarySortValue)
minimum = append(minimum, resultSets[i][0].SecondarySortValue)
} else { // dummy values so the indices match
maximum = append(maximum, 0)
minimum = append(minimum, 0)
}
for _, res := range resultSets[i] {
if res.SecondarySortValue > maximum[i] {
maximum[i] = res.SecondarySortValue
}
if res.SecondarySortValue < minimum[i] {
minimum[i] = res.SecondarySortValue
}
}
}
// normalize scores between 0 and 1 and sum uo the normalized scores from different sources
// pre-allocate map, at this stage we do not know how many total, combined results there are, but it is at least the
// length of the longer input list
numResults := len(resultSets[0])
if len(resultSets) > 1 && len(resultSets[1]) > numResults {
numResults = len(resultSets[1])
}
mapResults := make(map[strfmt.UUID]*Result, numResults)
for i := range resultSets {
weight := float32(weights[i])
for _, res := range resultSets[i] {
// If all scores are identical min and max are the same => just set score to the weight.
score := weight
if maximum[i] != minimum[i] {
score *= (res.SecondarySortValue - minimum[i]) / (maximum[i] - minimum[i])
}
previousResult, ok := mapResults[res.ID]
explainScore := fmt.Sprintf("(Result Set '%v') Document %v: original score %v, normalized score: %v", names[i], res.ID, res.SecondarySortValue, score)
if ok {
score += previousResult.Score
explainScore += " - " + previousResult.ExplainScore
}
res.Score = score
res.ExplainScore = explainScore
mapResults[res.ID] = res
}
}
concat := make([]*Result, 0, len(mapResults))
for _, res := range mapResults {
concat = append(concat, res)
}
sort.Slice(concat, func(i, j int) bool {
a_b := float64(concat[j].Score - concat[i].Score)
if a_b*a_b < 1e-14 {
return concat[i].SecondarySortValue > concat[j].SecondarySortValue
}
return float64(concat[i].Score) > float64(concat[j].Score)
})
return concat
}
|