Spaces:
Running
Running
// _ _ | |
// __ _____ __ ___ ___ __ _| |_ ___ | |
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ | |
// \ V V / __/ (_| |\ V /| | (_| | || __/ | |
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| | |
// | |
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. | |
// | |
// CONTACT: [email protected] | |
// | |
package compressionhelpers | |
import ( | |
"encoding/binary" | |
"math" | |
"sync/atomic" | |
"gonum.org/v1/gonum/stat/distuv" | |
) | |
type distribution interface { | |
Transform(x float64) float64 | |
CDF(x float64) float64 | |
Quantile(x float64) float64 | |
} | |
type logNormalDistribution struct { | |
dist *distuv.LogNormal | |
} | |
func newLogNormalDistribution(mean float64, std float64) distribution { | |
return &logNormalDistribution{ | |
dist: &distuv.LogNormal{ | |
Mu: mean, | |
Sigma: std, | |
}, | |
} | |
} | |
func (d *logNormalDistribution) Transform(x float64) float64 { | |
if x > 0 { | |
return math.Log(x) | |
} | |
return 0 | |
} | |
func (d *logNormalDistribution) CDF(x float64) float64 { | |
return d.dist.CDF(x) | |
} | |
func (d *logNormalDistribution) Quantile(x float64) float64 { | |
return d.dist.Quantile(x) | |
} | |
type normalDistribution struct { | |
dist *distuv.Normal | |
} | |
func newNormalDistribution(mean float64, std float64) distribution { | |
return &normalDistribution{ | |
dist: &distuv.Normal{ | |
Mu: mean, | |
Sigma: std, | |
}, | |
} | |
} | |
func (d *normalDistribution) Transform(x float64) float64 { | |
return x | |
} | |
func (d *normalDistribution) CDF(x float64) float64 { | |
return d.dist.CDF(x) | |
} | |
func (d *normalDistribution) Quantile(x float64) float64 { | |
return d.dist.Quantile(x) | |
} | |
type Centroid struct { | |
Center []float32 | |
Calculated atomic.Bool | |
} | |
type EncoderDistribution byte | |
const ( | |
NormalEncoderDistribution EncoderDistribution = 0 | |
LogNormalEncoderDistribution EncoderDistribution = 1 | |
) | |
type TileEncoder struct { | |
bins float64 | |
mean float64 | |
stdDev float64 | |
size float64 | |
s1 float64 | |
s2 float64 | |
segment int | |
centroids []Centroid | |
encoderDistribution EncoderDistribution | |
distribution distribution | |
} | |
func NewTileEncoder(bits int, segment int, encoderDistribution EncoderDistribution) *TileEncoder { | |
centroids := math.Pow(2, float64(bits)) | |
te := &TileEncoder{ | |
bins: centroids, | |
mean: 0, | |
stdDev: 0, | |
size: 0, | |
s1: 0, | |
s2: 0, | |
segment: segment, | |
centroids: make([]Centroid, int(centroids)), | |
encoderDistribution: encoderDistribution, | |
} | |
te.setEncoderDistribution() | |
return te | |
} | |
func RestoreTileEncoder(bins float64, mean float64, stdDev float64, size float64, s1 float64, s2 float64, segment uint16, encoderDistribution byte) *TileEncoder { | |
te := &TileEncoder{ | |
bins: bins, | |
mean: mean, | |
stdDev: stdDev, | |
size: size, | |
s1: s1, | |
s2: s2, | |
segment: int(segment), | |
encoderDistribution: EncoderDistribution(encoderDistribution), | |
} | |
te.setEncoderDistribution() | |
return te | |
} | |
func (te *TileEncoder) ExposeDataForRestore() []byte { | |
buffer := make([]byte, 51) | |
binary.LittleEndian.PutUint64(buffer[0:8], math.Float64bits(te.bins)) | |
binary.LittleEndian.PutUint64(buffer[8:16], math.Float64bits(te.mean)) | |
binary.LittleEndian.PutUint64(buffer[16:24], math.Float64bits(te.stdDev)) | |
binary.LittleEndian.PutUint64(buffer[24:32], math.Float64bits(te.size)) | |
binary.LittleEndian.PutUint64(buffer[32:40], math.Float64bits(te.s1)) | |
binary.LittleEndian.PutUint64(buffer[40:48], math.Float64bits(te.s2)) | |
binary.LittleEndian.PutUint16(buffer[48:50], uint16(te.segment)) | |
buffer[50] = byte(te.encoderDistribution) | |
return buffer | |
} | |
func (te *TileEncoder) Fit(data [][]float32) error { | |
te.setEncoderDistribution() | |
return nil | |
} | |
func (te *TileEncoder) setEncoderDistribution() { | |
switch te.encoderDistribution { | |
case LogNormalEncoderDistribution: | |
te.distribution = newLogNormalDistribution(te.mean, te.stdDev) | |
case NormalEncoderDistribution: | |
te.distribution = newNormalDistribution(te.mean, te.stdDev) | |
} | |
} | |
func (te *TileEncoder) Add(x []float32) { | |
// calculate mean and stddev iteratively | |
x64 := te.distribution.Transform(float64(x[te.segment])) | |
te.s1 += x64 | |
te.s2 += x64 * x64 | |
te.size++ | |
te.mean = te.s1 / te.size | |
sum := te.s2 + te.size*te.mean*te.mean | |
prod := 2 * te.mean * te.s1 | |
te.stdDev = math.Sqrt((sum - prod) / te.size) | |
} | |
func (te *TileEncoder) Encode(x []float32) byte { | |
cdf := te.distribution.CDF(float64(x[te.segment])) | |
intPart, _ := math.Modf(cdf * float64(te.bins)) | |
return byte(intPart) | |
} | |
func (te *TileEncoder) centroid(b byte) []float32 { | |
res := make([]float32, 0, 1) | |
if b == 0 { | |
res = append(res, float32(te.distribution.Quantile(1/te.bins))) | |
} else if b == byte(te.bins) { | |
res = append(res, float32(te.distribution.Quantile((te.bins-1)/te.bins))) | |
} else { | |
b64 := float64(b) | |
mean := (b64/te.bins + (b64+1)/te.bins) / 2 | |
res = append(res, float32(te.distribution.Quantile(mean))) | |
} | |
return res | |
} | |
func (te *TileEncoder) Centroid(b byte) []float32 { | |
if te.centroids[b].Calculated.Load() { | |
return te.centroids[b].Center | |
} | |
te.centroids[b].Center = te.centroid(b) | |
te.centroids[b].Calculated.Store(true) | |
return te.centroids[b].Center | |
} | |