KevinStephenson
Adding in weaviate code
b110593
raw
history blame
16.1 kB
// _ _
// __ _____ __ ___ ___ __ _| |_ ___
// \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \
// \ V V / __/ (_| |\ V /| | (_| | || __/
// \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___|
//
// Copyright © 2016 - 2024 Weaviate B.V. All rights reserved.
//
// CONTACT: [email protected]
//
package monitoring
import (
"sync"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
"github.com/weaviate/weaviate/usecases/config"
)
type PrometheusMetrics struct {
BatchTime *prometheus.HistogramVec
BatchDeleteTime *prometheus.SummaryVec
ObjectsTime *prometheus.SummaryVec
LSMBloomFilters *prometheus.SummaryVec
AsyncOperations *prometheus.GaugeVec
LSMSegmentCount *prometheus.GaugeVec
LSMSegmentCountByLevel *prometheus.GaugeVec
LSMSegmentObjects *prometheus.GaugeVec
LSMSegmentSize *prometheus.GaugeVec
LSMMemtableSize *prometheus.GaugeVec
LSMMemtableDurations *prometheus.SummaryVec
VectorIndexTombstones *prometheus.GaugeVec
VectorIndexTombstoneCleanupThreads *prometheus.GaugeVec
VectorIndexTombstoneCleanedCount *prometheus.CounterVec
VectorIndexOperations *prometheus.GaugeVec
VectorIndexDurations *prometheus.SummaryVec
VectorIndexSize *prometheus.GaugeVec
VectorIndexMaintenanceDurations *prometheus.SummaryVec
ObjectCount *prometheus.GaugeVec
QueriesCount *prometheus.GaugeVec
RequestsTotal *prometheus.GaugeVec
QueriesDurations *prometheus.HistogramVec
QueriesFilteredVectorDurations *prometheus.SummaryVec
QueryDimensions *prometheus.CounterVec
QueryDimensionsCombined prometheus.Counter
GoroutinesCount *prometheus.GaugeVec
BackupRestoreDurations *prometheus.SummaryVec
BackupStoreDurations *prometheus.SummaryVec
BucketPauseDurations *prometheus.SummaryVec
BackupRestoreClassDurations *prometheus.SummaryVec
BackupRestoreBackupInitDurations *prometheus.SummaryVec
BackupRestoreFromStorageDurations *prometheus.SummaryVec
BackupRestoreDataTransferred *prometheus.CounterVec
BackupStoreDataTransferred *prometheus.CounterVec
VectorDimensionsSum *prometheus.GaugeVec
VectorSegmentsSum *prometheus.GaugeVec
StartupProgress *prometheus.GaugeVec
StartupDurations *prometheus.SummaryVec
StartupDiskIO *prometheus.SummaryVec
ShardsLoaded *prometheus.GaugeVec
ShardsUnloaded *prometheus.GaugeVec
ShardsLoading *prometheus.GaugeVec
ShardsUnloading *prometheus.GaugeVec
Group bool
}
// Delete Shard deletes existing label combinations that match both
// the shard and class name. If a metric is not collected at the shard
// level it is unaffected. This is to make sure that deleting a single
// shard (e.g. multi-tenancy) does not affect metrics for existing
// shards.
//
// In addition, there are some metrics that we explicitly keep, such
// as vector_dimensions_sum as they can be used in billing decisions.
func (pm *PrometheusMetrics) DeleteShard(className, shardName string) error {
if pm == nil {
return nil
}
labels := prometheus.Labels{
"class_name": className,
"shard_name": shardName,
}
pm.BatchTime.DeletePartialMatch(labels)
pm.BatchDeleteTime.DeletePartialMatch(labels)
pm.ObjectsTime.DeletePartialMatch(labels)
pm.ObjectCount.DeletePartialMatch(labels)
pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels)
pm.AsyncOperations.DeletePartialMatch(labels)
pm.LSMBloomFilters.DeletePartialMatch(labels)
pm.LSMMemtableDurations.DeletePartialMatch(labels)
pm.LSMMemtableSize.DeletePartialMatch(labels)
pm.LSMMemtableDurations.DeletePartialMatch(labels)
pm.LSMSegmentCount.DeletePartialMatch(labels)
pm.LSMSegmentSize.DeletePartialMatch(labels)
pm.LSMSegmentCountByLevel.DeletePartialMatch(labels)
pm.VectorIndexTombstones.DeletePartialMatch(labels)
pm.VectorIndexTombstoneCleanupThreads.DeletePartialMatch(labels)
pm.VectorIndexTombstoneCleanedCount.DeletePartialMatch(labels)
pm.VectorIndexOperations.DeletePartialMatch(labels)
pm.VectorIndexMaintenanceDurations.DeletePartialMatch(labels)
pm.VectorIndexDurations.DeletePartialMatch(labels)
pm.VectorIndexSize.DeletePartialMatch(labels)
pm.StartupProgress.DeletePartialMatch(labels)
pm.StartupDurations.DeletePartialMatch(labels)
pm.StartupDiskIO.DeletePartialMatch(labels)
return nil
}
// DeleteClass deletes all metrics that match the class name, but do
// not have a shard-specific label. See [DeleteShard] for more
// information.
func (pm *PrometheusMetrics) DeleteClass(className string) error {
if pm == nil {
return nil
}
labels := prometheus.Labels{
"class_name": className,
}
pm.QueriesCount.DeletePartialMatch(labels)
pm.QueriesDurations.DeletePartialMatch(labels)
pm.GoroutinesCount.DeletePartialMatch(labels)
pm.BackupRestoreClassDurations.DeletePartialMatch(labels)
pm.BackupRestoreBackupInitDurations.DeletePartialMatch(labels)
pm.BackupRestoreFromStorageDurations.DeletePartialMatch(labels)
pm.BackupStoreDurations.DeletePartialMatch(labels)
pm.BackupRestoreDataTransferred.DeletePartialMatch(labels)
pm.BackupStoreDataTransferred.DeletePartialMatch(labels)
pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels)
return nil
}
var (
msBuckets = []float64{10, 50, 100, 500, 1000, 5000}
metrics *PrometheusMetrics = nil
)
func init() {
metrics = newPrometheusMetrics()
}
func InitConfig(cfg config.Monitoring) {
metrics.Group = cfg.Group
}
func GetMetrics() *PrometheusMetrics {
return metrics
}
func newPrometheusMetrics() *PrometheusMetrics {
return &PrometheusMetrics{
BatchTime: promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "batch_durations_ms",
Help: "Duration in ms of a single batch",
Buckets: msBuckets,
}, []string{"operation", "class_name", "shard_name"}),
BatchDeleteTime: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "batch_delete_durations_ms",
Help: "Duration in ms of a single delete batch",
}, []string{"operation", "class_name", "shard_name"}),
ObjectsTime: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "objects_durations_ms",
Help: "Duration of an individual object operation. Also as part of batches.",
}, []string{"operation", "step", "class_name", "shard_name"}),
ObjectCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "object_count",
Help: "Number of currently ongoing async operations",
}, []string{"class_name", "shard_name"}),
QueriesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "concurrent_queries_count",
Help: "Number of concurrently running query operations",
}, []string{"class_name", "query_type"}),
RequestsTotal: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "requests_total",
Help: "Number of all requests made",
}, []string{"status", "class_name", "api", "query_type"}),
QueriesDurations: promauto.NewHistogramVec(prometheus.HistogramOpts{
Name: "queries_durations_ms",
Help: "Duration of queries in milliseconds",
Buckets: msBuckets,
}, []string{"class_name", "query_type"}),
QueriesFilteredVectorDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "queries_filtered_vector_durations_ms",
Help: "Duration of queries in milliseconds",
}, []string{"class_name", "shard_name", "operation"}),
GoroutinesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "concurrent_goroutines",
Help: "Number of concurrently running goroutines",
}, []string{"class_name", "query_type"}),
AsyncOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "async_operations_running",
Help: "Number of currently ongoing async operations",
}, []string{"operation", "class_name", "shard_name", "path"}),
// LSM metrics
LSMSegmentCount: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "lsm_active_segments",
Help: "Number of currently present segments per shard",
}, []string{"strategy", "class_name", "shard_name", "path"}),
LSMBloomFilters: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "lsm_bloom_filters_duration_ms",
Help: "Duration of bloom filter operations",
}, []string{"operation", "strategy", "class_name", "shard_name"}),
LSMSegmentObjects: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "lsm_segment_objects",
Help: "Number of objects/entries of segment by level",
}, []string{"strategy", "class_name", "shard_name", "path", "level"}),
LSMSegmentSize: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "lsm_segment_size",
Help: "Size of segment by level and unit",
}, []string{"strategy", "class_name", "shard_name", "path", "level", "unit"}),
LSMSegmentCountByLevel: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "lsm_segment_count",
Help: "Number of segments by level",
}, []string{"strategy", "class_name", "shard_name", "path", "level"}),
LSMMemtableSize: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "lsm_memtable_size",
Help: "Size of memtable by path",
}, []string{"strategy", "class_name", "shard_name", "path"}),
LSMMemtableDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "lsm_memtable_durations_ms",
Help: "Time in ms for a bucket operation to complete",
}, []string{"strategy", "class_name", "shard_name", "path", "operation"}),
// Vector index metrics
VectorIndexTombstones: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "vector_index_tombstones",
Help: "Number of active vector index tombstones",
}, []string{"class_name", "shard_name"}),
VectorIndexTombstoneCleanupThreads: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "vector_index_tombstone_cleanup_threads",
Help: "Number of threads in use to clean up tombstones",
}, []string{"class_name", "shard_name"}),
VectorIndexTombstoneCleanedCount: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "vector_index_tombstone_cleaned",
Help: "Total number of deleted objects that have been cleaned up",
}, []string{"class_name", "shard_name"}),
VectorIndexOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "vector_index_operations",
Help: "Total number of mutating operations on the vector index",
}, []string{"operation", "class_name", "shard_name"}),
VectorIndexSize: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "vector_index_size",
Help: "The size of the vector index. Typically larger than number of vectors, as it grows proactively.",
}, []string{"class_name", "shard_name"}),
VectorIndexMaintenanceDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "vector_index_maintenance_durations_ms",
Help: "Duration of a sync or async vector index maintenance operation",
}, []string{"operation", "class_name", "shard_name"}),
VectorIndexDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "vector_index_durations_ms",
Help: "Duration of typical vector index operations (insert, delete)",
}, []string{"operation", "step", "class_name", "shard_name"}),
VectorDimensionsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "vector_dimensions_sum",
Help: "Total dimensions in a shard",
}, []string{"class_name", "shard_name"}),
VectorSegmentsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "vector_segments_sum",
Help: "Total segments in a shard if quantization enabled",
}, []string{"class_name", "shard_name"}),
// Startup metrics
StartupProgress: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "startup_progress",
Help: "A ratio (percentage) of startup progress for a particular component in a shard",
}, []string{"operation", "class_name", "shard_name"}),
StartupDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "startup_durations_ms",
Help: "Duration of individual startup operations in ms",
}, []string{"operation", "class_name", "shard_name"}),
StartupDiskIO: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "startup_diskio_throughput",
Help: "Disk I/O throuhput in bytes per second",
}, []string{"operation", "class_name", "shard_name"}),
QueryDimensions: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "query_dimensions_total",
Help: "The vector dimensions used by any read-query that involves vectors",
}, []string{"query_type", "operation", "class_name"}),
QueryDimensionsCombined: promauto.NewCounter(prometheus.CounterOpts{
Name: "query_dimensions_combined_total",
Help: "The vector dimensions used by any read-query that involves vectors, aggregated across all classes and shards. The sum of all labels for query_dimensions_total should always match this labelless metric",
}),
// Backup/restore metrics
BackupRestoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "backup_restore_ms",
Help: "Duration of a backup restore",
}, []string{"backend_name", "class_name"}),
BackupRestoreClassDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "backup_restore_class_ms",
Help: "Duration restoring class",
}, []string{"class_name"}),
BackupRestoreBackupInitDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "backup_restore_init_ms",
Help: "startup phase of a backup restore",
}, []string{"backend_name", "class_name"}),
BackupRestoreFromStorageDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "backup_restore_from_backend_ms",
Help: "file transfer stage of a backup restore",
}, []string{"backend_name", "class_name"}),
BackupStoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "backup_store_to_backend_ms",
Help: "file transfer stage of a backup restore",
}, []string{"backend_name", "class_name"}),
BucketPauseDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{
Name: "bucket_pause_durations_ms",
Help: "bucket pause durations",
}, []string{"bucket_dir"}),
BackupRestoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "backup_restore_data_transferred",
Help: "Total number of bytes transferred during a backup restore",
}, []string{"backend_name", "class_name"}),
BackupStoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{
Name: "backup_store_data_transferred",
Help: "Total number of bytes transferred during a backup store",
}, []string{"backend_name", "class_name"}),
// Shard metrics
ShardsLoaded: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "shards_loaded",
Help: "Number of shards loaded",
}, []string{"class_name"}),
ShardsUnloaded: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "shards_unloaded",
Help: "Number of shards on not loaded",
}, []string{"class_name"}),
ShardsLoading: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "shards_loading",
Help: "Number of shards in process of loading",
}, []string{"class_name"}),
ShardsUnloading: promauto.NewGaugeVec(prometheus.GaugeOpts{
Name: "shards_unloading",
Help: "Number of shards in process of unloading",
}, []string{"class_name"}),
}
}
type OnceUponATimer struct {
sync.Once
Timer *prometheus.Timer
}
func NewOnceTimer(promTimer *prometheus.Timer) *OnceUponATimer {
o := OnceUponATimer{}
o.Timer = promTimer
return &o
}
func (o *OnceUponATimer) ObserveDurationOnce() {
o.Do(func() {
o.Timer.ObserveDuration()
})
}