Spaces:
Sleeping
Sleeping
| // _ _ | |
| // __ _____ __ ___ ___ __ _| |_ ___ | |
| // \ \ /\ / / _ \/ _` \ \ / / |/ _` | __/ _ \ | |
| // \ V V / __/ (_| |\ V /| | (_| | || __/ | |
| // \_/\_/ \___|\__,_| \_/ |_|\__,_|\__\___| | |
| // | |
| // Copyright © 2016 - 2024 Weaviate B.V. All rights reserved. | |
| // | |
| // CONTACT: [email protected] | |
| // | |
| package monitoring | |
| import ( | |
| "sync" | |
| "github.com/prometheus/client_golang/prometheus" | |
| "github.com/prometheus/client_golang/prometheus/promauto" | |
| "github.com/weaviate/weaviate/usecases/config" | |
| ) | |
| type PrometheusMetrics struct { | |
| BatchTime *prometheus.HistogramVec | |
| BatchDeleteTime *prometheus.SummaryVec | |
| ObjectsTime *prometheus.SummaryVec | |
| LSMBloomFilters *prometheus.SummaryVec | |
| AsyncOperations *prometheus.GaugeVec | |
| LSMSegmentCount *prometheus.GaugeVec | |
| LSMSegmentCountByLevel *prometheus.GaugeVec | |
| LSMSegmentObjects *prometheus.GaugeVec | |
| LSMSegmentSize *prometheus.GaugeVec | |
| LSMMemtableSize *prometheus.GaugeVec | |
| LSMMemtableDurations *prometheus.SummaryVec | |
| VectorIndexTombstones *prometheus.GaugeVec | |
| VectorIndexTombstoneCleanupThreads *prometheus.GaugeVec | |
| VectorIndexTombstoneCleanedCount *prometheus.CounterVec | |
| VectorIndexOperations *prometheus.GaugeVec | |
| VectorIndexDurations *prometheus.SummaryVec | |
| VectorIndexSize *prometheus.GaugeVec | |
| VectorIndexMaintenanceDurations *prometheus.SummaryVec | |
| ObjectCount *prometheus.GaugeVec | |
| QueriesCount *prometheus.GaugeVec | |
| RequestsTotal *prometheus.GaugeVec | |
| QueriesDurations *prometheus.HistogramVec | |
| QueriesFilteredVectorDurations *prometheus.SummaryVec | |
| QueryDimensions *prometheus.CounterVec | |
| QueryDimensionsCombined prometheus.Counter | |
| GoroutinesCount *prometheus.GaugeVec | |
| BackupRestoreDurations *prometheus.SummaryVec | |
| BackupStoreDurations *prometheus.SummaryVec | |
| BucketPauseDurations *prometheus.SummaryVec | |
| BackupRestoreClassDurations *prometheus.SummaryVec | |
| BackupRestoreBackupInitDurations *prometheus.SummaryVec | |
| BackupRestoreFromStorageDurations *prometheus.SummaryVec | |
| BackupRestoreDataTransferred *prometheus.CounterVec | |
| BackupStoreDataTransferred *prometheus.CounterVec | |
| VectorDimensionsSum *prometheus.GaugeVec | |
| VectorSegmentsSum *prometheus.GaugeVec | |
| StartupProgress *prometheus.GaugeVec | |
| StartupDurations *prometheus.SummaryVec | |
| StartupDiskIO *prometheus.SummaryVec | |
| ShardsLoaded *prometheus.GaugeVec | |
| ShardsUnloaded *prometheus.GaugeVec | |
| ShardsLoading *prometheus.GaugeVec | |
| ShardsUnloading *prometheus.GaugeVec | |
| Group bool | |
| } | |
| // Delete Shard deletes existing label combinations that match both | |
| // the shard and class name. If a metric is not collected at the shard | |
| // level it is unaffected. This is to make sure that deleting a single | |
| // shard (e.g. multi-tenancy) does not affect metrics for existing | |
| // shards. | |
| // | |
| // In addition, there are some metrics that we explicitly keep, such | |
| // as vector_dimensions_sum as they can be used in billing decisions. | |
| func (pm *PrometheusMetrics) DeleteShard(className, shardName string) error { | |
| if pm == nil { | |
| return nil | |
| } | |
| labels := prometheus.Labels{ | |
| "class_name": className, | |
| "shard_name": shardName, | |
| } | |
| pm.BatchTime.DeletePartialMatch(labels) | |
| pm.BatchDeleteTime.DeletePartialMatch(labels) | |
| pm.ObjectsTime.DeletePartialMatch(labels) | |
| pm.ObjectCount.DeletePartialMatch(labels) | |
| pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels) | |
| pm.AsyncOperations.DeletePartialMatch(labels) | |
| pm.LSMBloomFilters.DeletePartialMatch(labels) | |
| pm.LSMMemtableDurations.DeletePartialMatch(labels) | |
| pm.LSMMemtableSize.DeletePartialMatch(labels) | |
| pm.LSMMemtableDurations.DeletePartialMatch(labels) | |
| pm.LSMSegmentCount.DeletePartialMatch(labels) | |
| pm.LSMSegmentSize.DeletePartialMatch(labels) | |
| pm.LSMSegmentCountByLevel.DeletePartialMatch(labels) | |
| pm.VectorIndexTombstones.DeletePartialMatch(labels) | |
| pm.VectorIndexTombstoneCleanupThreads.DeletePartialMatch(labels) | |
| pm.VectorIndexTombstoneCleanedCount.DeletePartialMatch(labels) | |
| pm.VectorIndexOperations.DeletePartialMatch(labels) | |
| pm.VectorIndexMaintenanceDurations.DeletePartialMatch(labels) | |
| pm.VectorIndexDurations.DeletePartialMatch(labels) | |
| pm.VectorIndexSize.DeletePartialMatch(labels) | |
| pm.StartupProgress.DeletePartialMatch(labels) | |
| pm.StartupDurations.DeletePartialMatch(labels) | |
| pm.StartupDiskIO.DeletePartialMatch(labels) | |
| return nil | |
| } | |
| // DeleteClass deletes all metrics that match the class name, but do | |
| // not have a shard-specific label. See [DeleteShard] for more | |
| // information. | |
| func (pm *PrometheusMetrics) DeleteClass(className string) error { | |
| if pm == nil { | |
| return nil | |
| } | |
| labels := prometheus.Labels{ | |
| "class_name": className, | |
| } | |
| pm.QueriesCount.DeletePartialMatch(labels) | |
| pm.QueriesDurations.DeletePartialMatch(labels) | |
| pm.GoroutinesCount.DeletePartialMatch(labels) | |
| pm.BackupRestoreClassDurations.DeletePartialMatch(labels) | |
| pm.BackupRestoreBackupInitDurations.DeletePartialMatch(labels) | |
| pm.BackupRestoreFromStorageDurations.DeletePartialMatch(labels) | |
| pm.BackupStoreDurations.DeletePartialMatch(labels) | |
| pm.BackupRestoreDataTransferred.DeletePartialMatch(labels) | |
| pm.BackupStoreDataTransferred.DeletePartialMatch(labels) | |
| pm.QueriesFilteredVectorDurations.DeletePartialMatch(labels) | |
| return nil | |
| } | |
| var ( | |
| msBuckets = []float64{10, 50, 100, 500, 1000, 5000} | |
| metrics *PrometheusMetrics = nil | |
| ) | |
| func init() { | |
| metrics = newPrometheusMetrics() | |
| } | |
| func InitConfig(cfg config.Monitoring) { | |
| metrics.Group = cfg.Group | |
| } | |
| func GetMetrics() *PrometheusMetrics { | |
| return metrics | |
| } | |
| func newPrometheusMetrics() *PrometheusMetrics { | |
| return &PrometheusMetrics{ | |
| BatchTime: promauto.NewHistogramVec(prometheus.HistogramOpts{ | |
| Name: "batch_durations_ms", | |
| Help: "Duration in ms of a single batch", | |
| Buckets: msBuckets, | |
| }, []string{"operation", "class_name", "shard_name"}), | |
| BatchDeleteTime: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "batch_delete_durations_ms", | |
| Help: "Duration in ms of a single delete batch", | |
| }, []string{"operation", "class_name", "shard_name"}), | |
| ObjectsTime: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "objects_durations_ms", | |
| Help: "Duration of an individual object operation. Also as part of batches.", | |
| }, []string{"operation", "step", "class_name", "shard_name"}), | |
| ObjectCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "object_count", | |
| Help: "Number of currently ongoing async operations", | |
| }, []string{"class_name", "shard_name"}), | |
| QueriesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "concurrent_queries_count", | |
| Help: "Number of concurrently running query operations", | |
| }, []string{"class_name", "query_type"}), | |
| RequestsTotal: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "requests_total", | |
| Help: "Number of all requests made", | |
| }, []string{"status", "class_name", "api", "query_type"}), | |
| QueriesDurations: promauto.NewHistogramVec(prometheus.HistogramOpts{ | |
| Name: "queries_durations_ms", | |
| Help: "Duration of queries in milliseconds", | |
| Buckets: msBuckets, | |
| }, []string{"class_name", "query_type"}), | |
| QueriesFilteredVectorDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "queries_filtered_vector_durations_ms", | |
| Help: "Duration of queries in milliseconds", | |
| }, []string{"class_name", "shard_name", "operation"}), | |
| GoroutinesCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "concurrent_goroutines", | |
| Help: "Number of concurrently running goroutines", | |
| }, []string{"class_name", "query_type"}), | |
| AsyncOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "async_operations_running", | |
| Help: "Number of currently ongoing async operations", | |
| }, []string{"operation", "class_name", "shard_name", "path"}), | |
| // LSM metrics | |
| LSMSegmentCount: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "lsm_active_segments", | |
| Help: "Number of currently present segments per shard", | |
| }, []string{"strategy", "class_name", "shard_name", "path"}), | |
| LSMBloomFilters: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "lsm_bloom_filters_duration_ms", | |
| Help: "Duration of bloom filter operations", | |
| }, []string{"operation", "strategy", "class_name", "shard_name"}), | |
| LSMSegmentObjects: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "lsm_segment_objects", | |
| Help: "Number of objects/entries of segment by level", | |
| }, []string{"strategy", "class_name", "shard_name", "path", "level"}), | |
| LSMSegmentSize: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "lsm_segment_size", | |
| Help: "Size of segment by level and unit", | |
| }, []string{"strategy", "class_name", "shard_name", "path", "level", "unit"}), | |
| LSMSegmentCountByLevel: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "lsm_segment_count", | |
| Help: "Number of segments by level", | |
| }, []string{"strategy", "class_name", "shard_name", "path", "level"}), | |
| LSMMemtableSize: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "lsm_memtable_size", | |
| Help: "Size of memtable by path", | |
| }, []string{"strategy", "class_name", "shard_name", "path"}), | |
| LSMMemtableDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "lsm_memtable_durations_ms", | |
| Help: "Time in ms for a bucket operation to complete", | |
| }, []string{"strategy", "class_name", "shard_name", "path", "operation"}), | |
| // Vector index metrics | |
| VectorIndexTombstones: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "vector_index_tombstones", | |
| Help: "Number of active vector index tombstones", | |
| }, []string{"class_name", "shard_name"}), | |
| VectorIndexTombstoneCleanupThreads: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "vector_index_tombstone_cleanup_threads", | |
| Help: "Number of threads in use to clean up tombstones", | |
| }, []string{"class_name", "shard_name"}), | |
| VectorIndexTombstoneCleanedCount: promauto.NewCounterVec(prometheus.CounterOpts{ | |
| Name: "vector_index_tombstone_cleaned", | |
| Help: "Total number of deleted objects that have been cleaned up", | |
| }, []string{"class_name", "shard_name"}), | |
| VectorIndexOperations: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "vector_index_operations", | |
| Help: "Total number of mutating operations on the vector index", | |
| }, []string{"operation", "class_name", "shard_name"}), | |
| VectorIndexSize: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "vector_index_size", | |
| Help: "The size of the vector index. Typically larger than number of vectors, as it grows proactively.", | |
| }, []string{"class_name", "shard_name"}), | |
| VectorIndexMaintenanceDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "vector_index_maintenance_durations_ms", | |
| Help: "Duration of a sync or async vector index maintenance operation", | |
| }, []string{"operation", "class_name", "shard_name"}), | |
| VectorIndexDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "vector_index_durations_ms", | |
| Help: "Duration of typical vector index operations (insert, delete)", | |
| }, []string{"operation", "step", "class_name", "shard_name"}), | |
| VectorDimensionsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "vector_dimensions_sum", | |
| Help: "Total dimensions in a shard", | |
| }, []string{"class_name", "shard_name"}), | |
| VectorSegmentsSum: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "vector_segments_sum", | |
| Help: "Total segments in a shard if quantization enabled", | |
| }, []string{"class_name", "shard_name"}), | |
| // Startup metrics | |
| StartupProgress: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "startup_progress", | |
| Help: "A ratio (percentage) of startup progress for a particular component in a shard", | |
| }, []string{"operation", "class_name", "shard_name"}), | |
| StartupDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "startup_durations_ms", | |
| Help: "Duration of individual startup operations in ms", | |
| }, []string{"operation", "class_name", "shard_name"}), | |
| StartupDiskIO: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "startup_diskio_throughput", | |
| Help: "Disk I/O throuhput in bytes per second", | |
| }, []string{"operation", "class_name", "shard_name"}), | |
| QueryDimensions: promauto.NewCounterVec(prometheus.CounterOpts{ | |
| Name: "query_dimensions_total", | |
| Help: "The vector dimensions used by any read-query that involves vectors", | |
| }, []string{"query_type", "operation", "class_name"}), | |
| QueryDimensionsCombined: promauto.NewCounter(prometheus.CounterOpts{ | |
| Name: "query_dimensions_combined_total", | |
| Help: "The vector dimensions used by any read-query that involves vectors, aggregated across all classes and shards. The sum of all labels for query_dimensions_total should always match this labelless metric", | |
| }), | |
| // Backup/restore metrics | |
| BackupRestoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "backup_restore_ms", | |
| Help: "Duration of a backup restore", | |
| }, []string{"backend_name", "class_name"}), | |
| BackupRestoreClassDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "backup_restore_class_ms", | |
| Help: "Duration restoring class", | |
| }, []string{"class_name"}), | |
| BackupRestoreBackupInitDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "backup_restore_init_ms", | |
| Help: "startup phase of a backup restore", | |
| }, []string{"backend_name", "class_name"}), | |
| BackupRestoreFromStorageDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "backup_restore_from_backend_ms", | |
| Help: "file transfer stage of a backup restore", | |
| }, []string{"backend_name", "class_name"}), | |
| BackupStoreDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "backup_store_to_backend_ms", | |
| Help: "file transfer stage of a backup restore", | |
| }, []string{"backend_name", "class_name"}), | |
| BucketPauseDurations: promauto.NewSummaryVec(prometheus.SummaryOpts{ | |
| Name: "bucket_pause_durations_ms", | |
| Help: "bucket pause durations", | |
| }, []string{"bucket_dir"}), | |
| BackupRestoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{ | |
| Name: "backup_restore_data_transferred", | |
| Help: "Total number of bytes transferred during a backup restore", | |
| }, []string{"backend_name", "class_name"}), | |
| BackupStoreDataTransferred: promauto.NewCounterVec(prometheus.CounterOpts{ | |
| Name: "backup_store_data_transferred", | |
| Help: "Total number of bytes transferred during a backup store", | |
| }, []string{"backend_name", "class_name"}), | |
| // Shard metrics | |
| ShardsLoaded: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "shards_loaded", | |
| Help: "Number of shards loaded", | |
| }, []string{"class_name"}), | |
| ShardsUnloaded: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "shards_unloaded", | |
| Help: "Number of shards on not loaded", | |
| }, []string{"class_name"}), | |
| ShardsLoading: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "shards_loading", | |
| Help: "Number of shards in process of loading", | |
| }, []string{"class_name"}), | |
| ShardsUnloading: promauto.NewGaugeVec(prometheus.GaugeOpts{ | |
| Name: "shards_unloading", | |
| Help: "Number of shards in process of unloading", | |
| }, []string{"class_name"}), | |
| } | |
| } | |
| type OnceUponATimer struct { | |
| sync.Once | |
| Timer *prometheus.Timer | |
| } | |
| func NewOnceTimer(promTimer *prometheus.Timer) *OnceUponATimer { | |
| o := OnceUponATimer{} | |
| o.Timer = promTimer | |
| return &o | |
| } | |
| func (o *OnceUponATimer) ObserveDurationOnce() { | |
| o.Do(func() { | |
| o.Timer.ObserveDuration() | |
| }) | |
| } | |