| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: BlackboxProbeFailed
expr: probe_success
== 0
for: 3m
labels:
severity: critical
annotations:
description: |-
Probe failed
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox probe failed (instance {{ $labels.instance }})
|
ok
|
|
3.63s ago
|
454us |
| alert: BlackboxConfigurationReloadFailure
expr: blackbox_exporter_config_last_reload_successful
!= 1
for: 1m
labels:
severity: warning
annotations:
description: |-
Blackbox configuration reload failure
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
|
ok
|
|
3.629s ago
|
105.1us |
| alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m])
> 1
for: 1m
labels:
severity: warning
annotations:
description: |-
Blackbox probe took more than 1s to complete
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox slow probe (instance {{ $labels.instance }})
|
ok
|
|
3.629s ago
|
623.1us |
| alert: BlackboxProbeHttpFailure
expr: probe_http_status_code
<= 199 or probe_http_status_code >= 400
for: 3m
labels:
severity: critical
annotations:
description: |-
HTTP status code is not 200-399
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
|
ok
|
|
3.629s ago
|
439.6us |
| alert: BlackboxSslCertificateWillExpireSoon
expr: 3
<= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400,
0.1) < 20
for: 1m
labels:
severity: warning
annotations:
description: |-
SSL certificate expires in less than 20 days
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance
}})
|
ok
|
|
3.628s ago
|
386.9us |
| alert: BlackboxSslCertificateWillExpireSoon
expr: 0
<= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400,
0.1) < 3
for: 1m
labels:
severity: critical
annotations:
description: |-
SSL certificate expires in less than 3 days
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance
}})
|
ok
|
|
3.628s ago
|
325.9us |
| alert: BlackboxSslCertificateExpired
expr: round((last_over_time(probe_ssl_earliest_cert_expiry[10m])
- time()) / 86400, 0.1) < 0
for: 1m
labels:
severity: critical
annotations:
description: |-
SSL certificate has expired already
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
|
ok
|
|
3.628s ago
|
265.4us |
| alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m])
> 1
for: 1m
labels:
severity: warning
annotations:
description: |-
HTTP request took more than 1s
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
|
ok
|
|
3.628s ago
|
619.3us |
| alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m])
> 1
for: 1m
labels:
severity: warning
annotations:
description: |-
Blackbox ping took more than 1s
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Blackbox probe slow ping (instance {{ $labels.instance }})
|
ok
|
|
3.627s ago
|
82.93us |
|
2.072s ago |
3.617ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 1m
labels:
severity: warning
annotations:
description: |-
A Prometheus job has disappeared
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus job missing (instance {{ $labels.instance }})
|
ok
|
|
2.072s ago
|
275.2us |
| alert: PrometheusTargetMissing
expr: up == 0
for: 1m
labels:
severity: critical
annotations:
description: |-
A Prometheus target has disappeared. An exporter might be crashed.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target missing (instance {{ $labels.instance }})
|
ok
|
|
2.072s ago
|
428.6us |
| alert: PrometheusAllTargetsMissing
expr: sum
by (job) (up) == 0
for: 1m
labels:
severity: critical
annotations:
description: |-
A Prometheus job does not have living target anymore.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus all targets missing (instance {{ $labels.instance }})
|
ok
|
|
2.071s ago
|
303.9us |
| alert: PrometheusTargetMissingWithWarmupTime
expr: sum
by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds
- node_boot_time_seconds > 600))
for: 1m
labels:
severity: critical
annotations:
description: |-
Allow a job time to start up (10 minutes) before alerting that it's down.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target missing with warmup time (instance {{ $labels.instance
}})
|
ok
|
|
2.071s ago
|
691.2us |
| alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful
!= 1
for: 1m
labels:
severity: warning
annotations:
description: |-
Prometheus configuration reload error
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
85.76us |
| alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
> 2
for: 1m
labels:
severity: warning
annotations:
description: |-
Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus too many restarts (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
156.4us |
| alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
114.7us |
| alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} template text expansion failures
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus template text expansion failures (instance {{ $labels.instance
}})
|
ok
|
|
2.07s ago
|
83.58us |
| alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds
> prometheus_rule_group_interval_seconds
for: 5m
labels:
severity: warning
annotations:
description: |-
Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
354.4us |
| alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m])
> 0
for: 1m
labels:
severity: warning
annotations:
description: |-
The Prometheus notification queue has not been empty for 10 minutes
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus notifications backlog (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
169.5us |
| alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Alertmanager is failing sending notifications
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
}})
|
ok
|
|
2.07s ago
|
70.46us |
| alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets
== 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus has no target in service discovery
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target empty (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
156.7us |
| alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"}
/ on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"}
> 1.05
for: 5m
labels:
severity: warning
annotations:
description: |-
Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target scraping slow (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
141.2us |
| alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m])
> 10
for: 5m
labels:
severity: warning
annotations:
description: |-
Prometheus has many scrapes that exceed the sample limit
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus large scrape (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
78.82us |
| alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
> 0
for: 1m
labels:
severity: warning
annotations:
description: |-
Prometheus has many samples rejected due to duplicate timestamps but different values
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
56.26us |
| alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} checkpoint creation failures
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
}})
|
ok
|
|
2.07s ago
|
52.44us |
| alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} checkpoint deletion failures
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
}})
|
ok
|
|
2.07s ago
|
56.14us |
| alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} TSDB compactions failures
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
51.51us |
| alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} TSDB head truncation failures
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
76.1us |
| alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} TSDB reload failures
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
59.06us |
| alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} TSDB WAL corruptions
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
53.06us |
| alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m])
> 0
for: 1m
labels:
severity: critical
annotations:
description: |-
Prometheus encountered {{ $value }} TSDB WAL truncation failures
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
|
ok
|
|
2.07s ago
|
52.84us |
|
13.093s ago |
1.083ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: MysqlDown
expr: mysql_up == 0
for: 1m
labels:
severity: critical
annotations:
description: |-
MySQL instance is down on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL down (instance {{ $labels.instance }})
|
ok
|
|
8.743s ago
|
392.9us |
| alert: MysqlTooManyConnections(>80%)
expr: max_over_time(mysql_global_status_threads_connected[1m])
/ mysql_global_variables_max_connections * 100 > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
More than 80% of MySQL connections are in use on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
|
ok
|
|
8.742s ago
|
218.9us |
| alert: MysqlHighPreparedStatementsUtilization(>80%)
expr: max_over_time(mysql_global_status_prepared_stmt_count[1m])
/ mysql_global_variables_max_prepared_stmt_count * 100 > 80
for: 2m
labels:
severity: warning
annotations:
description: |-
High utilization of prepared statements (>80%) on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance
}})
|
ok
|
|
8.742s ago
|
194.6us |
| alert: MysqlHighThreadsRunning
expr: max_over_time(mysql_global_status_threads_running[1m])
/ mysql_global_variables_max_connections * 100 > 60
for: 2m
labels:
severity: warning
annotations:
description: |-
More than 60% of MySQL connections are in running state on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL high threads running (instance {{ $labels.instance }})
|
ok
|
|
8.742s ago
|
174.7us |
| alert: MysqlSlaveIoThreadNotRunning
expr: (mysql_slave_status_slave_io_running
and on (instance) mysql_slave_status_master_server_id > 0) == 0
for: 1m
labels:
severity: critical
annotations:
description: |-
MySQL Slave IO thread not running on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
|
ok
|
|
8.742s ago
|
161.1us |
| alert: MysqlSlaveSqlThreadNotRunning
expr: (mysql_slave_status_slave_sql_running
and on (instance) mysql_slave_status_master_server_id > 0) == 0
for: 1m
labels:
severity: critical
annotations:
description: |-
MySQL Slave SQL thread not running on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }})
|
ok
|
|
8.742s ago
|
192us |
| alert: MysqlSlaveReplicationLag
expr: ((mysql_slave_status_seconds_behind_master
- mysql_slave_status_sql_delay) and on (instance) mysql_slave_status_master_server_id
> 0) > 30
for: 1m
labels:
severity: critical
annotations:
description: |-
MySQL replication lag on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL Slave replication lag (instance {{ $labels.instance }})
|
ok
|
|
8.742s ago
|
232.2us |
| alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[1m])
> 0
for: 2m
labels:
severity: warning
annotations:
description: |-
MySQL server mysql has some new slow query.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL slow queries (instance {{ $labels.instance }})
|
ok
|
|
8.742s ago
|
117.3us |
| alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m])
> 10
for: 1m
labels:
severity: warning
annotations:
description: |-
MySQL innodb log writes stalling
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
|
ok
|
|
8.741s ago
|
133.3us |
| alert: MysqlRestarted
expr: mysql_global_status_uptime
< 60
for: 1m
labels:
severity: info
annotations:
description: |-
MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL restarted (instance {{ $labels.instance }})
|
ok
|
|
8.741s ago
|
99.09us |
| alert: MysqlHighQps
expr: irate(mysql_global_status_questions[1m])
> 10000
for: 2m
labels:
severity: info
annotations:
description: |-
MySQL is being overload with unusual QPS (> 10k QPS).
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL High QPS (instance {{ $labels.instance }})
|
ok
|
|
8.741s ago
|
117.8us |
| alert: MysqlTooManyOpenFiles
expr: mysql_global_status_innodb_num_open_files
/ mysql_global_variables_open_files_limit * 100 > 75
for: 2m
labels:
severity: warning
annotations:
description: |-
MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL too many open files (instance {{ $labels.instance }})
|
ok
|
|
8.741s ago
|
222.1us |
| alert: MysqlInnodbForceRecoveryIsEnabled
expr: mysql_global_variables_innodb_force_recovery
!= 0
for: 2m
labels:
severity: warning
annotations:
description: |-
MySQL InnoDB force recovery is enabled on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
|
ok
|
|
8.741s ago
|
109.3us |
| alert: MysqlInnodbHistory_lenTooLong
expr: mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len
> 50000
for: 2m
labels:
severity: warning
annotations:
description: |-
MySQL history_len (undo log) too long on {{ $labels.instance }}
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
|
ok
|
|
8.741s ago
|
64.93us |
|
6.833s ago |
20.32ms |
| Rule |
State |
Error |
Last Evaluation |
Evaluation Time |
| alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes
/ node_memory_MemTotal_bytes < 0.1)
for: 2m
labels:
severity: warning
annotations:
description: |-
Node memory is filling up (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of memory (instance {{ $labels.instance }})
|
ok
|
|
6.833s ago
|
687.8us |
| alert: HostMemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[5m])
> 1000)
for: 1m
labels:
severity: warning
annotations:
description: |-
The node is under heavy memory pressure. High rate of loading memory pages from disk.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host memory under memory pressure (instance {{ $labels.instance }})
|
ok
|
|
6.832s ago
|
297us |
| alert: HostMemoryIsUnderutilized
expr: min_over_time(node_memory_MemFree_bytes[1w])
> node_memory_MemTotal_bytes * 0.8
for: 1m
labels:
severity: info
annotations:
description: |-
Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Memory is underutilized (instance {{ $labels.instance }})
|
ok
|
|
6.832s ago
|
7.866ms |
| alert: HostUnusualNetworkThroughputIn
expr: ((rate(node_network_receive_bytes_total[5m])
/ node_network_speed_bytes) > 0.8)
for: 1m
labels:
severity: warning
annotations:
description: |-
Host receive bandwidth is high (>80%).
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual network throughput in (instance {{ $labels.instance }})
|
ok
|
|
6.824s ago
|
426.5us |
| alert: HostUnusualNetworkThroughputOut
expr: ((rate(node_network_transmit_bytes_total[5m])
/ node_network_speed_bytes) > 0.8)
for: 1m
labels:
severity: warning
annotations:
description: |-
Host transmit bandwidth is high (>80%)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual network throughput out (instance {{ $labels.instance }})
|
ok
|
|
6.824s ago
|
289.8us |
| alert: HostUnusualDiskReadRate
expr: (rate(node_disk_io_time_seconds_total[5m])
> 0.8)
for: 1m
labels:
severity: warning
annotations:
description: |-
Disk is too busy (IO wait > 80%)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk read rate (instance {{ $labels.instance }})
|
ok
|
|
6.824s ago
|
242.5us |
| alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}
/ node_filesystem_size_bytes < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly
== 0)
for: 2m
labels:
severity: critical
annotations:
description: |-
Disk is almost full (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of disk space (instance {{ $labels.instance }})
|
ok
|
|
6.824s ago
|
911.7us |
| alert: HostDiskMayFillIn24Hours
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h],
86400) <= 0 and node_filesystem_avail_bytes > 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Filesystem will likely run out of space within the next 24 hours.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
|
ok
|
|
6.823s ago
|
2.033ms |
| alert: HostOutOfInodes
expr: (node_filesystem_files_free
/ node_filesystem_files < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly
== 0)
for: 2m
labels:
severity: critical
annotations:
description: |-
Disk is almost running out of available inodes (< 10% left)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host out of inodes (instance {{ $labels.instance }})
|
ok
|
|
6.821s ago
|
675.5us |
| alert: HostFilesystemDeviceError
expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}
== 1
for: 2m
labels:
severity: critical
annotations:
description: |-
Error stat-ing the {{ $labels.mountpoint }} filesystem
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host filesystem device error (instance {{ $labels.instance }})
|
ok
|
|
6.82s ago
|
236.1us |
| alert: HostInodesMayFillIn24Hours
expr: predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h],
86400) <= 0 and node_filesystem_files_free > 0
for: 2m
labels:
severity: warning
annotations:
description: |-
Filesystem will likely run out of inodes within the next 24 hours at current write rate
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
|
ok
|
|
6.82s ago
|
869us |
| alert: HostUnusualDiskReadLatency
expr: (rate(node_disk_read_time_seconds_total[1m])
/ rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m])
> 0)
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (read operations > 100ms)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk read latency (instance {{ $labels.instance }})
|
ok
|
|
6.819s ago
|
439.2us |
| alert: HostUnusualDiskWriteLatency
expr: (rate(node_disk_write_time_seconds_total[1m])
/ rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m])
> 0)
for: 2m
labels:
severity: warning
annotations:
description: |-
Disk latency is growing (write operations > 100ms)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk write latency (instance {{ $labels.instance }})
|
ok
|
|
6.819s ago
|
436.5us |
| alert: HostHighCpuLoad
expr: 1
- (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) >
0.8
for: 10m
labels:
severity: warning
annotations:
description: |-
CPU load is > 80%
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host high CPU load (instance {{ $labels.instance }})
|
ok
|
|
6.819s ago
|
610.3us |
| alert: HostCpuStealNoisyNeighbor
expr: avg
without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 >
10
for: 1m
labels:
severity: warning
annotations:
description: |-
CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
|
ok
|
|
6.818s ago
|
588.4us |
| alert: HostCpuHighIowait
expr: avg
without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.1
for: 1m
labels:
severity: warning
annotations:
description: |-
CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host CPU high iowait (instance {{ $labels.instance }})
|
ok
|
|
6.818s ago
|
451.4us |
| alert: HostUnusualDiskIo
expr: rate(node_disk_io_time_seconds_total[5m])
> 0.8
for: 5m
labels:
severity: warning
annotations:
description: |-
Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host unusual disk IO (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
148.5us |
| alert: HostSwapIsFillingUp
expr: ((1
- (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
for: 2m
labels:
severity: warning
annotations:
description: |-
Swap is filling up (>80%)
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host swap is filling up (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
335.1us |
| alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius
> node_hwmon_temp_max_celsius
for: 5m
labels:
severity: warning
annotations:
description: |-
Physical hardware component too hot
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host physical component too hot (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
63.7us |
| alert: HostNodeOvertemperatureAlarm
expr: ((node_hwmon_temp_crit_alarm_celsius
== 1) or (node_hwmon_temp_alarm == 1))
for: 1m
labels:
severity: critical
annotations:
description: |-
Physical node temperature alarm triggered
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host node overtemperature alarm (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
79.46us |
| alert: HostSoftwareRaidInsufficientDrives
expr: ((node_md_disks_required
- on (device, instance) node_md_disks{state="active"}) > 0)
for: 1m
labels:
severity: critical
annotations:
description: |-
MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
76us |
| alert: HostSoftwareRaidDiskFailure
expr: (node_md_disks{state="failed"}
> 0)
for: 2m
labels:
severity: warning
annotations:
description: |-
MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host software RAID disk failure (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
50.96us |
| alert: HostKernelVersionDeviations
expr: changes(node_uname_info[1h])
> 0
for: 1m
labels:
severity: info
annotations:
description: |-
Kernel version for {{ $labels.instance }} has changed.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host kernel version deviations (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
306.6us |
| alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m])
> 0)
for: 1m
labels:
severity: warning
annotations:
description: |-
OOM kill detected
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host OOM kill detected (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
162.1us |
| alert: HostEdacCorrectableErrorsDetected
expr: (increase(node_edac_correctable_errors_total[1m])
> 0)
for: 1m
labels:
severity: info
annotations:
description: |-
Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
68.31us |
| alert: HostEdacUncorrectableErrorsDetected
expr: (node_edac_uncorrectable_errors_total
> 0)
for: 1m
labels:
severity: warning
annotations:
description: |-
Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
49.05us |
| alert: HostNetworkReceiveErrors
expr: (rate(node_network_receive_errs_total[2m])
/ rate(node_network_receive_packets_total[2m]) > 0.01)
for: 2m
labels:
severity: warning
annotations:
description: |-
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Receive Errors (instance {{ $labels.instance }})
|
ok
|
|
6.817s ago
|
339.8us |
| alert: HostNetworkTransmitErrors
expr: (rate(node_network_transmit_errs_total[2m])
/ rate(node_network_transmit_packets_total[2m]) > 0.01)
for: 2m
labels:
severity: warning
annotations:
description: |-
Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Transmit Errors (instance {{ $labels.instance }})
|
ok
|
|
6.816s ago
|
325.7us |
| alert: HostNetworkBondDegraded
expr: ((node_bonding_active
- node_bonding_slaves) != 0)
for: 2m
labels:
severity: warning
annotations:
description: |-
Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host Network Bond Degraded (instance {{ $labels.instance }})
|
ok
|
|
6.816s ago
|
81.71us |
| alert: HostConntrackLimit
expr: (node_nf_conntrack_entries
/ node_nf_conntrack_entries_limit > 0.8)
for: 5m
labels:
severity: warning
annotations:
description: |-
The number of conntrack is approaching limit
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host conntrack limit (instance {{ $labels.instance }})
|
ok
|
|
6.816s ago
|
312.9us |
| alert: HostClockSkew
expr: ((node_timex_offset_seconds
> 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds
< -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
for: 10m
labels:
severity: warning
annotations:
description: |-
Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host clock skew (instance {{ $labels.instance }})
|
ok
|
|
6.816s ago
|
505us |
| alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m])
== 0 and node_timex_maxerror_seconds >= 16)
for: 2m
labels:
severity: warning
annotations:
description: |-
Clock not synchronising. Ensure NTP is configured on this host.
VALUE = {{ $value }}
LABELS = {{ $labels }}
summary: Host clock not synchronising (instance {{ $labels.instance }})
|
ok
|
|
6.815s ago
|
298.3us |