Rules

BakinsFpmExporter

5.998s ago

334.9us

Rule State Error Last Evaluation Evaluation Time
alert: Php-fpmMax-childrenReached expr: sum by (instance) (phpfpm_max_children_reached_total) > 0 for: 1m labels: severity: warning annotations: description: |- PHP-FPM reached max children - {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: PHP-FPM max-children reached (instance {{ $labels.instance }}) ok 5.998s ago 317.2us

BlackboxExporter

3.63s ago

3.335ms

Rule State Error Last Evaluation Evaluation Time
alert: BlackboxProbeFailed expr: probe_success == 0 for: 3m labels: severity: critical annotations: description: |- Probe failed VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe failed (instance {{ $labels.instance }}) ok 3.63s ago 454us
alert: BlackboxConfigurationReloadFailure expr: blackbox_exporter_config_last_reload_successful != 1 for: 1m labels: severity: warning annotations: description: |- Blackbox configuration reload failure VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox configuration reload failure (instance {{ $labels.instance }}) ok 3.629s ago 105.1us
alert: BlackboxSlowProbe expr: avg_over_time(probe_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: description: |- Blackbox probe took more than 1s to complete VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox slow probe (instance {{ $labels.instance }}) ok 3.629s ago 623.1us
alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 or probe_http_status_code >= 400 for: 3m labels: severity: critical annotations: description: |- HTTP status code is not 200-399 VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe HTTP failure (instance {{ $labels.instance }}) ok 3.629s ago 439.6us
alert: BlackboxSslCertificateWillExpireSoon expr: 3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20 for: 1m labels: severity: warning annotations: description: |- SSL certificate expires in less than 20 days VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) ok 3.628s ago 386.9us
alert: BlackboxSslCertificateWillExpireSoon expr: 0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3 for: 1m labels: severity: critical annotations: description: |- SSL certificate expires in less than 3 days VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }}) ok 3.628s ago 325.9us
alert: BlackboxSslCertificateExpired expr: round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0 for: 1m labels: severity: critical annotations: description: |- SSL certificate has expired already VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox SSL certificate expired (instance {{ $labels.instance }}) ok 3.628s ago 265.4us
alert: BlackboxProbeSlowHttp expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: description: |- HTTP request took more than 1s VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe slow HTTP (instance {{ $labels.instance }}) ok 3.628s ago 619.3us
alert: BlackboxProbeSlowPing expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: description: |- Blackbox ping took more than 1s VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe slow ping (instance {{ $labels.instance }}) ok 3.627s ago 82.93us

EmbeddedExporter

2.072s ago

3.617ms

Rule State Error Last Evaluation Evaluation Time
alert: PrometheusJobMissing expr: absent(up{job="prometheus"}) for: 1m labels: severity: warning annotations: description: |- A Prometheus job has disappeared VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus job missing (instance {{ $labels.instance }}) ok 2.072s ago 275.2us
alert: PrometheusTargetMissing expr: up == 0 for: 1m labels: severity: critical annotations: description: |- A Prometheus target has disappeared. An exporter might be crashed. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target missing (instance {{ $labels.instance }}) ok 2.072s ago 428.6us
alert: PrometheusAllTargetsMissing expr: sum by (job) (up) == 0 for: 1m labels: severity: critical annotations: description: |- A Prometheus job does not have living target anymore. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus all targets missing (instance {{ $labels.instance }}) ok 2.071s ago 303.9us
alert: PrometheusTargetMissingWithWarmupTime expr: sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600)) for: 1m labels: severity: critical annotations: description: |- Allow a job time to start up (10 minutes) before alerting that it's down. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target missing with warmup time (instance {{ $labels.instance }}) ok 2.071s ago 691.2us
alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 1m labels: severity: warning annotations: description: |- Prometheus configuration reload error VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus configuration reload failure (instance {{ $labels.instance }}) ok 2.07s ago 85.76us
alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2 for: 1m labels: severity: warning annotations: description: |- Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus too many restarts (instance {{ $labels.instance }}) ok 2.07s ago 156.4us
alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation failures (instance {{ $labels.instance }}) ok 2.07s ago 114.7us
alert: PrometheusTemplateTextExpansionFailures expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} template text expansion failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus template text expansion failures (instance {{ $labels.instance }}) ok 2.07s ago 83.58us
alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: description: |- Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation slow (instance {{ $labels.instance }}) ok 2.07s ago 354.4us
alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 for: 1m labels: severity: warning annotations: description: |- The Prometheus notification queue has not been empty for 10 minutes VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus notifications backlog (instance {{ $labels.instance }}) ok 2.07s ago 169.5us
alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Alertmanager is failing sending notifications VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }}) ok 2.07s ago 70.46us
alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 for: 1m labels: severity: critical annotations: description: |- Prometheus has no target in service discovery VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target empty (instance {{ $labels.instance }}) ok 2.07s ago 156.7us
alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 for: 5m labels: severity: warning annotations: description: |- Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target scraping slow (instance {{ $labels.instance }}) ok 2.07s ago 141.2us
alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: description: |- Prometheus has many scrapes that exceed the sample limit VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus large scrape (instance {{ $labels.instance }}) ok 2.07s ago 78.82us
alert: PrometheusTargetScrapeDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 for: 1m labels: severity: warning annotations: description: |- Prometheus has many samples rejected due to duplicate timestamps but different values VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target scrape duplicate (instance {{ $labels.instance }}) ok 2.07s ago 56.26us
alert: PrometheusTsdbCheckpointCreationFailures expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} checkpoint creation failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }}) ok 2.07s ago 52.44us
alert: PrometheusTsdbCheckpointDeletionFailures expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} checkpoint deletion failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }}) ok 2.07s ago 56.14us
alert: PrometheusTsdbCompactionsFailed expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} TSDB compactions failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }}) ok 2.07s ago 51.51us
alert: PrometheusTsdbHeadTruncationsFailed expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} TSDB head truncation failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }}) ok 2.07s ago 76.1us
alert: PrometheusTsdbReloadFailures expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} TSDB reload failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB reload failures (instance {{ $labels.instance }}) ok 2.07s ago 59.06us
alert: PrometheusTsdbWalCorruptions expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} TSDB WAL corruptions VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }}) ok 2.07s ago 53.06us
alert: PrometheusTsdbWalTruncationsFailed expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: |- Prometheus encountered {{ $value }} TSDB WAL truncation failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }}) ok 2.07s ago 52.84us

KnyarNginxExporter

13.093s ago

1.083ms

Rule State Error Last Evaluation Evaluation Time
alert: NginxHighHttp4xxErrorRate expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 for: 1m labels: severity: critical annotations: description: |- Too many HTTP requests with status 4xx (> 5%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }}) ok 13.093s ago 576.3us
alert: NginxHighHttp5xxErrorRate expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 for: 1m labels: severity: critical annotations: description: |- Too many HTTP requests with status 5xx (> 5%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }}) ok 13.093s ago 344.5us
alert: NginxLatencyHigh expr: histogram_quantile(0.99, sum by (host, node, le) (rate(nginx_http_request_duration_seconds_bucket[2m]))) > 3 for: 2m labels: severity: warning annotations: description: |- Nginx p99 latency is higher than 3 seconds VALUE = {{ $value }} LABELS = {{ $labels }} summary: Nginx latency high (instance {{ $labels.instance }}) ok 13.093s ago 123.6us

MysqldExporter

8.742s ago

2.47ms

Rule State Error Last Evaluation Evaluation Time
alert: MysqlDown expr: mysql_up == 0 for: 1m labels: severity: critical annotations: description: |- MySQL instance is down on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL down (instance {{ $labels.instance }}) ok 8.743s ago 392.9us
alert: MysqlTooManyConnections(>80%) expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 for: 2m labels: severity: warning annotations: description: |- More than 80% of MySQL connections are in use on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }}) ok 8.742s ago 218.9us
alert: MysqlHighPreparedStatementsUtilization(>80%) expr: max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 for: 2m labels: severity: warning annotations: description: |- High utilization of prepared statements (>80%) on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }}) ok 8.742s ago 194.6us
alert: MysqlHighThreadsRunning expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 for: 2m labels: severity: warning annotations: description: |- More than 60% of MySQL connections are in running state on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL high threads running (instance {{ $labels.instance }}) ok 8.742s ago 174.7us
alert: MysqlSlaveIoThreadNotRunning expr: (mysql_slave_status_slave_io_running and on (instance) mysql_slave_status_master_server_id > 0) == 0 for: 1m labels: severity: critical annotations: description: |- MySQL Slave IO thread not running on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL Slave IO thread not running (instance {{ $labels.instance }}) ok 8.742s ago 161.1us
alert: MysqlSlaveSqlThreadNotRunning expr: (mysql_slave_status_slave_sql_running and on (instance) mysql_slave_status_master_server_id > 0) == 0 for: 1m labels: severity: critical annotations: description: |- MySQL Slave SQL thread not running on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }}) ok 8.742s ago 192us
alert: MysqlSlaveReplicationLag expr: ((mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and on (instance) mysql_slave_status_master_server_id > 0) > 30 for: 1m labels: severity: critical annotations: description: |- MySQL replication lag on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL Slave replication lag (instance {{ $labels.instance }}) ok 8.742s ago 232.2us
alert: MysqlSlowQueries expr: increase(mysql_global_status_slow_queries[1m]) > 0 for: 2m labels: severity: warning annotations: description: |- MySQL server mysql has some new slow query. VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL slow queries (instance {{ $labels.instance }}) ok 8.742s ago 117.3us
alert: MysqlInnodbLogWaits expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 for: 1m labels: severity: warning annotations: description: |- MySQL innodb log writes stalling VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL InnoDB log waits (instance {{ $labels.instance }}) ok 8.741s ago 133.3us
alert: MysqlRestarted expr: mysql_global_status_uptime < 60 for: 1m labels: severity: info annotations: description: |- MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}. VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL restarted (instance {{ $labels.instance }}) ok 8.741s ago 99.09us
alert: MysqlHighQps expr: irate(mysql_global_status_questions[1m]) > 10000 for: 2m labels: severity: info annotations: description: |- MySQL is being overload with unusual QPS (> 10k QPS). VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL High QPS (instance {{ $labels.instance }}) ok 8.741s ago 117.8us
alert: MysqlTooManyOpenFiles expr: mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 for: 2m labels: severity: warning annotations: description: |- MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}. VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL too many open files (instance {{ $labels.instance }}) ok 8.741s ago 222.1us
alert: MysqlInnodbForceRecoveryIsEnabled expr: mysql_global_variables_innodb_force_recovery != 0 for: 2m labels: severity: warning annotations: description: |- MySQL InnoDB force recovery is enabled on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }}) ok 8.741s ago 109.3us
alert: MysqlInnodbHistory_lenTooLong expr: mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000 for: 2m labels: severity: warning annotations: description: |- MySQL history_len (undo log) too long on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }}) ok 8.741s ago 64.93us

NodeExporter

6.833s ago

20.32ms

Rule State Error Last Evaluation Evaluation Time
alert: HostOutOfMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1) for: 2m labels: severity: warning annotations: description: |- Node memory is filling up (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of memory (instance {{ $labels.instance }}) ok 6.833s ago 687.8us
alert: HostMemoryUnderMemoryPressure expr: (rate(node_vmstat_pgmajfault[5m]) > 1000) for: 1m labels: severity: warning annotations: description: |- The node is under heavy memory pressure. High rate of loading memory pages from disk. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host memory under memory pressure (instance {{ $labels.instance }}) ok 6.832s ago 297us
alert: HostMemoryIsUnderutilized expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * 0.8 for: 1m labels: severity: info annotations: description: |- Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }}) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Memory is underutilized (instance {{ $labels.instance }}) ok 6.832s ago 7.866ms
alert: HostUnusualNetworkThroughputIn expr: ((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > 0.8) for: 1m labels: severity: warning annotations: description: |- Host receive bandwidth is high (>80%). VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput in (instance {{ $labels.instance }}) ok 6.824s ago 426.5us
alert: HostUnusualNetworkThroughputOut expr: ((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > 0.8) for: 1m labels: severity: warning annotations: description: |- Host transmit bandwidth is high (>80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput out (instance {{ $labels.instance }}) ok 6.824s ago 289.8us
alert: HostUnusualDiskReadRate expr: (rate(node_disk_io_time_seconds_total[5m]) > 0.8) for: 1m labels: severity: warning annotations: description: |- Disk is too busy (IO wait > 80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read rate (instance {{ $labels.instance }}) ok 6.824s ago 242.5us
alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} / node_filesystem_size_bytes < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly == 0) for: 2m labels: severity: critical annotations: description: |- Disk is almost full (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of disk space (instance {{ $labels.instance }}) ok 6.824s ago 911.7us
alert: HostDiskMayFillIn24Hours expr: predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0 for: 2m labels: severity: warning annotations: description: |- Filesystem will likely run out of space within the next 24 hours. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host disk may fill in 24 hours (instance {{ $labels.instance }}) ok 6.823s ago 2.033ms
alert: HostOutOfInodes expr: (node_filesystem_files_free / node_filesystem_files < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly == 0) for: 2m labels: severity: critical annotations: description: |- Disk is almost running out of available inodes (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of inodes (instance {{ $labels.instance }}) ok 6.821s ago 675.5us
alert: HostFilesystemDeviceError expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"} == 1 for: 2m labels: severity: critical annotations: description: |- Error stat-ing the {{ $labels.mountpoint }} filesystem VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host filesystem device error (instance {{ $labels.instance }}) ok 6.82s ago 236.1us
alert: HostInodesMayFillIn24Hours expr: predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0 for: 2m labels: severity: warning annotations: description: |- Filesystem will likely run out of inodes within the next 24 hours at current write rate VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }}) ok 6.82s ago 869us
alert: HostUnusualDiskReadLatency expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) for: 2m labels: severity: warning annotations: description: |- Disk latency is growing (read operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read latency (instance {{ $labels.instance }}) ok 6.819s ago 439.2us
alert: HostUnusualDiskWriteLatency expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) for: 2m labels: severity: warning annotations: description: |- Disk latency is growing (write operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk write latency (instance {{ $labels.instance }}) ok 6.819s ago 436.5us
alert: HostHighCpuLoad expr: 1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 0.8 for: 10m labels: severity: warning annotations: description: |- CPU load is > 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host high CPU load (instance {{ $labels.instance }}) ok 6.819s ago 610.3us
alert: HostCpuStealNoisyNeighbor expr: avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 1m labels: severity: warning annotations: description: |- CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }}) ok 6.818s ago 588.4us
alert: HostCpuHighIowait expr: avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.1 for: 1m labels: severity: warning annotations: description: |- CPU iowait > 10%. Your CPU is idling waiting for storage to respond. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host CPU high iowait (instance {{ $labels.instance }}) ok 6.818s ago 451.4us
alert: HostUnusualDiskIo expr: rate(node_disk_io_time_seconds_total[5m]) > 0.8 for: 5m labels: severity: warning annotations: description: |- Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk IO (instance {{ $labels.instance }}) ok 6.817s ago 148.5us
alert: HostSwapIsFillingUp expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) for: 2m labels: severity: warning annotations: description: |- Swap is filling up (>80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host swap is filling up (instance {{ $labels.instance }}) ok 6.817s ago 335.1us
alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > node_hwmon_temp_max_celsius for: 5m labels: severity: warning annotations: description: |- Physical hardware component too hot VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host physical component too hot (instance {{ $labels.instance }}) ok 6.817s ago 63.7us
alert: HostNodeOvertemperatureAlarm expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) for: 1m labels: severity: critical annotations: description: |- Physical node temperature alarm triggered VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host node overtemperature alarm (instance {{ $labels.instance }}) ok 6.817s ago 79.46us
alert: HostSoftwareRaidInsufficientDrives expr: ((node_md_disks_required - on (device, instance) node_md_disks{state="active"}) > 0) for: 1m labels: severity: critical annotations: description: |- MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host software RAID insufficient drives (instance {{ $labels.instance }}) ok 6.817s ago 76us
alert: HostSoftwareRaidDiskFailure expr: (node_md_disks{state="failed"} > 0) for: 2m labels: severity: warning annotations: description: |- MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host software RAID disk failure (instance {{ $labels.instance }}) ok 6.817s ago 50.96us
alert: HostKernelVersionDeviations expr: changes(node_uname_info[1h]) > 0 for: 1m labels: severity: info annotations: description: |- Kernel version for {{ $labels.instance }} has changed. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host kernel version deviations (instance {{ $labels.instance }}) ok 6.817s ago 306.6us
alert: HostOomKillDetected expr: (increase(node_vmstat_oom_kill[1m]) > 0) for: 1m labels: severity: warning annotations: description: |- OOM kill detected VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host OOM kill detected (instance {{ $labels.instance }}) ok 6.817s ago 162.1us
alert: HostEdacCorrectableErrorsDetected expr: (increase(node_edac_correctable_errors_total[1m]) > 0) for: 1m labels: severity: info annotations: description: |- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }}) ok 6.817s ago 68.31us
alert: HostEdacUncorrectableErrorsDetected expr: (node_edac_uncorrectable_errors_total > 0) for: 1m labels: severity: warning annotations: description: |- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }}) ok 6.817s ago 49.05us
alert: HostNetworkReceiveErrors expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) for: 2m labels: severity: warning annotations: description: |- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Receive Errors (instance {{ $labels.instance }}) ok 6.817s ago 339.8us
alert: HostNetworkTransmitErrors expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) for: 2m labels: severity: warning annotations: description: |- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Transmit Errors (instance {{ $labels.instance }}) ok 6.816s ago 325.7us
alert: HostNetworkBondDegraded expr: ((node_bonding_active - node_bonding_slaves) != 0) for: 2m labels: severity: warning annotations: description: |- Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}". VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Bond Degraded (instance {{ $labels.instance }}) ok 6.816s ago 81.71us
alert: HostConntrackLimit expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) for: 5m labels: severity: warning annotations: description: |- The number of conntrack is approaching limit VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host conntrack limit (instance {{ $labels.instance }}) ok 6.816s ago 312.9us
alert: HostClockSkew expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) for: 10m labels: severity: warning annotations: description: |- Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host clock skew (instance {{ $labels.instance }}) ok 6.816s ago 505us
alert: HostClockNotSynchronising expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) for: 2m labels: severity: warning annotations: description: |- Clock not synchronising. Ensure NTP is configured on this host. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host clock not synchronising (instance {{ $labels.instance }}) ok 6.815s ago 298.3us