Alerts


/etc/prometheus/rules/bakins-fpm-exporter.yml > BakinsFpmExporter
Php-fpmMax-childrenReached (0 active)
alert: Php-fpmMax-childrenReached
expr: sum
  by (instance) (phpfpm_max_children_reached_total) > 0
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    PHP-FPM reached max children - {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: PHP-FPM max-children reached (instance {{ $labels.instance }})
/etc/prometheus/rules/blackbox-exporter.yml > BlackboxExporter
BlackboxConfigurationReloadFailure (0 active)
alert: BlackboxConfigurationReloadFailure
expr: blackbox_exporter_config_last_reload_successful
  != 1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Blackbox configuration reload failure
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox configuration reload failure (instance {{ $labels.instance }})
BlackboxProbeFailed (0 active)
alert: BlackboxProbeFailed
expr: probe_success
  == 0
for: 3m
labels:
  severity: critical
annotations:
  description: |-
    Probe failed
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox probe failed (instance {{ $labels.instance }})
BlackboxProbeHttpFailure (0 active)
alert: BlackboxProbeHttpFailure
expr: probe_http_status_code
  <= 199 or probe_http_status_code >= 400
for: 3m
labels:
  severity: critical
annotations:
  description: |-
    HTTP status code is not 200-399
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})
BlackboxProbeSlowHttp (0 active)
alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m])
  > 1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    HTTP request took more than 1s
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})
BlackboxProbeSlowPing (0 active)
alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m])
  > 1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Blackbox ping took more than 1s
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox probe slow ping (instance {{ $labels.instance }})
BlackboxSlowProbe (0 active)
alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m])
  > 1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Blackbox probe took more than 1s to complete
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox slow probe (instance {{ $labels.instance }})
BlackboxSslCertificateExpired (0 active)
alert: BlackboxSslCertificateExpired
expr: round((last_over_time(probe_ssl_earliest_cert_expiry[10m])
  - time()) / 86400, 0.1) < 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    SSL certificate has expired already
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})
BlackboxSslCertificateWillExpireSoon (0 active)
alert: BlackboxSslCertificateWillExpireSoon
expr: 3
  <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400,
  0.1) < 20
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    SSL certificate expires in less than 20 days
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance
    }})
BlackboxSslCertificateWillExpireSoon (0 active)
alert: BlackboxSslCertificateWillExpireSoon
expr: 0
  <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400,
  0.1) < 3
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    SSL certificate expires in less than 3 days
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance
    }})
/etc/prometheus/rules/embedded-exporter.yml > EmbeddedExporter
PrometheusAlertmanagerNotificationFailing (0 active)
alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Alertmanager is failing sending notifications
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus AlertManager notification failing (instance {{ $labels.instance
    }})
PrometheusAllTargetsMissing (0 active)
alert: PrometheusAllTargetsMissing
expr: sum
  by (job) (up) == 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    A Prometheus job does not have living target anymore.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus all targets missing (instance {{ $labels.instance }})
PrometheusConfigurationReloadFailure (0 active)
alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful
  != 1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus configuration reload error
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus configuration reload failure (instance {{ $labels.instance }})
PrometheusJobMissing (0 active)
alert: PrometheusJobMissing
expr: absent(up{job="prometheus"})
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    A Prometheus job has disappeared
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus job missing (instance {{ $labels.instance }})
PrometheusLargeScrape (0 active)
alert: PrometheusLargeScrape
expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m])
  > 10
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus has many scrapes that exceed the sample limit
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus large scrape (instance {{ $labels.instance }})
PrometheusNotificationsBacklog (0 active)
alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m])
  > 0
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    The Prometheus notification queue has not been empty for 10 minutes
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus notifications backlog (instance {{ $labels.instance }})
PrometheusRuleEvaluationFailures (0 active)
alert: PrometheusRuleEvaluationFailures
expr: increase(prometheus_rule_evaluation_failures_total[3m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})
PrometheusRuleEvaluationSlow (0 active)
alert: PrometheusRuleEvaluationSlow
expr: prometheus_rule_group_last_duration_seconds
  > prometheus_rule_group_interval_seconds
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})
PrometheusTargetEmpty (0 active)
alert: PrometheusTargetEmpty
expr: prometheus_sd_discovered_targets
  == 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus has no target in service discovery
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus target empty (instance {{ $labels.instance }})
PrometheusTargetMissing (0 active)
alert: PrometheusTargetMissing
expr: up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    A Prometheus target has disappeared. An exporter might be crashed.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus target missing (instance {{ $labels.instance }})
PrometheusTargetMissingWithWarmupTime (0 active)
alert: PrometheusTargetMissingWithWarmupTime
expr: sum
  by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds
  - node_boot_time_seconds > 600))
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Allow a job time to start up (10 minutes) before alerting that it's down.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus target missing with warmup time (instance {{ $labels.instance
    }})
PrometheusTargetScrapeDuplicate (0 active)
alert: PrometheusTargetScrapeDuplicate
expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m])
  > 0
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus has many samples rejected due to duplicate timestamps but different values
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})
PrometheusTargetScrapingSlow (0 active)
alert: PrometheusTargetScrapingSlow
expr: prometheus_target_interval_length_seconds{quantile="0.9"}
  / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"}
  > 1.05
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus target scraping slow (instance {{ $labels.instance }})
PrometheusTemplateTextExpansionFailures (0 active)
alert: PrometheusTemplateTextExpansionFailures
expr: increase(prometheus_template_text_expansion_failures_total[3m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} template text expansion failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus template text expansion failures (instance {{ $labels.instance
    }})
PrometheusTooManyRestarts (0 active)
alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m])
  > 2
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus too many restarts (instance {{ $labels.instance }})
PrometheusTsdbCheckpointCreationFailures (0 active)
alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} checkpoint creation failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance
    }})
PrometheusTsdbCheckpointDeletionFailures (0 active)
alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} checkpoint deletion failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance
    }})
PrometheusTsdbCompactionsFailed (0 active)
alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} TSDB compactions failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})
PrometheusTsdbHeadTruncationsFailed (0 active)
alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} TSDB head truncation failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})
PrometheusTsdbReloadFailures (0 active)
alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} TSDB reload failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})
PrometheusTsdbWalCorruptions (0 active)
alert: PrometheusTsdbWalCorruptions
expr: increase(prometheus_tsdb_wal_corruptions_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} TSDB WAL corruptions
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})
PrometheusTsdbWalTruncationsFailed (0 active)
alert: PrometheusTsdbWalTruncationsFailed
expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m])
  > 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Prometheus encountered {{ $value }} TSDB WAL truncation failures
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})
/etc/prometheus/rules/knyar-nginx-exporter.yml > KnyarNginxExporter
NginxHighHttp4xxErrorRate (0 active)
alert: NginxHighHttp4xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m]))
  / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Too many HTTP requests with status 4xx (> 5%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})
NginxHighHttp5xxErrorRate (0 active)
alert: NginxHighHttp5xxErrorRate
expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m]))
  / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Too many HTTP requests with status 5xx (> 5%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})
NginxLatencyHigh (0 active)
alert: NginxLatencyHigh
expr: histogram_quantile(0.99,
  sum by (host, node, le) (rate(nginx_http_request_duration_seconds_bucket[2m])))
  > 3
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Nginx p99 latency is higher than 3 seconds
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Nginx latency high (instance {{ $labels.instance }})
/etc/prometheus/rules/mysqld-exporter.yml > MysqldExporter
MysqlDown (0 active)
alert: MysqlDown
expr: mysql_up == 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    MySQL instance is down on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL down (instance {{ $labels.instance }})
MysqlHighPreparedStatementsUtilization(>80%) (0 active)
alert: MysqlHighPreparedStatementsUtilization(>80%)
expr: max_over_time(mysql_global_status_prepared_stmt_count[1m])
  / mysql_global_variables_max_prepared_stmt_count * 100 > 80
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    High utilization of prepared statements (>80%) on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance
    }})
MysqlHighQps (0 active)
alert: MysqlHighQps
expr: irate(mysql_global_status_questions[1m])
  > 10000
for: 2m
labels:
  severity: info
annotations:
  description: |-
    MySQL is being overload with unusual QPS (> 10k QPS).
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL High QPS (instance {{ $labels.instance }})
MysqlHighThreadsRunning (0 active)
alert: MysqlHighThreadsRunning
expr: max_over_time(mysql_global_status_threads_running[1m])
  / mysql_global_variables_max_connections * 100 > 60
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    More than 60% of MySQL connections are in running state on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL high threads running (instance {{ $labels.instance }})
MysqlInnodbForceRecoveryIsEnabled (0 active)
alert: MysqlInnodbForceRecoveryIsEnabled
expr: mysql_global_variables_innodb_force_recovery
  != 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    MySQL InnoDB force recovery is enabled on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})
MysqlInnodbHistory_lenTooLong (0 active)
alert: MysqlInnodbHistory_lenTooLong
expr: mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len
  > 50000
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    MySQL history_len (undo log) too long on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})
MysqlInnodbLogWaits (0 active)
alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m])
  > 10
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    MySQL innodb log writes stalling
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL InnoDB log waits (instance {{ $labels.instance }})
MysqlRestarted (0 active)
alert: MysqlRestarted
expr: mysql_global_status_uptime
  < 60
for: 1m
labels:
  severity: info
annotations:
  description: |-
    MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL restarted (instance {{ $labels.instance }})
MysqlSlaveIoThreadNotRunning (0 active)
alert: MysqlSlaveIoThreadNotRunning
expr: (mysql_slave_status_slave_io_running
  and on (instance) mysql_slave_status_master_server_id > 0) == 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    MySQL Slave IO thread not running on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})
MysqlSlaveReplicationLag (0 active)
alert: MysqlSlaveReplicationLag
expr: ((mysql_slave_status_seconds_behind_master
  - mysql_slave_status_sql_delay) and on (instance) mysql_slave_status_master_server_id
  > 0) > 30
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    MySQL replication lag on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL Slave replication lag (instance {{ $labels.instance }})
MysqlSlaveSqlThreadNotRunning (0 active)
alert: MysqlSlaveSqlThreadNotRunning
expr: (mysql_slave_status_slave_sql_running
  and on (instance) mysql_slave_status_master_server_id > 0) == 0
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    MySQL Slave SQL thread not running on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }})
MysqlSlowQueries (0 active)
alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[1m])
  > 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    MySQL server mysql has some new slow query.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL slow queries (instance {{ $labels.instance }})
MysqlTooManyConnections(>80%) (0 active)
alert: MysqlTooManyConnections(>80%)
expr: max_over_time(mysql_global_status_threads_connected[1m])
  / mysql_global_variables_max_connections * 100 > 80
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    More than 80% of MySQL connections are in use on {{ $labels.instance }}
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})
MysqlTooManyOpenFiles (0 active)
alert: MysqlTooManyOpenFiles
expr: mysql_global_status_innodb_num_open_files
  / mysql_global_variables_open_files_limit * 100 > 75
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: MySQL too many open files (instance {{ $labels.instance }})
/etc/prometheus/rules/node-exporter.yml > NodeExporter
HostClockNotSynchronising (0 active)
alert: HostClockNotSynchronising
expr: (min_over_time(node_timex_sync_status[1m])
  == 0 and node_timex_maxerror_seconds >= 16)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Clock not synchronising. Ensure NTP is configured on this host.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host clock not synchronising (instance {{ $labels.instance }})
HostClockSkew (0 active)
alert: HostClockSkew
expr: ((node_timex_offset_seconds
  > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds
  < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0))
for: 10m
labels:
  severity: warning
annotations:
  description: |-
    Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host clock skew (instance {{ $labels.instance }})
HostConntrackLimit (0 active)
alert: HostConntrackLimit
expr: (node_nf_conntrack_entries
  / node_nf_conntrack_entries_limit > 0.8)
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    The number of conntrack is approaching limit
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host conntrack limit (instance {{ $labels.instance }})
HostCpuHighIowait (0 active)
alert: HostCpuHighIowait
expr: avg
  without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.1
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    CPU iowait > 10%. Your CPU is idling waiting for storage to respond.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host CPU high iowait (instance {{ $labels.instance }})
HostCpuStealNoisyNeighbor (0 active)
alert: HostCpuStealNoisyNeighbor
expr: avg
  without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 >
  10
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})
HostDiskMayFillIn24Hours (0 active)
alert: HostDiskMayFillIn24Hours
expr: predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[3h],
  86400) <= 0 and node_filesystem_avail_bytes > 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Filesystem will likely run out of space within the next 24 hours.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})
HostEdacCorrectableErrorsDetected (0 active)
alert: HostEdacCorrectableErrorsDetected
expr: (increase(node_edac_correctable_errors_total[1m])
  > 0)
for: 1m
labels:
  severity: info
annotations:
  description: |-
    Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})
HostEdacUncorrectableErrorsDetected (0 active)
alert: HostEdacUncorrectableErrorsDetected
expr: (node_edac_uncorrectable_errors_total
  > 0)
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})
HostFilesystemDeviceError (0 active)
alert: HostFilesystemDeviceError
expr: node_filesystem_device_error{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}
  == 1
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    Error stat-ing the {{ $labels.mountpoint }} filesystem
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host filesystem device error (instance {{ $labels.instance }})
HostHighCpuLoad (0 active)
alert: HostHighCpuLoad
expr: 1
  - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) >
  0.8
for: 10m
labels:
  severity: warning
annotations:
  description: |-
    CPU load is > 80%
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host high CPU load (instance {{ $labels.instance }})
HostInodesMayFillIn24Hours (0 active)
alert: HostInodesMayFillIn24Hours
expr: predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}[1h],
  86400) <= 0 and node_filesystem_files_free > 0
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Filesystem will likely run out of inodes within the next 24 hours at current write rate
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})
HostKernelVersionDeviations (0 active)
alert: HostKernelVersionDeviations
expr: changes(node_uname_info[1h])
  > 0
for: 1m
labels:
  severity: info
annotations:
  description: |-
    Kernel version for {{ $labels.instance }} has changed.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host kernel version deviations (instance {{ $labels.instance }})
HostMemoryIsUnderutilized (0 active)
alert: HostMemoryIsUnderutilized
expr: min_over_time(node_memory_MemFree_bytes[1w])
  > node_memory_MemTotal_bytes * 0.8
for: 1m
labels:
  severity: info
annotations:
  description: |-
    Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }})
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host Memory is underutilized (instance {{ $labels.instance }})
HostMemoryUnderMemoryPressure (0 active)
alert: HostMemoryUnderMemoryPressure
expr: (rate(node_vmstat_pgmajfault[5m])
  > 1000)
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    The node is under heavy memory pressure. High rate of loading memory pages from disk.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host memory under memory pressure (instance {{ $labels.instance }})
HostNetworkBondDegraded (0 active)
alert: HostNetworkBondDegraded
expr: ((node_bonding_active
  - node_bonding_slaves) != 0)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}".
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host Network Bond Degraded (instance {{ $labels.instance }})
HostNetworkReceiveErrors (0 active)
alert: HostNetworkReceiveErrors
expr: (rate(node_network_receive_errs_total[2m])
  / rate(node_network_receive_packets_total[2m]) > 0.01)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host Network Receive Errors (instance {{ $labels.instance }})
HostNetworkTransmitErrors (0 active)
alert: HostNetworkTransmitErrors
expr: (rate(node_network_transmit_errs_total[2m])
  / rate(node_network_transmit_packets_total[2m]) > 0.01)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host Network Transmit Errors (instance {{ $labels.instance }})
HostNodeOvertemperatureAlarm (0 active)
alert: HostNodeOvertemperatureAlarm
expr: ((node_hwmon_temp_crit_alarm_celsius
  == 1) or (node_hwmon_temp_alarm == 1))
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    Physical node temperature alarm triggered
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host node overtemperature alarm (instance {{ $labels.instance }})
HostOomKillDetected (0 active)
alert: HostOomKillDetected
expr: (increase(node_vmstat_oom_kill[1m])
  > 0)
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    OOM kill detected
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host OOM kill detected (instance {{ $labels.instance }})
HostOutOfDiskSpace (0 active)
alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*|tmpfs|cifs|nfs)"}
  / node_filesystem_size_bytes < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly
  == 0)
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    Disk is almost full (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host out of disk space (instance {{ $labels.instance }})
HostOutOfInodes (0 active)
alert: HostOutOfInodes
expr: (node_filesystem_files_free
  / node_filesystem_files < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly
  == 0)
for: 2m
labels:
  severity: critical
annotations:
  description: |-
    Disk is almost running out of available inodes (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host out of inodes (instance {{ $labels.instance }})
HostOutOfMemory (0 active)
alert: HostOutOfMemory
expr: (node_memory_MemAvailable_bytes
  / node_memory_MemTotal_bytes < 0.1)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Node memory is filling up (< 10% left)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host out of memory (instance {{ $labels.instance }})
HostPhysicalComponentTooHot (0 active)
alert: HostPhysicalComponentTooHot
expr: node_hwmon_temp_celsius
  > node_hwmon_temp_max_celsius
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Physical hardware component too hot
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host physical component too hot (instance {{ $labels.instance }})
HostSoftwareRaidDiskFailure (0 active)
alert: HostSoftwareRaidDiskFailure
expr: (node_md_disks{state="failed"}
  > 0)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host software RAID disk failure (instance {{ $labels.instance }})
HostSoftwareRaidInsufficientDrives (0 active)
alert: HostSoftwareRaidInsufficientDrives
expr: ((node_md_disks_required
  - on (device, instance) node_md_disks{state="active"}) > 0)
for: 1m
labels:
  severity: critical
annotations:
  description: |-
    MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host software RAID insufficient drives (instance {{ $labels.instance }})
HostSwapIsFillingUp (0 active)
alert: HostSwapIsFillingUp
expr: ((1
  - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Swap is filling up (>80%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host swap is filling up (instance {{ $labels.instance }})
HostUnusualDiskIo (0 active)
alert: HostUnusualDiskIo
expr: rate(node_disk_io_time_seconds_total[5m])
  > 0.8
for: 5m
labels:
  severity: warning
annotations:
  description: |-
    Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues.
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk IO (instance {{ $labels.instance }})
HostUnusualDiskReadLatency (0 active)
alert: HostUnusualDiskReadLatency
expr: (rate(node_disk_read_time_seconds_total[1m])
  / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m])
  > 0)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk latency is growing (read operations > 100ms)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk read latency (instance {{ $labels.instance }})
HostUnusualDiskReadRate (0 active)
alert: HostUnusualDiskReadRate
expr: (rate(node_disk_io_time_seconds_total[5m])
  > 0.8)
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Disk is too busy (IO wait > 80%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk read rate (instance {{ $labels.instance }})
HostUnusualDiskWriteLatency (0 active)
alert: HostUnusualDiskWriteLatency
expr: (rate(node_disk_write_time_seconds_total[1m])
  / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m])
  > 0)
for: 2m
labels:
  severity: warning
annotations:
  description: |-
    Disk latency is growing (write operations > 100ms)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual disk write latency (instance {{ $labels.instance }})
HostUnusualNetworkThroughputIn (0 active)
alert: HostUnusualNetworkThroughputIn
expr: ((rate(node_network_receive_bytes_total[5m])
  / node_network_speed_bytes) > 0.8)
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Host receive bandwidth is high (>80%).
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual network throughput in (instance {{ $labels.instance }})
HostUnusualNetworkThroughputOut (0 active)
alert: HostUnusualNetworkThroughputOut
expr: ((rate(node_network_transmit_bytes_total[5m])
  / node_network_speed_bytes) > 0.8)
for: 1m
labels:
  severity: warning
annotations:
  description: |-
    Host transmit bandwidth is high (>80%)
      VALUE = {{ $value }}
      LABELS = {{ $labels }}
  summary: Host unusual network throughput out (instance {{ $labels.instance }})