Prometheus Time Series Collection and Processing Server

Rules

BakinsFpmExporter			5.998s ago	334.9us
Rule	State	Error	Last Evaluation	Evaluation Time
alert: Php-fpmMax-childrenReached expr: sum by (instance) (phpfpm_max_children_reached_total) > 0 for: 1m labels: severity: warning annotations: description: \|- PHP-FPM reached max children - {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: PHP-FPM max-children reached (instance {{ $labels.instance }})	ok		5.998s ago	317.2us
BlackboxExporter			3.63s ago	3.335ms
Rule	State	Error	Last Evaluation	Evaluation Time
alert: BlackboxProbeFailed expr: probe_success == 0 for: 3m labels: severity: critical annotations: description: \|- Probe failed VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe failed (instance {{ $labels.instance }})	ok		3.63s ago	454us
alert: BlackboxConfigurationReloadFailure expr: blackbox_exporter_config_last_reload_successful != 1 for: 1m labels: severity: warning annotations: description: \|- Blackbox configuration reload failure VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox configuration reload failure (instance {{ $labels.instance }})	ok		3.629s ago	105.1us
alert: BlackboxSlowProbe expr: avg_over_time(probe_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: description: \|- Blackbox probe took more than 1s to complete VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox slow probe (instance {{ $labels.instance }})	ok		3.629s ago	623.1us
alert: BlackboxProbeHttpFailure expr: probe_http_status_code <= 199 or probe_http_status_code >= 400 for: 3m labels: severity: critical annotations: description: \|- HTTP status code is not 200-399 VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe HTTP failure (instance {{ $labels.instance }})	ok		3.629s ago	439.6us
alert: BlackboxSslCertificateWillExpireSoon expr: 3 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 20 for: 1m labels: severity: warning annotations: description: \|- SSL certificate expires in less than 20 days VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})	ok		3.628s ago	386.9us
alert: BlackboxSslCertificateWillExpireSoon expr: 0 <= round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 3 for: 1m labels: severity: critical annotations: description: \|- SSL certificate expires in less than 3 days VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox SSL certificate will expire soon (instance {{ $labels.instance }})	ok		3.628s ago	325.9us
alert: BlackboxSslCertificateExpired expr: round((last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) / 86400, 0.1) < 0 for: 1m labels: severity: critical annotations: description: \|- SSL certificate has expired already VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox SSL certificate expired (instance {{ $labels.instance }})	ok		3.628s ago	265.4us
alert: BlackboxProbeSlowHttp expr: avg_over_time(probe_http_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: description: \|- HTTP request took more than 1s VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe slow HTTP (instance {{ $labels.instance }})	ok		3.628s ago	619.3us
alert: BlackboxProbeSlowPing expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1 for: 1m labels: severity: warning annotations: description: \|- Blackbox ping took more than 1s VALUE = {{ $value }} LABELS = {{ $labels }} summary: Blackbox probe slow ping (instance {{ $labels.instance }})	ok		3.627s ago	82.93us
EmbeddedExporter			2.072s ago	3.617ms
Rule	State	Error	Last Evaluation	Evaluation Time
alert: PrometheusJobMissing expr: absent(up{job="prometheus"}) for: 1m labels: severity: warning annotations: description: \|- A Prometheus job has disappeared VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus job missing (instance {{ $labels.instance }})	ok		2.072s ago	275.2us
alert: PrometheusTargetMissing expr: up == 0 for: 1m labels: severity: critical annotations: description: \|- A Prometheus target has disappeared. An exporter might be crashed. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target missing (instance {{ $labels.instance }})	ok		2.072s ago	428.6us
alert: PrometheusAllTargetsMissing expr: sum by (job) (up) == 0 for: 1m labels: severity: critical annotations: description: \|- A Prometheus job does not have living target anymore. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus all targets missing (instance {{ $labels.instance }})	ok		2.071s ago	303.9us
alert: PrometheusTargetMissingWithWarmupTime expr: sum by (instance, job) ((up == 0) * on (instance) group_left (__name__) (node_time_seconds - node_boot_time_seconds > 600)) for: 1m labels: severity: critical annotations: description: \|- Allow a job time to start up (10 minutes) before alerting that it's down. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target missing with warmup time (instance {{ $labels.instance }})	ok		2.071s ago	691.2us
alert: PrometheusConfigurationReloadFailure expr: prometheus_config_last_reload_successful != 1 for: 1m labels: severity: warning annotations: description: \|- Prometheus configuration reload error VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus configuration reload failure (instance {{ $labels.instance }})	ok		2.07s ago	85.76us
alert: PrometheusTooManyRestarts expr: changes(process_start_time_seconds{job=~"prometheus\|pushgateway\|alertmanager"}[15m]) > 2 for: 1m labels: severity: warning annotations: description: \|- Prometheus has restarted more than twice in the last 15 minutes. It might be crashlooping. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus too many restarts (instance {{ $labels.instance }})	ok		2.07s ago	156.4us
alert: PrometheusRuleEvaluationFailures expr: increase(prometheus_rule_evaluation_failures_total[3m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} rule evaluation failures, leading to potentially ignored alerts. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation failures (instance {{ $labels.instance }})	ok		2.07s ago	114.7us
alert: PrometheusTemplateTextExpansionFailures expr: increase(prometheus_template_text_expansion_failures_total[3m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} template text expansion failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus template text expansion failures (instance {{ $labels.instance }})	ok		2.07s ago	83.58us
alert: PrometheusRuleEvaluationSlow expr: prometheus_rule_group_last_duration_seconds > prometheus_rule_group_interval_seconds for: 5m labels: severity: warning annotations: description: \|- Prometheus rule evaluation took more time than the scheduled interval. It indicates a slower storage backend access or too complex query. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus rule evaluation slow (instance {{ $labels.instance }})	ok		2.07s ago	354.4us
alert: PrometheusNotificationsBacklog expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0 for: 1m labels: severity: warning annotations: description: \|- The Prometheus notification queue has not been empty for 10 minutes VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus notifications backlog (instance {{ $labels.instance }})	ok		2.07s ago	169.5us
alert: PrometheusAlertmanagerNotificationFailing expr: rate(alertmanager_notifications_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Alertmanager is failing sending notifications VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus AlertManager notification failing (instance {{ $labels.instance }})	ok		2.07s ago	70.46us
alert: PrometheusTargetEmpty expr: prometheus_sd_discovered_targets == 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus has no target in service discovery VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target empty (instance {{ $labels.instance }})	ok		2.07s ago	156.7us
alert: PrometheusTargetScrapingSlow expr: prometheus_target_interval_length_seconds{quantile="0.9"} / on (interval, instance, job) prometheus_target_interval_length_seconds{quantile="0.5"} > 1.05 for: 5m labels: severity: warning annotations: description: \|- Prometheus is scraping exporters slowly since it exceeded the requested interval time. Your Prometheus server is under-provisioned. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target scraping slow (instance {{ $labels.instance }})	ok		2.07s ago	141.2us
alert: PrometheusLargeScrape expr: increase(prometheus_target_scrapes_exceeded_sample_limit_total[10m]) > 10 for: 5m labels: severity: warning annotations: description: \|- Prometheus has many scrapes that exceed the sample limit VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus large scrape (instance {{ $labels.instance }})	ok		2.07s ago	78.82us
alert: PrometheusTargetScrapeDuplicate expr: increase(prometheus_target_scrapes_sample_duplicate_timestamp_total[5m]) > 0 for: 1m labels: severity: warning annotations: description: \|- Prometheus has many samples rejected due to duplicate timestamps but different values VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus target scrape duplicate (instance {{ $labels.instance }})	ok		2.07s ago	56.26us
alert: PrometheusTsdbCheckpointCreationFailures expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} checkpoint creation failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB checkpoint creation failures (instance {{ $labels.instance }})	ok		2.07s ago	52.44us
alert: PrometheusTsdbCheckpointDeletionFailures expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} checkpoint deletion failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB checkpoint deletion failures (instance {{ $labels.instance }})	ok		2.07s ago	56.14us
alert: PrometheusTsdbCompactionsFailed expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} TSDB compactions failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB compactions failed (instance {{ $labels.instance }})	ok		2.07s ago	51.51us
alert: PrometheusTsdbHeadTruncationsFailed expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} TSDB head truncation failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB head truncations failed (instance {{ $labels.instance }})	ok		2.07s ago	76.1us
alert: PrometheusTsdbReloadFailures expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} TSDB reload failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB reload failures (instance {{ $labels.instance }})	ok		2.07s ago	59.06us
alert: PrometheusTsdbWalCorruptions expr: increase(prometheus_tsdb_wal_corruptions_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} TSDB WAL corruptions VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB WAL corruptions (instance {{ $labels.instance }})	ok		2.07s ago	53.06us
alert: PrometheusTsdbWalTruncationsFailed expr: increase(prometheus_tsdb_wal_truncations_failed_total[1m]) > 0 for: 1m labels: severity: critical annotations: description: \|- Prometheus encountered {{ $value }} TSDB WAL truncation failures VALUE = {{ $value }} LABELS = {{ $labels }} summary: Prometheus TSDB WAL truncations failed (instance {{ $labels.instance }})	ok		2.07s ago	52.84us
KnyarNginxExporter			13.093s ago	1.083ms
Rule	State	Error	Last Evaluation	Evaluation Time
alert: NginxHighHttp4xxErrorRate expr: sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 for: 1m labels: severity: critical annotations: description: \|- Too many HTTP requests with status 4xx (> 5%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})	ok		13.093s ago	576.3us
alert: NginxHighHttp5xxErrorRate expr: sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5 for: 1m labels: severity: critical annotations: description: \|- Too many HTTP requests with status 5xx (> 5%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})	ok		13.093s ago	344.5us
alert: NginxLatencyHigh expr: histogram_quantile(0.99, sum by (host, node, le) (rate(nginx_http_request_duration_seconds_bucket[2m]))) > 3 for: 2m labels: severity: warning annotations: description: \|- Nginx p99 latency is higher than 3 seconds VALUE = {{ $value }} LABELS = {{ $labels }} summary: Nginx latency high (instance {{ $labels.instance }})	ok		13.093s ago	123.6us
MysqldExporter			8.742s ago	2.47ms
Rule	State	Error	Last Evaluation	Evaluation Time
alert: MysqlDown expr: mysql_up == 0 for: 1m labels: severity: critical annotations: description: \|- MySQL instance is down on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL down (instance {{ $labels.instance }})	ok		8.743s ago	392.9us
alert: MysqlTooManyConnections(>80%) expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 80 for: 2m labels: severity: warning annotations: description: \|- More than 80% of MySQL connections are in use on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL too many connections (> 80%) (instance {{ $labels.instance }})	ok		8.742s ago	218.9us
alert: MysqlHighPreparedStatementsUtilization(>80%) expr: max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 for: 2m labels: severity: warning annotations: description: \|- High utilization of prepared statements (>80%) on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL high prepared statements utilization (> 80%) (instance {{ $labels.instance }})	ok		8.742s ago	194.6us
alert: MysqlHighThreadsRunning expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 60 for: 2m labels: severity: warning annotations: description: \|- More than 60% of MySQL connections are in running state on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL high threads running (instance {{ $labels.instance }})	ok		8.742s ago	174.7us
alert: MysqlSlaveIoThreadNotRunning expr: (mysql_slave_status_slave_io_running and on (instance) mysql_slave_status_master_server_id > 0) == 0 for: 1m labels: severity: critical annotations: description: \|- MySQL Slave IO thread not running on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL Slave IO thread not running (instance {{ $labels.instance }})	ok		8.742s ago	161.1us
alert: MysqlSlaveSqlThreadNotRunning expr: (mysql_slave_status_slave_sql_running and on (instance) mysql_slave_status_master_server_id > 0) == 0 for: 1m labels: severity: critical annotations: description: \|- MySQL Slave SQL thread not running on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL Slave SQL thread not running (instance {{ $labels.instance }})	ok		8.742s ago	192us
alert: MysqlSlaveReplicationLag expr: ((mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) and on (instance) mysql_slave_status_master_server_id > 0) > 30 for: 1m labels: severity: critical annotations: description: \|- MySQL replication lag on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL Slave replication lag (instance {{ $labels.instance }})	ok		8.742s ago	232.2us
alert: MysqlSlowQueries expr: increase(mysql_global_status_slow_queries[1m]) > 0 for: 2m labels: severity: warning annotations: description: \|- MySQL server mysql has some new slow query. VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL slow queries (instance {{ $labels.instance }})	ok		8.742s ago	117.3us
alert: MysqlInnodbLogWaits expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 for: 1m labels: severity: warning annotations: description: \|- MySQL innodb log writes stalling VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL InnoDB log waits (instance {{ $labels.instance }})	ok		8.741s ago	133.3us
alert: MysqlRestarted expr: mysql_global_status_uptime < 60 for: 1m labels: severity: info annotations: description: \|- MySQL has just been restarted, less than one minute ago on {{ $labels.instance }}. VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL restarted (instance {{ $labels.instance }})	ok		8.741s ago	99.09us
alert: MysqlHighQps expr: irate(mysql_global_status_questions[1m]) > 10000 for: 2m labels: severity: info annotations: description: \|- MySQL is being overload with unusual QPS (> 10k QPS). VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL High QPS (instance {{ $labels.instance }})	ok		8.741s ago	117.8us
alert: MysqlTooManyOpenFiles expr: mysql_global_status_innodb_num_open_files / mysql_global_variables_open_files_limit * 100 > 75 for: 2m labels: severity: warning annotations: description: \|- MySQL has too many open files, consider increase variables open_files_limit on {{ $labels.instance }}. VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL too many open files (instance {{ $labels.instance }})	ok		8.741s ago	222.1us
alert: MysqlInnodbForceRecoveryIsEnabled expr: mysql_global_variables_innodb_force_recovery != 0 for: 2m labels: severity: warning annotations: description: \|- MySQL InnoDB force recovery is enabled on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL InnoDB Force Recovery is enabled (instance {{ $labels.instance }})	ok		8.741s ago	109.3us
alert: MysqlInnodbHistory_lenTooLong expr: mysql_info_schema_innodb_metrics_transaction_trx_rseg_history_len > 50000 for: 2m labels: severity: warning annotations: description: \|- MySQL history_len (undo log) too long on {{ $labels.instance }} VALUE = {{ $value }} LABELS = {{ $labels }} summary: MySQL InnoDB history_len too long (instance {{ $labels.instance }})	ok		8.741s ago	64.93us
NodeExporter			6.833s ago	20.32ms
Rule	State	Error	Last Evaluation	Evaluation Time
alert: HostOutOfMemory expr: (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes < 0.1) for: 2m labels: severity: warning annotations: description: \|- Node memory is filling up (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of memory (instance {{ $labels.instance }})	ok		6.833s ago	687.8us
alert: HostMemoryUnderMemoryPressure expr: (rate(node_vmstat_pgmajfault[5m]) > 1000) for: 1m labels: severity: warning annotations: description: \|- The node is under heavy memory pressure. High rate of loading memory pages from disk. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host memory under memory pressure (instance {{ $labels.instance }})	ok		6.832s ago	297us
alert: HostMemoryIsUnderutilized expr: min_over_time(node_memory_MemFree_bytes[1w]) > node_memory_MemTotal_bytes * 0.8 for: 1m labels: severity: info annotations: description: \|- Node memory usage is < 20% for 1 week. Consider reducing memory space. (instance {{ $labels.instance }}) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Memory is underutilized (instance {{ $labels.instance }})	ok		6.832s ago	7.866ms
alert: HostUnusualNetworkThroughputIn expr: ((rate(node_network_receive_bytes_total[5m]) / node_network_speed_bytes) > 0.8) for: 1m labels: severity: warning annotations: description: \|- Host receive bandwidth is high (>80%). VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput in (instance {{ $labels.instance }})	ok		6.824s ago	426.5us
alert: HostUnusualNetworkThroughputOut expr: ((rate(node_network_transmit_bytes_total[5m]) / node_network_speed_bytes) > 0.8) for: 1m labels: severity: warning annotations: description: \|- Host transmit bandwidth is high (>80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual network throughput out (instance {{ $labels.instance }})	ok		6.824s ago	289.8us
alert: HostUnusualDiskReadRate expr: (rate(node_disk_io_time_seconds_total[5m]) > 0.8) for: 1m labels: severity: warning annotations: description: \|- Disk is too busy (IO wait > 80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read rate (instance {{ $labels.instance }})	ok		6.824s ago	242.5us
alert: HostOutOfDiskSpace expr: (node_filesystem_avail_bytes{fstype!~"^(fuse.*\|tmpfs\|cifs\|nfs)"} / node_filesystem_size_bytes < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly == 0) for: 2m labels: severity: critical annotations: description: \|- Disk is almost full (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of disk space (instance {{ $labels.instance }})	ok		6.824s ago	911.7us
alert: HostDiskMayFillIn24Hours expr: predict_linear(node_filesystem_avail_bytes{fstype!~"^(fuse.*\|tmpfs\|cifs\|nfs)"}[3h], 86400) <= 0 and node_filesystem_avail_bytes > 0 for: 2m labels: severity: warning annotations: description: \|- Filesystem will likely run out of space within the next 24 hours. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host disk may fill in 24 hours (instance {{ $labels.instance }})	ok		6.823s ago	2.033ms
alert: HostOutOfInodes expr: (node_filesystem_files_free / node_filesystem_files < 0.1 and on (instance, device, mountpoint) node_filesystem_readonly == 0) for: 2m labels: severity: critical annotations: description: \|- Disk is almost running out of available inodes (< 10% left) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host out of inodes (instance {{ $labels.instance }})	ok		6.821s ago	675.5us
alert: HostFilesystemDeviceError expr: node_filesystem_device_error{fstype!~"^(fuse.*\|tmpfs\|cifs\|nfs)"} == 1 for: 2m labels: severity: critical annotations: description: \|- Error stat-ing the {{ $labels.mountpoint }} filesystem VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host filesystem device error (instance {{ $labels.instance }})	ok		6.82s ago	236.1us
alert: HostInodesMayFillIn24Hours expr: predict_linear(node_filesystem_files_free{fstype!~"^(fuse.*\|tmpfs\|cifs\|nfs)"}[1h], 86400) <= 0 and node_filesystem_files_free > 0 for: 2m labels: severity: warning annotations: description: \|- Filesystem will likely run out of inodes within the next 24 hours at current write rate VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host inodes may fill in 24 hours (instance {{ $labels.instance }})	ok		6.82s ago	869us
alert: HostUnusualDiskReadLatency expr: (rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0) for: 2m labels: severity: warning annotations: description: \|- Disk latency is growing (read operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk read latency (instance {{ $labels.instance }})	ok		6.819s ago	439.2us
alert: HostUnusualDiskWriteLatency expr: (rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0) for: 2m labels: severity: warning annotations: description: \|- Disk latency is growing (write operations > 100ms) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk write latency (instance {{ $labels.instance }})	ok		6.819s ago	436.5us
alert: HostHighCpuLoad expr: 1 - (avg without (cpu) (rate(node_cpu_seconds_total{mode="idle"}[5m]))) > 0.8 for: 10m labels: severity: warning annotations: description: \|- CPU load is > 80% VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host high CPU load (instance {{ $labels.instance }})	ok		6.819s ago	610.3us
alert: HostCpuStealNoisyNeighbor expr: avg without (cpu) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10 for: 1m labels: severity: warning annotations: description: \|- CPU steal is > 10%. A noisy neighbor is killing VM performances or a spot instance may be out of credit. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host CPU steal noisy neighbor (instance {{ $labels.instance }})	ok		6.818s ago	588.4us
alert: HostCpuHighIowait expr: avg without (cpu) (rate(node_cpu_seconds_total{mode="iowait"}[5m])) > 0.1 for: 1m labels: severity: warning annotations: description: \|- CPU iowait > 10%. Your CPU is idling waiting for storage to respond. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host CPU high iowait (instance {{ $labels.instance }})	ok		6.818s ago	451.4us
alert: HostUnusualDiskIo expr: rate(node_disk_io_time_seconds_total[5m]) > 0.8 for: 5m labels: severity: warning annotations: description: \|- Disk usage >80%. Check storage for issues or increase IOPS capabilities. Check storage for issues. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host unusual disk IO (instance {{ $labels.instance }})	ok		6.817s ago	148.5us
alert: HostSwapIsFillingUp expr: ((1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80) for: 2m labels: severity: warning annotations: description: \|- Swap is filling up (>80%) VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host swap is filling up (instance {{ $labels.instance }})	ok		6.817s ago	335.1us
alert: HostPhysicalComponentTooHot expr: node_hwmon_temp_celsius > node_hwmon_temp_max_celsius for: 5m labels: severity: warning annotations: description: \|- Physical hardware component too hot VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host physical component too hot (instance {{ $labels.instance }})	ok		6.817s ago	63.7us
alert: HostNodeOvertemperatureAlarm expr: ((node_hwmon_temp_crit_alarm_celsius == 1) or (node_hwmon_temp_alarm == 1)) for: 1m labels: severity: critical annotations: description: \|- Physical node temperature alarm triggered VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host node overtemperature alarm (instance {{ $labels.instance }})	ok		6.817s ago	79.46us
alert: HostSoftwareRaidInsufficientDrives expr: ((node_md_disks_required - on (device, instance) node_md_disks{state="active"}) > 0) for: 1m labels: severity: critical annotations: description: \|- MD RAID array {{ $labels.device }} on {{ $labels.instance }} has insufficient drives remaining. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host software RAID insufficient drives (instance {{ $labels.instance }})	ok		6.817s ago	76us
alert: HostSoftwareRaidDiskFailure expr: (node_md_disks{state="failed"} > 0) for: 2m labels: severity: warning annotations: description: \|- MD RAID array {{ $labels.device }} on {{ $labels.instance }} needs attention. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host software RAID disk failure (instance {{ $labels.instance }})	ok		6.817s ago	50.96us
alert: HostKernelVersionDeviations expr: changes(node_uname_info[1h]) > 0 for: 1m labels: severity: info annotations: description: \|- Kernel version for {{ $labels.instance }} has changed. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host kernel version deviations (instance {{ $labels.instance }})	ok		6.817s ago	306.6us
alert: HostOomKillDetected expr: (increase(node_vmstat_oom_kill[1m]) > 0) for: 1m labels: severity: warning annotations: description: \|- OOM kill detected VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host OOM kill detected (instance {{ $labels.instance }})	ok		6.817s ago	162.1us
alert: HostEdacCorrectableErrorsDetected expr: (increase(node_edac_correctable_errors_total[1m]) > 0) for: 1m labels: severity: info annotations: description: \|- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} correctable memory errors reported by EDAC in the last 5 minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host EDAC Correctable Errors detected (instance {{ $labels.instance }})	ok		6.817s ago	68.31us
alert: HostEdacUncorrectableErrorsDetected expr: (node_edac_uncorrectable_errors_total > 0) for: 1m labels: severity: warning annotations: description: \|- Host {{ $labels.instance }} has had {{ printf "%.0f" $value }} uncorrectable memory errors reported by EDAC in the last 5 minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host EDAC Uncorrectable Errors detected (instance {{ $labels.instance }})	ok		6.817s ago	49.05us
alert: HostNetworkReceiveErrors expr: (rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01) for: 2m labels: severity: warning annotations: description: \|- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} receive errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Receive Errors (instance {{ $labels.instance }})	ok		6.817s ago	339.8us
alert: HostNetworkTransmitErrors expr: (rate(node_network_transmit_errs_total[2m]) / rate(node_network_transmit_packets_total[2m]) > 0.01) for: 2m labels: severity: warning annotations: description: \|- Host {{ $labels.instance }} interface {{ $labels.device }} has encountered {{ printf "%.0f" $value }} transmit errors in the last two minutes. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Transmit Errors (instance {{ $labels.instance }})	ok		6.816s ago	325.7us
alert: HostNetworkBondDegraded expr: ((node_bonding_active - node_bonding_slaves) != 0) for: 2m labels: severity: warning annotations: description: \|- Bond "{{ $labels.device }}" degraded on "{{ $labels.instance }}". VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host Network Bond Degraded (instance {{ $labels.instance }})	ok		6.816s ago	81.71us
alert: HostConntrackLimit expr: (node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8) for: 5m labels: severity: warning annotations: description: \|- The number of conntrack is approaching limit VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host conntrack limit (instance {{ $labels.instance }})	ok		6.816s ago	312.9us
alert: HostClockSkew expr: ((node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)) for: 10m labels: severity: warning annotations: description: \|- Clock skew detected. Clock is out of sync. Ensure NTP is configured correctly on this host. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host clock skew (instance {{ $labels.instance }})	ok		6.816s ago	505us
alert: HostClockNotSynchronising expr: (min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16) for: 2m labels: severity: warning annotations: description: \|- Clock not synchronising. Ensure NTP is configured on this host. VALUE = {{ $value }} LABELS = {{ $labels }} summary: Host clock not synchronising (instance {{ $labels.instance }})	ok		6.815s ago	298.3us

Rules

BakinsFpmExporter

5.998s ago

334.9us

BlackboxExporter

3.63s ago

3.335ms

EmbeddedExporter

2.072s ago

3.617ms

KnyarNginxExporter

13.093s ago

1.083ms

MysqldExporter

8.742s ago

2.47ms

NodeExporter

6.833s ago

20.32ms