Repository URL to install this package:
|
Version:
1.0.0b1 ▾
|
---
prometheus_version: 2.9.2
prometheus_config_dir: /etc/prometheus
prometheus_db_dir: /var/lib/prometheus
prometheus_web_listen_address: "0.0.0.0:9090"
prometheus_web_external_url: ''
prometheus_storage_retention: "30d"
# Available since Prometheus 2.7.0
# [EXPERIMENTAL] Maximum number of bytes that can be stored for blocks. Units
# supported: KB, MB, GB, TB, PB.
prometheus_storage_retention_size: "0"
prometheus_config_flags_extra: {}
# prometheus_config_flags_extra:
# storage.tsdb.retention: 15d
# alertmanager.timeout: 10s
prometheus_alertmanager_config: []
# prometheus_alertmanager_config:
# - scheme: https
# path_prefix: /alertmanager
# basic_auth:
# username: user
# password: pass
# static_configs:
# - targets: ["127.0.0.1:9093"]
# proxy_url: "127.0.0.2"
prometheus_alert_relabel_configs: []
# prometheus_alert_relabel_configs:
# - action: labeldrop
# regex: replica
prometheus_global:
scrape_interval: 15s
scrape_timeout: 10s
evaluation_interval: 15s
prometheus_remote_write: []
# prometheus_remote_write:
# - url: https://dev.kausal.co/prom/push
# basic_auth:
# password: FOO
prometheus_remote_read: []
# prometheus_remote_read:
# - url: https://demo.cloudalchemy.org:9201/read
# basic_auth:
# password: FOO
prometheus_external_labels:
environment: "{{ ansible_fqdn | default(ansible_host) | default(inventory_hostname) }}"
prometheus_targets: {}
# node:
# - targets:
# - localhost:9100
# labels:
# env: test
prometheus_scrape_configs:
- job_name: "prometheus"
metrics_path: "{{ prometheus_metrics_path }}"
static_configs:
- targets:
- "{{ ansible_fqdn | default(ansible_host) | default('localhost') }}:9090"
- job_name: "node"
file_sd_configs:
- files:
- "{{ prometheus_config_dir }}/file_sd/node.yml"
# Alternative config file name, searched in ansible templates path.
prometheus_config_file: 'prometheus.yml.j2'
prometheus_alert_rules_files:
- prometheus/rules/*.rules
prometheus_static_targets_files:
- prometheus/targets/*.yml
- prometheus/targets/*.json
prometheus_alert_rules:
- alert: Watchdog
expr: vector(1)
for: 10m
labels:
severity: warning
annotations:
description: 'This is an alert meant to ensure that the entire alerting pipeline is functional.
This alert is always firing, therefore it should always be firing in Alertmanager
and always fire against a receiver. There are integrations with various notification
mechanisms that send a notification when this alert is not firing. For example the
"DeadMansSnitch" integration in PagerDuty.'
summary: 'Ensure entire alerting pipeline is functional'
- alert: InstanceDown
expr: "up == 0"
for: 5m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} down{% endraw %}"
- alert: CriticalCPULoad
expr: '100 - (avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m])) * 100) > 96'
for: 2m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has Critical CPU load for more than 2 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Critical CPU load{% endraw %}"
- alert: CriticalRAMUsage
expr: '(1 - ((node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes) / node_memory_MemTotal_bytes)) * 100 > 98'
for: 5m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} has Critical Memory Usage more than 5 minutes.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} has Critical Memory Usage{% endraw %}"
- alert: CriticalDiskSpace
expr: 'node_filesystem_free_bytes{mountpoint!~"^/run(/.*|$)",fstype!~"(squashfs|fuse.*)",job="node"} / node_filesystem_size_bytes{job="node"} < 0.1'
for: 4m
labels:
severity: critical
annotations:
description: "{% raw %}{{ $labels.instance }} of job {{ $labels.job }} has less than 10% space remaining.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Critical disk space usage{% endraw %}"
- alert: RebootRequired
expr: "node_reboot_required > 0"
labels:
severity: warning
annotations:
description: "{% raw %}{{ $labels.instance }} requires a reboot.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - reboot required{% endraw %}"
- alert: ClockSkewDetected
expr: 'abs(node_timex_offset_seconds) * 1000 > 30'
for: 2m
labels:
severity: warning
annotations:
description: "{% raw %}Clock skew detected on {{ $labels.instance }}. Ensure NTP is configured correctly on this host.{% endraw %}"
summary: "{% raw %}Instance {{ $labels.instance }} - Clock skew detected{% endraw %}"