ansible/roles/openshift/project/templates/prometheusRules.yml

69 lines
2.7 KiB
YAML

#jinja2:variable_start_string:'@@',variable_end_string:'@@'
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alerts
spec:
groups:
{% if 'cronjob' in alert_rules %}
- name: CronjobFailed
rules:
- alert: CronjobFailed
annotations:
description: Latest execution of CronJob {{$labels.namespace}}/{{$labels.cronjob}} failed to complete.
summary: Latest job {{$labels.cronjob}} has failed.
expr: (kube_cronjob_status_last_schedule_time - kube_cronjob_status_last_successful_time) * (1 - kube_cronjob_status_active) > 0
for: 5m
labels:
severity: warning
{% endif %}
{% if 'buildconfig' in alert_rules %}
- name: BuildFailed
rules:
- alert: BuildFailed
annotations:
description: Last build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) failed to complete.
summary: Last build {{$labels.buildconfig}} has failed.
expr: topk(1,openshift_build_completed_timestamp_seconds) by (buildconfig)
* ON(build,buildconfig,namespace) openshift_build_status_phase_total{build_phase="failed"}
> 0
for: 5m
labels:
severity: warning
{% endif %}
{% if 'pod' in alert_rules %}
- name: PodFailing
rules:
- alert: PodPending
annotations:
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
summary: Pod {{$labels.pod}} is stuck in pending state.
expr: kube_pod_status_phase{phase="Pending"} > 0
for: 15m
labels:
severity: warning
- alert: PodRestarted
annotations:
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted.
summary: Containers in pod {{$labels.pod}} has restarted.
expr: rate(kube_pod_container_status_restarts_total[10m]) * 60 * 10 > 0
labels:
severity: warning
- alert: PodCrashLoop
annotations:
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted {{ printf "%.2f" $value }} in the last 15 minutes.
summary: Pod {{$labels.pod}} is in CrashLoop state.
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 2
labels:
severity: warning
for: 15m
- alert: PodOOMKilled
annotations:
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} ran out
of memory and has been killed.
summary: Containers in pod {{$labels.pod}} has been OOMKilled.
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
labels:
severity: warning
{% endif %}