69 lines
2.7 KiB
YAML
69 lines
2.7 KiB
YAML
#jinja2:variable_start_string:'@@',variable_end_string:'@@'
|
|
---
|
|
apiVersion: monitoring.coreos.com/v1
|
|
kind: PrometheusRule
|
|
metadata:
|
|
name: alerts
|
|
spec:
|
|
groups:
|
|
{% if 'cronjob' in alert_rules %}
|
|
- name: CronjobFailed
|
|
rules:
|
|
- alert: CronjobFailed
|
|
annotations:
|
|
description: Latest execution of CronJob {{$labels.namespace}}/{{$labels.cronjob}} failed to complete.
|
|
summary: Latest job {{$labels.cronjob}} has failed.
|
|
expr: (kube_cronjob_status_last_schedule_time - kube_cronjob_status_last_successful_time) * (1 - kube_cronjob_status_active) > 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
{% endif %}
|
|
{% if 'buildconfig' in alert_rules %}
|
|
- name: BuildFailed
|
|
rules:
|
|
- alert: BuildFailed
|
|
annotations:
|
|
description: Last build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) failed to complete.
|
|
summary: Last build {{$labels.buildconfig}} has failed.
|
|
expr: topk(1,openshift_build_completed_timestamp_seconds) by (buildconfig)
|
|
* ON(build,buildconfig,namespace) openshift_build_status_phase_total{build_phase="failed"}
|
|
> 0
|
|
for: 5m
|
|
labels:
|
|
severity: warning
|
|
{% endif %}
|
|
{% if 'pod' in alert_rules %}
|
|
- name: PodFailing
|
|
rules:
|
|
- alert: PodPending
|
|
annotations:
|
|
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
|
|
summary: Pod {{$labels.pod}} is stuck in pending state.
|
|
expr: kube_pod_status_phase{phase="Pending"} > 0
|
|
for: 15m
|
|
labels:
|
|
severity: warning
|
|
- alert: PodRestarted
|
|
annotations:
|
|
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted.
|
|
summary: Containers in pod {{$labels.pod}} has restarted.
|
|
expr: rate(kube_pod_container_status_restarts_total[10m]) * 60 * 10 > 0
|
|
labels:
|
|
severity: warning
|
|
- alert: PodCrashLoop
|
|
annotations:
|
|
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted {{ printf "%.2f" $value }} in the last 15 minutes.
|
|
summary: Pod {{$labels.pod}} is in CrashLoop state.
|
|
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 2
|
|
labels:
|
|
severity: warning
|
|
for: 15m
|
|
- alert: PodOOMKilled
|
|
annotations:
|
|
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} ran out
|
|
of memory and has been killed.
|
|
summary: Containers in pod {{$labels.pod}} has been OOMKilled.
|
|
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
|
|
labels:
|
|
severity: warning
|
|
{% endif %}
|