websites: add alerts for pod/job/build errors

This commit is contained in:
Francois Andrieu 2023-01-23 14:32:37 +01:00 committed by kevin
parent de196fd597
commit 66726137ae
4 changed files with 86 additions and 0 deletions

View file

@ -99,3 +99,9 @@
template: deployment.yml
objectname: deployment.yml
when: env == "staging"
- role: openshift/object
app: websites
file: prometheusRules.yml
objectname: prometheusRules.yml
when: env == "staging"

View file

@ -0,0 +1,57 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alerts
spec:
groups:
- name: jobFailed
rules:
- alert: JobFailed
annotations:
description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed.
summary: At least one job has failed.
expr: kube_job_failed > 0
labels:
severity: warning
- name: BuildFailed
rules:
- alert: BuildFailed
annotations:
description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed.
summary: Build {{$labels.buildconfig}} has failed.
expr: openshift_build_status_phase_total{build_phase="failed"} > 0
labels:
severity: warning
- name: PodFailing
rules:
- alert: PodPending
annotations:
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
summary: Pod {{$labels.pod}} is in pending state.
expr: kube_pod_status_phase{phase="Pending"} > 0
for: 10m
labels:
severity: warning
- alert: PodRestarted
annotations:
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted.
summary: Containers in pod {{$labels.pod}} has restarted.
expr: rate(kube_pod_container_status_restarts_total[10m]) * 60 * 10 > 0
labels:
severity: warning
- alert: PodCrashLoop
annotations:
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted {{ printf "%.2f" $value }} in the last 15 minutes.
summary: Pod {{$labels.pod}} is in CrashLoop state.
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 2
labels:
severity: warning
for: 15m
- alert: PodOOMKilled
annotations:
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} ran out
of memory and has been killed.
summary: Containers in pod {{$labels.pod}} has been OOMKilled.
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
labels:
severity: warning

View file

@ -57,6 +57,13 @@
objectname: appowners.yml
template_fullpath: "{{roles_path}}/openshift/project/templates/appowners.yml"
- name: alertmanager.yml
include_role:
name: openshift/object
vars:
objectname: alertmanager.yml
template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml"
- name: egresspolicy.yml
include_role:
name: openshift/object

View file

@ -0,0 +1,16 @@
apiVersion: monitoring.coreos.com/v1beta1
kind: AlertmanagerConfig
metadata:
name: appowners-alerts
namespace: "{{app}}"
spec:
receivers:
- emailConfigs:
- sendResolved: true
to: "{{ appowners | product(['fedoraproject.org']) | map('join', '@') | join(',') }}"
name: default
route:
groupBy:
- alertname
- namespace
receiver: default