websites: add alerts for pod/job/build errors
This commit is contained in:
parent
de196fd597
commit
66726137ae
4 changed files with 86 additions and 0 deletions
|
@ -99,3 +99,9 @@
|
|||
template: deployment.yml
|
||||
objectname: deployment.yml
|
||||
when: env == "staging"
|
||||
|
||||
- role: openshift/object
|
||||
app: websites
|
||||
file: prometheusRules.yml
|
||||
objectname: prometheusRules.yml
|
||||
when: env == "staging"
|
||||
|
|
57
roles/openshift-apps/websites/files/prometheusRules.yml
Normal file
57
roles/openshift-apps/websites/files/prometheusRules.yml
Normal file
|
@ -0,0 +1,57 @@
|
|||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: alerts
|
||||
spec:
|
||||
groups:
|
||||
- name: jobFailed
|
||||
rules:
|
||||
- alert: JobFailed
|
||||
annotations:
|
||||
description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed.
|
||||
summary: At least one job has failed.
|
||||
expr: kube_job_failed > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- name: BuildFailed
|
||||
rules:
|
||||
- alert: BuildFailed
|
||||
annotations:
|
||||
description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed.
|
||||
summary: Build {{$labels.buildconfig}} has failed.
|
||||
expr: openshift_build_status_phase_total{build_phase="failed"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- name: PodFailing
|
||||
rules:
|
||||
- alert: PodPending
|
||||
annotations:
|
||||
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
|
||||
summary: Pod {{$labels.pod}} is in pending state.
|
||||
expr: kube_pod_status_phase{phase="Pending"} > 0
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PodRestarted
|
||||
annotations:
|
||||
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted.
|
||||
summary: Containers in pod {{$labels.pod}} has restarted.
|
||||
expr: rate(kube_pod_container_status_restarts_total[10m]) * 60 * 10 > 0
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PodCrashLoop
|
||||
annotations:
|
||||
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} has restarted {{ printf "%.2f" $value }} in the last 15 minutes.
|
||||
summary: Pod {{$labels.pod}} is in CrashLoop state.
|
||||
expr: rate(kube_pod_container_status_restarts_total[15m]) * 60 * 15 > 2
|
||||
labels:
|
||||
severity: warning
|
||||
for: 15m
|
||||
- alert: PodOOMKilled
|
||||
annotations:
|
||||
description: Container {{$labels.container}} in Pod {{$labels.namespace}}/{{$labels.pod}} ran out
|
||||
of memory and has been killed.
|
||||
summary: Containers in pod {{$labels.pod}} has been OOMKilled.
|
||||
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
|
||||
labels:
|
||||
severity: warning
|
|
@ -57,6 +57,13 @@
|
|||
objectname: appowners.yml
|
||||
template_fullpath: "{{roles_path}}/openshift/project/templates/appowners.yml"
|
||||
|
||||
- name: alertmanager.yml
|
||||
include_role:
|
||||
name: openshift/object
|
||||
vars:
|
||||
objectname: alertmanager.yml
|
||||
template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml"
|
||||
|
||||
- name: egresspolicy.yml
|
||||
include_role:
|
||||
name: openshift/object
|
||||
|
|
16
roles/openshift/project/templates/alertmanager.yml
Normal file
16
roles/openshift/project/templates/alertmanager.yml
Normal file
|
@ -0,0 +1,16 @@
|
|||
apiVersion: monitoring.coreos.com/v1beta1
|
||||
kind: AlertmanagerConfig
|
||||
metadata:
|
||||
name: appowners-alerts
|
||||
namespace: "{{app}}"
|
||||
spec:
|
||||
receivers:
|
||||
- emailConfigs:
|
||||
- sendResolved: true
|
||||
to: "{{ appowners | product(['fedoraproject.org']) | map('join', '@') | join(',') }}"
|
||||
name: default
|
||||
route:
|
||||
groupBy:
|
||||
- alertname
|
||||
- namespace
|
||||
receiver: default
|
Loading…
Add table
Add a link
Reference in a new issue