ocp_monitoring: move rules to openshift/project role

This commit is contained in:
Francois Andrieu 2023-01-26 22:35:40 +01:00 committed by kevin
parent 842db118e8
commit caa4f85ed2
5 changed files with 38 additions and 15 deletions

View file

@ -13,6 +13,7 @@
- role: openshift/project
app: websites
description: Fedora websites building
alerting: true
appowners:
- codeblock
- ryanlerch
@ -100,8 +101,3 @@
objectname: deployment.yml
when: env == "staging"
- role: openshift/object
app: websites
file: prometheusRules.yml
objectname: prometheusRules.yml
when: env == "staging"

View file

@ -3,3 +3,9 @@ allow_fas_db: false
allow_iad2: true
egress_policy_template: "{{roles_path}}/openshift/project/templates/egresspolicy.yml"
alerting: false
alert_rules:
- pod
- cronjob
- buildconfig

View file

@ -64,6 +64,14 @@
objectname: alertmanager.yml
template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml"
- name: prometheusRules.yml
include_role:
name: openshift/object
vars:
objectname: prometheusRules.yml
template_fullpath: "{{roles_path}}/openshift/project/templates/prometheusRules.yml"
when: alerting
- name: egresspolicy.yml
include_role:
name: openshift/object

View file

@ -14,3 +14,4 @@ spec:
- alertname
- namespace
receiver: default
repeat_interval: 72h

View file

@ -1,35 +1,46 @@
#jinja2:variable_start_string:'@@',variable_end_string:'@@'
---
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
name: alerts
spec:
groups:
- name: jobFailed
{% if 'cronjob' in alert_rules %}
- name: CronjobFailed
rules:
- alert: JobFailed
- alert: CronjobFailed
annotations:
description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed.
summary: At least one job has failed.
expr: kube_job_failed > 0
description: Latest execution of CronJob {{$labels.namespace}}/{{$labels.cronjob}} failed to complete.
summary: Latest job {{$labels.cronjob}} has failed.
expr: kube_cronjob_status_last_schedule_time - kube_cronjob_status_last_successful_time > 0
for: 5m
labels:
severity: warning
{% endif %}
{% if 'buildconfig' in alert_rules %}
- name: BuildFailed
rules:
- alert: BuildFailed
annotations:
description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed.
summary: Build {{$labels.buildconfig}} has failed.
expr: openshift_build_status_phase_total{build_phase="failed"} > 0
description: Last build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) failed to complete.
summary: Last build {{$labels.buildconfig}} has failed.
expr: topk(1,openshift_build_completed_timestamp_seconds) by (buildconfig)
* ON(build,buildconfig,namespace) openshift_build_status_phase_total{build_phase="failed"}
> 0
for: 5m
labels:
severity: warning
{% endif %}
{% if 'pod' in alert_rules %}
- name: PodFailing
rules:
- alert: PodPending
annotations:
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
summary: Pod {{$labels.pod}} is in pending state.
summary: Pod {{$labels.pod}} is stuck in pending state.
expr: kube_pod_status_phase{phase="Pending"} > 0
for: 10m
for: 15m
labels:
severity: warning
- alert: PodRestarted
@ -55,3 +66,4 @@ spec:
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
labels:
severity: warning
{% endif %}