ocp_monitoring: move rules to openshift/project role

This commit is contained in:
Francois Andrieu 2023-01-26 22:35:40 +01:00 committed by kevin
parent 842db118e8
commit caa4f85ed2
5 changed files with 38 additions and 15 deletions

View file

@ -13,6 +13,7 @@
- role: openshift/project - role: openshift/project
app: websites app: websites
description: Fedora websites building description: Fedora websites building
alerting: true
appowners: appowners:
- codeblock - codeblock
- ryanlerch - ryanlerch
@ -100,8 +101,3 @@
objectname: deployment.yml objectname: deployment.yml
when: env == "staging" when: env == "staging"
- role: openshift/object
app: websites
file: prometheusRules.yml
objectname: prometheusRules.yml
when: env == "staging"

View file

@ -3,3 +3,9 @@ allow_fas_db: false
allow_iad2: true allow_iad2: true
egress_policy_template: "{{roles_path}}/openshift/project/templates/egresspolicy.yml" egress_policy_template: "{{roles_path}}/openshift/project/templates/egresspolicy.yml"
alerting: false
alert_rules:
- pod
- cronjob
- buildconfig

View file

@ -64,6 +64,14 @@
objectname: alertmanager.yml objectname: alertmanager.yml
template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml" template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml"
- name: prometheusRules.yml
include_role:
name: openshift/object
vars:
objectname: prometheusRules.yml
template_fullpath: "{{roles_path}}/openshift/project/templates/prometheusRules.yml"
when: alerting
- name: egresspolicy.yml - name: egresspolicy.yml
include_role: include_role:
name: openshift/object name: openshift/object

View file

@ -14,3 +14,4 @@ spec:
- alertname - alertname
- namespace - namespace
receiver: default receiver: default
repeat_interval: 72h

View file

@ -1,35 +1,46 @@
#jinja2:variable_start_string:'@@',variable_end_string:'@@'
---
apiVersion: monitoring.coreos.com/v1 apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule kind: PrometheusRule
metadata: metadata:
name: alerts name: alerts
spec: spec:
groups: groups:
- name: jobFailed {% if 'cronjob' in alert_rules %}
- name: CronjobFailed
rules: rules:
- alert: JobFailed - alert: CronjobFailed
annotations: annotations:
description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed. description: Latest execution of CronJob {{$labels.namespace}}/{{$labels.cronjob}} failed to complete.
summary: At least one job has failed. summary: Latest job {{$labels.cronjob}} has failed.
expr: kube_job_failed > 0 expr: kube_cronjob_status_last_schedule_time - kube_cronjob_status_last_successful_time > 0
for: 5m
labels: labels:
severity: warning severity: warning
{% endif %}
{% if 'buildconfig' in alert_rules %}
- name: BuildFailed - name: BuildFailed
rules: rules:
- alert: BuildFailed - alert: BuildFailed
annotations: annotations:
description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed. description: Last build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) failed to complete.
summary: Build {{$labels.buildconfig}} has failed. summary: Last build {{$labels.buildconfig}} has failed.
expr: openshift_build_status_phase_total{build_phase="failed"} > 0 expr: topk(1,openshift_build_completed_timestamp_seconds) by (buildconfig)
* ON(build,buildconfig,namespace) openshift_build_status_phase_total{build_phase="failed"}
> 0
for: 5m
labels: labels:
severity: warning severity: warning
{% endif %}
{% if 'pod' in alert_rules %}
- name: PodFailing - name: PodFailing
rules: rules:
- alert: PodPending - alert: PodPending
annotations: annotations:
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m. description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
summary: Pod {{$labels.pod}} is in pending state. summary: Pod {{$labels.pod}} is stuck in pending state.
expr: kube_pod_status_phase{phase="Pending"} > 0 expr: kube_pod_status_phase{phase="Pending"} > 0
for: 10m for: 15m
labels: labels:
severity: warning severity: warning
- alert: PodRestarted - alert: PodRestarted
@ -55,3 +66,4 @@ spec:
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0 expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
labels: labels:
severity: warning severity: warning
{% endif %}