ocp_monitoring: move rules to openshift/project role
This commit is contained in:
parent
842db118e8
commit
caa4f85ed2
5 changed files with 38 additions and 15 deletions
|
@ -13,6 +13,7 @@
|
|||
- role: openshift/project
|
||||
app: websites
|
||||
description: Fedora websites building
|
||||
alerting: true
|
||||
appowners:
|
||||
- codeblock
|
||||
- ryanlerch
|
||||
|
@ -100,8 +101,3 @@
|
|||
objectname: deployment.yml
|
||||
when: env == "staging"
|
||||
|
||||
- role: openshift/object
|
||||
app: websites
|
||||
file: prometheusRules.yml
|
||||
objectname: prometheusRules.yml
|
||||
when: env == "staging"
|
||||
|
|
|
@ -3,3 +3,9 @@ allow_fas_db: false
|
|||
allow_iad2: true
|
||||
|
||||
egress_policy_template: "{{roles_path}}/openshift/project/templates/egresspolicy.yml"
|
||||
|
||||
alerting: false
|
||||
alert_rules:
|
||||
- pod
|
||||
- cronjob
|
||||
- buildconfig
|
||||
|
|
|
@ -64,6 +64,14 @@
|
|||
objectname: alertmanager.yml
|
||||
template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml"
|
||||
|
||||
- name: prometheusRules.yml
|
||||
include_role:
|
||||
name: openshift/object
|
||||
vars:
|
||||
objectname: prometheusRules.yml
|
||||
template_fullpath: "{{roles_path}}/openshift/project/templates/prometheusRules.yml"
|
||||
when: alerting
|
||||
|
||||
- name: egresspolicy.yml
|
||||
include_role:
|
||||
name: openshift/object
|
||||
|
|
|
@ -14,3 +14,4 @@ spec:
|
|||
- alertname
|
||||
- namespace
|
||||
receiver: default
|
||||
repeat_interval: 72h
|
||||
|
|
|
@ -1,35 +1,46 @@
|
|||
#jinja2:variable_start_string:'@@',variable_end_string:'@@'
|
||||
---
|
||||
apiVersion: monitoring.coreos.com/v1
|
||||
kind: PrometheusRule
|
||||
metadata:
|
||||
name: alerts
|
||||
spec:
|
||||
groups:
|
||||
- name: jobFailed
|
||||
{% if 'cronjob' in alert_rules %}
|
||||
- name: CronjobFailed
|
||||
rules:
|
||||
- alert: JobFailed
|
||||
- alert: CronjobFailed
|
||||
annotations:
|
||||
description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed.
|
||||
summary: At least one job has failed.
|
||||
expr: kube_job_failed > 0
|
||||
description: Latest execution of CronJob {{$labels.namespace}}/{{$labels.cronjob}} failed to complete.
|
||||
summary: Latest job {{$labels.cronjob}} has failed.
|
||||
expr: kube_cronjob_status_last_schedule_time - kube_cronjob_status_last_successful_time > 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{% endif %}
|
||||
{% if 'buildconfig' in alert_rules %}
|
||||
- name: BuildFailed
|
||||
rules:
|
||||
- alert: BuildFailed
|
||||
annotations:
|
||||
description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed.
|
||||
summary: Build {{$labels.buildconfig}} has failed.
|
||||
expr: openshift_build_status_phase_total{build_phase="failed"} > 0
|
||||
description: Last build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) failed to complete.
|
||||
summary: Last build {{$labels.buildconfig}} has failed.
|
||||
expr: topk(1,openshift_build_completed_timestamp_seconds) by (buildconfig)
|
||||
* ON(build,buildconfig,namespace) openshift_build_status_phase_total{build_phase="failed"}
|
||||
> 0
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
{% endif %}
|
||||
{% if 'pod' in alert_rules %}
|
||||
- name: PodFailing
|
||||
rules:
|
||||
- alert: PodPending
|
||||
annotations:
|
||||
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
|
||||
summary: Pod {{$labels.pod}} is in pending state.
|
||||
summary: Pod {{$labels.pod}} is stuck in pending state.
|
||||
expr: kube_pod_status_phase{phase="Pending"} > 0
|
||||
for: 10m
|
||||
for: 15m
|
||||
labels:
|
||||
severity: warning
|
||||
- alert: PodRestarted
|
||||
|
@ -55,3 +66,4 @@ spec:
|
|||
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
|
||||
labels:
|
||||
severity: warning
|
||||
{% endif %}
|
Loading…
Add table
Add a link
Reference in a new issue