ocp_monitoring: move rules to openshift/project role
This commit is contained in:
parent
842db118e8
commit
caa4f85ed2
5 changed files with 38 additions and 15 deletions
|
@ -13,6 +13,7 @@
|
||||||
- role: openshift/project
|
- role: openshift/project
|
||||||
app: websites
|
app: websites
|
||||||
description: Fedora websites building
|
description: Fedora websites building
|
||||||
|
alerting: true
|
||||||
appowners:
|
appowners:
|
||||||
- codeblock
|
- codeblock
|
||||||
- ryanlerch
|
- ryanlerch
|
||||||
|
@ -100,8 +101,3 @@
|
||||||
objectname: deployment.yml
|
objectname: deployment.yml
|
||||||
when: env == "staging"
|
when: env == "staging"
|
||||||
|
|
||||||
- role: openshift/object
|
|
||||||
app: websites
|
|
||||||
file: prometheusRules.yml
|
|
||||||
objectname: prometheusRules.yml
|
|
||||||
when: env == "staging"
|
|
||||||
|
|
|
@ -3,3 +3,9 @@ allow_fas_db: false
|
||||||
allow_iad2: true
|
allow_iad2: true
|
||||||
|
|
||||||
egress_policy_template: "{{roles_path}}/openshift/project/templates/egresspolicy.yml"
|
egress_policy_template: "{{roles_path}}/openshift/project/templates/egresspolicy.yml"
|
||||||
|
|
||||||
|
alerting: false
|
||||||
|
alert_rules:
|
||||||
|
- pod
|
||||||
|
- cronjob
|
||||||
|
- buildconfig
|
||||||
|
|
|
@ -64,6 +64,14 @@
|
||||||
objectname: alertmanager.yml
|
objectname: alertmanager.yml
|
||||||
template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml"
|
template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml"
|
||||||
|
|
||||||
|
- name: prometheusRules.yml
|
||||||
|
include_role:
|
||||||
|
name: openshift/object
|
||||||
|
vars:
|
||||||
|
objectname: prometheusRules.yml
|
||||||
|
template_fullpath: "{{roles_path}}/openshift/project/templates/prometheusRules.yml"
|
||||||
|
when: alerting
|
||||||
|
|
||||||
- name: egresspolicy.yml
|
- name: egresspolicy.yml
|
||||||
include_role:
|
include_role:
|
||||||
name: openshift/object
|
name: openshift/object
|
||||||
|
|
|
@ -14,3 +14,4 @@ spec:
|
||||||
- alertname
|
- alertname
|
||||||
- namespace
|
- namespace
|
||||||
receiver: default
|
receiver: default
|
||||||
|
repeat_interval: 72h
|
||||||
|
|
|
@ -1,35 +1,46 @@
|
||||||
|
#jinja2:variable_start_string:'@@',variable_end_string:'@@'
|
||||||
|
---
|
||||||
apiVersion: monitoring.coreos.com/v1
|
apiVersion: monitoring.coreos.com/v1
|
||||||
kind: PrometheusRule
|
kind: PrometheusRule
|
||||||
metadata:
|
metadata:
|
||||||
name: alerts
|
name: alerts
|
||||||
spec:
|
spec:
|
||||||
groups:
|
groups:
|
||||||
- name: jobFailed
|
{% if 'cronjob' in alert_rules %}
|
||||||
|
- name: CronjobFailed
|
||||||
rules:
|
rules:
|
||||||
- alert: JobFailed
|
- alert: CronjobFailed
|
||||||
annotations:
|
annotations:
|
||||||
description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed.
|
description: Latest execution of CronJob {{$labels.namespace}}/{{$labels.cronjob}} failed to complete.
|
||||||
summary: At least one job has failed.
|
summary: Latest job {{$labels.cronjob}} has failed.
|
||||||
expr: kube_job_failed > 0
|
expr: kube_cronjob_status_last_schedule_time - kube_cronjob_status_last_successful_time > 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
{% endif %}
|
||||||
|
{% if 'buildconfig' in alert_rules %}
|
||||||
- name: BuildFailed
|
- name: BuildFailed
|
||||||
rules:
|
rules:
|
||||||
- alert: BuildFailed
|
- alert: BuildFailed
|
||||||
annotations:
|
annotations:
|
||||||
description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed.
|
description: Last build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) failed to complete.
|
||||||
summary: Build {{$labels.buildconfig}} has failed.
|
summary: Last build {{$labels.buildconfig}} has failed.
|
||||||
expr: openshift_build_status_phase_total{build_phase="failed"} > 0
|
expr: topk(1,openshift_build_completed_timestamp_seconds) by (buildconfig)
|
||||||
|
* ON(build,buildconfig,namespace) openshift_build_status_phase_total{build_phase="failed"}
|
||||||
|
> 0
|
||||||
|
for: 5m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
{% endif %}
|
||||||
|
{% if 'pod' in alert_rules %}
|
||||||
- name: PodFailing
|
- name: PodFailing
|
||||||
rules:
|
rules:
|
||||||
- alert: PodPending
|
- alert: PodPending
|
||||||
annotations:
|
annotations:
|
||||||
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
|
description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m.
|
||||||
summary: Pod {{$labels.pod}} is in pending state.
|
summary: Pod {{$labels.pod}} is stuck in pending state.
|
||||||
expr: kube_pod_status_phase{phase="Pending"} > 0
|
expr: kube_pod_status_phase{phase="Pending"} > 0
|
||||||
for: 10m
|
for: 15m
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
- alert: PodRestarted
|
- alert: PodRestarted
|
||||||
|
@ -55,3 +66,4 @@ spec:
|
||||||
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
|
expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0
|
||||||
labels:
|
labels:
|
||||||
severity: warning
|
severity: warning
|
||||||
|
{% endif %}
|
Loading…
Add table
Add a link
Reference in a new issue