From caa4f85ed212c7b02cf68bde43f9702fa6470c53 Mon Sep 17 00:00:00 2001 From: Francois Andrieu Date: Thu, 26 Jan 2023 22:35:40 +0100 Subject: [PATCH] ocp_monitoring: move rules to openshift/project role --- playbooks/openshift-apps/websites.yml | 6 +--- roles/openshift/project/defaults/main.yml | 6 ++++ roles/openshift/project/tasks/main.yml | 8 +++++ .../project/templates/alertmanager.yml | 1 + .../project/templates}/prometheusRules.yml | 32 +++++++++++++------ 5 files changed, 38 insertions(+), 15 deletions(-) rename roles/{openshift-apps/websites/files => openshift/project/templates}/prometheusRules.yml (62%) diff --git a/playbooks/openshift-apps/websites.yml b/playbooks/openshift-apps/websites.yml index 89842b6908..327e2bd83b 100644 --- a/playbooks/openshift-apps/websites.yml +++ b/playbooks/openshift-apps/websites.yml @@ -13,6 +13,7 @@ - role: openshift/project app: websites description: Fedora websites building + alerting: true appowners: - codeblock - ryanlerch @@ -100,8 +101,3 @@ objectname: deployment.yml when: env == "staging" - - role: openshift/object - app: websites - file: prometheusRules.yml - objectname: prometheusRules.yml - when: env == "staging" diff --git a/roles/openshift/project/defaults/main.yml b/roles/openshift/project/defaults/main.yml index 25dc975d60..b347339dd8 100644 --- a/roles/openshift/project/defaults/main.yml +++ b/roles/openshift/project/defaults/main.yml @@ -3,3 +3,9 @@ allow_fas_db: false allow_iad2: true egress_policy_template: "{{roles_path}}/openshift/project/templates/egresspolicy.yml" + +alerting: false +alert_rules: +- pod +- cronjob +- buildconfig diff --git a/roles/openshift/project/tasks/main.yml b/roles/openshift/project/tasks/main.yml index e764a7b526..8e84ea53b5 100644 --- a/roles/openshift/project/tasks/main.yml +++ b/roles/openshift/project/tasks/main.yml @@ -64,6 +64,14 @@ objectname: alertmanager.yml template_fullpath: "{{roles_path}}/openshift/project/templates/alertmanager.yml" +- name: prometheusRules.yml + include_role: + name: openshift/object + vars: + objectname: prometheusRules.yml + template_fullpath: "{{roles_path}}/openshift/project/templates/prometheusRules.yml" + when: alerting + - name: egresspolicy.yml include_role: name: openshift/object diff --git a/roles/openshift/project/templates/alertmanager.yml b/roles/openshift/project/templates/alertmanager.yml index 0065c6bb6f..9451006ba0 100644 --- a/roles/openshift/project/templates/alertmanager.yml +++ b/roles/openshift/project/templates/alertmanager.yml @@ -14,3 +14,4 @@ spec: - alertname - namespace receiver: default + repeat_interval: 72h diff --git a/roles/openshift-apps/websites/files/prometheusRules.yml b/roles/openshift/project/templates/prometheusRules.yml similarity index 62% rename from roles/openshift-apps/websites/files/prometheusRules.yml rename to roles/openshift/project/templates/prometheusRules.yml index 436870b4ef..dd14e2f7b6 100644 --- a/roles/openshift-apps/websites/files/prometheusRules.yml +++ b/roles/openshift/project/templates/prometheusRules.yml @@ -1,35 +1,46 @@ +#jinja2:variable_start_string:'@@',variable_end_string:'@@' +--- apiVersion: monitoring.coreos.com/v1 kind: PrometheusRule metadata: name: alerts spec: groups: - - name: jobFailed +{% if 'cronjob' in alert_rules %} + - name: CronjobFailed rules: - - alert: JobFailed + - alert: CronjobFailed annotations: - description: Job {{$labels.namespace}}/{{$labels.job_name}} has failed. - summary: At least one job has failed. - expr: kube_job_failed > 0 + description: Latest execution of CronJob {{$labels.namespace}}/{{$labels.cronjob}} failed to complete. + summary: Latest job {{$labels.cronjob}} has failed. + expr: kube_cronjob_status_last_schedule_time - kube_cronjob_status_last_successful_time > 0 + for: 5m labels: severity: warning +{% endif %} +{% if 'buildconfig' in alert_rules %} - name: BuildFailed rules: - alert: BuildFailed annotations: - description: Build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) has failed. - summary: Build {{$labels.buildconfig}} has failed. - expr: openshift_build_status_phase_total{build_phase="failed"} > 0 + description: Last build {{$labels.namespace}}/{{$labels.buildconfig}} ({{$labels.build}}) failed to complete. + summary: Last build {{$labels.buildconfig}} has failed. + expr: topk(1,openshift_build_completed_timestamp_seconds) by (buildconfig) + * ON(build,buildconfig,namespace) openshift_build_status_phase_total{build_phase="failed"} + > 0 + for: 5m labels: severity: warning +{% endif %} +{% if 'pod' in alert_rules %} - name: PodFailing rules: - alert: PodPending annotations: description: Pod {{$labels.namespace}}/{{$labels.pod}} is in pending state for more than 10m. - summary: Pod {{$labels.pod}} is in pending state. + summary: Pod {{$labels.pod}} is stuck in pending state. expr: kube_pod_status_phase{phase="Pending"} > 0 - for: 10m + for: 15m labels: severity: warning - alert: PodRestarted @@ -55,3 +66,4 @@ spec: expr: kube_pod_container_status_last_terminated_reason{reason="OOMKilled"} > 0 labels: severity: warning +{% endif %}