From 646a390c9a31e1a50aab49497a14782dd0200491 Mon Sep 17 00:00:00 2001 From: Adam Saleh Date: Wed, 14 Apr 2021 16:26:21 +0200 Subject: [PATCH] Added more prometheus documentation --- .../monitoring_metrics/prometheus_for_dev.rst | 31 ++++++ .../monitoring_metrics/prometheus_for_ops.rst | 94 ++++++++++++++++++- 2 files changed, 124 insertions(+), 1 deletion(-) diff --git a/docs/monitoring_metrics/prometheus_for_dev.rst b/docs/monitoring_metrics/prometheus_for_dev.rst index 5d10dbe..5cc333d 100644 --- a/docs/monitoring_metrics/prometheus_for_dev.rst +++ b/docs/monitoring_metrics/prometheus_for_dev.rst @@ -10,6 +10,7 @@ This way, the merics will be scraped into the configured prometheus and correctl As an example, lets look at ServiceMonitor for bodhi: :: + apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor metadata: @@ -30,6 +31,7 @@ machinery at our disposal, see `Matcher 1 + labels: + severity: high + +would alert if there is more than 1% responses with 500 status code. \ No newline at end of file diff --git a/docs/monitoring_metrics/prometheus_for_ops.rst b/docs/monitoring_metrics/prometheus_for_ops.rst index 8ba4373..38ec8bd 100644 --- a/docs/monitoring_metrics/prometheus_for_ops.rst +++ b/docs/monitoring_metrics/prometheus_for_ops.rst @@ -112,4 +112,96 @@ https://github.com/timescale/promscale remote_write: - url: "http://promscale:9201/write" remote_read: - - url: "http://promscale:9201/read" \ No newline at end of file + - url: "http://promscale:9201/read" + +Notes on auxialiary services +---------------------------- + +As prometheus is primarily targeted to collect metrics from +services that have beein instrumented to expose them, if you don't +your service is not instrumented, or it is not a service, +i.e. a batch-job, you need an adapter to help you with the metrics collection. + +There are two services that help with this. + +* `blackbox exporter `_ to monitor services that have not been instruented based on querying public a.p.i. +* `push gateqay `_ that helps collect information from batch-jobs + +Maintaining the push-gateway can be relegated to the application developer, +as it is lightweight, and by colloecting metrics from the namespace it is running in, +the data will be correctly labeled. + +With blackbox exporter, it can be beneficial to have it running as prometheus side-car, +in simmilar fashion, as we configure oauth-proxy, adding this to the containers section +of the prometheus definition: + +:: + + - name: blackbox-exporter + volumeMounts: + - name: configmap-blackbox + mountPath: /etc/blackbox-config + - mountPath: /etc/tls/private + name: secret-prometheus-k8s-tls + image: quay.io/prometheus/blackbox-exporter:4.4 + args: + - '--config.file=/etc/blackbox-config/blackbox.yml' + ports: + - containerPort: 9115 + name: blackbox + +We can then instruct what is to be monitored through the configmap-blackbox, you can find `relevant examples ` in the project repo. +Beause blackox exporter is in the sam epod, we need to use the additional-scrape-config to add it in. + +Notes on alerting +----------------- + +Prometheus as is, can have rules configured that trigger alerts, once +a specific query evaluates to true. The definition of the rule is explained in the companion docs +for prometheus for developers and can be created in the namespace of the running application. + +Here, we need to focus what happens with alert after prometheus realizes it should fire it, +based on a rule. + +In prometheus crd definition, there is a section about the alert-manager that is supposed to +manage the forwarding of these alerts. + +:: + + alerting: + alertmanagers: + - bearerTokenFile: /var/run/secrets/kubernetes.io/serviceaccount/token + name: alertmanager-service + namespace: application-monitoring + port: web + scheme: https + tlsConfig: + caFile: /var/run/secrets/kubernetes.io/serviceaccount/service-ca.crt + serverName: alertmanager-service.application-monitoring.svc + +We already have alertmanager running and configured by the alertmanager-operator. +Alertmanager itself is really simplistic with a simple ui and api, that alows for silencing an +alert for a given ammount of time. + +It it is expected that the actual user-interaction is happening elsewhere, +either through services like OpsGenie, or through i.e. `integration with zabbix `_ + +More of a build-it yourself solution is to use i.e. https://karma-dashboard.io/, +but we haven't tried any of these as the part of our POC. + +To be able to be notified of the alert, you need to have the `correct reciever configuration `_ in the alertmanagers secret: + +:: + +global: + resolve_timeout: 5m +route: + group_by: ['job'] + group_wait: 10s + group_interval: 10s + repeat_interval: 30m + receiver: 'email' +receivers: +- name: 'email' + email_configs: + - to: 'asaleh@redhat.com' \ No newline at end of file