MDstable
NoteSnippetChecklistPlaybook

Prometheus — Déploiement et configuration

Prometheus en Docker : scraping, exporters (Node, cAdvisor, Blackbox), alerting avec Alertmanager, PromQL

snippetintermediate 2026-05-26 4 min read
prometheusalertmanagerpromqlexportersdockermetricsalerting

Architecture

Targets exporters
/metrics HTTP
Prometheus 9090 Alertmanager 9093 Email Slack PagerDuty
Grafana 3000 datasource

Déploiement Docker

yaml
# docker-compose.yml
services:
prometheus:
image: prom/prometheus:v2.51.0
ports: ["9090:9090"]
volumes:
- ./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml
- ./prometheus/rules:/etc/prometheus/rules
- prometheus_data:/prometheus
command:
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.retention.time=30d'
- '--web.enable-lifecycle'
alertmanager:
image: prom/alertmanager:v0.27.0
ports: ["9093:9093"]
volumes:
- ./prometheus/alertmanager.yml:/etc/alertmanager/alertmanager.yml
volumes:
prometheus_data:
bash
docker compose up -d prometheus alertmanager
# Interface : http://localhost:9090

prometheus.yml

yaml
global:
scrape_interval: 15s
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager:9093"]
rule_files:
- "rules/*.yml"
scrape_configs:
- job_name: prometheus
static_configs:
- targets: ["localhost:9090"]
- job_name: node
static_configs:
- targets: ["node-exporter:9100"]
labels:
env: prod
- job_name: cadvisor
static_configs:
- targets: ["cadvisor:8080"]
- job_name: blackbox
metrics_path: /probe
params:
module: [http_2xx]
static_configs:
- targets:
- https://mon-app.example.com
- https://api.example.com
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: blackbox-exporter:9115

Exporters courants

yaml
# Node Exporter — métriques système Linux
node-exporter:
image: prom/node-exporter:v1.7.0
ports: ["9100:9100"]
volumes:
- /proc:/host/proc:ro
- /sys:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.mount-points-exclude=^/(sys|proc|dev|host|etc)($$|/)'
# cAdvisor — métriques containers Docker
cadvisor:
image: gcr.io/cadvisor/cadvisor:v0.49.1
ports: ["8080:8080"]
volumes:
- /:/rootfs:ro
- /var/run:/var/run:ro
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
privileged: true
# Blackbox Exporter — sondes HTTP/TCP/ICMP
blackbox-exporter:
image: prom/blackbox-exporter:v0.24.0
ports: ["9115:9115"]
volumes:
- ./prometheus/blackbox.yml:/etc/blackbox_exporter/config.yml

PromQL — Requêtes essentielles

promql
# CPU utilisé (%) par instance
100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
# RAM disponible (Go)
node_memory_MemAvailable_bytes / 1024 / 1024 / 1024
# RAM utilisée (%)
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
# Disque utilisé (%) par point de montage
(node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100
# Réseau entrant (Mb/s)
rate(node_network_receive_bytes_total{device!="lo"}[5m]) * 8 / 1024 / 1024
# Réseau sortant (Mb/s)
rate(node_network_transmit_bytes_total{device!="lo"}[5m]) * 8 / 1024 / 1024
# Containers en cours d'exécution
count(container_last_seen{name!=""})
# HTTP probe up/down
probe_success{job="blackbox"}
# Latence HTTP (p95)
histogram_quantile(0.95, rate(http_request_duration_seconds_bucket[5m]))
# Taux d'erreurs HTTP (5xx)
rate(http_requests_total{status=~"5.."}[5m]) / rate(http_requests_total[5m])

Règles d'alerte

yaml
# prometheus/rules/alerts.yml
groups:
- name: infra
rules:
- alert: InstanceDown
expr: up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} ({{ $labels.job }}) inaccessible depuis 2 min"
- alert: HighCPU
expr: 100 - (avg by(instance)(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
for: 5m
labels:
severity: warning
annotations:
summary: "CPU élevé sur {{ $labels.instance }}"
description: "CPU à {{ $value | printf \"%.0f\" }}%"
- alert: LowDiskSpace
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100 < 15
for: 5m
labels:
severity: warning
annotations:
summary: "Disque faible sur {{ $labels.instance }}"
description: "Seulement {{ $value | printf \"%.0f\" }}% disponible sur /"
- alert: HighMemory
expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "RAM critique sur {{ $labels.instance }}"
- alert: SiteDown
expr: probe_success{job="blackbox"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Site inaccessible : {{ $labels.instance }}"

Alertmanager

yaml
# prometheus/alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: alertmanager@example.com
smtp_smarthost: smtp.example.com:587
smtp_auth_username: alertmanager@example.com
smtp_auth_password: "<PASSWORD>"
route:
group_by: [alertname, instance]
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: default
routes:
- match:
severity: critical
receiver: critical-slack
- match:
severity: warning
receiver: email-ops
receivers:
- name: default
email_configs:
- to: ops@example.com
- name: critical-slack
slack_configs:
- api_url: "<SLACK_WEBHOOK_URL>"
channel: "#alerts-critical"
title: "CRITIQUE — {{ .GroupLabels.alertname }}"
text: "{{ range .Alerts }}{{ .Annotations.description }}\n{{ end }}"
- name: email-ops
email_configs:
- to: ops@example.com
subject: "[WARNING] {{ .GroupLabels.alertname }}"

Commandes utiles

bash
# Recharger la config sans redémarrage
curl -X POST http//localhost9090/-/reload
# Vérifier la config
docker exec prometheus promtool check config /etc/prometheus/prometheus.yml
# Vérifier les règles
docker exec prometheus promtool check rules /etc/prometheus/rules/alerts.yml
# Tester une alerte manuellement
curl -X POST http//localhost9093/api/v1/alerts
-H "Content-Type: application/json"
-d '[{"labels":{"alertname":"TestAlert","severity":"warning"},"annotations":{"summary":"Test"}}]'
# Status des targets
curl -s http//localhost9090/api/v1/targets | jq '.data.activeTargets[] | {job: .labels.job, health, lastError}'
💡 Tip —

Activer --web.enable-lifecycle pour recharger la config via POST /-/reload sans redémarrer le container. Indispensable en production pour éviter les interruptions lors d'ajout de targets.

⚠ Attention —

Par défaut Prometheus n'a pas d'authentification. En production, protéger l'interface via un reverse proxy (Nginx basic auth ou OAuth2 Proxy) et restreindre l'accès réseau.

OPS·BRAIN v1.05 notes · Monitoringlocal