Advanced Monitoring with Prometheus and AlertManager

Introduction

Prometheus and AlertManager provide comprehensive monitoring and alerting for distributed systems. This guide covers metrics collection, alerting rules, and notification routing.

Prerequisites

Kubernetes cluster or Docker environment
Basic understanding of monitoring concepts

Step 1: Prometheus Configuration

Create prometheus.yml:

global:
  scrape_interval: 15s
  evaluation_interval: 15s
  external_labels:
    cluster: 'production'
    region: 'us-east-1'

rule_files:
  - "alert_rules.yml"
  - "recording_rules.yml"

alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

scrape_configs:
  # Prometheus itself
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

  # Node Exporter
  - job_name: 'node'
    static_configs:
      - targets: 
        - 'node-exporter:9100'
    scrape_interval: 30s
    metrics_path: /metrics

  # Application metrics
  - job_name: 'app'
    kubernetes_sd_configs:
      - role: pod
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)
      - source_labels: [__address__, __meta_kubernetes_pod_annotation_prometheus_io_port]
        action: replace
        regex: ([^:]+)(?::\d+)?;(\d+)
        replacement: $1:$2
        target_label: __address__
      - action: labelmap
        regex: __meta_kubernetes_pod_label_(.+)
      - source_labels: [__meta_kubernetes_namespace]
        action: replace
        target_label: kubernetes_namespace
      - source_labels: [__meta_kubernetes_pod_name]
        action: replace
        target_label: kubernetes_pod_name

  # Kafka metrics
  - job_name: 'kafka'
    static_configs:
      - targets: ['kafka-exporter:9308']
    scrape_interval: 30s

  # PostgreSQL metrics
  - job_name: 'postgres'
    static_configs:
      - targets: ['postgres-exporter:9187']
    scrape_interval: 30s

  # Redis metrics
  - job_name: 'redis'
    static_configs:
      - targets: ['redis-exporter:9121']
    scrape_interval: 30s

Step 2: Recording Rules

Create recording_rules.yml:

groups:
  - name: application_rules
    interval: 30s
    rules:
      # HTTP request rate
      - record: http_requests_per_second
        expr: sum(rate(http_requests_total[5m])) by (job, instance, method, status)
      
      # HTTP error rate
      - record: http_error_rate
        expr: |
          sum(rate(http_requests_total{status=~"5.."}[5m])) by (job, instance) 
          / 
          sum(rate(http_requests_total[5m])) by (job, instance)
      
      # HTTP request latency percentiles
      - record: http_request_duration_p95
        expr: histogram_quantile(0.95, sum(rate(http_request_duration_seconds_bucket[5m])) by (job, instance, le))
      
      - record: http_request_duration_p99
        expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (job, instance, le))

  - name: infrastructure_rules
    interval: 30s
    rules:
      # CPU utilization
      - record: cpu_utilization
        expr: 100 - (avg by (instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100)
      
      # Memory utilization
      - record: memory_utilization
        expr: |
          (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
      
      # Disk utilization
      - record: disk_utilization
        expr: |
          100 - ((node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) * 100)
      
      # Network throughput
      - record: network_throughput_rx
        expr: sum(rate(node_network_receive_bytes_total[5m])) by (instance)
      
      - record: network_throughput_tx
        expr: sum(rate(node_network_transmit_bytes_total[5m])) by (instance)

  - name: business_rules
    interval: 60s
    rules:
      # Order processing rate
      - record: orders_per_minute
        expr: sum(rate(orders_total[1m])) * 60
      
      # Revenue per hour
      - record: revenue_per_hour
        expr: sum(rate(revenue_total[1h])) * 3600
      
      # Active users
      - record: active_users_5m
        expr: count(increase(user_activity_total[5m]) > 0)

Step 3: Alert Rules

Create alert_rules.yml:

groups:
  - name: system_alerts
    rules:
      - alert: HighCPUUsage
        expr: cpu_utilization > 80
        for: 5m
        labels:
          severity: warning
          service: infrastructure
        annotations:
          summary: "High CPU usage detected"
          description: "CPU usage is {{ $value }}% on {{ $labels.instance }}"
          runbook_url: "https://wiki.company.com/runbooks/high-cpu"

      - alert: HighMemoryUsage
        expr: memory_utilization > 90
        for: 3m
        labels:
          severity: critical
          service: infrastructure
        annotations:
          summary: "High memory usage detected"
          description: "Memory usage is {{ $value }}% on {{ $labels.instance }}"

      - alert: DiskSpaceLow
        expr: disk_utilization > 85
        for: 1m
        labels:
          severity: warning
          service: infrastructure
        annotations:
          summary: "Disk space is running low"
          description: "Disk usage is {{ $value }}% on {{ $labels.instance }}"

      - alert: NodeDown
        expr: up{job="node"} == 0
        for: 1m
        labels:
          severity: critical
          service: infrastructure
        annotations:
          summary: "Node is down"
          description: "Node {{ $labels.instance }} has been down for more than 1 minute"

  - name: application_alerts
    rules:
      - alert: HighErrorRate
        expr: http_error_rate > 0.05
        for: 5m
        labels:
          severity: critical
          service: application
        annotations:
          summary: "High HTTP error rate"
          description: "Error rate is {{ $value | humanizePercentage }} for {{ $labels.job }}"

      - alert: HighLatency
        expr: http_request_duration_p95 > 2
        for: 5m
        labels:
          severity: warning
          service: application
        annotations:
          summary: "High response latency"
          description: "95th percentile latency is {{ $value }}s for {{ $labels.job }}"

      - alert: ApplicationDown
        expr: up{job="app"} == 0
        for: 1m
        labels:
          severity: critical
          service: application
        annotations:
          summary: "Application is down"
          description: "Application {{ $labels.instance }} is not responding"

  - name: database_alerts
    rules:
      - alert: PostgreSQLDown
        expr: up{job="postgres"} == 0
        for: 1m
        labels:
          severity: critical
          service: database
        annotations:
          summary: "PostgreSQL is down"
          description: "PostgreSQL instance {{ $labels.instance }} is down"

      - alert: HighDatabaseConnections
        expr: pg_stat_database_numbackends / pg_settings_max_connections > 0.8
        for: 5m
        labels:
          severity: warning
          service: database
        annotations:
          summary: "High database connection usage"
          description: "Database connections are {{ $value | humanizePercentage }} of maximum"

  - name: business_alerts
    rules:
      - alert: LowOrderVolume
        expr: orders_per_minute < 10
        for: 10m
        labels:
          severity: warning
          service: business
        annotations:
          summary: "Low order processing volume"
          description: "Only {{ $value }} orders per minute (expected > 10)"

      - alert: PaymentFailures
        expr: increase(payment_failures_total[5m]) > 5
        for: 2m
        labels:
          severity: critical
          service: business
        annotations:
          summary: "High payment failure rate"
          description: "{{ $value }} payment failures in the last 5 minutes"

Step 4: AlertManager Configuration

Create alertmanager.yml:

global:
  smtp_smarthost: 'mail.company.com:587'
  smtp_from: 'alerts@company.com'
  smtp_auth_username: 'alerts@company.com'
  smtp_auth_password: 'password'

# Inhibition rules to reduce noise
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['instance', 'service']

# Route configuration
route:
  group_by: ['alertname', 'service']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1h
  receiver: 'default'
  
  routes:
    # Critical alerts go to on-call
    - match:
        severity: critical
      receiver: 'critical-alerts'
      group_wait: 0s
      repeat_interval: 5m
    
    # Infrastructure alerts
    - match:
        service: infrastructure
      receiver: 'infrastructure-team'
      
    # Application alerts
    - match:
        service: application
      receiver: 'development-team'
      
    # Database alerts
    - match:
        service: database
      receiver: 'dba-team'
      
    # Business alerts
    - match:
        service: business
      receiver: 'business-team'

# Receiver configurations
receivers:
  - name: 'default'
    email_configs:
      - to: 'team@company.com'
        subject: 'Alert: {{ .GroupLabels.alertname }}'
        html: |
          <!DOCTYPE html>
          <html>
          <body>
            <h2>Alert Summary</h2>
            <p><strong>Alert:</strong> {{ .GroupLabels.alertname }}</p>
            <p><strong>Service:</strong> {{ .GroupLabels.service }}</p>
            <h3>Firing Alerts</h3>
            <ul>
            {{ range .Alerts.Firing }}
              <li>{{ .Annotations.summary }} ({{ .Labels.instance }})</li>
            {{ end }}
            </ul>
          </body>
          </html>

  - name: 'critical-alerts'
    email_configs:
      - to: 'oncall@company.com'
        subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
        html: |
          <h1 style="color: red;">CRITICAL ALERT</h1>
          <p><strong>Alert:</strong> {{ .GroupLabels.alertname }}</p>
          {{ range .Alerts.Firing }}
          <p><strong>Description:</strong> {{ .Annotations.description }}</p>
          <p><strong>Instance:</strong> {{ .Labels.instance }}</p>
          {{ end }}
    
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/SLACK/WEBHOOK'
        channel: '#alerts'
        title: 'Critical Alert: {{ .GroupLabels.alertname }}'
        text: |
          {{ range .Alerts.Firing }}
          *Instance:* {{ .Labels.instance }}
          *Description:* {{ .Annotations.description }}
          {{ end }}
        color: 'danger'

  - name: 'infrastructure-team'
    email_configs:
      - to: 'infrastructure@company.com'
        subject: 'Infrastructure Alert: {{ .GroupLabels.alertname }}'

  - name: 'development-team'
    slack_configs:
      - api_url: 'https://hooks.slack.com/services/YOUR/DEV/WEBHOOK'
        channel: '#development'
        title: 'App Alert: {{ .GroupLabels.alertname }}'
        
  - name: 'dba-team'
    email_configs:
      - to: 'dba@company.com'
        subject: 'Database Alert: {{ .GroupLabels.alertname }}'

  - name: 'business-team'
    email_configs:
      - to: 'business@company.com'
        subject: 'Business Metric Alert: {{ .GroupLabels.alertname }}'

Step 5: Docker Compose Setup

Create docker-compose.yml:

version: '3.8'

services:
  prometheus:
    image: prom/prometheus:v2.47.0
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alert_rules.yml:/etc/prometheus/alert_rules.yml
      - ./recording_rules.yml:/etc/prometheus/recording_rules.yml
      - prometheus_data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--web.console.libraries=/etc/prometheus/console_libraries'
      - '--web.console.templates=/etc/prometheus/consoles'
      - '--storage.tsdb.retention.time=200h'
      - '--web.enable-lifecycle'

  alertmanager:
    image: prom/alertmanager:v0.26.0
    container_name: alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
      - alertmanager_data:/alertmanager
    command:
      - '--config.file=/etc/alertmanager/alertmanager.yml'
      - '--storage.path=/alertmanager'

  grafana:
    image: grafana/grafana:10.1.0
    container_name: grafana
    ports:
      - "3000:3000"
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    volumes:
      - grafana_data:/var/lib/grafana
      - ./grafana/provisioning:/etc/grafana/provisioning

  node-exporter:
    image: prom/node-exporter:v1.6.0
    container_name: node-exporter
    ports:
      - "9100:9100"
    command:
      - '--path.rootfs=/host'
    volumes:
      - '/:/host:ro,rslave'

volumes:
  prometheus_data:
  alertmanager_data:
  grafana_data:

Step 6: Custom Application Metrics

Instrument Node.js app with Prometheus metrics:

const client = require('prom-client');
const express = require('express');

// Create a Registry
const register = new client.Registry();

// Add default metrics
client.collectDefaultMetrics({ register });

// Custom metrics
const httpRequestDuration = new client.Histogram({
  name: 'http_request_duration_seconds',
  help: 'Duration of HTTP requests in seconds',
  labelNames: ['method', 'route', 'status'],
  buckets: [0.1, 0.5, 1, 2, 5, 10],
  registers: [register],
});

const httpRequestTotal = new client.Counter({
  name: 'http_requests_total',
  help: 'Total number of HTTP requests',
  labelNames: ['method', 'route', 'status'],
  registers: [register],
});

const activeConnections = new client.Gauge({
  name: 'active_connections',
  help: 'Number of active connections',
  registers: [register],
});

// Business metrics
const ordersTotal = new client.Counter({
  name: 'orders_total',
  help: 'Total number of orders processed',
  labelNames: ['status'],
  registers: [register],
});

const revenueTotal = new client.Counter({
  name: 'revenue_total',
  help: 'Total revenue generated',
  registers: [register],
});

// Middleware for HTTP metrics
function metricsMiddleware(req, res, next) {
  const start = Date.now();
  
  res.on('finish', () => {
    const duration = (Date.now() - start) / 1000;
    const route = req.route?.path || req.path;
    
    httpRequestDuration
      .labels(req.method, route, res.statusCode)
      .observe(duration);
      
    httpRequestTotal
      .labels(req.method, route, res.statusCode)
      .inc();
  });
  
  next();
}

module.exports = {
  register,
  metricsMiddleware,
  httpRequestDuration,
  httpRequestTotal,
  activeConnections,
  ordersTotal,
  revenueTotal,
};

Summary

Prometheus and AlertManager provide comprehensive monitoring and intelligent alerting for production systems. Use recording rules for performance, alert rules for notifications, and proper routing for team-specific alerts to maintain system reliability.