Our monitoring stack provides real-time visibility into the health and performance of all homelab services, with beautiful dashboards and intelligent alerting.
┌─────────────────┐ ┌──────────────┐ ┌─────────────┐
│ Exporters │────▶│ Prometheus │────▶│ Grafana │
│ │ │ │ │ │
│ - Node Exporter │ │ - Scraping │ │ - Dashboards│
│ - cAdvisor │ │ - Storage │ │ - Alerts │
│ - Blackbox │ │ - Rules │ │ - Reports │
│ - Custom │ │ │ │ │
└─────────────────┘ └──────────────┘ └─────────────┘
│
▼
┌──────────────┐
│ AlertManager │
│ │
│ - Routing │
│ - Grouping │
│ - Silencing │
└──────────────┘
Dashboard ID: system-overview
Key Metrics:
Important Panels:
Dashboard ID: docker-containers
Key Metrics:
Useful Views:
Dashboard ID: service-health
Key Metrics:
Critical Monitors:
Dashboard ID: network-stats
Key Metrics:
Dashboard ID: storage-metrics
Key Metrics:
alert: HighCPUUsage
expr: 100 - (avg(rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "High CPU usage on {{ $labels.instance }}"
description: "CPU usage is above 80% (current value: {{ $value }}%)"
alert: DiskSpaceWarning
expr: (node_filesystem_avail_bytes / node_filesystem_size_bytes) * 100 < 20
for: 5m
labels:
severity: warning
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "Disk {{ $labels.mountpoint }} has less than 20% free space"
alert: ServiceDown
expr: up{job=~".*"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Service {{ $labels.job }} is down"
description: "{{ $labels.instance }} has been down for more than 5 minutes"
alert: SSLCertExpiringSoon
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 1h
labels:
severity: warning
annotations:
summary: "SSL certificate expiring soon for {{ $labels.instance }}"
description: "Certificate expires in {{ $value | humanizeDuration }}"
## alertmanager.yml
route:
group_by: ['alertname', 'severity']
group_wait: 30s
group_interval: 5m
repeat_interval: 4h
receiver: 'default'
routes:
- match:
severity: critical
receiver: 'critical-alerts'
continue: true
- match:
severity: warning
receiver: 'warning-alerts'
receivers:
- name: 'critical-alerts'
email_configs:
- to: 'admin@speicher.family'
subject: 'CRITICAL: {{ .GroupLabels.alertname }}'
discord_configs:
- webhook_url: 'DISCORD_WEBHOOK_URL'
title: 'Critical Alert'
- name: 'warning-alerts'
webhook_configs:
- url: 'http://n8n:5678/webhook/alerts'
## prometheus.yml
scrape_configs:
- job_name: 'new-service'
static_configs:
- targets: ['new-service:9100']
metric_relabel_configs:
- source_labels: [__name__]
regex: 'go_.*'
action: drop # Drop unwanted metrics
## docker-compose.yml
new-service-exporter:
image: prom/node-exporter:latest
container_name: new-service-exporter
ports:
- "9100:9100"
command:
- '--path.rootfs=/host'
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
volumes:
- /:/host:ro,rslave
## Create alert rule
alert: NewServiceDown
expr: up{job="new-service"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "New Service is down"
## Python example with prometheus_client
from prometheus_client import Counter, Histogram, start_http_server
## Define metrics
request_count = Counter('app_requests_total', 'Total requests')
request_duration = Histogram('app_request_duration_seconds', 'Request duration')
## Use in application
@request_duration.time()
def process_request():
request_count.inc()
## Process request
## Record custom metrics
groups:
- name: business_metrics
rules:
- record: daily_active_users
expr: count(count by (user) (rate(user_activity_total[1d])))
- record: revenue_per_hour
expr: sum(rate(payment_amount_total[1h]))
sum(rate(container_cpu_usage_seconds_total[5m])) by (name) * 100
(1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100
rate(node_disk_read_bytes_total[5m]) + rate(node_disk_written_bytes_total[5m])
sum(rate(http_requests_total{status=~"2.."}[5m])) / sum(rate(http_requests_total[5m])) * 100
topk(5, sum(rate(container_cpu_usage_seconds_total[5m])) by (name))
Grafana Folders:
├── System/
│ ├── Host Metrics
│ ├── Network Stats
│ └── Storage Overview
├── Services/
│ ├── Web Services
│ ├── Databases
│ └── Message Queues
├── Business/
│ ├── User Analytics
│ └── Performance KPIs
└── Alerts/
├── Alert Overview
└── Silence Manager
-- Host selector
SHOW TAG VALUES FROM "cpu" WITH KEY = "host"
-- Time range presets
$__timeFilter(time_column)
-- Service selector
label_values(up, job)
Deployment Markers:
curl -X POST http://lucille4:3000/api/annotations \
-H "Authorization: Bearer $GRAFANA_API_KEY" \
-H "Content-Type: application/json" \
-d '{
"dashboardId": 1,
"text": "Deployed version 2.0",
"tags": ["deployment", "production"]
}'
## prometheus.yml
global:
scrape_interval: 15s
evaluation_interval: 15s
external_labels:
monitor: 'homelab'
## Storage configuration
storage.tsdb.retention.time: 90d
storage.tsdb.retention.size: 50GB
## Check cardinality
prometheus_tsdb_symbol_table_size_bytes / 1024 / 1024
## Find high cardinality metrics
topk(10, count by (__name__)({__name__=~".+"}))
$__interval for dynamic sampling
## Compact TSDB
curl -X POST http://lucille4:9090/api/v1/admin/tsdb/clean_tombstones
## Check data size
du -sh /prometheus/data/
-- Clean old dashboard versions
DELETE FROM dashboard_version WHERE created < NOW() - INTERVAL '30 days';
-- Remove orphaned annotations
DELETE FROM annotation WHERE dashboard_id NOT IN (SELECT id FROM dashboard);
## Snapshot current data
curl -X POST http://lucille4:9090/api/v1/admin/tsdb/snapshot
## Copy snapshot
rsync -av /prometheus/snapshots/ /backup/prometheus/
## Export all dashboards
for dashboard in $(curl -s http://lucille4:3000/api/search | jq -r .[].uri); do
curl -s http://lucille4:3000/api/dashboards/$dashboard | jq . > backup/$dashboard.json
done
## Backup database
pg_dump -h lucille4 -U grafana grafana > grafana_backup.sql
## Check target status
curl http://lucille4:9090/api/v1/targets
## Verify exporter is running
curl http://target-host:9100/metrics
## Check Prometheus logs
docker logs prometheus --tail 100
## Prometheus metrics
curl http://lucille4:9090/metrics
## Check configuration
curl http://lucille4:9090/api/v1/status/config
## Runtime information
curl http://lucille4:9090/api/v1/status/runtimeinfo
## Grafana health
curl http://lucille4:3000/api/health
## Prometheus exporter config
sensor:
- platform: prometheus
namespace: homeassistant
component_config_glob:
sensor.*:
override_metric: temperature_celsius
// Go service example
import "github.com/prometheus/client_golang/prometheus"
var (
requestDuration = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Name: "http_request_duration_seconds",
Help: "HTTP request duration",
},
[]string{"method", "endpoint"},
)
)
func init() {
prometheus.MustRegister(requestDuration)
}