From 1571560b5ddd66e22f99d873b4c50a14bee1a553 Mon Sep 17 00:00:00 2001 From: medusa Date: Mon, 4 Aug 2025 16:22:07 -0500 Subject: [PATCH] Update tech_docs/its_the_new_style_design.md --- tech_docs/its_the_new_style_design.md | 355 ++++++++++++++++++++++++++ 1 file changed, 355 insertions(+) diff --git a/tech_docs/its_the_new_style_design.md b/tech_docs/its_the_new_style_design.md index 156f13b..a5c9f32 100644 --- a/tech_docs/its_the_new_style_design.md +++ b/tech_docs/its_the_new_style_design.md @@ -1,3 +1,358 @@ +────────────────────────────── +MyCorp “One-Box Wonder” – Complete & Idempotent deploy.sh +────────────────────────────── +Purpose +- Fresh Debian 12 → fully working DNS + DHCP + CA + Observability +- Re-run any time; only missing packages & changed configs are touched +- All paths / variables live at the top of the script—edit once, never chase. + +Save as `deploy.sh`, `chmod +x`, then `./deploy.sh`. + +```bash +#!/usr/bin/env bash +# ------------------------------------------------------------------ +# MyCorp One-Box Wonder – Universal Deployer +# Debian 12 minimal, SSH only +# ------------------------------------------------------------------ +set -euo pipefail + +############################# +# 0. Tunables – CHANGE HERE # +############################# +HOST_IP="10.0.255.1" +DOMAIN_ROOT="mycorp.net" +REPO_URL="https://git.mycorp.net/infra/onebox-wonder.git" +LOKI_RETENTION="7d" +GRAFANA_ADMIN_PASS="changeme123" +STEP_CA_NAME="MyCorp Internal CA" +STEP_PROVISIONER="admin@mycorp.net" +############################# + +log() { echo "[$(date +%F_%T)] $*"; } + +# ------------------------------------------------------------------ +# 1. Base OS +# ------------------------------------------------------------------ +log "Updating OS" +apt-get update -qq +DEBIAN_FRONTEND=noninteractive apt-get -y -qq upgrade +apt-get install -y -qq \ + dnsmasq prometheus grafana loki promtail \ + node-exporter curl wget git jq + +# Disable systemd-resolved so dnsmasq owns :53 +systemctl disable --now systemd-resolved || true +ln -sf /run/systemd/resolve/resolv.conf /etc/resolv.conf + +# ------------------------------------------------------------------ +# 2. Clone or refresh configuration repo +# ------------------------------------------------------------------ +if [[ -d /opt/onebox ]]; then + log "Pulling latest config" + git -C /opt/onebox pull +else + log "Cloning repo" + git clone "$REPO_URL" /opt/onebox +fi + +# ------------------------------------------------------------------ +# 3. Install dnsmasq configs +# ------------------------------------------------------------------ +rsync -a /opt/onebox/files/dnsmasq.d/ /etc/dnsmasq.d/ +rsync -a /opt/onebox/files/dnsmasq-static-hosts /etc/ +dnsmasq --test && systemctl restart dnsmasq + +# ------------------------------------------------------------------ +# 4. Step-CA (internal ACME) +# ------------------------------------------------------------------ +if [[ ! -f /etc/step-ca/config/ca.json ]]; then + log "Initializing Step-CA" + useradd -r -s /bin/false step || true + step ca init --name "$STEP_CA_NAME" \ + --dns "ns.infra.$DOMAIN_ROOT" \ + --address ":443" \ + --provisioner "$STEP_PROVISIONER" \ + --password-file <(echo "${STEP_CA_NAME}") \ + --root /etc/step-ca/certs/root_ca.crt \ + --key /etc/step-ca/secrets/root_ca_key \ + --config /etc/step-ca/config/ca.json + step ca provisioner add acme --type ACME --config /etc/step-ca/config/ca.json +fi +rsync -a /opt/onebox/files/step-ca.service /etc/systemd/system/ +systemctl daemon-reload +systemctl enable --now step-ca + +# ------------------------------------------------------------------ +# 5. Install dnsmasq_exporter (Prometheus metric source) +# ------------------------------------------------------------------ +if [[ ! -x /usr/local/bin/dnsmasq_exporter ]]; then + log "Installing dnsmasq_exporter" + curl -sSL https://github.com/google/dnsmasq_exporter/releases/latest/download/dnsmasq_exporter-linux-amd64 \ + -o /usr/local/bin/dnsmasq_exporter + chmod +x /usr/local/bin/dnsmasq_exporter +fi +rsync -a /opt/onebox/files/dnsmasq_exporter.service /etc/systemd/system/ +systemctl daemon-reload +systemctl enable --now dnsmasq_exporter + +# ------------------------------------------------------------------ +# 6. Prometheus config +# ------------------------------------------------------------------ +rsync -a /opt/onebox/files/prometheus.yml /etc/prometheus/prometheus.yml +systemctl enable --now prometheus + +# ------------------------------------------------------------------ +# 7. Grafana +# ------------------------------------------------------------------ +rsync -a /opt/onebox/files/grafana.ini /etc/grafana/grafana.ini +rsync -a /opt/onebox/files/dashboards/ /var/lib/grafana/dashboards/ +echo "admin:${GRAFANA_ADMIN_PASS}" | chpasswd +systemctl enable --now grafana-server + +# ------------------------------------------------------------------ +# 8. Loki + Promtail (log aggregation) +# ------------------------------------------------------------------ +rsync -a /opt/onebox/files/loki.yml /etc/loki/local-config.yaml +rsync -a /opt/onebox/files/promtail.yml /etc/promtail/config.yml +systemctl enable --now loki promtail + +# ------------------------------------------------------------------ +# 9. Firewall (allow mgmt subnet only) +# ------------------------------------------------------------------------ +ufw --force reset +ufw default deny incoming +ufw allow from 10.0.0.0/8 to any port 22 # SSH +ufw allow from 10.0.0.0/8 to any port 3000 # Grafana +ufw allow from 10.0.0.0/8 to any port 53 # DNS +ufw allow from 10.0.0.0/8 to any port 67 # DHCP +ufw --force enable + +# ------------------------------------------------------------------ +# 10. Health-check & finish +# ------------------------------------------------------------------ +for svc in dnsmasq prometheus grafana-server loki promtail step-ca; do + systemctl is-active --quiet "$svc" || { log "$svc failed to start"; exit 1; } +done + +log "✅ One-Box Wonder ready" +log " DNS/DHCP: ${HOST_IP}:53" +log " Grafana: http://${HOST_IP}:3000 (admin:${GRAFANA_ADMIN_PASS})" +log " CA: https://${HOST_IP}:443/acme/acme/directory" +``` + +────────────────────────────── +Usage Notes +- Keep `/opt/onebox` the **single source of truth**; commit there, push, then `ssh box "cd /opt/onebox && git pull && ./deploy.sh"` +- Add new VLANs by dropping a file into `files/dnsmasq.d/` and re-run `./deploy.sh`; zero downtime. + +--- + +Below is an **observability add-on** that drops onto your existing One-Box Wonder **without touching DNS/DHCP logic** and gives you **metrics, logs, and alerts** in ~15 minutes. + +────────────────────────────── +Goal +“Is DNS answering? Are leases running low? Did the box reboot?” One URL answers it all. + +────────────────────────────── +Stack (ultra-light) +| Component | Purpose | Footprint | +|---|---|---| +| **Prometheus** (binary ~60 MB) | Scrapes dnsmasq & node metrics | 50 MB RAM | +| **node_exporter** | CPU, disk, network | 20 MB RAM | +| **dnsmasq_exporter** (or script) | Lease counts, cache hits | 10 MB RAM | +| **Grafana** (OSS) | Dashboards & alerts | 100 MB RAM | +| **journald → Loki** (optional) | Centralised logs | 80 MB RAM | + +**Total ≈ 260 MB RAM**—still fits a 512 MB VM. + +────────────────────────────── +1. Install in one shot +```bash +sudo apt update +sudo apt install -y prometheus grafana loki promtail node-exporter +# dnsmasq_exporter (Go binary) +curl -sSL https://github.com/google/dnsmasq_exporter/releases/latest/download/dnsmasq_exporter-linux-amd64 \ + -o /usr/local/bin/dnsmasq_exporter && chmod +x /usr/local/bin/dnsmasq_exporter +``` + +────────────────────────────── +2. Enable & start +```bash +# dnsmasq_exporter user +sudo useradd -r -s /bin/false dnsmasq_exporter +sudo tee /etc/systemd/system/dnsmasq_exporter.service <<'EOF' +[Unit] +Description=DNSmasq metrics exporter +After=network.target +[Service] +User=dnsmasq_exporter +ExecStart=/usr/local/bin/dnsmasq_exporter --dnsmasq.addr=127.0.0.1:53 +Restart=always +[Install] +WantedBy=multi-user.target +EOF + +sudo systemctl daemon-reload +sudo systemctl enable --now prometheus grafana-server node_exporter dnsmasq_exporter +``` + +────────────────────────────── +3. Prometheus scrape config (append) +`/etc/prometheus/prometheus.yml` +```yaml +scrape_configs: + - job_name: 'node' + static_configs: + - targets: ['localhost:9100'] + - job_name: 'dnsmasq' + static_configs: + - targets: ['localhost:9153'] + - job_name: 'dnsmasq_leases' + static_configs: + - targets: ['localhost:9153'] +``` + +Reload: `sudo systemctl reload prometheus` + +────────────────────────────── +4. Grafana dashboards (import JSON) +- **ID 1860** – “Node Exporter Full” +- **ID 13186** – “DNSmasq” (or load the JSON below) + +Quick-and-dirty dashboard snippet (paste into Grafana → Import → JSON): +```json +{ + "dashboard": { + "title": "MyCorp One-Box DNS/DHCP", + "panels": [ + { "title": "DHCP Leases Used %", + "targets": [{ "expr": "dnsmasq_dhcp_leases / 128 * 100", "legendFormat": "{{zone}}" }] }, + { "title": "DNS Cache Hit Ratio", + "targets": [{ "expr": "dnsmasq_dns_cache_hits / (dnsmasq_dns_cache_hits + dnsmasq_dns_cache_misses)" }] } + ] + } +} +``` + +────────────────────────────── +5. Log pipeline (optional but nice) +Promtail ships journald to Loki. + +`/etc/promtail/config.yml` +```yaml +server: + http_listen_port: 9080 +positions: + filename: /tmp/positions.yaml +clients: + - url: http://localhost:3100/loki/api/v1/push +scrape_configs: + - job_name: dnsmasq + journal: + labels: + job: dnsmasq + relabel_configs: + - source_labels: ['__journal__systemd_unit'] + target_label: 'unit' +``` +```bash +sudo systemctl enable --now promtail +``` + +────────────────────────────── +6. Alert rules (Prometheus) +`/etc/prometheus/rules/dnsmasq.yml` +```yaml +groups: + - name: dnsmasq + rules: + - alert: DHCPPoolHigh + expr: dnsmasq_dhcp_leases / 128 > 0.85 + for: 5m + annotations: + summary: "DHCP pool {{ $labels.zone }} > 85 % full" + - alert: DNSDown + expr: up{job="dnsmasq"} == 0 + for: 1m + annotations: + summary: "dnsmasq_exporter is down" +``` +Point Grafana Alertmanager to Slack/Discord/email. + +────────────────────────────── +7. One-command “health” alias +```bash +echo "alias health='curl -s http://localhost:3000/api/health | jq .database'" >> ~/.bash_aliases +``` +Log in tomorrow → Grafana dashboard URL bookmarked; `health` tells you if the stack itself is alive. + +────────────────────────────── +8. Clean-up checklist +- Firewall: open **only** 3000/tcp (Grafana) to your mgmt subnet. +- Backups: nightly `tar czf /var/backups/obs-$(date +%F).tgz /var/lib/prometheus /var/lib/grafana`. +- Git commit: add `observability/` sub-folder; keep infra & metrics configs together. + +────────────────────────────── +Done. +Open `http://10.0.255.1:3000` (admin/admin → change) and you’ll see **leases, cache hit-rate, CPU, disk**—all in one pane. + +--- + +Here are **“meta tangents”** that literally **send you back to real work faster**—each one is a **one-shot action** that removes future yak-shaving. + +1. **Git-init once → never SSH-edit again** + ```bash + cd /etc/dnsmasq.d + sudo git init && sudo git add . && sudo git commit -m "baseline" + echo '*.conf' | sudo tee .gitignore # ignore temp files + alias dns-commit='sudo git add . && sudo git commit -m "$(date -Iseconds)"' + ``` + After that, every change is `dns-commit && sudo systemctl reload dnsmasq`. + +2. **Make the box **self-documenting** + Add a **MOTD** that prints the current zones: + ```bash + sudo tee /etc/update-motd.d/50-dnsmasq-status <<'EOF' + #!/bin/sh + echo "=== DNS/DHCP zones ===" + grep -E 'domain=|dhcp-range=' /etc/dnsmasq.d/*.conf | column -t + EOF + sudo chmod +x /etc/update-motd.d/50-dnsmasq-status + ``` + Log in tomorrow, zones are right there—no docs to open. + +3. **One-liner **“new VLAN”** generator** + ```bash + newvlan() { + local id=$1 name=$2 + sed "s/lan/$name/g; s/0/$id/g" /etc/dnsmasq.d/10-lan.conf \ + | sudo tee /etc/dnsmasq.d/${id}-${name}.conf + sudo systemctl reload dnsmasq + echo "VLAN $id $name ready" + } + ``` + Usage: `newvlan 40 cameras`. + +4. **Push-button backup** + ```bash + sudo crontab -l | { cat; echo "0 3 * * * tar czf /var/backups/dnsmasq-$(date +\%F).tgz /etc/dnsmasq* /var/lib/misc/dnsmasq.leases"; } | sudo crontab - + ``` + Daily tarball; restore = untar + reload. + +5. **“Don’t-break-DNS” pre-commit hook** + ```bash + sudo tee /etc/dnsmasq.d/.git/hooks/pre-commit <<'EOF' + #!/bin/bash + dnsmasq --test -C /dev/null -C <(cat /etc/dnsmasq.d/*.conf) || exit 1 + EOF + sudo chmod +x /etc/dnsmasq.d/.git/hooks/pre-commit + ``` + Bad configs never reach `reload`. + +Pick any two; you’ll **never touch the box again** unless the business truly changes. + +--- + ────────────────────────────── MyCorp “One-Box Wonder” – End-to-End Deployment Guide ──────────────────────────────