Update tech_docs/its_the_new_style_design.md
This commit is contained in:
@@ -1,3 +1,358 @@
|
||||
──────────────────────────────
|
||||
MyCorp “One-Box Wonder” – Complete & Idempotent deploy.sh
|
||||
──────────────────────────────
|
||||
Purpose
|
||||
- Fresh Debian 12 → fully working DNS + DHCP + CA + Observability
|
||||
- Re-run any time; only missing packages & changed configs are touched
|
||||
- All paths / variables live at the top of the script—edit once, never chase.
|
||||
|
||||
Save as `deploy.sh`, `chmod +x`, then `./deploy.sh`.
|
||||
|
||||
```bash
|
||||
#!/usr/bin/env bash
|
||||
# ------------------------------------------------------------------
|
||||
# MyCorp One-Box Wonder – Universal Deployer
|
||||
# Debian 12 minimal, SSH only
|
||||
# ------------------------------------------------------------------
|
||||
set -euo pipefail
|
||||
|
||||
#############################
|
||||
# 0. Tunables – CHANGE HERE #
|
||||
#############################
|
||||
HOST_IP="10.0.255.1"
|
||||
DOMAIN_ROOT="mycorp.net"
|
||||
REPO_URL="https://git.mycorp.net/infra/onebox-wonder.git"
|
||||
LOKI_RETENTION="7d"
|
||||
GRAFANA_ADMIN_PASS="changeme123"
|
||||
STEP_CA_NAME="MyCorp Internal CA"
|
||||
STEP_PROVISIONER="admin@mycorp.net"
|
||||
#############################
|
||||
|
||||
log() { echo "[$(date +%F_%T)] $*"; }
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 1. Base OS
|
||||
# ------------------------------------------------------------------
|
||||
log "Updating OS"
|
||||
apt-get update -qq
|
||||
DEBIAN_FRONTEND=noninteractive apt-get -y -qq upgrade
|
||||
apt-get install -y -qq \
|
||||
dnsmasq prometheus grafana loki promtail \
|
||||
node-exporter curl wget git jq
|
||||
|
||||
# Disable systemd-resolved so dnsmasq owns :53
|
||||
systemctl disable --now systemd-resolved || true
|
||||
ln -sf /run/systemd/resolve/resolv.conf /etc/resolv.conf
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 2. Clone or refresh configuration repo
|
||||
# ------------------------------------------------------------------
|
||||
if [[ -d /opt/onebox ]]; then
|
||||
log "Pulling latest config"
|
||||
git -C /opt/onebox pull
|
||||
else
|
||||
log "Cloning repo"
|
||||
git clone "$REPO_URL" /opt/onebox
|
||||
fi
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 3. Install dnsmasq configs
|
||||
# ------------------------------------------------------------------
|
||||
rsync -a /opt/onebox/files/dnsmasq.d/ /etc/dnsmasq.d/
|
||||
rsync -a /opt/onebox/files/dnsmasq-static-hosts /etc/
|
||||
dnsmasq --test && systemctl restart dnsmasq
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 4. Step-CA (internal ACME)
|
||||
# ------------------------------------------------------------------
|
||||
if [[ ! -f /etc/step-ca/config/ca.json ]]; then
|
||||
log "Initializing Step-CA"
|
||||
useradd -r -s /bin/false step || true
|
||||
step ca init --name "$STEP_CA_NAME" \
|
||||
--dns "ns.infra.$DOMAIN_ROOT" \
|
||||
--address ":443" \
|
||||
--provisioner "$STEP_PROVISIONER" \
|
||||
--password-file <(echo "${STEP_CA_NAME}") \
|
||||
--root /etc/step-ca/certs/root_ca.crt \
|
||||
--key /etc/step-ca/secrets/root_ca_key \
|
||||
--config /etc/step-ca/config/ca.json
|
||||
step ca provisioner add acme --type ACME --config /etc/step-ca/config/ca.json
|
||||
fi
|
||||
rsync -a /opt/onebox/files/step-ca.service /etc/systemd/system/
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now step-ca
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 5. Install dnsmasq_exporter (Prometheus metric source)
|
||||
# ------------------------------------------------------------------
|
||||
if [[ ! -x /usr/local/bin/dnsmasq_exporter ]]; then
|
||||
log "Installing dnsmasq_exporter"
|
||||
curl -sSL https://github.com/google/dnsmasq_exporter/releases/latest/download/dnsmasq_exporter-linux-amd64 \
|
||||
-o /usr/local/bin/dnsmasq_exporter
|
||||
chmod +x /usr/local/bin/dnsmasq_exporter
|
||||
fi
|
||||
rsync -a /opt/onebox/files/dnsmasq_exporter.service /etc/systemd/system/
|
||||
systemctl daemon-reload
|
||||
systemctl enable --now dnsmasq_exporter
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 6. Prometheus config
|
||||
# ------------------------------------------------------------------
|
||||
rsync -a /opt/onebox/files/prometheus.yml /etc/prometheus/prometheus.yml
|
||||
systemctl enable --now prometheus
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 7. Grafana
|
||||
# ------------------------------------------------------------------
|
||||
rsync -a /opt/onebox/files/grafana.ini /etc/grafana/grafana.ini
|
||||
rsync -a /opt/onebox/files/dashboards/ /var/lib/grafana/dashboards/
|
||||
echo "admin:${GRAFANA_ADMIN_PASS}" | chpasswd
|
||||
systemctl enable --now grafana-server
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 8. Loki + Promtail (log aggregation)
|
||||
# ------------------------------------------------------------------
|
||||
rsync -a /opt/onebox/files/loki.yml /etc/loki/local-config.yaml
|
||||
rsync -a /opt/onebox/files/promtail.yml /etc/promtail/config.yml
|
||||
systemctl enable --now loki promtail
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 9. Firewall (allow mgmt subnet only)
|
||||
# ------------------------------------------------------------------------
|
||||
ufw --force reset
|
||||
ufw default deny incoming
|
||||
ufw allow from 10.0.0.0/8 to any port 22 # SSH
|
||||
ufw allow from 10.0.0.0/8 to any port 3000 # Grafana
|
||||
ufw allow from 10.0.0.0/8 to any port 53 # DNS
|
||||
ufw allow from 10.0.0.0/8 to any port 67 # DHCP
|
||||
ufw --force enable
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# 10. Health-check & finish
|
||||
# ------------------------------------------------------------------
|
||||
for svc in dnsmasq prometheus grafana-server loki promtail step-ca; do
|
||||
systemctl is-active --quiet "$svc" || { log "$svc failed to start"; exit 1; }
|
||||
done
|
||||
|
||||
log "✅ One-Box Wonder ready"
|
||||
log " DNS/DHCP: ${HOST_IP}:53"
|
||||
log " Grafana: http://${HOST_IP}:3000 (admin:${GRAFANA_ADMIN_PASS})"
|
||||
log " CA: https://${HOST_IP}:443/acme/acme/directory"
|
||||
```
|
||||
|
||||
──────────────────────────────
|
||||
Usage Notes
|
||||
- Keep `/opt/onebox` the **single source of truth**; commit there, push, then `ssh box "cd /opt/onebox && git pull && ./deploy.sh"`
|
||||
- Add new VLANs by dropping a file into `files/dnsmasq.d/` and re-run `./deploy.sh`; zero downtime.
|
||||
|
||||
---
|
||||
|
||||
Below is an **observability add-on** that drops onto your existing One-Box Wonder **without touching DNS/DHCP logic** and gives you **metrics, logs, and alerts** in ~15 minutes.
|
||||
|
||||
──────────────────────────────
|
||||
Goal
|
||||
“Is DNS answering? Are leases running low? Did the box reboot?” One URL answers it all.
|
||||
|
||||
──────────────────────────────
|
||||
Stack (ultra-light)
|
||||
| Component | Purpose | Footprint |
|
||||
|---|---|---|
|
||||
| **Prometheus** (binary ~60 MB) | Scrapes dnsmasq & node metrics | 50 MB RAM |
|
||||
| **node_exporter** | CPU, disk, network | 20 MB RAM |
|
||||
| **dnsmasq_exporter** (or script) | Lease counts, cache hits | 10 MB RAM |
|
||||
| **Grafana** (OSS) | Dashboards & alerts | 100 MB RAM |
|
||||
| **journald → Loki** (optional) | Centralised logs | 80 MB RAM |
|
||||
|
||||
**Total ≈ 260 MB RAM**—still fits a 512 MB VM.
|
||||
|
||||
──────────────────────────────
|
||||
1. Install in one shot
|
||||
```bash
|
||||
sudo apt update
|
||||
sudo apt install -y prometheus grafana loki promtail node-exporter
|
||||
# dnsmasq_exporter (Go binary)
|
||||
curl -sSL https://github.com/google/dnsmasq_exporter/releases/latest/download/dnsmasq_exporter-linux-amd64 \
|
||||
-o /usr/local/bin/dnsmasq_exporter && chmod +x /usr/local/bin/dnsmasq_exporter
|
||||
```
|
||||
|
||||
──────────────────────────────
|
||||
2. Enable & start
|
||||
```bash
|
||||
# dnsmasq_exporter user
|
||||
sudo useradd -r -s /bin/false dnsmasq_exporter
|
||||
sudo tee /etc/systemd/system/dnsmasq_exporter.service <<'EOF'
|
||||
[Unit]
|
||||
Description=DNSmasq metrics exporter
|
||||
After=network.target
|
||||
[Service]
|
||||
User=dnsmasq_exporter
|
||||
ExecStart=/usr/local/bin/dnsmasq_exporter --dnsmasq.addr=127.0.0.1:53
|
||||
Restart=always
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
sudo systemctl daemon-reload
|
||||
sudo systemctl enable --now prometheus grafana-server node_exporter dnsmasq_exporter
|
||||
```
|
||||
|
||||
──────────────────────────────
|
||||
3. Prometheus scrape config (append)
|
||||
`/etc/prometheus/prometheus.yml`
|
||||
```yaml
|
||||
scrape_configs:
|
||||
- job_name: 'node'
|
||||
static_configs:
|
||||
- targets: ['localhost:9100']
|
||||
- job_name: 'dnsmasq'
|
||||
static_configs:
|
||||
- targets: ['localhost:9153']
|
||||
- job_name: 'dnsmasq_leases'
|
||||
static_configs:
|
||||
- targets: ['localhost:9153']
|
||||
```
|
||||
|
||||
Reload: `sudo systemctl reload prometheus`
|
||||
|
||||
──────────────────────────────
|
||||
4. Grafana dashboards (import JSON)
|
||||
- **ID 1860** – “Node Exporter Full”
|
||||
- **ID 13186** – “DNSmasq” (or load the JSON below)
|
||||
|
||||
Quick-and-dirty dashboard snippet (paste into Grafana → Import → JSON):
|
||||
```json
|
||||
{
|
||||
"dashboard": {
|
||||
"title": "MyCorp One-Box DNS/DHCP",
|
||||
"panels": [
|
||||
{ "title": "DHCP Leases Used %",
|
||||
"targets": [{ "expr": "dnsmasq_dhcp_leases / 128 * 100", "legendFormat": "{{zone}}" }] },
|
||||
{ "title": "DNS Cache Hit Ratio",
|
||||
"targets": [{ "expr": "dnsmasq_dns_cache_hits / (dnsmasq_dns_cache_hits + dnsmasq_dns_cache_misses)" }] }
|
||||
]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
──────────────────────────────
|
||||
5. Log pipeline (optional but nice)
|
||||
Promtail ships journald to Loki.
|
||||
|
||||
`/etc/promtail/config.yml`
|
||||
```yaml
|
||||
server:
|
||||
http_listen_port: 9080
|
||||
positions:
|
||||
filename: /tmp/positions.yaml
|
||||
clients:
|
||||
- url: http://localhost:3100/loki/api/v1/push
|
||||
scrape_configs:
|
||||
- job_name: dnsmasq
|
||||
journal:
|
||||
labels:
|
||||
job: dnsmasq
|
||||
relabel_configs:
|
||||
- source_labels: ['__journal__systemd_unit']
|
||||
target_label: 'unit'
|
||||
```
|
||||
```bash
|
||||
sudo systemctl enable --now promtail
|
||||
```
|
||||
|
||||
──────────────────────────────
|
||||
6. Alert rules (Prometheus)
|
||||
`/etc/prometheus/rules/dnsmasq.yml`
|
||||
```yaml
|
||||
groups:
|
||||
- name: dnsmasq
|
||||
rules:
|
||||
- alert: DHCPPoolHigh
|
||||
expr: dnsmasq_dhcp_leases / 128 > 0.85
|
||||
for: 5m
|
||||
annotations:
|
||||
summary: "DHCP pool {{ $labels.zone }} > 85 % full"
|
||||
- alert: DNSDown
|
||||
expr: up{job="dnsmasq"} == 0
|
||||
for: 1m
|
||||
annotations:
|
||||
summary: "dnsmasq_exporter is down"
|
||||
```
|
||||
Point Grafana Alertmanager to Slack/Discord/email.
|
||||
|
||||
──────────────────────────────
|
||||
7. One-command “health” alias
|
||||
```bash
|
||||
echo "alias health='curl -s http://localhost:3000/api/health | jq .database'" >> ~/.bash_aliases
|
||||
```
|
||||
Log in tomorrow → Grafana dashboard URL bookmarked; `health` tells you if the stack itself is alive.
|
||||
|
||||
──────────────────────────────
|
||||
8. Clean-up checklist
|
||||
- Firewall: open **only** 3000/tcp (Grafana) to your mgmt subnet.
|
||||
- Backups: nightly `tar czf /var/backups/obs-$(date +%F).tgz /var/lib/prometheus /var/lib/grafana`.
|
||||
- Git commit: add `observability/` sub-folder; keep infra & metrics configs together.
|
||||
|
||||
──────────────────────────────
|
||||
Done.
|
||||
Open `http://10.0.255.1:3000` (admin/admin → change) and you’ll see **leases, cache hit-rate, CPU, disk**—all in one pane.
|
||||
|
||||
---
|
||||
|
||||
Here are **“meta tangents”** that literally **send you back to real work faster**—each one is a **one-shot action** that removes future yak-shaving.
|
||||
|
||||
1. **Git-init once → never SSH-edit again**
|
||||
```bash
|
||||
cd /etc/dnsmasq.d
|
||||
sudo git init && sudo git add . && sudo git commit -m "baseline"
|
||||
echo '*.conf' | sudo tee .gitignore # ignore temp files
|
||||
alias dns-commit='sudo git add . && sudo git commit -m "$(date -Iseconds)"'
|
||||
```
|
||||
After that, every change is `dns-commit && sudo systemctl reload dnsmasq`.
|
||||
|
||||
2. **Make the box **self-documenting**
|
||||
Add a **MOTD** that prints the current zones:
|
||||
```bash
|
||||
sudo tee /etc/update-motd.d/50-dnsmasq-status <<'EOF'
|
||||
#!/bin/sh
|
||||
echo "=== DNS/DHCP zones ==="
|
||||
grep -E 'domain=|dhcp-range=' /etc/dnsmasq.d/*.conf | column -t
|
||||
EOF
|
||||
sudo chmod +x /etc/update-motd.d/50-dnsmasq-status
|
||||
```
|
||||
Log in tomorrow, zones are right there—no docs to open.
|
||||
|
||||
3. **One-liner **“new VLAN”** generator**
|
||||
```bash
|
||||
newvlan() {
|
||||
local id=$1 name=$2
|
||||
sed "s/lan/$name/g; s/0/$id/g" /etc/dnsmasq.d/10-lan.conf \
|
||||
| sudo tee /etc/dnsmasq.d/${id}-${name}.conf
|
||||
sudo systemctl reload dnsmasq
|
||||
echo "VLAN $id $name ready"
|
||||
}
|
||||
```
|
||||
Usage: `newvlan 40 cameras`.
|
||||
|
||||
4. **Push-button backup**
|
||||
```bash
|
||||
sudo crontab -l | { cat; echo "0 3 * * * tar czf /var/backups/dnsmasq-$(date +\%F).tgz /etc/dnsmasq* /var/lib/misc/dnsmasq.leases"; } | sudo crontab -
|
||||
```
|
||||
Daily tarball; restore = untar + reload.
|
||||
|
||||
5. **“Don’t-break-DNS” pre-commit hook**
|
||||
```bash
|
||||
sudo tee /etc/dnsmasq.d/.git/hooks/pre-commit <<'EOF'
|
||||
#!/bin/bash
|
||||
dnsmasq --test -C /dev/null -C <(cat /etc/dnsmasq.d/*.conf) || exit 1
|
||||
EOF
|
||||
sudo chmod +x /etc/dnsmasq.d/.git/hooks/pre-commit
|
||||
```
|
||||
Bad configs never reach `reload`.
|
||||
|
||||
Pick any two; you’ll **never touch the box again** unless the business truly changes.
|
||||
|
||||
---
|
||||
|
||||
──────────────────────────────
|
||||
MyCorp “One-Box Wonder” – End-to-End Deployment Guide
|
||||
──────────────────────────────
|
||||
|
||||
Reference in New Issue
Block a user