From 32a7cf5b25308b06e9b0d03f5f19244d99c33fa5 Mon Sep 17 00:00:00 2001
From: "soroush.asadi" <soroush.asadi@aliasaas.com>
Date: Mon, 15 Jun 2026 18:45:07 +0330
Subject: [PATCH] ops: nightly DB backup + self-hosted uptime monitoring
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Backup (production data-loss protection — was none):
- meezi-backup sidecar in docker-compose.yml runs pg_dump nightly at 02:00
  Tehran, gzip, 14-day rotation, atomic .partial→final, into ./backups
  (persists across deploys; rsync off-box per RESTORE.md).
- Wired into the deploy job (up -d --no-deps backup); takes one dump on boot.
- scripts/backup/pg-backup-loop.sh + RESTORE.md (restore + off-box guidance).

Monitoring:
- docker-compose.monitoring.yml: Uptime Kuma stack (own volume), stood up
  once, independent of app deploys.
- Caddyfile status.{$DOMAIN} route; docs/monitoring.md lists the exact
  monitors (incl. /q guest-menu 200 check) + TLS-expiry alerts (catches the
  ~90-day cert breakage early) + alert-channel setup.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .gitea/workflows/ci-cd.yml       |  5 +++
 Caddyfile                        |  8 ++++
 docker-compose.monitoring.yml    | 29 +++++++++++++++
 docker-compose.yml               | 24 ++++++++++++
 docs/monitoring.md               | 47 ++++++++++++++++++++++++
 scripts/backup/RESTORE.md        | 55 ++++++++++++++++++++++++++++
 scripts/backup/pg-backup-loop.sh | 63 ++++++++++++++++++++++++++++++++
 7 files changed, 231 insertions(+)
 create mode 100644 docker-compose.monitoring.yml
 create mode 100644 docs/monitoring.md
 create mode 100644 scripts/backup/RESTORE.md
 create mode 100644 scripts/backup/pg-backup-loop.sh

diff --git a/.gitea/workflows/ci-cd.yml b/.gitea/workflows/ci-cd.yml
index 99d2fd6..2e1f63e 100644
--- a/.gitea/workflows/ci-cd.yml
+++ b/.gitea/workflows/ci-cd.yml
@@ -446,6 +446,11 @@ jobs:
             -f docker-compose.admin.yml \
             up -d --no-deps admin-web
 
+      - name: Start nightly DB backup
+        # Sidecar that pg_dumps meezi-db nightly into ./backups (14-day retention).
+        # --no-deps so it doesn't try to (re)start postgres which isn't compose-managed.
+        run: docker compose up -d --no-deps backup
+
       - name: Show all running containers
         if: always()
         run: docker compose -f docker-compose.yml -f docker-compose.admin.yml ps
diff --git a/Caddyfile b/Caddyfile
index 32f171c..da3ced3 100644
--- a/Caddyfile
+++ b/Caddyfile
@@ -7,6 +7,7 @@
 # Domains needed in DNS (all → same server IP):
 #   meezi.ir, app.meezi.ir, api.meezi.ir,
 #   koja.meezi.ir, admin.meezi.ir, admin-api.meezi.ir
+#   status.meezi.ir  (only if the monitoring stack is running — see docs/monitoring.md)
 
 {
     email {$ACME_EMAIL}
@@ -41,3 +42,10 @@ admin.{$DOMAIN} {
 admin-api.{$DOMAIN} {
     reverse_proxy admin-api:8080
 }
+
+# ── Uptime monitoring (Uptime Kuma) ──────────────────────────────────────────
+# Only resolves if the monitoring stack is up (docker-compose.monitoring.yml).
+# Caddy ignores upstreams that don't exist until the container is running.
+status.{$DOMAIN} {
+    reverse_proxy uptime-kuma:3001
+}
diff --git a/docker-compose.monitoring.yml b/docker-compose.monitoring.yml
new file mode 100644
index 0000000..5db2781
--- /dev/null
+++ b/docker-compose.monitoring.yml
@@ -0,0 +1,29 @@
+name: meezi
+
+# Self-hosted uptime monitoring for Meezi — Uptime Kuma.
+#
+# One-time stand-up (does NOT need redeploying with every app deploy):
+#   docker compose -f docker-compose.monitoring.yml up -d
+#
+# Then open https://status.meezi.ir (or http://SERVER:3201) and configure the
+# monitors + alert channel as described in docs/monitoring.md.
+#
+# Config + history persist in the uptime_kuma_data volume.
+
+services:
+  uptime-kuma:
+    image: ${UPTIME_KUMA_IMAGE:-mirror.soroushasadi.com/louislam/uptime-kuma:1}
+    container_name: meezi-uptime-kuma
+    restart: unless-stopped
+    volumes:
+      - uptime_kuma_data:/app/data
+    ports:
+      - "${UPTIME_KUMA_PORT:-3201}:3001"
+    healthcheck:
+      test: ["CMD-SHELL", "node extra/healthcheck.js || exit 1"]
+      interval: 60s
+      timeout: 10s
+      retries: 3
+
+volumes:
+  uptime_kuma_data:
diff --git a/docker-compose.yml b/docker-compose.yml
index f4c9e55..0b3de8d 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -177,6 +177,30 @@ services:
     ports:
       - "${KOJA_PORT:-3103}:3000"
 
+  # Nightly Postgres backup — dumps the DB every night, keeps the last 14 days.
+  # Dumps land in the host ./backups dir (bind mount) so they survive a full
+  # container/volume wipe and can be rsync'd off-box. See scripts/backup/RESTORE.md.
+  backup:
+    image: ${POSTGRES_IMAGE:-mirror.soroushasadi.com/postgres:16-alpine}
+    container_name: meezi-backup
+    restart: unless-stopped
+    depends_on:
+      postgres:
+        condition: service_healthy
+    environment:
+      PGHOST: postgres
+      PGPORT: "5432"
+      PGUSER: meezi
+      PGPASSWORD: "${DB_PASSWORD:-meezi_local_pass}"
+      PGDATABASE: meezi
+      RETAIN_DAYS: "${BACKUP_RETAIN_DAYS:-14}"
+      BACKUP_HOUR: "${BACKUP_HOUR:-2}"
+      TZ: Asia/Tehran
+    entrypoint: ["/bin/sh", "/backup/pg-backup-loop.sh"]
+    volumes:
+      - ./scripts/backup:/backup:ro
+      - ${BACKUP_DIR:-./backups}:/backups
+
 volumes:
   postgres_data:
   redis_data:
diff --git a/docs/monitoring.md b/docs/monitoring.md
new file mode 100644
index 0000000..757c96f
--- /dev/null
+++ b/docs/monitoring.md
@@ -0,0 +1,47 @@
+# Meezi uptime monitoring (Uptime Kuma)
+
+Self-hosted uptime + TLS-expiry monitoring with alerting. Runs as a separate
+compose stack so it stays up independently of app deploys.
+
+## Stand it up (one time, on the prod host)
+```bash
+cd /path/to/meezi
+docker compose -f docker-compose.monitoring.yml up -d
+```
+Then either:
+- add a DNS A record `status.meezi.ir → server IP` and reload Caddy
+  (`docker exec meezi-caddy caddy reload` or restart the caddy stack) — the
+  `status.{$DOMAIN}` block is already in the Caddyfile, **or**
+- reach it directly at `http://SERVER:3201` for the initial setup.
+
+First visit creates the admin account — set a strong password.
+
+## Monitors to add (in the Uptime Kuma UI)
+Add one **HTTP(s)** monitor per public surface, interval 60s, accept 2xx/3xx:
+
+| Name | URL | Notes |
+|------|-----|-------|
+| Website | https://meezi.ir/fa | marketing |
+| Dashboard | https://app.meezi.ir/fa/login | merchant panel |
+| API health | https://api.meezi.ir/api/public/security-config | returns JSON 200 |
+| Koja | https://koja.meezi.ir/fa | public discovery |
+| Admin | https://admin.meezi.ir | internal panel |
+| Guest menu | https://app.meezi.ir/q/healthcheck | should be 200 (not 500) |
+
+For each HTTPS monitor enable **"Certificate Expiry Notification"** — this
+catches the recurring ~90-day Let's Encrypt cert-chain breakages early
+(see the mirror-cert runbook). Set the threshold to 14 days.
+
+## Alerts
+Settings → Notifications → add a channel (Telegram bot or email/SMTP), then
+attach it to every monitor. Telegram is simplest: create a bot via @BotFather,
+get the chat id, paste both into Uptime Kuma.
+
+## What this does NOT replace
+- **Backups** — see `scripts/backup/RESTORE.md`.
+- **Crash auto-recovery** — Docker `restart: unless-stopped` already restarts
+  crashed containers; Uptime Kuma tells you when one is flapping or down.
+
+## Status page (optional)
+Uptime Kuma can publish a public status page (Settings → Status Pages) at
+`status.meezi.ir/status/meezi` if you want customers to see uptime.
diff --git a/scripts/backup/RESTORE.md b/scripts/backup/RESTORE.md
new file mode 100644
index 0000000..00d23c9
--- /dev/null
+++ b/scripts/backup/RESTORE.md
@@ -0,0 +1,55 @@
+# Meezi database backup & restore
+
+## How backups work
+The `meezi-backup` container (in `docker-compose.yml`) runs a nightly `pg_dump`
+of the whole `meezi` database at **02:00 Asia/Tehran**, gzips it, and keeps the
+**last 14 days** in the host `./backups` directory (override with `BACKUP_DIR`).
+Filenames: `meezi_YYYYMMDD_HHMMSS.sql.gz`. One backup is also taken immediately
+when the container first starts.
+
+Check it's running / list backups:
+```bash
+docker logs meezi-backup --tail 20
+ls -lh ./backups
+```
+
+## ⚠️ Copy backups OFF the server
+The bind-mounted `./backups` survives a container/volume wipe, but **not a disk
+failure**. Add an off-box copy (run from the host via cron), e.g.:
+```bash
+# rsync to another host nightly at 03:00
+0 3 * * * rsync -az --delete /path/to/meezi/backups/ user@backup-host:/srv/meezi-backups/
+```
+or `rclone copy ./backups remote:meezi-backups` to object storage.
+
+## Restore
+1. Pick a dump:
+   ```bash
+   ls -lh ./backups          # choose e.g. meezi_20260615_020000.sql.gz
+   ```
+2. (Recommended) stop the API so nothing writes mid-restore:
+   ```bash
+   docker stop meezi-api
+   ```
+3. Restore into the running Postgres container:
+   ```bash
+   gunzip -c ./backups/meezi_20260615_020000.sql.gz \
+     | docker exec -i meezi-db psql -U meezi -d meezi
+   ```
+   For a clean restore into an empty DB, drop & recreate first:
+   ```bash
+   docker exec -i meezi-db psql -U meezi -d postgres -c "DROP DATABASE meezi;"
+   docker exec -i meezi-db psql -U meezi -d postgres -c "CREATE DATABASE meezi OWNER meezi;"
+   gunzip -c ./backups/<dump>.sql.gz | docker exec -i meezi-db psql -U meezi -d meezi
+   ```
+4. Start the API again (it runs EF migrations on boot, which is a no-op if the
+   dump is current):
+   ```bash
+   docker start meezi-api
+   ```
+
+## Manual one-off backup
+```bash
+docker exec meezi-db pg_dump -U meezi --no-owner --no-privileges meezi \
+  | gzip -9 > ./backups/meezi_manual_$(date +%Y%m%d_%H%M%S).sql.gz
+```
diff --git a/scripts/backup/pg-backup-loop.sh b/scripts/backup/pg-backup-loop.sh
new file mode 100644
index 0000000..b07de77
--- /dev/null
+++ b/scripts/backup/pg-backup-loop.sh
@@ -0,0 +1,63 @@
+#!/bin/sh
+# Nightly Postgres backup loop for Meezi.
+#
+# Runs inside a small postgres-image container (has pg_dump/gzip). Every day at
+# ~02:00 Tehran it dumps the whole database, gzips it, and keeps the last
+# RETAIN_DAYS files in /backups. Designed to be dead-simple and dependency-free:
+# no cron daemon, just sleep-until-next-run so it survives container restarts.
+#
+# Env:
+#   PGHOST, PGUSER, PGPASSWORD, PGDATABASE  — connection (from compose)
+#   RETAIN_DAYS    — how many daily dumps to keep (default 14)
+#   BACKUP_HOUR    — local hour to run (default 2 = 02:00)
+set -eu
+
+RETAIN_DAYS="${RETAIN_DAYS:-14}"
+BACKUP_HOUR="${BACKUP_HOUR:-2}"
+OUT_DIR=/backups
+export TZ="${TZ:-Asia/Tehran}"
+
+log() { echo "[pg-backup $(date '+%Y-%m-%d %H:%M:%S %Z')] $*"; }
+
+run_backup() {
+  ts=$(date '+%Y%m%d_%H%M%S')
+  tmp="$OUT_DIR/.meezi_${ts}.sql.gz.partial"
+  final="$OUT_DIR/meezi_${ts}.sql.gz"
+  log "starting dump → $final"
+  # pg_dump streams to gzip; .partial then atomic rename so a crash never
+  # leaves a truncated file that looks like a good backup.
+  if pg_dump --no-owner --no-privileges | gzip -9 > "$tmp"; then
+    mv "$tmp" "$final"
+    size=$(wc -c < "$final" 2>/dev/null || echo '?')
+    log "done ($size bytes)"
+  else
+    rm -f "$tmp"
+    log "ERROR: dump failed"
+    return 1
+  fi
+  # Rotate: delete dumps older than RETAIN_DAYS days.
+  find "$OUT_DIR" -maxdepth 1 -name 'meezi_*.sql.gz' -mtime "+${RETAIN_DAYS}" -print -delete | while read -r f; do
+    log "rotated out $f"
+  done
+}
+
+seconds_until_next_run() {
+  now_h=$(date '+%-H'); now_m=$(date '+%-M'); now_s=$(date '+%-S')
+  now=$(( now_h * 3600 + now_m * 60 + now_s ))
+  target=$(( BACKUP_HOUR * 3600 ))
+  if [ "$now" -lt "$target" ]; then
+    echo $(( target - now ))
+  else
+    echo $(( 86400 - now + target ))
+  fi
+}
+
+log "backup loop started (retain ${RETAIN_DAYS}d, daily at ${BACKUP_HOUR}:00 ${TZ})"
+# Take one backup immediately on first boot so we never sit a full day with none.
+run_backup || true
+while true; do
+  wait_s=$(seconds_until_next_run)
+  log "next backup in ${wait_s}s"
+  sleep "$wait_s"
+  run_backup || true
+done