Compare commits

..

No commits in common. "19e675fa5a9975bb1310b96b7423c61ca187d2c8" and "293c1f88d8f0778f58b47755db15f93eddf2758c" have entirely different histories.

12 changed files with 44 additions and 170 deletions

View file

@ -1,22 +0,0 @@
# syntax=docker/dockerfile:1
# Custom Caddy image: vanilla Caddy + the Gandi DNS-01 plugin (ADR-024).
#
# WHY: mesh/LAN-only services have no public A-record, so they cannot satisfy ACME
# HTTP-01; they need DNS-01 against Gandi (the M1 *.<domain> wildcard strategy).
# Caddy's official image ships no third-party DNS plugins, so we compile one in.
#
# WHERE to build: on ubongo (the control node) — NOT on askari/Hetzner. Google's Go
# module proxy 403s Hetzner IP ranges, which broke the original on-host build (M4a).
# Build here, push the pinned tag/digest to the Forgejo registry, pull on askari.
#
# Versions pinned (ADR-011/ADR-014). caddy-dns/gandi v1.1.0 -> libdns/gandi v1.1.0,
# which authenticates with a Gandi Personal Access Token via "Authorization: Bearer"
# against https://api.gandi.net/v5/livedns (the legacy Apikey scheme is gone — using
# a PAT in the old Apikey slot 403s, which is what sank the M4a attempt).
# verified: caddy-dns/gandi v1.1.0 sends the PAT as Bearer · WebFetch libdns/gandi
# client.go @master (go.mod requires v1.1.0) · 2026-06-15
FROM caddy:2.11.4-builder AS build
RUN xcaddy build v2.11.4 --with github.com/caddy-dns/gandi@v1.1.0
FROM caddy:2.11.4
COPY --from=build /usr/bin/caddy /usr/bin/caddy

View file

@ -19,10 +19,6 @@ TF := terraform
TF_ENV ?= staging TF_ENV ?= staging
MOLECULE_IMAGE := forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest MOLECULE_IMAGE := forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
# Custom Caddy + Gandi DNS-01 plugin (ADR-024). Build on ubongo, NOT askari/Hetzner
# (the Go module proxy 403s Hetzner IPs); push the pinned tag to the Forgejo registry.
CADDY_IMAGE := forgejo.nyumbani.baobab.band/sjat/caddy-gandi:2.11.4
CADDY_DOCKERFILE := .docker/caddy-gandi/Dockerfile
# For TF_ENV=offsite, source the Hetzner token from the vault into the environment # For TF_ENV=offsite, source the Hetzner token from the vault into the environment
# (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md). # (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md).
@ -37,7 +33,7 @@ endif
.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \ .PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
edit-vault check-vault new-role \ edit-vault check-vault new-role \
tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \ tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \
molecule-image molecule-image-push caddy-image caddy-image-push molecule-image molecule-image-push
help: help:
@echo "" @echo ""
@ -67,8 +63,6 @@ help:
@echo "" @echo ""
@echo " make molecule-image Build the Molecule test image locally" @echo " make molecule-image Build the Molecule test image locally"
@echo " make molecule-image-push Push the test image to the Forgejo registry" @echo " make molecule-image-push Push the test image to the Forgejo registry"
@echo " make caddy-image Build the custom Caddy + Gandi DNS-01 image (run on ubongo)"
@echo " make caddy-image-push Push the Caddy image to the Forgejo registry"
@echo "" @echo ""
# ── Environment setup ───────────────────────────────────────────────────────── # ── Environment setup ─────────────────────────────────────────────────────────
@ -149,16 +143,6 @@ molecule-image:
molecule-image-push: molecule-image molecule-image-push: molecule-image
docker push $(MOLECULE_IMAGE) docker push $(MOLECULE_IMAGE)
# ── Custom Caddy image (Gandi DNS-01 plugin, ADR-024) ─────────────────────────
# DNS-01 (wildcard / mesh-LAN-only certs) needs the caddy-dns/gandi plugin compiled
# in via xcaddy. Build on ubongo — Google's Go module proxy 403s Hetzner IPs.
caddy-image:
docker build -t $(CADDY_IMAGE) -f $(CADDY_DOCKERFILE) .docker/caddy-gandi
caddy-image-push: caddy-image
docker push $(CADDY_IMAGE)
# ── Terraform ───────────────────────────────────────────────────────────────── # ── Terraform ─────────────────────────────────────────────────────────────────
tf-init: tf-init:

View file

@ -32,7 +32,7 @@ _Last reviewed: 2026-06-14._
| `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. | | `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. |
| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **Pending:** NetBird mesh enrollment (so SSH is LAN-only); full `base` hardening (only the `firewall` concern exists, and it is NOT applied here — applying default-deny with no mesh would lock out inbound SSH on the physical NIC); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). | | `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **Pending:** NetBird mesh enrollment (so SSH is LAN-only); full `base` hardening (only the `firewall` concern exists, and it is NOT applied here — applying default-deny with no mesh would lock out inbound SSH on the physical NIC); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). |
| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me``77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **Pending:** NetBird coordinator (M4b), host firewall + mesh enrollment (M5), offsite tfstate backup (ADR-022). | | `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me``77.42.120.136`. **SSH-hardened + fail2ban (M3).** **Docker + Caddy reverse proxy (M4a):** `docker_host` + `reverse_proxy` (vanilla Caddy, HTTP-01) applied; `https://test.askari.wingu.me` serves a valid Let's Encrypt cert ✓ (firewall opens 80/443/3478). **Pending:** NetBird coordinator (M4b), host firewall + mesh enrollment (M5), offsite tfstate backup (ADR-022). |
| `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). **DNS-01 for mesh/LAN-only services is now built + proven (2026-06-15):** custom `caddy-gandi` image (`.docker/caddy-gandi/`, `make caddy-image`, pinned caddy-dns/gandi v1.1.0 → Bearer PAT), enabled per-instance via `reverse_proxy__acme_dns_provider: gandi` + `reverse_proxy__image`. Verified end-to-end — a real wildcard cert issued via LE **staging** + Gandi DNS-01 with `vault.gandi.pat`. M4a's deferral (version skew + Hetzner-IP build) is closed; image **pending registry push** (`make caddy-image-push` needs `docker login`). | | `roles/docker_host/` (Docker engine) + `roles/reverse_proxy/` (Caddy, ADR-024) | **Built + applied** (askari, M4a). `docker_host` installs Docker CE + compose; `reverse_proxy` is boma's standard Caddy proxy (HTTP-01 for public hosts; routes from `reverse_proxy__routes`). DNS-01 for cluster mesh/LAN-only services is deferred to Phase 2 (caddy-dns/gandi unresolved — see FRICTION). |
## Scaffolded but empty — NOT implemented ## Scaffolded but empty — NOT implemented

View file

@ -22,17 +22,6 @@ earning its keep.
_(append new raw signals here; the next kaizen review consumes them)_ _(append new raw signals here; the next kaizen review consumes them)_
- `[friction]` **Image push to the Forgejo registry fails with `no basic auth
credentials`** (2026-06-15): `make caddy-image-push` (and `molecule-image-push`) fail
unless the Docker daemon on ubongo has an interactive `docker login
forgejo.nyumbani.baobab.band` session — and those creds are **not in vault** (only
`gandi` + `hetzner` are), so an agent can't complete a push non-interactively. The
build half is fully automatable; the push half silently requires a human. → candidate:
document the `docker login` step in `docs/runbooks/claude-code-setup.md`, **or** store
a scoped Forgejo registry token in vault + a `make registry-login` target (login via
`--password-stdin`, `no_log`) so pushes are agent-completable like every other
vault-backed action.
- `[recurring]` **ADRs claim cross-doc reconciliation they didn't actually perform** - `[recurring]` **ADRs claim cross-doc reconciliation they didn't actually perform**
(2026-06-14): ADR-024's Status + Consequences asserted "ADR-017 prose that mentioned (2026-06-14): ADR-024's Status + Consequences asserted "ADR-017 prose that mentioned
Traefik is updated to read Caddy" — but ADR-008/017/019 + CAPABILITIES still said Traefik is updated to read Caddy" — but ADR-008/017/019 + CAPABILITIES still said
@ -63,7 +52,7 @@ which migrate or archive (knowledge is never deleted).
| Brainstorming spec-review gate fires despite the standing agreement (06-10) | CHANGE → mechanical | Extended the same Stop hook with a tight second matcher (review + "the spec" + "before" + "implementation plan", or the literal "spec written and committed"); tested to block the gate and pass meta-discussion. Same external-skill-script-vs-convention family as the execution menu. | | Brainstorming spec-review gate fires despite the standing agreement (06-10) | CHANGE → mechanical | Extended the same Stop hook with a tight second matcher (review + "the spec" + "before" + "implementation plan", or the literal "spec written and committed"); tested to block the gate and pass meta-discussion. Same external-skill-script-vs-convention family as the execution menu. |
| Subagent faithfulness self-reports can be wrong (06-10) | ACCEPTED | The mitigation — independent two-stage review where the reviewer is told "do not trust the report" and reads the actual diff — is now embodied in `superpowers:subagent-driven-development`, used for the `/kaizen` build itself. Revisit if it recurs. | | Subagent faithfulness self-reports can be wrong (06-10) | ACCEPTED | The mitigation — independent two-stage review where the reviewer is told "do not trust the report" and reads the actual diff — is now embodied in `superpowers:subagent-driven-development`, used for the `/kaizen` build itself. Revisit if it recurs. |
| ADR-writing policy unsettled (05-31) | ALREADY-BUILT | ADR-023 (ADR structure & lifecycle) + `docs/decisions/adr-template.md` settle status/sections — both postdate this signal. | | ADR-writing policy unsettled (05-31) | ALREADY-BUILT | ADR-023 (ADR structure & lifecycle) + `docs/decisions/adr-template.md` settle status/sections — both postdate this signal. |
| Hetzner 403 / caddy-dns DNS-01 didn't issue (06-14) | ALREADY-BUILT **RESOLVED 2026-06-15** | 06-14: ADR-024 recorded the HTTP-01 decision + DNS-01 deferral. 06-15: deferral **closed** — root cause was **version skew** (pre-Bearer `libdns/gandi` sent Gandi's deprecated `Apikey` header → 403) plus building on a Hetzner IP. Fix: pin caddy-dns/gandi v1.1.0 (Bearer PAT) + build on ubongo. DNS-01 now built + proven (real wildcard cert via LE staging). See ADR-024 Status + STATUS.md + `roles/reverse_proxy`. | | Hetzner 403 / caddy-dns DNS-01 didn't issue (06-14) | ALREADY-BUILT | ADR-024's revised Status records the HTTP-01 decision, the DNS-01 deferral to Phase 2, and the Hetzner-build + plugin blocks. |
| `apply:{tags}` not propagated by dynamic `include_tasks` (06-14) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "Tags on dynamic `include_tasks` need `apply:`". | | `apply:{tags}` not propagated by dynamic `include_tasks` (06-14) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "Tags on dynamic `include_tasks` need `apply:`". |
| Molecule CAN test tag-propagation, via a tagged converge (06-14) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "Testing concern-tag isolation in Molecule". | | Molecule CAN test tag-propagation, via a tagged converge (06-14) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "Testing concern-tag isolation in Molecule". |
| apply=false Molecule + data-pytest gap for API/templating roles (06-14) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "API / templating roles: render-only tests miss the real call". | | apply=false Molecule + data-pytest gap for API/templating roles (06-14) | SYSTEMATIZE | → `docs/testing/gotchas.md` — "API / templating roles: render-only tests miss the real call". |

View file

@ -112,8 +112,7 @@ active. Full CIS L1/L2, auditd, AppArmor, AIDE remain deferred to Phase 2 (TODO
Built in two phases. **M4a (platform) — ✅ DONE:** Docker on askari + boma's standard Built in two phases. **M4a (platform) — ✅ DONE:** Docker on askari + boma's standard
**Caddy** reverse proxy (ADR-024), proven by `https://test.askari.wingu.me` serving a **Caddy** reverse proxy (ADR-024), proven by `https://test.askari.wingu.me` serving a
valid Let's Encrypt cert (HTTP-01; the Gandi **DNS-01** path is now built + proven — valid Let's Encrypt cert (HTTP-01 — DNS-01 deferred to Phase 2, see ADR-024/FRICTION).
2026-06-15, see ADR-024 — for mesh/LAN-only cluster services).
Firewall opened 80/443/3478. Spec/plan: `…2026-06-14-netbird-coordinator-m4-design.md` / Firewall opened 80/443/3478. Spec/plan: `…2026-06-14-netbird-coordinator-m4-design.md` /
`…2026-06-14-m4a-docker-caddy.md`. **M4b (next):** the `netbird_coordinator` service `…2026-06-14-m4a-docker-caddy.md`. **M4b (next):** the `netbird_coordinator` service
role — read NetBird's current self-host compose then. role — read NetBird's current self-host compose then.

View file

@ -2,31 +2,18 @@
## Status ## Status
Accepted (2026-06-14; DNS-01 path resolved + proven 2026-06-15). Amends the soft Accepted (2026-06-14). Amends the soft Traefik assumption carried by the roadmap
Traefik assumption carried by the roadmap (Phase-2 step 5) and ADR-017 prose; those (Phase-2 step 5) and ADR-017 prose; those are updated to read "Caddy (ADR-024)".
are updated to read "Caddy (ADR-024)".
> **Cert method follows exposure.** The cert *challenge* depends on whether a host is > **Cert method follows exposure (revised 2026-06-14, M4a).** The cert *challenge*
> publicly reachable: **public hosts** (askari) use **HTTP-01** with **vanilla Caddy** > depends on whether a host is publicly reachable: **public hosts** (askari) use
> simplest, no plugin; **mesh/LAN-only cluster services** (no public A-record) use > **HTTP-01** with **vanilla Caddy** — simplest, no plugin; **mesh/LAN-only cluster
> **DNS-01** via Gandi (the M1 capability), since they can't satisfy HTTP-01. > services** (no public A-record) need **DNS-01** (the M1 Gandi capability), since they
> > can't satisfy HTTP-01. The DNS-01 path is **deferred to Phase 2**: the `caddy-dns/gandi`
> **DNS-01 resolved + proven (2026-06-15) — the M4a deferral is closed.** The original > plugin did not create the ACME TXT records on askari despite a verified-valid token
> failure was diagnosed as **version skew**: the image built at M4a used a pre-Bearer > (and Hetzner IPs are 403'd by Google's Go module infra, blocking the on-host custom
> `libdns/gandi` that sent Gandi's **deprecated `Apikey` header** (→ 403 on a > build) — both to be sorted when the cluster's private services actually need DNS-01.
> verified-valid token), and the `xcaddy` build ran *on a Hetzner IP* (Google's Go > The body below describes the DNS-01 design; askari (M4a) ships on HTTP-01.
> module proxy 403s those ranges). Both have clean, boma-aligned fixes: **pin
> caddy-dns/gandi v1.1.0** (→ `libdns/gandi` v1.1.0, which sends the PAT as
> `Authorization: Bearer` to `https://api.gandi.net/v5/livedns`) and **build the image
> on ubongo, not Hetzner**. Verified end-to-end (2026-06-15): the custom image issues a
> real **wildcard** cert (`*.dns01test.wingu.me`) against Let's Encrypt **staging** via
> Gandi DNS-01 using `vault.gandi.pat`; `caddy validate` accepts `acme_dns gandi` on the
> custom image and rejects it on vanilla `caddy:2`. Build with `make caddy-image`; the
> `reverse_proxy` role enables it per-instance via `reverse_proxy__acme_dns_provider:
> gandi` + `reverse_proxy__image`. **Traefik was reconsidered and rejected again**
> lego's Gandi provider faces the *same* PAT-vs-Apikey question, so switching would not
> have dodged the issue, and would reverse this ADR for nothing. askari (M4a) stays on
> HTTP-01 (a public host needs no DNS-01).
## Context ## Context
@ -70,32 +57,26 @@ boma's reverse proxy is **Caddy**.
5. `forward_auth` to Authentik is a first-class Caddy directive — the planned 5. `forward_auth` to Authentik is a first-class Caddy directive — the planned
Authentik auth story (ADR-002) is preserved without Traefik as the middleman. Authentik auth story (ADR-002) is preserved without Traefik as the middleman.
### 2. Custom image (DNS-01 path — built) ### 2. Custom image (DNS-01 path only — Phase 2)
> Applies only to the **DNS-01** path. M4a ships **vanilla `caddy:2`** on askari > Applies only to the **DNS-01** path, which is **deferred to Phase 2** (see the Status
> (HTTP-01) — no custom image; only DNS-01 hosts pull the custom one. > note). M4a ships **vanilla `caddy:2`** on askari (HTTP-01) — no custom image.
Caddy's official Docker image does not include third-party DNS plugins. The Caddy's official Docker image does not include third-party DNS plugins. The `caddy-dns/gandi`
`caddy-dns/gandi` plugin must be compiled in via `xcaddy`. boma builds a custom image plugin must be compiled in via `xcaddy`. When the cluster's mesh/LAN-only services need
(`.docker/caddy-gandi/Dockerfile`, `make caddy-image`), **pinned** (ADR-011/ADR-014): DNS-01, boma builds a custom image:
```dockerfile ```
FROM caddy:2.11.4-builder AS build FROM caddy:builder AS builder
RUN xcaddy build v2.11.4 --with github.com/caddy-dns/gandi@v1.1.0 RUN xcaddy build --with github.com/caddy-dns/gandi
FROM caddy:2.11.4 FROM caddy:latest
COPY --from=build /usr/bin/caddy /usr/bin/caddy COPY --from=builder /usr/bin/caddy /usr/bin/caddy
``` ```
Two hard constraints, both learned from the M4a failure: That image would be maintained as a boma artifact (Forgejo registry, pinned digest in the
Compose template) — the cost of the Gandi DNS-01 path. (On askari this approach hit two
1. **Build on ubongo, not Hetzner.** Google's Go module proxy 403s Hetzner IP ranges, so blockers, so DNS-01 is deferred; see the Status note.)
the on-host build on askari failed. ubongo (the control node) builds it in ~1 min,
then it is pushed to the Forgejo registry (`make caddy-image-push`) and pulled by
DNS-01 hosts — the same artifact pattern as the Molecule image.
2. **Pin a Bearer-capable plugin.** caddy-dns/gandi v1.1.0 → libdns/gandi v1.1.0 sends
the PAT as `Authorization: Bearer`. Older versions used the deprecated `Apikey`
header and 403 on a PAT — that was the M4a "valid token but no TXT record" symptom.
### 3. Deployment scope ### 3. Deployment scope
@ -115,11 +96,9 @@ middleware migration is required.
- **Roadmap Phase-2 step 5** is updated from "Authentik + Traefik" to "Authentik + - **Roadmap Phase-2 step 5** is updated from "Authentik + Traefik" to "Authentik +
Caddy (ADR-024)". Caddy (ADR-024)".
- **ADR-017 prose** that mentioned Traefik is updated to read "Caddy (ADR-024)". - **ADR-017 prose** that mentioned Traefik is updated to read "Caddy (ADR-024)".
- M4a (public hosts, HTTP-01) runs **vanilla `caddy:2`** — no custom image. The DNS-01 - M4a (public hosts, HTTP-01) runs **vanilla `caddy:2`** — no custom image. **If/when**
custom Caddy image (`xcaddy` + `caddy-dns/gandi`, `.docker/caddy-gandi/`) is **built and the Phase-2 DNS-01 path lands, a custom Caddy image (`xcaddy` + `caddy-dns/gandi`) must
proven**; it must be pushed to the Forgejo registry (`make caddy-image-push`, needs be built, pushed to the Forgejo registry, and kept current (plugin + base image updates).
`docker login`) and kept current (plugin + base-image version bumps, pinned per
ADR-011/ADR-014) as DNS-01 cluster services come online.
- Caddyfile config is rendered by Ansible from `group_vars` — consistent with ADR-004 - Caddyfile config is rendered by Ansible from `group_vars` — consistent with ADR-004
and easier to review than distributed container labels. and easier to review than distributed container labels.
- `forward_auth` to Authentik is available when Authentik is deployed; no extra - `forward_auth` to Authentik is available when Authentik is deployed; no extra

View file

@ -1,34 +1,18 @@
# reverse_proxy # reverse_proxy
Boma's standard Caddy reverse proxy (ADR-024). Runs on `askari` (the off-site Boma's standard Caddy reverse proxy (ADR-024). Runs on `askari` (the off-site
Hetzner host) and terminates TLS for services. It supports **two ACME challenge Hetzner host) and terminates TLS for all public-facing services via ACME HTTP-01.
types**, chosen per proxy instance by exposure: Uses the official `caddy:2` image — no custom build, no DNS plugin, no token required.
- **HTTP-01 (default)** — public hosts with an A-record (askari). Official `caddy:2`
image; no plugin, no token.
- **DNS-01 via Gandi** — mesh/LAN-only hosts with **no** public A-record (the cluster),
where HTTP-01 is impossible. Needs the custom `caddy-gandi` image and the Gandi PAT.
## How TLS works ## How TLS works
**HTTP-01 (default).** Caddy obtains per-hostname certificates using the ACME HTTP-01 Caddy obtains per-hostname certificates using the ACME HTTP-01 challenge. Port 80
challenge. Port 80 must be reachable from the internet. Each `host` in must be reachable from the internet for the challenge to succeed. Each `host` in
`reverse_proxy__routes` gets its own certificate automatically. `reverse_proxy__routes` gets its own certificate automatically.
**DNS-01 (Gandi).** Set `reverse_proxy__acme_dns_provider: gandi` and point > **DNS-01 (for mesh/LAN-only cluster services) is deferred to Phase 2.** The
`reverse_proxy__image` at the custom Caddy image (`make caddy-image`, built on ubongo > `caddy-dns/gandi` plugin failed to issue certificates during M4a and needs
and pushed to the Forgejo registry — see `.docker/caddy-gandi/`). Caddy then proves > investigation before it can be used.
domain control by writing ACME TXT records through the Gandi LiveDNS API, so it can
issue certs — **including wildcards** — for hosts that are never publicly reachable.
The token (`vault.gandi.pat`) is injected as `GANDI_BEARER_TOKEN` via a host-only
`env` file (mode 0600) and sent as a **Bearer PAT** (the legacy Apikey scheme is gone).
> **Verified (2026-06-15):** the custom image issues a real wildcard cert
> (`*.dns01test.wingu.me`) end-to-end against Let's Encrypt staging via Gandi DNS-01;
> `caddy validate` accepts the `acme_dns gandi` directive on the custom image and
> rejects it on vanilla `caddy:2` (`module not registered: dns.providers.gandi`). The
> original M4a failure was version skew (a pre-Bearer `libdns/gandi` that sent the
> deprecated Apikey header) plus building the image on a Hetzner IP (Go proxy 403).
## Route catalog — `reverse_proxy__routes` ## Route catalog — `reverse_proxy__routes`
@ -62,8 +46,6 @@ Use `upstream` to proxy to a Docker service, or `respond` to return a static str
| `reverse_proxy__base_dir` | `/opt/services/reverse_proxy` | Working directory for Compose project | | `reverse_proxy__base_dir` | `/opt/services/reverse_proxy` | Working directory for Compose project |
| `reverse_proxy__acme_email` | `admin@example.test` | ACME registration email | | `reverse_proxy__acme_email` | `admin@example.test` | ACME registration email |
| `reverse_proxy__routes` | `[]` | List of `{host, upstream}` or `{host, respond}` entries | | `reverse_proxy__routes` | `[]` | List of `{host, upstream}` or `{host, respond}` entries |
| `reverse_proxy__image` | `caddy:2` | Container image. DNS-01 hosts override to the custom `caddy-gandi` image |
| `reverse_proxy__acme_dns_provider` | `""` | `""` = HTTP-01; `"gandi"` = ACME DNS-01 via the Gandi PAT |
| `reverse_proxy__manage` | `true` | Set `false` in Molecule to skip Docker tasks | | `reverse_proxy__manage` | `true` | Set `false` in Molecule to skip Docker tasks |
Production overrides live in Production overrides live in
@ -77,7 +59,4 @@ creation) without a Docker daemon.
## Secrets ## Secrets
- **HTTP-01 (default):** none — the challenge requires no credentials. None. HTTP-01 requires no credentials.
- **DNS-01 (`reverse_proxy__acme_dns_provider: gandi`):** the Gandi PAT
(`vault.gandi.pat`, the same token `public_dns` uses). Rendered host-side into
`{{ reverse_proxy__base_dir }}/env` (mode 0600, `no_log`); never committed.

View file

@ -1,19 +1,10 @@
--- ---
# Caddy reverse proxy (ADR-024). # Caddy reverse proxy (ADR-024). Vanilla Caddy; TLS via ACME HTTP-01 (public hosts).
reverse_proxy__base_dir: /opt/services/reverse_proxy reverse_proxy__base_dir: /opt/services/reverse_proxy
reverse_proxy__acme_email: admin@example.test reverse_proxy__acme_email: admin@example.test
reverse_proxy__routes: [] # each: {host: x, upstream: "svc:port"} OR {host: x, respond: "text"} reverse_proxy__routes: [] # each: {host: x, upstream: "svc:port"} OR {host: x, respond: "text"}
reverse_proxy__manage: true # set false in Molecule to render without Docker reverse_proxy__manage: true # set false in Molecule to render without Docker
# ACME challenge type (ADR-024). Default is HTTP-01 with the vanilla upstream image —
# correct for PUBLIC hosts with an A-record (askari). For mesh/LAN-only hosts with NO
# public A-record (the cluster), HTTP-01 is impossible: set reverse_proxy__acme_dns_provider
# to "gandi" AND point reverse_proxy__image at the custom Caddy+Gandi image to issue certs
# (incl. wildcards) via Gandi DNS-01. The token is vault.gandi.pat (sent as a Bearer PAT;
# the legacy Apikey scheme is gone). Build the image with `make caddy-image` (on ubongo).
reverse_proxy__image: "caddy:2" # DNS-01 hosts override -> the caddy-gandi registry image
reverse_proxy__acme_dns_provider: "" # "" = HTTP-01; "gandi" = ACME DNS-01 via Gandi PAT
# access__*/backup__* are the ADR-021/022 CROSS-ROLE conventions — shared field names that # access__*/backup__* are the ADR-021/022 CROSS-ROLE conventions — shared field names that
# render ACCESS.md/BACKUP.md and drive /check-access · /check-backup. They intentionally do # render ACCESS.md/BACKUP.md and drive /check-access · /check-backup. They intentionally do
# NOT carry the reverse_proxy__ prefix, so each is marked `# noqa: var-naming[no-role-prefix]` # NOT carry the reverse_proxy__ prefix, so each is marked `# noqa: var-naming[no-role-prefix]`
@ -31,7 +22,7 @@ access__api: # noqa: var-naming[no-role-prefix]
reason: "Caddy admin API bound to container localhost :2019; never exposed (ADR-020 catalog owns ports)" reason: "Caddy admin API bound to container localhost :2019; never exposed (ADR-020 catalog owns ports)"
# Backup contract (ADR-022). Stateless: Caddy's /data holds only ACME account keys + # Backup contract (ADR-022). Stateless: Caddy's /data holds only ACME account keys +
# issued certs, which are re-requested automatically on restart via ACME (HTTP-01 or # issued certs, which are re-requested automatically on restart via HTTP-01 (no manual
# DNS-01; no manual steps). Residual risk: Let's Encrypt rate limits on rapid re-issuance. # steps). Residual risk: Let's Encrypt rate limits on rapid repeated re-issuance.
backup__service: reverse_proxy # noqa: var-naming[no-role-prefix] backup__service: reverse_proxy # noqa: var-naming[no-role-prefix]
backup__state: false # noqa: var-naming[no-role-prefix] backup__state: false # noqa: var-naming[no-role-prefix]

View file

@ -14,16 +14,6 @@
notify: reload caddy notify: reload caddy
tags: [config] tags: [config]
- name: Render the Gandi DNS-01 token env file
ansible.builtin.template:
src: env.j2
dest: "{{ reverse_proxy__base_dir }}/env"
mode: "0600"
no_log: true # contains the Gandi PAT
when: reverse_proxy__acme_dns_provider == 'gandi'
notify: reload caddy
tags: [config]
- name: Render the compose file - name: Render the compose file
ansible.builtin.template: ansible.builtin.template:
src: docker-compose.yml.j2 src: docker-compose.yml.j2

View file

@ -1,12 +1,6 @@
# {{ ansible_managed }} # {{ ansible_managed }}
{ {
email {{ reverse_proxy__acme_email }} email {{ reverse_proxy__acme_email }}
{% if reverse_proxy__acme_dns_provider == 'gandi' %}
# ACME DNS-01 via Gandi (mesh/LAN-only hosts, incl. wildcard certs). Token is the
# Gandi PAT, injected from the env file as a Bearer token (ADR-024). Needs the custom
# caddy-gandi image — the upstream caddy:2 has no DNS provider modules.
acme_dns gandi {env.GANDI_BEARER_TOKEN}
{% endif %}
} }
{% for r in reverse_proxy__routes %} {% for r in reverse_proxy__routes %}
{{ r.host }} { {{ r.host }} {

View file

@ -1,16 +1,12 @@
# {{ ansible_managed }} # {{ ansible_managed }}
services: services:
caddy: caddy:
image: {{ reverse_proxy__image }} image: caddy:2
container_name: caddy container_name: caddy
restart: unless-stopped restart: unless-stopped
ports: ports:
- "80:80" - "80:80"
- "443:443" - "443:443"
{% if reverse_proxy__acme_dns_provider == 'gandi' %}
env_file:
- ./env
{% endif %}
volumes: volumes:
- ./Caddyfile:/etc/caddy/Caddyfile:ro - ./Caddyfile:/etc/caddy/Caddyfile:ro
- caddy_data:/data - caddy_data:/data

View file

@ -1,5 +0,0 @@
# {{ ansible_managed }}
# Gandi Personal Access Token for ACME DNS-01 (rendered only when
# reverse_proxy__acme_dns_provider == 'gandi'). Sent by caddy-dns/gandi as a Bearer
# token to https://api.gandi.net/v5/livedns. Host-only, mode 0600 (ADR-024).
GANDI_BEARER_TOKEN={{ vault.gandi.pat }}