Compare commits
No commits in common. "09b0aad342cfafb78f5831d3a45bc8e7984635b2" and "993d7885e4a285666548c1057adc4ccf42134f81" have entirely different histories.
09b0aad342
...
993d7885e4
19 changed files with 27 additions and 832 deletions
26
Makefile
26
Makefile
|
|
@ -13,26 +13,18 @@ MOLECULE := $(VENV)/bin/molecule
|
||||||
VAULT_ARGS :=
|
VAULT_ARGS :=
|
||||||
# Default vault file for edit-vault / check-vault (override with VAULT=<path>).
|
# Default vault file for edit-vault / check-vault (override with VAULT=<path>).
|
||||||
VAULT ?= inventories/production/group_vars/all/vault.yml
|
VAULT ?= inventories/production/group_vars/all/vault.yml
|
||||||
INVENTORY := -i inventories/production/
|
INVENTORY := -i inventories/production/hosts.yml
|
||||||
|
|
||||||
TF := terraform
|
TF := terraform
|
||||||
TF_ENV ?= staging
|
TF_ENV ?= staging
|
||||||
MOLECULE_IMAGE := forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
|
MOLECULE_IMAGE := forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
|
||||||
MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
|
MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
|
||||||
|
|
||||||
# For TF_ENV=offsite, source the Hetzner token from the vault into the environment
|
|
||||||
# (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md).
|
|
||||||
ifeq ($(TF_ENV),offsite)
|
|
||||||
TF_TOKEN_ENV := TF_VAR_hcloud_token="$$($(ANSIBLE)-vault view inventories/production/group_vars/all/vault.yml | $(PYTHON) -c 'import sys, yaml; print(yaml.safe_load(sys.stdin)["vault"]["hetzner"]["token"])')"
|
|
||||||
else
|
|
||||||
TF_TOKEN_ENV :=
|
|
||||||
endif
|
|
||||||
|
|
||||||
.DEFAULT_GOAL := help
|
.DEFAULT_GOAL := help
|
||||||
|
|
||||||
.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
|
.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
|
||||||
edit-vault check-vault new-role \
|
edit-vault check-vault new-role \
|
||||||
tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \
|
tf-init tf-plan tf-apply tf-output tf-inventory \
|
||||||
molecule-image molecule-image-push
|
molecule-image molecule-image-push
|
||||||
|
|
||||||
help:
|
help:
|
||||||
|
|
@ -57,7 +49,6 @@ help:
|
||||||
@echo " make tf-apply [TF_ENV=staging] Apply Terraform changes"
|
@echo " make tf-apply [TF_ENV=staging] Apply Terraform changes"
|
||||||
@echo " make tf-output [TF_ENV=staging] Print Terraform outputs as JSON"
|
@echo " make tf-output [TF_ENV=staging] Print Terraform outputs as JSON"
|
||||||
@echo " make tf-inventory [TF_ENV=staging] Regenerate Ansible inventory from Terraform outputs"
|
@echo " make tf-inventory [TF_ENV=staging] Regenerate Ansible inventory from Terraform outputs"
|
||||||
@echo " make tf-inventory-offsite Generate offsite_hosts inventory (askari) into inventories/production/"
|
|
||||||
@echo ""
|
@echo ""
|
||||||
@echo " TF_ENV defaults to 'staging'. Use TF_ENV=production for production."
|
@echo " TF_ENV defaults to 'staging'. Use TF_ENV=production for production."
|
||||||
@echo ""
|
@echo ""
|
||||||
|
|
@ -146,16 +137,16 @@ molecule-image-push: molecule-image
|
||||||
# ── Terraform ─────────────────────────────────────────────────────────────────
|
# ── Terraform ─────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
tf-init:
|
tf-init:
|
||||||
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) init
|
$(TF) -chdir=terraform/environments/$(TF_ENV) init
|
||||||
|
|
||||||
tf-plan:
|
tf-plan:
|
||||||
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) plan
|
$(TF) -chdir=terraform/environments/$(TF_ENV) plan
|
||||||
|
|
||||||
tf-apply:
|
tf-apply:
|
||||||
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) apply
|
$(TF) -chdir=terraform/environments/$(TF_ENV) apply
|
||||||
|
|
||||||
tf-output:
|
tf-output:
|
||||||
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) output -json
|
$(TF) -chdir=terraform/environments/$(TF_ENV) output -json
|
||||||
|
|
||||||
tf-inventory:
|
tf-inventory:
|
||||||
ifndef TF_ENV
|
ifndef TF_ENV
|
||||||
|
|
@ -165,11 +156,6 @@ endif
|
||||||
| $(PYTHON) scripts/tf_to_inventory.py > inventories/$(TF_ENV)/hosts.yml
|
| $(PYTHON) scripts/tf_to_inventory.py > inventories/$(TF_ENV)/hosts.yml
|
||||||
@echo "Inventory written to inventories/$(TF_ENV)/hosts.yml"
|
@echo "Inventory written to inventories/$(TF_ENV)/hosts.yml"
|
||||||
|
|
||||||
tf-inventory-offsite:
|
|
||||||
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/offsite output -json \
|
|
||||||
| $(PYTHON) scripts/tf_to_inventory.py > inventories/production/offsite.yml
|
|
||||||
@echo "Offsite inventory written to inventories/production/offsite.yml"
|
|
||||||
|
|
||||||
# ── Role scaffolding ──────────────────────────────────────────────────────────
|
# ── Role scaffolding ──────────────────────────────────────────────────────────
|
||||||
|
|
||||||
new-role:
|
new-role:
|
||||||
|
|
|
||||||
|
|
@ -5,7 +5,7 @@ This repo is partly aspirational: the ADRs in `docs/decisions/` describe the
|
||||||
truth. **Before relying on a role, provider, or pipeline existing, check here.**
|
truth. **Before relying on a role, provider, or pipeline existing, check here.**
|
||||||
If something is listed as "designed, not built", do not assume it works.
|
If something is listed as "designed, not built", do not assume it works.
|
||||||
|
|
||||||
_Last reviewed: 2026-06-14._
|
_Last reviewed: 2026-06-11._
|
||||||
|
|
||||||
## Real and working today
|
## Real and working today
|
||||||
|
|
||||||
|
|
@ -20,7 +20,7 @@ _Last reviewed: 2026-06-14._
|
||||||
| Pre-commit hooks | Configured: lint, gitleaks, vault-encryption guard. Activate with `pre-commit install` after `make setup`. |
|
| Pre-commit hooks | Configured: lint, gitleaks, vault-encryption guard. Activate with `pre-commit install` after `make setup`. |
|
||||||
| Vault password client | `scripts/vault-pass-client.sh` fetches the master password from Vaultwarden via `rbw` (wired as `vault_password_file`). Requires `rbw` installed + `rbw unlock`. |
|
| Vault password client | `scripts/vault-pass-client.sh` fetches the master password from Vaultwarden via `rbw` (wired as `vault_password_file`). Requires `rbw` installed + `rbw unlock`. |
|
||||||
| `/review-repo` | Repo audit: `scripts/repo-scan.py` (Phase 0) + `.claude/commands/review-repo.md`, reports to `docs/reviews/`. On-demand only; cron + email deferred (`docs/TODO.md`). |
|
| `/review-repo` | Repo audit: `scripts/repo-scan.py` (Phase 0) + `.claude/commands/review-repo.md`, reports to `docs/reviews/`. On-demand only; cron + email deferred (`docs/TODO.md`). |
|
||||||
| Terraform HCL (`terraform/`) | Written (proxmox VM module + envs) — but never run; see below. Offsite env also written — see "Designed but not built". |
|
| Terraform HCL (`terraform/`) | Written (proxmox VM module + envs) — but never run; see below |
|
||||||
| `docs/hardware/reference.md` + `scripts/capacity-scan.py` | Present — reference doc (skeleton until real hardware) + stdlib scan; emits capacity JSON |
|
| `docs/hardware/reference.md` + `scripts/capacity-scan.py` | Present — reference doc (skeleton until real hardware) + stdlib scan; emits capacity JSON |
|
||||||
| `/capacity-review` | Works — on-demand capacity evaluation → `docs/hardware/reviews/`. Intent-based (no live usage yet) |
|
| `/capacity-review` | Works — on-demand capacity evaluation → `docs/hardware/reviews/`. Intent-based (no live usage yet) |
|
||||||
| ADR-002 security strategy + `docs/security/{accepted-risks,service-checklist}.md` | Present — threat model, principles, governance frame; checklist + risk register are docs, enforced manually in review |
|
| ADR-002 security strategy + `docs/security/{accepted-risks,service-checklist}.md` | Present — threat model, principles, governance frame; checklist + risk register are docs, enforced manually in review |
|
||||||
|
|
@ -50,8 +50,7 @@ applying `dev_env` via `playbooks/workstation.yml`.)
|
||||||
| Thing | Designed in | Notes |
|
| Thing | Designed in | Notes |
|
||||||
|---|---|---|
|
|---|---|---|
|
||||||
| `dns` role (renders the internal zone) | ADR-007 / ADR-009 | Does not exist. Internal DNS ownership is assigned to it by design. |
|
| `dns` role (renders the internal zone) | ADR-007 / ADR-009 | Does not exist. Internal DNS ownership is assigned to it by design. |
|
||||||
| Terraform actually provisioning (Proxmox) | ADR-006 / ADR-009 | Never `terraform init`ed: no `.terraform.lock.hcl`, no state, no real `local.vms` entries |
|
| Terraform actually provisioning | ADR-006 / ADR-009 | Never `terraform init`ed: no `.terraform.lock.hcl`, no state, no real `local.vms` entries |
|
||||||
| `terraform/{modules/hetzner_vm, environments/offsite}` (askari) | ADR-006 (amended) | **Written, not yet applied.** Terraform owns askari's existence (hcloud provider, CAX11/hel1/debian-13, cloud-init `ansible` user, Hetzner Cloud Firewall SSH-from-ubongo). Makefile token-injection + directory inventory + `tf-inventory-offsite` handoff wired; offsite-handoff pytest green. **Pending:** `terraform init/plan/apply` (run on ubongo — creates a billed VPS) + bootstrap. M2 of the roadmap. |
|
|
||||||
| CI (Forgejo Actions) | ADR-003 / ADR-008 | Pipeline described; not implemented |
|
| CI (Forgejo Actions) | ADR-003 / ADR-008 | Pipeline described; not implemented |
|
||||||
| Level 2 / 3 testing (staging, `askari` smoke) | ADR-008 | Depends on real VMs / `askari`, which don't exist yet |
|
| Level 2 / 3 testing (staging, `askari` smoke) | ADR-008 | Depends on real VMs / `askari`, which don't exist yet |
|
||||||
| Per-service roles | ADR-004 | Model defined; no service roles built |
|
| Per-service roles | ADR-004 | Model defined; no service roles built |
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,5 @@
|
||||||
[defaults]
|
[defaults]
|
||||||
inventory = inventories/production/
|
inventory = inventories/production/hosts.yml
|
||||||
roles_path = roles
|
roles_path = roles
|
||||||
collections_path = .collections
|
collections_path = .collections
|
||||||
vault_password_file = scripts/vault-pass-client.sh
|
vault_password_file = scripts/vault-pass-client.sh
|
||||||
|
|
|
||||||
|
|
@ -8,7 +8,7 @@ Accepted (2026-05-30)
|
||||||
|
|
||||||
Ansible manages host configuration well but has no state model for infrastructure
|
Ansible manages host configuration well but has no state model for infrastructure
|
||||||
existence. Adding Terraform handles the "what exists" layer — creating and destroying
|
existence. Adding Terraform handles the "what exists" layer — creating and destroying
|
||||||
VMs on Proxmox and Hetzner — while Ansible continues to own everything that runs inside them,
|
VMs on Proxmox — while Ansible continues to own everything that runs inside them,
|
||||||
including all internal DNS records.
|
including all internal DNS records.
|
||||||
|
|
||||||
This complements rather than replaces Ansible. The two tools do not overlap. The
|
This complements rather than replaces Ansible. The two tools do not overlap. The
|
||||||
|
|
@ -35,13 +35,8 @@ cadence, making them a poor fit for Terraform state.
|
||||||
### Providers
|
### Providers
|
||||||
|
|
||||||
**`bpg/proxmox` (`~> 0.70`)**: Chosen over `telmate/proxmox` for active maintenance,
|
**`bpg/proxmox` (`~> 0.70`)**: Chosen over `telmate/proxmox` for active maintenance,
|
||||||
full Proxmox 8 API support, and better cloud-init integration. This is the provider
|
full Proxmox 8 API support, and better cloud-init integration. This is the only
|
||||||
for Proxmox VMs.
|
provider.
|
||||||
|
|
||||||
**`hetznercloud/hcloud` (`~> 1.65`)**: owns off-site VM existence (`askari`). ADR-006's
|
|
||||||
scope is now **Proxmox + Hetzner** — "Terraform owns VM existence" generalizes across
|
|
||||||
providers. The `offsite` environment + `hetzner_vm` module live alongside the Proxmox env
|
|
||||||
+ `proxmox_vm` module; each environment has its own local state.
|
|
||||||
|
|
||||||
Terraform does **not** manage DNS. An earlier design used `hashicorp/dns` (RFC 2136)
|
Terraform does **not** manage DNS. An earlier design used `hashicorp/dns` (RFC 2136)
|
||||||
to write A records, but that created a bootstrap cycle — the first DNS server cannot
|
to write A records, but that created a bootstrap cycle — the first DNS server cannot
|
||||||
|
|
@ -76,11 +71,9 @@ integration boundary.
|
||||||
terraform/
|
terraform/
|
||||||
modules/
|
modules/
|
||||||
proxmox_vm/ # reusable VM module — Proxmox only, no DNS
|
proxmox_vm/ # reusable VM module — Proxmox only, no DNS
|
||||||
hetzner_vm/ # reusable VM module — Hetzner Cloud, no DNS
|
|
||||||
environments/
|
environments/
|
||||||
staging/ # staging Proxmox VMs, separate state file
|
staging/ # staging VMs, separate state file
|
||||||
production/ # production Proxmox VMs, separate state file
|
production/ # production VMs, separate state file
|
||||||
offsite/ # off-site Hetzner VMs (askari), separate state file
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Separate environment directories (not Terraform workspaces) for the clearest
|
Separate environment directories (not Terraform workspaces) for the clearest
|
||||||
|
|
@ -128,10 +121,8 @@ handoff)**.
|
||||||
|
|
||||||
Drawn from the "What was ruled out" section and the decisions stated above:
|
Drawn from the "What was ruled out" section and the decisions stated above:
|
||||||
|
|
||||||
- `bpg/proxmox` is the provider for Proxmox VMs; `telmate/proxmox` was ruled out for weaker
|
- `bpg/proxmox` is the only provider; `telmate/proxmox` was ruled out for weaker
|
||||||
maintenance and Proxmox 8 / cloud-init support (Providers; What was ruled out).
|
maintenance and Proxmox 8 / cloud-init support (Providers; What was ruled out).
|
||||||
- `hetznercloud/hcloud` is the provider for off-site VM existence (`askari`); ADR-006's
|
|
||||||
scope now covers Proxmox + Hetzner (Providers).
|
|
||||||
- OPNsense stays entirely in Ansible — no Terraform OPNsense provider — to avoid
|
- OPNsense stays entirely in Ansible — no Terraform OPNsense provider — to avoid
|
||||||
community-provider rot across OPNsense releases (Responsibility split; What was
|
community-provider rot across OPNsense releases (Responsibility split; What was
|
||||||
ruled out).
|
ruled out).
|
||||||
|
|
|
||||||
|
|
@ -195,11 +195,9 @@ the self-hosted NetBird coordinator** (management/signal/relay). It reaches `srv
|
||||||
metrics endpoints and `mgmt` for administration over the mesh, scoped by NetBird
|
metrics endpoints and `mgmt` for administration over the mesh, scoped by NetBird
|
||||||
ACLs — no OPNsense WireGuard tunnel and no `10.99.0.0/24` routing.
|
ACLs — no OPNsense WireGuard tunnel and no `10.99.0.0/24` routing.
|
||||||
|
|
||||||
`askari` is provisioned as **Terraform IaC** (`hetznercloud/hcloud`), managed
|
`askari` is provisioned and managed independently of the Proxmox cluster — it must
|
||||||
independently of the Proxmox cluster (its own provider + local state in
|
be reachable even when the homelab is down (its entire purpose), which is also why
|
||||||
`terraform/environments/offsite/`). It must be reachable even when the homelab is down
|
the mesh coordinator lives here: an off-site control plane survives a homelab outage.
|
||||||
(its entire purpose), which is also why the mesh coordinator lives here: an off-site
|
|
||||||
control plane survives a homelab outage.
|
|
||||||
FQDN: `askari.wingu.me` (off-site tier; record added by `public_dns` when askari exists — M2/M4).
|
FQDN: `askari.wingu.me` (off-site tier; record added by `public_dns` when askari exists — M2/M4).
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
|
||||||
|
|
@ -83,10 +83,10 @@ group against the allowed set and fails loudly on an unknown group.
|
||||||
|
|
||||||
**Valid groups**: `control`, `docker_hosts`, `proxmox_hosts`, `offsite_hosts`.
|
**Valid groups**: `control`, `docker_hosts`, `proxmox_hosts`, `offsite_hosts`.
|
||||||
|
|
||||||
`control` holds `ubongo`, a physical machine not managed by Terraform (see the
|
`control` and `offsite_hosts` are not produced by Terraform — they hold manually
|
||||||
control-node exception below and ADR-015). `offsite_hosts` holds `askari`, which is
|
provisioned hosts (`ubongo` and `askari` respectively) added to the inventory by hand
|
||||||
Terraform-managed via the `hetznercloud/hcloud` provider in the `offsite` environment
|
(see the control-node exception below and ADR-015/ADR-016). They are valid groups so
|
||||||
(see the off-site handoff note below and ADR-016).
|
the generated `hosts.yml` carries their (otherwise empty) sections.
|
||||||
|
|
||||||
The generated `hosts.yml` carries a "do not edit manually" header and is owned by
|
The generated `hosts.yml` carries a "do not edit manually" header and is owned by
|
||||||
the generator. Treat it as a build artifact: the source of truth is `local.vms` in
|
the generator. Treat it as a build artifact: the source of truth is `local.vms` in
|
||||||
|
|
@ -152,27 +152,6 @@ Every other host is Terraform-managed.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
### The off-site handoff (`offsite` environment → `offsite_hosts`)
|
|
||||||
|
|
||||||
`askari` (Hetzner VPS, ADR-016) follows the same handoff pipeline as Proxmox hosts but
|
|
||||||
with its own provider and environment:
|
|
||||||
|
|
||||||
- **Producer** — `terraform/environments/offsite/outputs.tf` emits a `vms` map in the
|
|
||||||
same `{ host: { ip, group } }` shape as Proxmox environments; `askari`'s group is
|
|
||||||
`offsite_hosts`.
|
|
||||||
- **Consumer** — `scripts/tf_to_inventory.py` reads `terraform output -json` from the
|
|
||||||
`offsite` environment and writes `inventories/production/offsite.yml`.
|
|
||||||
- **Makefile target** — `make tf-inventory-offsite` runs the generator for the offsite
|
|
||||||
environment.
|
|
||||||
|
|
||||||
The production inventory is a **directory** (`inventories/production/`) that Ansible
|
|
||||||
merges at runtime: `hosts.yml` (Proxmox-generated) and `offsite.yml`
|
|
||||||
(offsite-generated) together form the full production host list. Each file is a build
|
|
||||||
artifact — never hand-edited; their source of truth is `local.vms` in the respective
|
|
||||||
environment's `main.tf`.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### What was ruled out
|
### What was ruled out
|
||||||
|
|
||||||
| Option | Reason |
|
| Option | Reason |
|
||||||
|
|
@ -199,10 +178,7 @@ Drawn from the boundary, the data contract, and the "What was ruled out" section
|
||||||
owned by Ansible, no chicken-and-egg; What was ruled out).
|
owned by Ansible, no chicken-and-egg; What was ruled out).
|
||||||
- The control node (`ubongo`) is the single documented exception to "Terraform owns
|
- The control node (`ubongo`) is the single documented exception to "Terraform owns
|
||||||
VM existence" — a physical machine provisioned manually and managed by Ansible for
|
VM existence" — a physical machine provisioned manually and managed by Ansible for
|
||||||
baseline config only (The control-node exception).
|
baseline config only; every other host is Terraform-managed (The control-node
|
||||||
- The `offsite` TF environment's `vms` output feeds the `offsite_hosts` group via
|
exception).
|
||||||
`tf_to_inventory.py` (`make tf-inventory-offsite` → `inventories/production/offsite.yml`);
|
|
||||||
the production inventory is a directory that merges `hosts.yml` (Proxmox) and
|
|
||||||
`offsite.yml` (offsite) (The off-site handoff).
|
|
||||||
- The seam is documented in exactly one place (this ADR); ADR-005 and ADR-006 link
|
- The seam is documented in exactly one place (this ADR); ADR-005 and ADR-006 link
|
||||||
here rather than restating it (What was ruled out).
|
here rather than restating it (What was ruled out).
|
||||||
|
|
|
||||||
|
|
@ -81,9 +81,8 @@ allocated for it.
|
||||||
- **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage.
|
- **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage.
|
||||||
NetBird's management datastore is backed up encrypted off `askari` (synced to
|
NetBird's management datastore is backed up encrypted off `askari` (synced to
|
||||||
`ubongo`/`mamba`); peers keep last-known config through a brief coordinator outage.
|
`ubongo`/`mamba`); peers keep last-known config through a brief coordinator outage.
|
||||||
- **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` — provisioned
|
- **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` (added
|
||||||
as **Terraform IaC** (`hetznercloud/hcloud`), managed independently of the Proxmox
|
manually like the control node — it is not Terraform-managed), `base` role, plus a
|
||||||
cluster (its own provider + local state). Ansible configuration: `base` role, plus a
|
|
||||||
dedicated `netbird_coordinator` service role (one service = one role, ADR-004; with
|
dedicated `netbird_coordinator` service role (one service = one role, ADR-004; with
|
||||||
`SECURITY.md`). Agent install/enrollment lives in `base`. NetBird server + agents are
|
`SECURITY.md`). Agent install/enrollment lives in `base`. NetBird server + agents are
|
||||||
version-pinned (ADR-011). boma's `dns` role stays authoritative for
|
version-pinned (ADR-011). boma's `dns` role stays authoritative for
|
||||||
|
|
|
||||||
|
|
@ -84,20 +84,6 @@ This was chosen over a single connectivity-model-generates-both (too much machin
|
||||||
tight coupling of two very different rule domains) and over fully independent per-layer
|
tight coupling of two very different rule domains) and over fully independent per-layer
|
||||||
declarations (real drift risk).
|
declarations (real drift risk).
|
||||||
|
|
||||||
### Off-cluster hosts — `askari` (Hetzner)
|
|
||||||
|
|
||||||
`askari` sits outside the Proxmox cluster and has no OPNsense. Its **perimeter** layer
|
|
||||||
is a TF-managed **Hetzner Cloud Firewall** (declared in `terraform/environments/offsite/`)
|
|
||||||
alongside the VM itself. Current rule set (M2): SSH inbound from `ubongo`'s public IP
|
|
||||||
only. NetBird ports (UDP 3478 + TCP 80/443) will be added in M4 when the coordinator
|
|
||||||
role is built.
|
|
||||||
|
|
||||||
The `group_vars` service catalog remains authoritative for `askari`'s **host nftables**
|
|
||||||
layer — the same two-layer model applies, with Hetzner Cloud Firewall substituting for
|
|
||||||
OPNsense at the perimeter.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### OPNsense automation — owned here, mechanism deferred
|
### OPNsense automation — owned here, mechanism deferred
|
||||||
|
|
||||||
OPNsense is Ansible-managed (CLAUDE.md: "OPNsense is entirely Ansible; no Terraform
|
OPNsense is Ansible-managed (CLAUDE.md: "OPNsense is entirely Ansible; no Terraform
|
||||||
|
|
|
||||||
|
|
@ -1,538 +0,0 @@
|
||||||
# askari Provisioning (M2) Implementation Plan
|
|
||||||
|
|
||||||
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
|
|
||||||
|
|
||||||
**Goal:** Provision `askari` (the off-site Hetzner VPS) as Terraform IaC — a `hetzner_vm` module + an `offsite` stack — behind a TF-managed cloud firewall, hand it into the `offsite_hosts` inventory, and bootstrap it.
|
|
||||||
|
|
||||||
**Architecture:** Generalize boma's "Terraform owns VM existence" principle (ADR-006) from Proxmox to Hetzner. A reusable `hetzner_vm` module wraps `hcloud_server` + `hcloud_firewall` + `hcloud_ssh_key`; an `offsite` environment (own local state) declares `askari` (CAX11/ARM, Helsinki, Debian 13). cloud-init creates the `ansible` user with ubongo's key; the firewall allows SSH from ubongo only. Handoff stays ADR-009-shaped: the offsite env outputs `vms`, and `tf_to_inventory.py` (already offsite-aware) generates an inventory file merged via a **directory inventory**.
|
|
||||||
|
|
||||||
**Tech Stack:** Terraform (`hetznercloud/hcloud` provider), Hetzner Cloud, cloud-init, Ansible. Token from `vault.hetzner.token` → `TF_VAR_hcloud_token`.
|
|
||||||
|
|
||||||
**Spec:** `docs/superpowers/specs/2026-06-14-askari-provisioning-design.md`
|
|
||||||
|
|
||||||
**Execution context:** Tasks 1–6 + 9 are authoring + `terraform fmt/validate/plan` (need `terraform` installed + the token, but no resources are created). **Task 7 (`terraform apply`) and Task 8 (bootstrap) create a real, billed VPS** — gated, run with explicit user go, `tf-plan` shown first (CLAUDE.md). If `terraform` is absent in the working env, Tasks 6–8 defer to ubongo.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## File Structure
|
|
||||||
|
|
||||||
- `terraform/modules/hetzner_vm/{variables,main,outputs}.tf` (create) — wraps server + firewall + ssh key + cloud-init.
|
|
||||||
- `terraform/environments/offsite/{providers,variables,main,outputs,backend}.tf` + `terraform.tfvars.example` (create) — the askari stack, own local state.
|
|
||||||
- `Makefile` (modify) — inject `TF_VAR_hcloud_token` for `TF_ENV=offsite`; directory inventory; `tf-inventory-offsite` target.
|
|
||||||
- `scripts/tf_to_inventory.py` (no change — already offsite-aware) + `tests/test_tf_to_inventory.py` (create) — lock the offsite handoff.
|
|
||||||
- `docs/decisions/{006,009,020,007,016}-*.md`, `STATUS.md` (modify) — ADR amendments + status.
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 1: Verify the Hetzner provider/image facts (ADR-014)
|
|
||||||
|
|
||||||
**Files:** none (research; pin values used by later tasks).
|
|
||||||
|
|
||||||
- [ ] **Step 1: Verify and record**
|
|
||||||
|
|
||||||
Verify (WebFetch registry.terraform.io / docs.hetzner.com, or `terraform` once init'd):
|
|
||||||
- latest `hetznercloud/hcloud` provider version to pin (expected `~> 1.48`+),
|
|
||||||
- the Debian 13 image slug (expected `debian-13`),
|
|
||||||
- that server type `cax11` exists in location `hel1`.
|
|
||||||
|
|
||||||
Record a stamp in the offsite `providers.tf` comment, e.g.:
|
|
||||||
`# verified: hetznercloud/hcloud <ver> · debian-13 image · cax11@hel1 · <source> · <date>`
|
|
||||||
|
|
||||||
- [ ] **Step 2: No commit** (values land in later tasks).
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 2: The `hetzner_vm` module
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Create: `terraform/modules/hetzner_vm/variables.tf`, `main.tf`, `outputs.tf`
|
|
||||||
|
|
||||||
- [ ] **Step 1: `variables.tf`**
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
variable "name" {
|
|
||||||
description = "Server name (and hostname)"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "server_type" {
|
|
||||||
description = "Hetzner server type, e.g. cax11 (ARM)"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "location" {
|
|
||||||
description = "Hetzner location, e.g. hel1"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "image" {
|
|
||||||
description = "OS image slug, e.g. debian-13"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ansible_ssh_pubkey" {
|
|
||||||
description = "Public SSH key provisioned for the ansible user via cloud-init"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ssh_admin_cidrs" {
|
|
||||||
description = "Source CIDRs allowed to reach SSH (e.g. ubongo's address/32)"
|
|
||||||
type = list(string)
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "labels" {
|
|
||||||
description = "Hetzner resource labels (metadata only)"
|
|
||||||
type = map(string)
|
|
||||||
default = {}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 2: `main.tf`**
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
# cloud-init: create the unprivileged `ansible` user with ubongo's key + sudo.
|
|
||||||
# (Mirrors the proxmox_vm module's user_account; Hetzner has no structured field.)
|
|
||||||
locals {
|
|
||||||
user_data = <<-EOT
|
|
||||||
#cloud-config
|
|
||||||
users:
|
|
||||||
- name: ansible
|
|
||||||
groups: [sudo]
|
|
||||||
sudo: "ALL=(ALL) NOPASSWD:ALL"
|
|
||||||
shell: /bin/bash
|
|
||||||
ssh_authorized_keys:
|
|
||||||
- ${var.ansible_ssh_pubkey}
|
|
||||||
package_update: true
|
|
||||||
packages:
|
|
||||||
- python3
|
|
||||||
EOT
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "hcloud_ssh_key" "ansible" {
|
|
||||||
name = "${var.name}-ansible"
|
|
||||||
public_key = var.ansible_ssh_pubkey
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "hcloud_firewall" "this" {
|
|
||||||
name = "${var.name}-fw"
|
|
||||||
|
|
||||||
# SSH from the control node only (NetBird ports are added in M4 when the
|
|
||||||
# coordinator deploys — see ADR-020; the host nftables layer is catalog-driven).
|
|
||||||
rule {
|
|
||||||
direction = "in"
|
|
||||||
protocol = "tcp"
|
|
||||||
port = "22"
|
|
||||||
source_ips = var.ssh_admin_cidrs
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "hcloud_server" "this" {
|
|
||||||
name = var.name
|
|
||||||
server_type = var.server_type
|
|
||||||
location = var.location
|
|
||||||
image = var.image
|
|
||||||
ssh_keys = [hcloud_ssh_key.ansible.id]
|
|
||||||
user_data = local.user_data
|
|
||||||
firewall_ids = [hcloud_firewall.this.id]
|
|
||||||
labels = var.labels
|
|
||||||
|
|
||||||
public_net {
|
|
||||||
ipv4_enabled = true
|
|
||||||
ipv6_enabled = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 3: `outputs.tf`**
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
output "ipv4_address" {
|
|
||||||
description = "Server public IPv4"
|
|
||||||
value = hcloud_server.this.ipv4_address
|
|
||||||
}
|
|
||||||
|
|
||||||
output "name" {
|
|
||||||
description = "Server name"
|
|
||||||
value = hcloud_server.this.name
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 4: Format**
|
|
||||||
|
|
||||||
Run: `terraform fmt terraform/modules/hetzner_vm/`
|
|
||||||
Expected: files formatted (or already formatted).
|
|
||||||
|
|
||||||
- [ ] **Step 5: Commit**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add terraform/modules/hetzner_vm
|
|
||||||
git commit -m "feat(tf): hetzner_vm module (server + firewall + ssh key + cloud-init)"
|
|
||||||
```
|
|
||||||
(append `Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>`)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 3: The `offsite` environment
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Create: `terraform/environments/offsite/{providers,variables,main,outputs,backend}.tf`, `terraform.tfvars.example`
|
|
||||||
|
|
||||||
- [ ] **Step 1: `providers.tf`** (pin the version from Task 1)
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
# verified: hetznercloud/hcloud ~> 1.48 · debian-13 · cax11@hel1 · <source> · <date>
|
|
||||||
terraform {
|
|
||||||
required_version = ">= 1.9"
|
|
||||||
|
|
||||||
required_providers {
|
|
||||||
hcloud = {
|
|
||||||
source = "hetznercloud/hcloud"
|
|
||||||
version = "~> 1.48"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
provider "hcloud" {
|
|
||||||
token = var.hcloud_token
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 2: `variables.tf`**
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
variable "hcloud_token" {
|
|
||||||
description = "Hetzner Cloud API token — set via TF_VAR_hcloud_token (from vault.hetzner.token)"
|
|
||||||
type = string
|
|
||||||
sensitive = true
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ansible_ssh_pubkey" {
|
|
||||||
description = "ubongo's control SSH public key, provisioned for the ansible user"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ssh_admin_cidrs" {
|
|
||||||
description = "Source CIDRs allowed to SSH askari (ubongo's address/32)"
|
|
||||||
type = list(string)
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 3: `main.tf`**
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
# offsite/main.tf — off-site Hetzner hosts. Terraform owns VM existence (ADR-006,
|
|
||||||
# generalized to Hetzner). ALWAYS `make tf-plan TF_ENV=offsite` and review before
|
|
||||||
# `make tf-apply TF_ENV=offsite`.
|
|
||||||
|
|
||||||
module "askari" {
|
|
||||||
source = "../../modules/hetzner_vm"
|
|
||||||
|
|
||||||
name = "askari"
|
|
||||||
server_type = "cax11" # ARM, 2 vCPU / 4 GB
|
|
||||||
location = "hel1" # Helsinki
|
|
||||||
image = "debian-13"
|
|
||||||
ansible_ssh_pubkey = var.ansible_ssh_pubkey
|
|
||||||
ssh_admin_cidrs = var.ssh_admin_cidrs
|
|
||||||
labels = {
|
|
||||||
env = "offsite"
|
|
||||||
group = "offsite_hosts"
|
|
||||||
managed-by = "terraform"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 4: `outputs.tf`** (the `tf_to_inventory.py` contract — `vms` map)
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
output "vms" {
|
|
||||||
description = "Hostname → IP and Ansible group — consumed by make tf-inventory-offsite"
|
|
||||||
value = {
|
|
||||||
askari = {
|
|
||||||
ip = module.askari.ipv4_address
|
|
||||||
group = "offsite_hosts"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 5: `backend.tf`**
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
# Terraform state: LOCAL, on the control node (like the Proxmox envs; ADR-006).
|
|
||||||
# askari survives a homelab outage by design, so a lost state is recovered by
|
|
||||||
# `terraform import` of the running server — not a rebuild. Back the state up with
|
|
||||||
# the control node (ADR-022).
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 6: `terraform.tfvars.example`**
|
|
||||||
|
|
||||||
```hcl
|
|
||||||
# offsite environment — non-secret values. Copy to terraform.tfvars and fill in.
|
|
||||||
#
|
|
||||||
# Secret is exported as an env var (never in this file):
|
|
||||||
# export TF_VAR_hcloud_token="$(...from vault.hetzner.token...)" # make handles this
|
|
||||||
#
|
|
||||||
# State is local (see backend.tf).
|
|
||||||
|
|
||||||
ansible_ssh_pubkey = "ssh-ed25519 AAAA... ansible@ubongo"
|
|
||||||
ssh_admin_cidrs = ["10.20.10.151/32"] # ubongo's LAN address (ADR-021)
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 7: Format + commit**
|
|
||||||
|
|
||||||
Run: `terraform fmt terraform/environments/offsite/`
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add terraform/environments/offsite
|
|
||||||
git commit -m "feat(tf): offsite environment — askari (CAX11/hel1/debian-13)"
|
|
||||||
```
|
|
||||||
(Co-Authored-By trailer)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 4: Makefile — token injection, directory inventory, offsite handoff
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `Makefile`
|
|
||||||
|
|
||||||
- [ ] **Step 1: Inject the Hetzner token for `TF_ENV=offsite`**
|
|
||||||
|
|
||||||
The `tf-*` targets need `TF_VAR_hcloud_token` for offsite, sourced from the vault. Add a guarded helper variable near the `TF` definition:
|
|
||||||
|
|
||||||
```makefile
|
|
||||||
# For TF_ENV=offsite, export the Hetzner token from the vault (rbw unlocked).
|
|
||||||
# Reads vault.hetzner.token in-memory; never written to a tfvars file (CLAUDE.md).
|
|
||||||
ifeq ($(TF_ENV),offsite)
|
|
||||||
TF_TOKEN_ENV = TF_VAR_hcloud_token="$$($(VENV)/bin/ansible-vault view inventories/production/group_vars/all/vault.yml | $(VENV)/bin/python -c 'import sys,yaml; print(yaml.safe_load(sys)["vault"]["hetzner"]["token"])')"
|
|
||||||
else
|
|
||||||
TF_TOKEN_ENV =
|
|
||||||
endif
|
|
||||||
```
|
|
||||||
|
|
||||||
Then prefix the `tf-init`/`tf-plan`/`tf-apply`/`tf-output` recipes with `$(TF_TOKEN_ENV)`, e.g.:
|
|
||||||
|
|
||||||
```makefile
|
|
||||||
tf-plan:
|
|
||||||
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) plan
|
|
||||||
```
|
|
||||||
|
|
||||||
(Apply the same prefix to `tf-init`, `tf-apply`, `tf-output`.)
|
|
||||||
|
|
||||||
- [ ] **Step 2: Directory inventory**
|
|
||||||
|
|
||||||
Change the inventory so multiple TF envs can each generate a file:
|
|
||||||
|
|
||||||
```makefile
|
|
||||||
INVENTORY := -i inventories/production/
|
|
||||||
```
|
|
||||||
|
|
||||||
(Ansible reads every file in the directory as an inventory source and merges them; `group_vars/`/`host_vars/` remain variable dirs. Verify `ansible.cfg` does not also hard-set `inventory=`; if it does, update it to match.)
|
|
||||||
|
|
||||||
- [ ] **Step 3: `tf-inventory-offsite` target**
|
|
||||||
|
|
||||||
Add (writes the offsite hosts into the production inventory dir, beside the Proxmox-generated `hosts.yml`):
|
|
||||||
|
|
||||||
```makefile
|
|
||||||
tf-inventory-offsite:
|
|
||||||
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/offsite output -json \
|
|
||||||
| $(PYTHON) scripts/tf_to_inventory.py > inventories/production/offsite.yml
|
|
||||||
@echo "Offsite inventory written to inventories/production/offsite.yml"
|
|
||||||
```
|
|
||||||
Add `tf-inventory-offsite` to `.PHONY` and a help line.
|
|
||||||
|
|
||||||
- [ ] **Step 4: Verify existing playbooks still resolve under the directory inventory**
|
|
||||||
|
|
||||||
Run: `make check PLAYBOOK=dns 2>&1 | tail -3`
|
|
||||||
Expected: still resolves the `control` host and runs (no inventory errors). If `connection:`/group_vars break, fix before committing.
|
|
||||||
|
|
||||||
- [ ] **Step 5: Commit**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add Makefile
|
|
||||||
git commit -m "feat(make): offsite TF token injection + directory inventory + tf-inventory-offsite"
|
|
||||||
```
|
|
||||||
(Co-Authored-By trailer)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 5: Lock the offsite inventory handoff (TDD)
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Test: `tests/test_tf_to_inventory.py`
|
|
||||||
|
|
||||||
- [ ] **Step 1: Write the failing test**
|
|
||||||
|
|
||||||
```python
|
|
||||||
import json
|
|
||||||
import pathlib
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
_SCRIPT = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "tf_to_inventory.py"
|
|
||||||
|
|
||||||
|
|
||||||
def _run(tf_output: dict) -> str:
|
|
||||||
return subprocess.run(
|
|
||||||
[sys.executable, str(_SCRIPT)],
|
|
||||||
input=json.dumps(tf_output), capture_output=True, text=True, check=True,
|
|
||||||
).stdout
|
|
||||||
|
|
||||||
|
|
||||||
def test_offsite_host_lands_in_offsite_hosts():
|
|
||||||
out = _run({"vms": {"value": {"askari": {"ip": "203.0.113.7", "group": "offsite_hosts"}}}})
|
|
||||||
assert "offsite_hosts:" in out
|
|
||||||
assert "askari:" in out
|
|
||||||
assert "ansible_host: 203.0.113.7" in out
|
|
||||||
|
|
||||||
|
|
||||||
def test_unknown_group_rejected():
|
|
||||||
proc = subprocess.run(
|
|
||||||
[sys.executable, str(_SCRIPT)],
|
|
||||||
input=json.dumps({"vms": {"value": {"x": {"ip": "1.2.3.4", "group": "nope"}}}}),
|
|
||||||
capture_output=True, text=True,
|
|
||||||
)
|
|
||||||
assert proc.returncode == 1
|
|
||||||
assert "unknown group" in proc.stderr
|
|
||||||
```
|
|
||||||
|
|
||||||
- [ ] **Step 2: Run it**
|
|
||||||
|
|
||||||
Run: `.venv/bin/python -m pytest tests/test_tf_to_inventory.py -v`
|
|
||||||
Expected: PASS — `tf_to_inventory.py` already supports `offsite_hosts` and rejects unknown groups (this test locks that behaviour for the M2 handoff; no code change needed). If it fails, fix `scripts/tf_to_inventory.py` minimally and report.
|
|
||||||
|
|
||||||
- [ ] **Step 3: Commit**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add tests/test_tf_to_inventory.py
|
|
||||||
git commit -m "test(tf): lock the offsite_hosts inventory handoff"
|
|
||||||
```
|
|
||||||
(Co-Authored-By trailer)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 6: Init, validate, plan (gated — needs terraform + token)
|
|
||||||
|
|
||||||
> Needs `terraform` installed and `rbw` unlocked. Creates **no** resources. If `terraform` is absent, defer Tasks 6–8 to ubongo.
|
|
||||||
|
|
||||||
- [ ] **Step 1: Set tfvars**
|
|
||||||
|
|
||||||
`cp terraform/environments/offsite/terraform.tfvars.example terraform/environments/offsite/terraform.tfvars` and set `ansible_ssh_pubkey` to ubongo's real control public key and `ssh_admin_cidrs` to ubongo's address (`10.20.10.151/32`). (`terraform.tfvars` is gitignored.)
|
|
||||||
|
|
||||||
- [ ] **Step 2: Init (tracks the lock file)**
|
|
||||||
|
|
||||||
Run: `make tf-init TF_ENV=offsite`
|
|
||||||
Expected: providers installed; `terraform/environments/offsite/.terraform.lock.hcl` created. `git add` the lock file (tracked per CLAUDE.md).
|
|
||||||
|
|
||||||
- [ ] **Step 3: Validate + plan**
|
|
||||||
|
|
||||||
Run: `terraform -chdir=terraform/environments/offsite validate` → `Success`.
|
|
||||||
Run: `make tf-plan TF_ENV=offsite` → review: **1 server + 1 firewall + 1 ssh key to add**. Confirm CAX11/hel1/debian-13 and the SSH-from-ubongo rule.
|
|
||||||
|
|
||||||
- [ ] **Step 4: Commit the lock file**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add terraform/environments/offsite/.terraform.lock.hcl
|
|
||||||
git commit -m "chore(tf): pin offsite provider lock (hcloud)"
|
|
||||||
```
|
|
||||||
(Co-Authored-By trailer)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 7: Apply — create askari (GATED, real billed VPS)
|
|
||||||
|
|
||||||
> **Explicit user go required.** Run on ubongo. The plan from Task 6 must be reviewed first (CLAUDE.md: never apply without a shown plan).
|
|
||||||
|
|
||||||
- [ ] **Step 1: Apply**
|
|
||||||
|
|
||||||
Run: `make tf-apply TF_ENV=offsite`
|
|
||||||
Expected: `hcloud_ssh_key`, `hcloud_firewall`, `hcloud_server.askari` created; outputs show `askari`'s IPv4.
|
|
||||||
|
|
||||||
- [ ] **Step 2: Generate the offsite inventory**
|
|
||||||
|
|
||||||
Run: `make tf-inventory-offsite`
|
|
||||||
Expected: `inventories/production/offsite.yml` written with `askari` under `offsite_hosts`.
|
|
||||||
|
|
||||||
- [ ] **Step 3: Verify the inventory merges**
|
|
||||||
|
|
||||||
Run: `.venv/bin/ansible-inventory $(INVENTORY) --host askari` (or `--list`)
|
|
||||||
Expected: `askari` present with its `ansible_host`.
|
|
||||||
|
|
||||||
- [ ] **Step 4: Commit the generated inventory**
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add inventories/production/offsite.yml
|
|
||||||
git commit -m "chore(inventory): askari in offsite_hosts (generated)"
|
|
||||||
```
|
|
||||||
(Co-Authored-By trailer)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 8: Bootstrap askari (GATED — needs the live host)
|
|
||||||
|
|
||||||
> Run on ubongo after Task 7. `rbw` unlocked.
|
|
||||||
|
|
||||||
- [ ] **Step 1: Reach it**
|
|
||||||
|
|
||||||
Run: `ssh ansible@<askari-ip>` (cloud-init created the `ansible` user with ubongo's key) — expect a shell. If refused, check the firewall `ssh_admin_cidrs` matches ubongo's egress IP.
|
|
||||||
|
|
||||||
- [ ] **Step 2: Bootstrap**
|
|
||||||
|
|
||||||
Run: `make check PLAYBOOK=bootstrap` (review) then `make deploy PLAYBOOK=bootstrap` — expect the `ansible` user + sudoers confirmed/created on askari (idempotent).
|
|
||||||
|
|
||||||
- [ ] **Step 3: No repo commit** — this configures the host, not the repo. (`base` subset = M3.)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
### Task 9: ADR amendments + STATUS
|
|
||||||
|
|
||||||
**Files:**
|
|
||||||
- Modify: `docs/decisions/006-terraform.md`, `009-provisioning-handoff.md`, `020-firewall.md`, `007-network.md`, `016-mesh-vpn.md`, `STATUS.md`
|
|
||||||
|
|
||||||
For each: **Read the relevant section first**, then apply the change.
|
|
||||||
|
|
||||||
- [ ] **Step 1: ADR-006 — generalize the provider scope**
|
|
||||||
|
|
||||||
In the **Providers** section, the line "`bpg/proxmox` … This is the only provider." → note a second provider:
|
|
||||||
```
|
|
||||||
**`hetznercloud/hcloud`**: owns off-site VM existence (`askari`). ADR-006's scope is
|
|
||||||
**Proxmox + Hetzner** — "Terraform owns VM existence" generalizes across providers; the
|
|
||||||
`offsite` environment + `hetzner_vm` module live alongside the Proxmox env + module.
|
|
||||||
```
|
|
||||||
Also adjust the Context line "creating and destroying VMs on Proxmox" → "on Proxmox and Hetzner".
|
|
||||||
|
|
||||||
- [ ] **Step 2: ADR-009 — offsite handoff**
|
|
||||||
|
|
||||||
Add a note that `offsite` is a TF environment whose `vms` output feeds `offsite_hosts` via `tf_to_inventory.py` (`make tf-inventory-offsite` → `inventories/production/offsite.yml`), and that the production inventory is a **directory** merging the Proxmox + offsite generated files.
|
|
||||||
|
|
||||||
- [ ] **Step 3: ADR-020 — askari's perimeter**
|
|
||||||
|
|
||||||
Note that off-cluster `askari` has no OPNsense; its **perimeter** is a TF-managed Hetzner Cloud Firewall (SSH-from-ubongo now; NetBird ports in M4). The `group_vars` catalog stays authoritative for the host nftables layer.
|
|
||||||
|
|
||||||
- [ ] **Step 4: ADR-007 / ADR-016 — askari is TF-provisioned**
|
|
||||||
|
|
||||||
Replace "provisioned … independently … added manually" wording for askari with "provisioned as Terraform IaC (hcloud), managed independently of the Proxmox cluster (own provider + state)."
|
|
||||||
|
|
||||||
- [ ] **Step 5: STATUS.md**
|
|
||||||
|
|
||||||
Move/realize askari's row per how far Task 7/8 got. If applied: under "Real and working today" — `askari` **Built + applied** (CAX11/hel1/debian-13, cloud firewall SSH-from-ubongo, bootstrapped, in `offsite_hosts`). If only authored (apply deferred): note the TF is written + `tf-plan` clean, apply pending on ubongo.
|
|
||||||
|
|
||||||
- [ ] **Step 6: Lint + commit**
|
|
||||||
|
|
||||||
Run: `make lint` (must pass).
|
|
||||||
|
|
||||||
```bash
|
|
||||||
git add docs/decisions/006-terraform.md docs/decisions/009-provisioning-handoff.md \
|
|
||||||
docs/decisions/020-firewall.md docs/decisions/007-network.md \
|
|
||||||
docs/decisions/016-mesh-vpn.md STATUS.md
|
|
||||||
git commit -m "docs(askari): amend ADR-006/009/020/007/016 for TF-provisioned offsite host; STATUS"
|
|
||||||
```
|
|
||||||
(Co-Authored-By trailer)
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Self-Review (completed)
|
|
||||||
|
|
||||||
- **Spec coverage:** TF owns existence / generalize ADR-006 (Decision 1) → Tasks 2,3,9; CAX11/hel1/debian-13 (Decision 2) → Task 3; TF cloud firewall, SSH-from-ubongo, NetBird ports later (Decision 3) → Task 2 + Task 9 ADR-020; token via `TF_VAR_hcloud_token` from vault (Decision 4) → Task 4; ADR-009 handoff via `tf_to_inventory` (Decision 5) → Tasks 4,5,7; cloud-init `ansible` user + bootstrap → Tasks 2,8; state + DR (import) → Task 3 backend; ADR amendments → Task 9. All covered.
|
|
||||||
- **Placeholder scan:** none — HCL, make, and test content are concrete. `<askari-ip>`/`<source>`/`<date>` are runtime/verification values, not unspecified logic.
|
|
||||||
- **Type/name consistency:** module vars (`name`, `server_type`, `location`, `image`, `ansible_ssh_pubkey`, `ssh_admin_cidrs`, `labels`) match between module + env call; the `vms` output shape (`{ip, group}`) matches `tf_to_inventory.py`'s contract; `TF_VAR_hcloud_token` ↔ `var.hcloud_token`; `vault.hetzner.token` matches the stored key.
|
|
||||||
- **Notes for the implementer:** (a) confirm Ansible merges the directory inventory's two files so `askari` resolves (Task 7 Step 3); (b) verify `hcloud_server` arg names against the pinned provider version (Task 1) — adjust `public_net`/`firewall_ids` if the provider differs; (c) Tasks 7–8 create a billed VPS — gated on explicit go.
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
# Terraform state: LOCAL, on the control node (like the Proxmox envs; ADR-006).
|
|
||||||
# askari survives a homelab outage by design, so a lost state is recovered by
|
|
||||||
# `terraform import` of the running server — not a rebuild. Back the state up with
|
|
||||||
# the control node (ADR-022).
|
|
||||||
|
|
@ -1,19 +0,0 @@
|
||||||
# offsite/main.tf — off-site Hetzner hosts. Terraform owns VM existence (ADR-006,
|
|
||||||
# generalized to Hetzner). ALWAYS `make tf-plan TF_ENV=offsite` and review before
|
|
||||||
# `make tf-apply TF_ENV=offsite`.
|
|
||||||
|
|
||||||
module "askari" {
|
|
||||||
source = "../../modules/hetzner_vm"
|
|
||||||
|
|
||||||
name = "askari"
|
|
||||||
server_type = "cax11" # ARM, 2 vCPU / 4 GB
|
|
||||||
location = "hel1" # Helsinki
|
|
||||||
image = "debian-13"
|
|
||||||
ansible_ssh_pubkey = var.ansible_ssh_pubkey
|
|
||||||
ssh_admin_cidrs = var.ssh_admin_cidrs
|
|
||||||
labels = {
|
|
||||||
env = "offsite"
|
|
||||||
group = "offsite_hosts"
|
|
||||||
managed-by = "terraform"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
output "vms" {
|
|
||||||
description = "Hostname -> IP and Ansible group — consumed by make tf-inventory-offsite"
|
|
||||||
value = {
|
|
||||||
askari = {
|
|
||||||
ip = module.askari.ipv4_address
|
|
||||||
group = "offsite_hosts"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
# verified: hetznercloud/hcloud 1.65.0 · debian-13 image · cax11@hel1 · terraform-registry · 2026-06-14
|
|
||||||
terraform {
|
|
||||||
required_version = ">= 1.9"
|
|
||||||
|
|
||||||
required_providers {
|
|
||||||
hcloud = {
|
|
||||||
source = "hetznercloud/hcloud"
|
|
||||||
version = "~> 1.65"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
provider "hcloud" {
|
|
||||||
token = var.hcloud_token
|
|
||||||
}
|
|
||||||
|
|
@ -1,13 +0,0 @@
|
||||||
# offsite environment — non-secret values. Copy to terraform.tfvars and fill in.
|
|
||||||
#
|
|
||||||
# Secret is exported as an env var (never in this file); the make tf-* targets do this
|
|
||||||
# automatically for TF_ENV=offsite, sourcing vault.hetzner.token:
|
|
||||||
# export TF_VAR_hcloud_token="...from vault.hetzner.token..."
|
|
||||||
#
|
|
||||||
# State is local (see backend.tf).
|
|
||||||
|
|
||||||
ansible_ssh_pubkey = "ssh-ed25519 AAAA... ansible@ubongo"
|
|
||||||
# The Hetzner Cloud Firewall filters PUBLIC traffic, so this is ubongo's WAN/egress
|
|
||||||
# IP (the perimeter analog of OPNsense, ADR-020) — NOT its LAN address. Find it with
|
|
||||||
# `curl -s ifconfig.me` from ubongo. Narrows to the NetBird `wt0` path once M5 lands.
|
|
||||||
ssh_admin_cidrs = ["203.0.113.10/32"] # placeholder — ubongo's WAN/egress IP
|
|
||||||
|
|
@ -1,15 +0,0 @@
|
||||||
variable "hcloud_token" {
|
|
||||||
description = "Hetzner Cloud API token — set via TF_VAR_hcloud_token (from vault.hetzner.token)"
|
|
||||||
type = string
|
|
||||||
sensitive = true
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ansible_ssh_pubkey" {
|
|
||||||
description = "ubongo's control SSH public key, provisioned for the ansible user"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ssh_admin_cidrs" {
|
|
||||||
description = "Source CIDRs allowed to SSH askari (ubongo's address/32)"
|
|
||||||
type = list(string)
|
|
||||||
}
|
|
||||||
|
|
@ -1,53 +0,0 @@
|
||||||
# cloud-init: create the unprivileged `ansible` user with ubongo's key + sudo.
|
|
||||||
# (Mirrors the proxmox_vm module's user_account; Hetzner has no structured field.)
|
|
||||||
locals {
|
|
||||||
# Indentation matches the closing EOT (2 spaces) so `<<-` strips to column 0 —
|
|
||||||
# cloud-config requires `#cloud-config` as the first line with no leading space.
|
|
||||||
user_data = <<-EOT
|
|
||||||
#cloud-config
|
|
||||||
users:
|
|
||||||
- name: ansible
|
|
||||||
groups: [sudo]
|
|
||||||
sudo: "ALL=(ALL) NOPASSWD:ALL"
|
|
||||||
shell: /bin/bash
|
|
||||||
ssh_authorized_keys:
|
|
||||||
- ${var.ansible_ssh_pubkey}
|
|
||||||
package_update: true
|
|
||||||
packages:
|
|
||||||
- python3
|
|
||||||
EOT
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "hcloud_ssh_key" "ansible" {
|
|
||||||
name = "${var.name}-ansible"
|
|
||||||
public_key = var.ansible_ssh_pubkey
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "hcloud_firewall" "this" {
|
|
||||||
name = "${var.name}-fw"
|
|
||||||
|
|
||||||
# SSH from the control node only. NetBird ports (UDP 3478, TCP 80/443) are added
|
|
||||||
# in M4 when the coordinator deploys (ADR-020); host nftables stays catalog-driven.
|
|
||||||
rule {
|
|
||||||
direction = "in"
|
|
||||||
protocol = "tcp"
|
|
||||||
port = "22"
|
|
||||||
source_ips = var.ssh_admin_cidrs
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
resource "hcloud_server" "this" {
|
|
||||||
name = var.name
|
|
||||||
server_type = var.server_type
|
|
||||||
location = var.location
|
|
||||||
image = var.image
|
|
||||||
ssh_keys = [hcloud_ssh_key.ansible.id]
|
|
||||||
user_data = local.user_data
|
|
||||||
firewall_ids = [hcloud_firewall.this.id]
|
|
||||||
labels = var.labels
|
|
||||||
|
|
||||||
public_net {
|
|
||||||
ipv4_enabled = true
|
|
||||||
ipv6_enabled = true
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
@ -1,9 +0,0 @@
|
||||||
output "ipv4_address" {
|
|
||||||
description = "Server public IPv4"
|
|
||||||
value = hcloud_server.this.ipv4_address
|
|
||||||
}
|
|
||||||
|
|
||||||
output "name" {
|
|
||||||
description = "Server name"
|
|
||||||
value = hcloud_server.this.name
|
|
||||||
}
|
|
||||||
|
|
@ -1,35 +0,0 @@
|
||||||
variable "name" {
|
|
||||||
description = "Server name (and hostname)"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "server_type" {
|
|
||||||
description = "Hetzner server type, e.g. cax11 (ARM)"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "location" {
|
|
||||||
description = "Hetzner location, e.g. hel1"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "image" {
|
|
||||||
description = "OS image slug, e.g. debian-13"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ansible_ssh_pubkey" {
|
|
||||||
description = "Public SSH key provisioned for the ansible user via cloud-init"
|
|
||||||
type = string
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "ssh_admin_cidrs" {
|
|
||||||
description = "Source CIDRs allowed to reach SSH (e.g. ubongo's address/32)"
|
|
||||||
type = list(string)
|
|
||||||
}
|
|
||||||
|
|
||||||
variable "labels" {
|
|
||||||
description = "Hetzner resource labels (metadata only)"
|
|
||||||
type = map(string)
|
|
||||||
default = {}
|
|
||||||
}
|
|
||||||
|
|
@ -1,30 +0,0 @@
|
||||||
import json
|
|
||||||
import pathlib
|
|
||||||
import subprocess
|
|
||||||
import sys
|
|
||||||
|
|
||||||
_SCRIPT = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "tf_to_inventory.py"
|
|
||||||
|
|
||||||
|
|
||||||
def _run(tf_output: dict) -> str:
|
|
||||||
return subprocess.run(
|
|
||||||
[sys.executable, str(_SCRIPT)],
|
|
||||||
input=json.dumps(tf_output), capture_output=True, text=True, check=True,
|
|
||||||
).stdout
|
|
||||||
|
|
||||||
|
|
||||||
def test_offsite_host_lands_in_offsite_hosts():
|
|
||||||
out = _run({"vms": {"value": {"askari": {"ip": "203.0.113.7", "group": "offsite_hosts"}}}})
|
|
||||||
assert "offsite_hosts:" in out
|
|
||||||
assert "askari:" in out
|
|
||||||
assert "ansible_host: 203.0.113.7" in out
|
|
||||||
|
|
||||||
|
|
||||||
def test_unknown_group_rejected():
|
|
||||||
proc = subprocess.run(
|
|
||||||
[sys.executable, str(_SCRIPT)],
|
|
||||||
input=json.dumps({"vms": {"value": {"x": {"ip": "1.2.3.4", "group": "nope"}}}}),
|
|
||||||
capture_output=True, text=True,
|
|
||||||
)
|
|
||||||
assert proc.returncode == 1
|
|
||||||
assert "unknown group" in proc.stderr
|
|
||||||
Loading…
Add table
Reference in a new issue