Compare commits

..

No commits in common. "09b0aad342cfafb78f5831d3a45bc8e7984635b2" and "993d7885e4a285666548c1057adc4ccf42134f81" have entirely different histories.

19 changed files with 27 additions and 832 deletions

View file

@ -13,26 +13,18 @@ MOLECULE := $(VENV)/bin/molecule
VAULT_ARGS := VAULT_ARGS :=
# Default vault file for edit-vault / check-vault (override with VAULT=<path>). # Default vault file for edit-vault / check-vault (override with VAULT=<path>).
VAULT ?= inventories/production/group_vars/all/vault.yml VAULT ?= inventories/production/group_vars/all/vault.yml
INVENTORY := -i inventories/production/ INVENTORY := -i inventories/production/hosts.yml
TF := terraform TF := terraform
TF_ENV ?= staging TF_ENV ?= staging
MOLECULE_IMAGE := forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest MOLECULE_IMAGE := forgejo.nyumbani.baobab.band/sjat/molecule-debian13:latest
MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile MOLECULE_DOCKERFILE := .docker/molecule-debian13/Dockerfile
# For TF_ENV=offsite, source the Hetzner token from the vault into the environment
# (rbw must be unlocked). Read in-memory; never written to a tfvars file (CLAUDE.md).
ifeq ($(TF_ENV),offsite)
TF_TOKEN_ENV := TF_VAR_hcloud_token="$$($(ANSIBLE)-vault view inventories/production/group_vars/all/vault.yml | $(PYTHON) -c 'import sys, yaml; print(yaml.safe_load(sys.stdin)["vault"]["hetzner"]["token"])')"
else
TF_TOKEN_ENV :=
endif
.DEFAULT_GOAL := help .DEFAULT_GOAL := help
.PHONY: help setup collections lint test test-all check deploy encrypt decrypt \ .PHONY: help setup collections lint test test-all check deploy encrypt decrypt \
edit-vault check-vault new-role \ edit-vault check-vault new-role \
tf-init tf-plan tf-apply tf-output tf-inventory tf-inventory-offsite \ tf-init tf-plan tf-apply tf-output tf-inventory \
molecule-image molecule-image-push molecule-image molecule-image-push
help: help:
@ -57,7 +49,6 @@ help:
@echo " make tf-apply [TF_ENV=staging] Apply Terraform changes" @echo " make tf-apply [TF_ENV=staging] Apply Terraform changes"
@echo " make tf-output [TF_ENV=staging] Print Terraform outputs as JSON" @echo " make tf-output [TF_ENV=staging] Print Terraform outputs as JSON"
@echo " make tf-inventory [TF_ENV=staging] Regenerate Ansible inventory from Terraform outputs" @echo " make tf-inventory [TF_ENV=staging] Regenerate Ansible inventory from Terraform outputs"
@echo " make tf-inventory-offsite Generate offsite_hosts inventory (askari) into inventories/production/"
@echo "" @echo ""
@echo " TF_ENV defaults to 'staging'. Use TF_ENV=production for production." @echo " TF_ENV defaults to 'staging'. Use TF_ENV=production for production."
@echo "" @echo ""
@ -146,16 +137,16 @@ molecule-image-push: molecule-image
# ── Terraform ───────────────────────────────────────────────────────────────── # ── Terraform ─────────────────────────────────────────────────────────────────
tf-init: tf-init:
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) init $(TF) -chdir=terraform/environments/$(TF_ENV) init
tf-plan: tf-plan:
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) plan $(TF) -chdir=terraform/environments/$(TF_ENV) plan
tf-apply: tf-apply:
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) apply $(TF) -chdir=terraform/environments/$(TF_ENV) apply
tf-output: tf-output:
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) output -json $(TF) -chdir=terraform/environments/$(TF_ENV) output -json
tf-inventory: tf-inventory:
ifndef TF_ENV ifndef TF_ENV
@ -165,11 +156,6 @@ endif
| $(PYTHON) scripts/tf_to_inventory.py > inventories/$(TF_ENV)/hosts.yml | $(PYTHON) scripts/tf_to_inventory.py > inventories/$(TF_ENV)/hosts.yml
@echo "Inventory written to inventories/$(TF_ENV)/hosts.yml" @echo "Inventory written to inventories/$(TF_ENV)/hosts.yml"
tf-inventory-offsite:
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/offsite output -json \
| $(PYTHON) scripts/tf_to_inventory.py > inventories/production/offsite.yml
@echo "Offsite inventory written to inventories/production/offsite.yml"
# ── Role scaffolding ────────────────────────────────────────────────────────── # ── Role scaffolding ──────────────────────────────────────────────────────────
new-role: new-role:

View file

@ -5,7 +5,7 @@ This repo is partly aspirational: the ADRs in `docs/decisions/` describe the
truth. **Before relying on a role, provider, or pipeline existing, check here.** truth. **Before relying on a role, provider, or pipeline existing, check here.**
If something is listed as "designed, not built", do not assume it works. If something is listed as "designed, not built", do not assume it works.
_Last reviewed: 2026-06-14._ _Last reviewed: 2026-06-11._
## Real and working today ## Real and working today
@ -20,7 +20,7 @@ _Last reviewed: 2026-06-14._
| Pre-commit hooks | Configured: lint, gitleaks, vault-encryption guard. Activate with `pre-commit install` after `make setup`. | | Pre-commit hooks | Configured: lint, gitleaks, vault-encryption guard. Activate with `pre-commit install` after `make setup`. |
| Vault password client | `scripts/vault-pass-client.sh` fetches the master password from Vaultwarden via `rbw` (wired as `vault_password_file`). Requires `rbw` installed + `rbw unlock`. | | Vault password client | `scripts/vault-pass-client.sh` fetches the master password from Vaultwarden via `rbw` (wired as `vault_password_file`). Requires `rbw` installed + `rbw unlock`. |
| `/review-repo` | Repo audit: `scripts/repo-scan.py` (Phase 0) + `.claude/commands/review-repo.md`, reports to `docs/reviews/`. On-demand only; cron + email deferred (`docs/TODO.md`). | | `/review-repo` | Repo audit: `scripts/repo-scan.py` (Phase 0) + `.claude/commands/review-repo.md`, reports to `docs/reviews/`. On-demand only; cron + email deferred (`docs/TODO.md`). |
| Terraform HCL (`terraform/`) | Written (proxmox VM module + envs) — but never run; see below. Offsite env also written — see "Designed but not built". | | Terraform HCL (`terraform/`) | Written (proxmox VM module + envs) — but never run; see below |
| `docs/hardware/reference.md` + `scripts/capacity-scan.py` | Present — reference doc (skeleton until real hardware) + stdlib scan; emits capacity JSON | | `docs/hardware/reference.md` + `scripts/capacity-scan.py` | Present — reference doc (skeleton until real hardware) + stdlib scan; emits capacity JSON |
| `/capacity-review` | Works — on-demand capacity evaluation → `docs/hardware/reviews/`. Intent-based (no live usage yet) | | `/capacity-review` | Works — on-demand capacity evaluation → `docs/hardware/reviews/`. Intent-based (no live usage yet) |
| ADR-002 security strategy + `docs/security/{accepted-risks,service-checklist}.md` | Present — threat model, principles, governance frame; checklist + risk register are docs, enforced manually in review | | ADR-002 security strategy + `docs/security/{accepted-risks,service-checklist}.md` | Present — threat model, principles, governance frame; checklist + risk register are docs, enforced manually in review |
@ -50,8 +50,7 @@ applying `dev_env` via `playbooks/workstation.yml`.)
| Thing | Designed in | Notes | | Thing | Designed in | Notes |
|---|---|---| |---|---|---|
| `dns` role (renders the internal zone) | ADR-007 / ADR-009 | Does not exist. Internal DNS ownership is assigned to it by design. | | `dns` role (renders the internal zone) | ADR-007 / ADR-009 | Does not exist. Internal DNS ownership is assigned to it by design. |
| Terraform actually provisioning (Proxmox) | ADR-006 / ADR-009 | Never `terraform init`ed: no `.terraform.lock.hcl`, no state, no real `local.vms` entries | | Terraform actually provisioning | ADR-006 / ADR-009 | Never `terraform init`ed: no `.terraform.lock.hcl`, no state, no real `local.vms` entries |
| `terraform/{modules/hetzner_vm, environments/offsite}` (askari) | ADR-006 (amended) | **Written, not yet applied.** Terraform owns askari's existence (hcloud provider, CAX11/hel1/debian-13, cloud-init `ansible` user, Hetzner Cloud Firewall SSH-from-ubongo). Makefile token-injection + directory inventory + `tf-inventory-offsite` handoff wired; offsite-handoff pytest green. **Pending:** `terraform init/plan/apply` (run on ubongo — creates a billed VPS) + bootstrap. M2 of the roadmap. |
| CI (Forgejo Actions) | ADR-003 / ADR-008 | Pipeline described; not implemented | | CI (Forgejo Actions) | ADR-003 / ADR-008 | Pipeline described; not implemented |
| Level 2 / 3 testing (staging, `askari` smoke) | ADR-008 | Depends on real VMs / `askari`, which don't exist yet | | Level 2 / 3 testing (staging, `askari` smoke) | ADR-008 | Depends on real VMs / `askari`, which don't exist yet |
| Per-service roles | ADR-004 | Model defined; no service roles built | | Per-service roles | ADR-004 | Model defined; no service roles built |

View file

@ -1,5 +1,5 @@
[defaults] [defaults]
inventory = inventories/production/ inventory = inventories/production/hosts.yml
roles_path = roles roles_path = roles
collections_path = .collections collections_path = .collections
vault_password_file = scripts/vault-pass-client.sh vault_password_file = scripts/vault-pass-client.sh

View file

@ -8,7 +8,7 @@ Accepted (2026-05-30)
Ansible manages host configuration well but has no state model for infrastructure Ansible manages host configuration well but has no state model for infrastructure
existence. Adding Terraform handles the "what exists" layer — creating and destroying existence. Adding Terraform handles the "what exists" layer — creating and destroying
VMs on Proxmox and Hetzner — while Ansible continues to own everything that runs inside them, VMs on Proxmox — while Ansible continues to own everything that runs inside them,
including all internal DNS records. including all internal DNS records.
This complements rather than replaces Ansible. The two tools do not overlap. The This complements rather than replaces Ansible. The two tools do not overlap. The
@ -35,13 +35,8 @@ cadence, making them a poor fit for Terraform state.
### Providers ### Providers
**`bpg/proxmox` (`~> 0.70`)**: Chosen over `telmate/proxmox` for active maintenance, **`bpg/proxmox` (`~> 0.70`)**: Chosen over `telmate/proxmox` for active maintenance,
full Proxmox 8 API support, and better cloud-init integration. This is the provider full Proxmox 8 API support, and better cloud-init integration. This is the only
for Proxmox VMs. provider.
**`hetznercloud/hcloud` (`~> 1.65`)**: owns off-site VM existence (`askari`). ADR-006's
scope is now **Proxmox + Hetzner** — "Terraform owns VM existence" generalizes across
providers. The `offsite` environment + `hetzner_vm` module live alongside the Proxmox env
+ `proxmox_vm` module; each environment has its own local state.
Terraform does **not** manage DNS. An earlier design used `hashicorp/dns` (RFC 2136) Terraform does **not** manage DNS. An earlier design used `hashicorp/dns` (RFC 2136)
to write A records, but that created a bootstrap cycle — the first DNS server cannot to write A records, but that created a bootstrap cycle — the first DNS server cannot
@ -76,11 +71,9 @@ integration boundary.
terraform/ terraform/
modules/ modules/
proxmox_vm/ # reusable VM module — Proxmox only, no DNS proxmox_vm/ # reusable VM module — Proxmox only, no DNS
hetzner_vm/ # reusable VM module — Hetzner Cloud, no DNS
environments/ environments/
staging/ # staging Proxmox VMs, separate state file staging/ # staging VMs, separate state file
production/ # production Proxmox VMs, separate state file production/ # production VMs, separate state file
offsite/ # off-site Hetzner VMs (askari), separate state file
``` ```
Separate environment directories (not Terraform workspaces) for the clearest Separate environment directories (not Terraform workspaces) for the clearest
@ -128,10 +121,8 @@ handoff)**.
Drawn from the "What was ruled out" section and the decisions stated above: Drawn from the "What was ruled out" section and the decisions stated above:
- `bpg/proxmox` is the provider for Proxmox VMs; `telmate/proxmox` was ruled out for weaker - `bpg/proxmox` is the only provider; `telmate/proxmox` was ruled out for weaker
maintenance and Proxmox 8 / cloud-init support (Providers; What was ruled out). maintenance and Proxmox 8 / cloud-init support (Providers; What was ruled out).
- `hetznercloud/hcloud` is the provider for off-site VM existence (`askari`); ADR-006's
scope now covers Proxmox + Hetzner (Providers).
- OPNsense stays entirely in Ansible — no Terraform OPNsense provider — to avoid - OPNsense stays entirely in Ansible — no Terraform OPNsense provider — to avoid
community-provider rot across OPNsense releases (Responsibility split; What was community-provider rot across OPNsense releases (Responsibility split; What was
ruled out). ruled out).

View file

@ -195,11 +195,9 @@ the self-hosted NetBird coordinator** (management/signal/relay). It reaches `srv
metrics endpoints and `mgmt` for administration over the mesh, scoped by NetBird metrics endpoints and `mgmt` for administration over the mesh, scoped by NetBird
ACLs — no OPNsense WireGuard tunnel and no `10.99.0.0/24` routing. ACLs — no OPNsense WireGuard tunnel and no `10.99.0.0/24` routing.
`askari` is provisioned as **Terraform IaC** (`hetznercloud/hcloud`), managed `askari` is provisioned and managed independently of the Proxmox cluster — it must
independently of the Proxmox cluster (its own provider + local state in be reachable even when the homelab is down (its entire purpose), which is also why
`terraform/environments/offsite/`). It must be reachable even when the homelab is down the mesh coordinator lives here: an off-site control plane survives a homelab outage.
(its entire purpose), which is also why the mesh coordinator lives here: an off-site
control plane survives a homelab outage.
FQDN: `askari.wingu.me` (off-site tier; record added by `public_dns` when askari exists — M2/M4). FQDN: `askari.wingu.me` (off-site tier; record added by `public_dns` when askari exists — M2/M4).
--- ---

View file

@ -83,10 +83,10 @@ group against the allowed set and fails loudly on an unknown group.
**Valid groups**: `control`, `docker_hosts`, `proxmox_hosts`, `offsite_hosts`. **Valid groups**: `control`, `docker_hosts`, `proxmox_hosts`, `offsite_hosts`.
`control` holds `ubongo`, a physical machine not managed by Terraform (see the `control` and `offsite_hosts` are not produced by Terraform — they hold manually
control-node exception below and ADR-015). `offsite_hosts` holds `askari`, which is provisioned hosts (`ubongo` and `askari` respectively) added to the inventory by hand
Terraform-managed via the `hetznercloud/hcloud` provider in the `offsite` environment (see the control-node exception below and ADR-015/ADR-016). They are valid groups so
(see the off-site handoff note below and ADR-016). the generated `hosts.yml` carries their (otherwise empty) sections.
The generated `hosts.yml` carries a "do not edit manually" header and is owned by The generated `hosts.yml` carries a "do not edit manually" header and is owned by
the generator. Treat it as a build artifact: the source of truth is `local.vms` in the generator. Treat it as a build artifact: the source of truth is `local.vms` in
@ -152,27 +152,6 @@ Every other host is Terraform-managed.
--- ---
### The off-site handoff (`offsite` environment → `offsite_hosts`)
`askari` (Hetzner VPS, ADR-016) follows the same handoff pipeline as Proxmox hosts but
with its own provider and environment:
- **Producer**`terraform/environments/offsite/outputs.tf` emits a `vms` map in the
same `{ host: { ip, group } }` shape as Proxmox environments; `askari`'s group is
`offsite_hosts`.
- **Consumer**`scripts/tf_to_inventory.py` reads `terraform output -json` from the
`offsite` environment and writes `inventories/production/offsite.yml`.
- **Makefile target**`make tf-inventory-offsite` runs the generator for the offsite
environment.
The production inventory is a **directory** (`inventories/production/`) that Ansible
merges at runtime: `hosts.yml` (Proxmox-generated) and `offsite.yml`
(offsite-generated) together form the full production host list. Each file is a build
artifact — never hand-edited; their source of truth is `local.vms` in the respective
environment's `main.tf`.
---
### What was ruled out ### What was ruled out
| Option | Reason | | Option | Reason |
@ -199,10 +178,7 @@ Drawn from the boundary, the data contract, and the "What was ruled out" section
owned by Ansible, no chicken-and-egg; What was ruled out). owned by Ansible, no chicken-and-egg; What was ruled out).
- The control node (`ubongo`) is the single documented exception to "Terraform owns - The control node (`ubongo`) is the single documented exception to "Terraform owns
VM existence" — a physical machine provisioned manually and managed by Ansible for VM existence" — a physical machine provisioned manually and managed by Ansible for
baseline config only (The control-node exception). baseline config only; every other host is Terraform-managed (The control-node
- The `offsite` TF environment's `vms` output feeds the `offsite_hosts` group via exception).
`tf_to_inventory.py` (`make tf-inventory-offsite``inventories/production/offsite.yml`);
the production inventory is a directory that merges `hosts.yml` (Proxmox) and
`offsite.yml` (offsite) (The off-site handoff).
- The seam is documented in exactly one place (this ADR); ADR-005 and ADR-006 link - The seam is documented in exactly one place (this ADR); ADR-005 and ADR-006 link
here rather than restating it (What was ruled out). here rather than restating it (What was ruled out).

View file

@ -81,9 +81,8 @@ allocated for it.
- **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage. - **Coordinator survival:** off-site on `askari` ⇒ mesh survives a homelab outage.
NetBird's management datastore is backed up encrypted off `askari` (synced to NetBird's management datastore is backed up encrypted off `askari` (synced to
`ubongo`/`mamba`); peers keep last-known config through a brief coordinator outage. `ubongo`/`mamba`); peers keep last-known config through a brief coordinator outage.
- **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` — provisioned - **`askari` is Ansible-managed:** its own inventory group `offsite_hosts` (added
as **Terraform IaC** (`hetznercloud/hcloud`), managed independently of the Proxmox manually like the control node — it is not Terraform-managed), `base` role, plus a
cluster (its own provider + local state). Ansible configuration: `base` role, plus a
dedicated `netbird_coordinator` service role (one service = one role, ADR-004; with dedicated `netbird_coordinator` service role (one service = one role, ADR-004; with
`SECURITY.md`). Agent install/enrollment lives in `base`. NetBird server + agents are `SECURITY.md`). Agent install/enrollment lives in `base`. NetBird server + agents are
version-pinned (ADR-011). boma's `dns` role stays authoritative for version-pinned (ADR-011). boma's `dns` role stays authoritative for

View file

@ -84,20 +84,6 @@ This was chosen over a single connectivity-model-generates-both (too much machin
tight coupling of two very different rule domains) and over fully independent per-layer tight coupling of two very different rule domains) and over fully independent per-layer
declarations (real drift risk). declarations (real drift risk).
### Off-cluster hosts — `askari` (Hetzner)
`askari` sits outside the Proxmox cluster and has no OPNsense. Its **perimeter** layer
is a TF-managed **Hetzner Cloud Firewall** (declared in `terraform/environments/offsite/`)
alongside the VM itself. Current rule set (M2): SSH inbound from `ubongo`'s public IP
only. NetBird ports (UDP 3478 + TCP 80/443) will be added in M4 when the coordinator
role is built.
The `group_vars` service catalog remains authoritative for `askari`'s **host nftables**
layer — the same two-layer model applies, with Hetzner Cloud Firewall substituting for
OPNsense at the perimeter.
---
### OPNsense automation — owned here, mechanism deferred ### OPNsense automation — owned here, mechanism deferred
OPNsense is Ansible-managed (CLAUDE.md: "OPNsense is entirely Ansible; no Terraform OPNsense is Ansible-managed (CLAUDE.md: "OPNsense is entirely Ansible; no Terraform

View file

@ -1,538 +0,0 @@
# askari Provisioning (M2) Implementation Plan
> **For agentic workers:** REQUIRED SUB-SKILL: Use superpowers:subagent-driven-development (recommended) or superpowers:executing-plans to implement this plan task-by-task. Steps use checkbox (`- [ ]`) syntax for tracking.
**Goal:** Provision `askari` (the off-site Hetzner VPS) as Terraform IaC — a `hetzner_vm` module + an `offsite` stack — behind a TF-managed cloud firewall, hand it into the `offsite_hosts` inventory, and bootstrap it.
**Architecture:** Generalize boma's "Terraform owns VM existence" principle (ADR-006) from Proxmox to Hetzner. A reusable `hetzner_vm` module wraps `hcloud_server` + `hcloud_firewall` + `hcloud_ssh_key`; an `offsite` environment (own local state) declares `askari` (CAX11/ARM, Helsinki, Debian 13). cloud-init creates the `ansible` user with ubongo's key; the firewall allows SSH from ubongo only. Handoff stays ADR-009-shaped: the offsite env outputs `vms`, and `tf_to_inventory.py` (already offsite-aware) generates an inventory file merged via a **directory inventory**.
**Tech Stack:** Terraform (`hetznercloud/hcloud` provider), Hetzner Cloud, cloud-init, Ansible. Token from `vault.hetzner.token``TF_VAR_hcloud_token`.
**Spec:** `docs/superpowers/specs/2026-06-14-askari-provisioning-design.md`
**Execution context:** Tasks 16 + 9 are authoring + `terraform fmt/validate/plan` (need `terraform` installed + the token, but no resources are created). **Task 7 (`terraform apply`) and Task 8 (bootstrap) create a real, billed VPS** — gated, run with explicit user go, `tf-plan` shown first (CLAUDE.md). If `terraform` is absent in the working env, Tasks 68 defer to ubongo.
---
## File Structure
- `terraform/modules/hetzner_vm/{variables,main,outputs}.tf` (create) — wraps server + firewall + ssh key + cloud-init.
- `terraform/environments/offsite/{providers,variables,main,outputs,backend}.tf` + `terraform.tfvars.example` (create) — the askari stack, own local state.
- `Makefile` (modify) — inject `TF_VAR_hcloud_token` for `TF_ENV=offsite`; directory inventory; `tf-inventory-offsite` target.
- `scripts/tf_to_inventory.py` (no change — already offsite-aware) + `tests/test_tf_to_inventory.py` (create) — lock the offsite handoff.
- `docs/decisions/{006,009,020,007,016}-*.md`, `STATUS.md` (modify) — ADR amendments + status.
---
### Task 1: Verify the Hetzner provider/image facts (ADR-014)
**Files:** none (research; pin values used by later tasks).
- [ ] **Step 1: Verify and record**
Verify (WebFetch registry.terraform.io / docs.hetzner.com, or `terraform` once init'd):
- latest `hetznercloud/hcloud` provider version to pin (expected `~> 1.48`+),
- the Debian 13 image slug (expected `debian-13`),
- that server type `cax11` exists in location `hel1`.
Record a stamp in the offsite `providers.tf` comment, e.g.:
`# verified: hetznercloud/hcloud <ver> · debian-13 image · cax11@hel1 · <source> · <date>`
- [ ] **Step 2: No commit** (values land in later tasks).
---
### Task 2: The `hetzner_vm` module
**Files:**
- Create: `terraform/modules/hetzner_vm/variables.tf`, `main.tf`, `outputs.tf`
- [ ] **Step 1: `variables.tf`**
```hcl
variable "name" {
description = "Server name (and hostname)"
type = string
}
variable "server_type" {
description = "Hetzner server type, e.g. cax11 (ARM)"
type = string
}
variable "location" {
description = "Hetzner location, e.g. hel1"
type = string
}
variable "image" {
description = "OS image slug, e.g. debian-13"
type = string
}
variable "ansible_ssh_pubkey" {
description = "Public SSH key provisioned for the ansible user via cloud-init"
type = string
}
variable "ssh_admin_cidrs" {
description = "Source CIDRs allowed to reach SSH (e.g. ubongo's address/32)"
type = list(string)
}
variable "labels" {
description = "Hetzner resource labels (metadata only)"
type = map(string)
default = {}
}
```
- [ ] **Step 2: `main.tf`**
```hcl
# cloud-init: create the unprivileged `ansible` user with ubongo's key + sudo.
# (Mirrors the proxmox_vm module's user_account; Hetzner has no structured field.)
locals {
user_data = <<-EOT
#cloud-config
users:
- name: ansible
groups: [sudo]
sudo: "ALL=(ALL) NOPASSWD:ALL"
shell: /bin/bash
ssh_authorized_keys:
- ${var.ansible_ssh_pubkey}
package_update: true
packages:
- python3
EOT
}
resource "hcloud_ssh_key" "ansible" {
name = "${var.name}-ansible"
public_key = var.ansible_ssh_pubkey
}
resource "hcloud_firewall" "this" {
name = "${var.name}-fw"
# SSH from the control node only (NetBird ports are added in M4 when the
# coordinator deploys — see ADR-020; the host nftables layer is catalog-driven).
rule {
direction = "in"
protocol = "tcp"
port = "22"
source_ips = var.ssh_admin_cidrs
}
}
resource "hcloud_server" "this" {
name = var.name
server_type = var.server_type
location = var.location
image = var.image
ssh_keys = [hcloud_ssh_key.ansible.id]
user_data = local.user_data
firewall_ids = [hcloud_firewall.this.id]
labels = var.labels
public_net {
ipv4_enabled = true
ipv6_enabled = true
}
}
```
- [ ] **Step 3: `outputs.tf`**
```hcl
output "ipv4_address" {
description = "Server public IPv4"
value = hcloud_server.this.ipv4_address
}
output "name" {
description = "Server name"
value = hcloud_server.this.name
}
```
- [ ] **Step 4: Format**
Run: `terraform fmt terraform/modules/hetzner_vm/`
Expected: files formatted (or already formatted).
- [ ] **Step 5: Commit**
```bash
git add terraform/modules/hetzner_vm
git commit -m "feat(tf): hetzner_vm module (server + firewall + ssh key + cloud-init)"
```
(append `Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>`)
---
### Task 3: The `offsite` environment
**Files:**
- Create: `terraform/environments/offsite/{providers,variables,main,outputs,backend}.tf`, `terraform.tfvars.example`
- [ ] **Step 1: `providers.tf`** (pin the version from Task 1)
```hcl
# verified: hetznercloud/hcloud ~> 1.48 · debian-13 · cax11@hel1 · <source> · <date>
terraform {
required_version = ">= 1.9"
required_providers {
hcloud = {
source = "hetznercloud/hcloud"
version = "~> 1.48"
}
}
}
provider "hcloud" {
token = var.hcloud_token
}
```
- [ ] **Step 2: `variables.tf`**
```hcl
variable "hcloud_token" {
description = "Hetzner Cloud API token — set via TF_VAR_hcloud_token (from vault.hetzner.token)"
type = string
sensitive = true
}
variable "ansible_ssh_pubkey" {
description = "ubongo's control SSH public key, provisioned for the ansible user"
type = string
}
variable "ssh_admin_cidrs" {
description = "Source CIDRs allowed to SSH askari (ubongo's address/32)"
type = list(string)
}
```
- [ ] **Step 3: `main.tf`**
```hcl
# offsite/main.tf — off-site Hetzner hosts. Terraform owns VM existence (ADR-006,
# generalized to Hetzner). ALWAYS `make tf-plan TF_ENV=offsite` and review before
# `make tf-apply TF_ENV=offsite`.
module "askari" {
source = "../../modules/hetzner_vm"
name = "askari"
server_type = "cax11" # ARM, 2 vCPU / 4 GB
location = "hel1" # Helsinki
image = "debian-13"
ansible_ssh_pubkey = var.ansible_ssh_pubkey
ssh_admin_cidrs = var.ssh_admin_cidrs
labels = {
env = "offsite"
group = "offsite_hosts"
managed-by = "terraform"
}
}
```
- [ ] **Step 4: `outputs.tf`** (the `tf_to_inventory.py` contract — `vms` map)
```hcl
output "vms" {
description = "Hostname → IP and Ansible group — consumed by make tf-inventory-offsite"
value = {
askari = {
ip = module.askari.ipv4_address
group = "offsite_hosts"
}
}
}
```
- [ ] **Step 5: `backend.tf`**
```hcl
# Terraform state: LOCAL, on the control node (like the Proxmox envs; ADR-006).
# askari survives a homelab outage by design, so a lost state is recovered by
# `terraform import` of the running server — not a rebuild. Back the state up with
# the control node (ADR-022).
```
- [ ] **Step 6: `terraform.tfvars.example`**
```hcl
# offsite environment — non-secret values. Copy to terraform.tfvars and fill in.
#
# Secret is exported as an env var (never in this file):
# export TF_VAR_hcloud_token="$(...from vault.hetzner.token...)" # make handles this
#
# State is local (see backend.tf).
ansible_ssh_pubkey = "ssh-ed25519 AAAA... ansible@ubongo"
ssh_admin_cidrs = ["10.20.10.151/32"] # ubongo's LAN address (ADR-021)
```
- [ ] **Step 7: Format + commit**
Run: `terraform fmt terraform/environments/offsite/`
```bash
git add terraform/environments/offsite
git commit -m "feat(tf): offsite environment — askari (CAX11/hel1/debian-13)"
```
(Co-Authored-By trailer)
---
### Task 4: Makefile — token injection, directory inventory, offsite handoff
**Files:**
- Modify: `Makefile`
- [ ] **Step 1: Inject the Hetzner token for `TF_ENV=offsite`**
The `tf-*` targets need `TF_VAR_hcloud_token` for offsite, sourced from the vault. Add a guarded helper variable near the `TF` definition:
```makefile
# For TF_ENV=offsite, export the Hetzner token from the vault (rbw unlocked).
# Reads vault.hetzner.token in-memory; never written to a tfvars file (CLAUDE.md).
ifeq ($(TF_ENV),offsite)
TF_TOKEN_ENV = TF_VAR_hcloud_token="$$($(VENV)/bin/ansible-vault view inventories/production/group_vars/all/vault.yml | $(VENV)/bin/python -c 'import sys,yaml; print(yaml.safe_load(sys)["vault"]["hetzner"]["token"])')"
else
TF_TOKEN_ENV =
endif
```
Then prefix the `tf-init`/`tf-plan`/`tf-apply`/`tf-output` recipes with `$(TF_TOKEN_ENV)`, e.g.:
```makefile
tf-plan:
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/$(TF_ENV) plan
```
(Apply the same prefix to `tf-init`, `tf-apply`, `tf-output`.)
- [ ] **Step 2: Directory inventory**
Change the inventory so multiple TF envs can each generate a file:
```makefile
INVENTORY := -i inventories/production/
```
(Ansible reads every file in the directory as an inventory source and merges them; `group_vars/`/`host_vars/` remain variable dirs. Verify `ansible.cfg` does not also hard-set `inventory=`; if it does, update it to match.)
- [ ] **Step 3: `tf-inventory-offsite` target**
Add (writes the offsite hosts into the production inventory dir, beside the Proxmox-generated `hosts.yml`):
```makefile
tf-inventory-offsite:
$(TF_TOKEN_ENV) $(TF) -chdir=terraform/environments/offsite output -json \
| $(PYTHON) scripts/tf_to_inventory.py > inventories/production/offsite.yml
@echo "Offsite inventory written to inventories/production/offsite.yml"
```
Add `tf-inventory-offsite` to `.PHONY` and a help line.
- [ ] **Step 4: Verify existing playbooks still resolve under the directory inventory**
Run: `make check PLAYBOOK=dns 2>&1 | tail -3`
Expected: still resolves the `control` host and runs (no inventory errors). If `connection:`/group_vars break, fix before committing.
- [ ] **Step 5: Commit**
```bash
git add Makefile
git commit -m "feat(make): offsite TF token injection + directory inventory + tf-inventory-offsite"
```
(Co-Authored-By trailer)
---
### Task 5: Lock the offsite inventory handoff (TDD)
**Files:**
- Test: `tests/test_tf_to_inventory.py`
- [ ] **Step 1: Write the failing test**
```python
import json
import pathlib
import subprocess
import sys
_SCRIPT = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "tf_to_inventory.py"
def _run(tf_output: dict) -> str:
return subprocess.run(
[sys.executable, str(_SCRIPT)],
input=json.dumps(tf_output), capture_output=True, text=True, check=True,
).stdout
def test_offsite_host_lands_in_offsite_hosts():
out = _run({"vms": {"value": {"askari": {"ip": "203.0.113.7", "group": "offsite_hosts"}}}})
assert "offsite_hosts:" in out
assert "askari:" in out
assert "ansible_host: 203.0.113.7" in out
def test_unknown_group_rejected():
proc = subprocess.run(
[sys.executable, str(_SCRIPT)],
input=json.dumps({"vms": {"value": {"x": {"ip": "1.2.3.4", "group": "nope"}}}}),
capture_output=True, text=True,
)
assert proc.returncode == 1
assert "unknown group" in proc.stderr
```
- [ ] **Step 2: Run it**
Run: `.venv/bin/python -m pytest tests/test_tf_to_inventory.py -v`
Expected: PASS — `tf_to_inventory.py` already supports `offsite_hosts` and rejects unknown groups (this test locks that behaviour for the M2 handoff; no code change needed). If it fails, fix `scripts/tf_to_inventory.py` minimally and report.
- [ ] **Step 3: Commit**
```bash
git add tests/test_tf_to_inventory.py
git commit -m "test(tf): lock the offsite_hosts inventory handoff"
```
(Co-Authored-By trailer)
---
### Task 6: Init, validate, plan (gated — needs terraform + token)
> Needs `terraform` installed and `rbw` unlocked. Creates **no** resources. If `terraform` is absent, defer Tasks 68 to ubongo.
- [ ] **Step 1: Set tfvars**
`cp terraform/environments/offsite/terraform.tfvars.example terraform/environments/offsite/terraform.tfvars` and set `ansible_ssh_pubkey` to ubongo's real control public key and `ssh_admin_cidrs` to ubongo's address (`10.20.10.151/32`). (`terraform.tfvars` is gitignored.)
- [ ] **Step 2: Init (tracks the lock file)**
Run: `make tf-init TF_ENV=offsite`
Expected: providers installed; `terraform/environments/offsite/.terraform.lock.hcl` created. `git add` the lock file (tracked per CLAUDE.md).
- [ ] **Step 3: Validate + plan**
Run: `terraform -chdir=terraform/environments/offsite validate``Success`.
Run: `make tf-plan TF_ENV=offsite` → review: **1 server + 1 firewall + 1 ssh key to add**. Confirm CAX11/hel1/debian-13 and the SSH-from-ubongo rule.
- [ ] **Step 4: Commit the lock file**
```bash
git add terraform/environments/offsite/.terraform.lock.hcl
git commit -m "chore(tf): pin offsite provider lock (hcloud)"
```
(Co-Authored-By trailer)
---
### Task 7: Apply — create askari (GATED, real billed VPS)
> **Explicit user go required.** Run on ubongo. The plan from Task 6 must be reviewed first (CLAUDE.md: never apply without a shown plan).
- [ ] **Step 1: Apply**
Run: `make tf-apply TF_ENV=offsite`
Expected: `hcloud_ssh_key`, `hcloud_firewall`, `hcloud_server.askari` created; outputs show `askari`'s IPv4.
- [ ] **Step 2: Generate the offsite inventory**
Run: `make tf-inventory-offsite`
Expected: `inventories/production/offsite.yml` written with `askari` under `offsite_hosts`.
- [ ] **Step 3: Verify the inventory merges**
Run: `.venv/bin/ansible-inventory $(INVENTORY) --host askari` (or `--list`)
Expected: `askari` present with its `ansible_host`.
- [ ] **Step 4: Commit the generated inventory**
```bash
git add inventories/production/offsite.yml
git commit -m "chore(inventory): askari in offsite_hosts (generated)"
```
(Co-Authored-By trailer)
---
### Task 8: Bootstrap askari (GATED — needs the live host)
> Run on ubongo after Task 7. `rbw` unlocked.
- [ ] **Step 1: Reach it**
Run: `ssh ansible@<askari-ip>` (cloud-init created the `ansible` user with ubongo's key) — expect a shell. If refused, check the firewall `ssh_admin_cidrs` matches ubongo's egress IP.
- [ ] **Step 2: Bootstrap**
Run: `make check PLAYBOOK=bootstrap` (review) then `make deploy PLAYBOOK=bootstrap` — expect the `ansible` user + sudoers confirmed/created on askari (idempotent).
- [ ] **Step 3: No repo commit** — this configures the host, not the repo. (`base` subset = M3.)
---
### Task 9: ADR amendments + STATUS
**Files:**
- Modify: `docs/decisions/006-terraform.md`, `009-provisioning-handoff.md`, `020-firewall.md`, `007-network.md`, `016-mesh-vpn.md`, `STATUS.md`
For each: **Read the relevant section first**, then apply the change.
- [ ] **Step 1: ADR-006 — generalize the provider scope**
In the **Providers** section, the line "`bpg/proxmox` … This is the only provider." → note a second provider:
```
**`hetznercloud/hcloud`**: owns off-site VM existence (`askari`). ADR-006's scope is
**Proxmox + Hetzner** — "Terraform owns VM existence" generalizes across providers; the
`offsite` environment + `hetzner_vm` module live alongside the Proxmox env + module.
```
Also adjust the Context line "creating and destroying VMs on Proxmox" → "on Proxmox and Hetzner".
- [ ] **Step 2: ADR-009 — offsite handoff**
Add a note that `offsite` is a TF environment whose `vms` output feeds `offsite_hosts` via `tf_to_inventory.py` (`make tf-inventory-offsite``inventories/production/offsite.yml`), and that the production inventory is a **directory** merging the Proxmox + offsite generated files.
- [ ] **Step 3: ADR-020 — askari's perimeter**
Note that off-cluster `askari` has no OPNsense; its **perimeter** is a TF-managed Hetzner Cloud Firewall (SSH-from-ubongo now; NetBird ports in M4). The `group_vars` catalog stays authoritative for the host nftables layer.
- [ ] **Step 4: ADR-007 / ADR-016 — askari is TF-provisioned**
Replace "provisioned … independently … added manually" wording for askari with "provisioned as Terraform IaC (hcloud), managed independently of the Proxmox cluster (own provider + state)."
- [ ] **Step 5: STATUS.md**
Move/realize askari's row per how far Task 7/8 got. If applied: under "Real and working today" — `askari` **Built + applied** (CAX11/hel1/debian-13, cloud firewall SSH-from-ubongo, bootstrapped, in `offsite_hosts`). If only authored (apply deferred): note the TF is written + `tf-plan` clean, apply pending on ubongo.
- [ ] **Step 6: Lint + commit**
Run: `make lint` (must pass).
```bash
git add docs/decisions/006-terraform.md docs/decisions/009-provisioning-handoff.md \
docs/decisions/020-firewall.md docs/decisions/007-network.md \
docs/decisions/016-mesh-vpn.md STATUS.md
git commit -m "docs(askari): amend ADR-006/009/020/007/016 for TF-provisioned offsite host; STATUS"
```
(Co-Authored-By trailer)
---
## Self-Review (completed)
- **Spec coverage:** TF owns existence / generalize ADR-006 (Decision 1) → Tasks 2,3,9; CAX11/hel1/debian-13 (Decision 2) → Task 3; TF cloud firewall, SSH-from-ubongo, NetBird ports later (Decision 3) → Task 2 + Task 9 ADR-020; token via `TF_VAR_hcloud_token` from vault (Decision 4) → Task 4; ADR-009 handoff via `tf_to_inventory` (Decision 5) → Tasks 4,5,7; cloud-init `ansible` user + bootstrap → Tasks 2,8; state + DR (import) → Task 3 backend; ADR amendments → Task 9. All covered.
- **Placeholder scan:** none — HCL, make, and test content are concrete. `<askari-ip>`/`<source>`/`<date>` are runtime/verification values, not unspecified logic.
- **Type/name consistency:** module vars (`name`, `server_type`, `location`, `image`, `ansible_ssh_pubkey`, `ssh_admin_cidrs`, `labels`) match between module + env call; the `vms` output shape (`{ip, group}`) matches `tf_to_inventory.py`'s contract; `TF_VAR_hcloud_token``var.hcloud_token`; `vault.hetzner.token` matches the stored key.
- **Notes for the implementer:** (a) confirm Ansible merges the directory inventory's two files so `askari` resolves (Task 7 Step 3); (b) verify `hcloud_server` arg names against the pinned provider version (Task 1) — adjust `public_net`/`firewall_ids` if the provider differs; (c) Tasks 78 create a billed VPS — gated on explicit go.

View file

@ -1,4 +0,0 @@
# Terraform state: LOCAL, on the control node (like the Proxmox envs; ADR-006).
# askari survives a homelab outage by design, so a lost state is recovered by
# `terraform import` of the running server not a rebuild. Back the state up with
# the control node (ADR-022).

View file

@ -1,19 +0,0 @@
# offsite/main.tf off-site Hetzner hosts. Terraform owns VM existence (ADR-006,
# generalized to Hetzner). ALWAYS `make tf-plan TF_ENV=offsite` and review before
# `make tf-apply TF_ENV=offsite`.
module "askari" {
source = "../../modules/hetzner_vm"
name = "askari"
server_type = "cax11" # ARM, 2 vCPU / 4 GB
location = "hel1" # Helsinki
image = "debian-13"
ansible_ssh_pubkey = var.ansible_ssh_pubkey
ssh_admin_cidrs = var.ssh_admin_cidrs
labels = {
env = "offsite"
group = "offsite_hosts"
managed-by = "terraform"
}
}

View file

@ -1,9 +0,0 @@
output "vms" {
description = "Hostname -> IP and Ansible group — consumed by make tf-inventory-offsite"
value = {
askari = {
ip = module.askari.ipv4_address
group = "offsite_hosts"
}
}
}

View file

@ -1,15 +0,0 @@
# verified: hetznercloud/hcloud 1.65.0 · debian-13 image · cax11@hel1 · terraform-registry · 2026-06-14
terraform {
required_version = ">= 1.9"
required_providers {
hcloud = {
source = "hetznercloud/hcloud"
version = "~> 1.65"
}
}
}
provider "hcloud" {
token = var.hcloud_token
}

View file

@ -1,13 +0,0 @@
# offsite environment — non-secret values. Copy to terraform.tfvars and fill in.
#
# Secret is exported as an env var (never in this file); the make tf-* targets do this
# automatically for TF_ENV=offsite, sourcing vault.hetzner.token:
# export TF_VAR_hcloud_token="...from vault.hetzner.token..."
#
# State is local (see backend.tf).
ansible_ssh_pubkey = "ssh-ed25519 AAAA... ansible@ubongo"
# The Hetzner Cloud Firewall filters PUBLIC traffic, so this is ubongo's WAN/egress
# IP (the perimeter analog of OPNsense, ADR-020) — NOT its LAN address. Find it with
# `curl -s ifconfig.me` from ubongo. Narrows to the NetBird `wt0` path once M5 lands.
ssh_admin_cidrs = ["203.0.113.10/32"] # placeholder — ubongo's WAN/egress IP

View file

@ -1,15 +0,0 @@
variable "hcloud_token" {
description = "Hetzner Cloud API token — set via TF_VAR_hcloud_token (from vault.hetzner.token)"
type = string
sensitive = true
}
variable "ansible_ssh_pubkey" {
description = "ubongo's control SSH public key, provisioned for the ansible user"
type = string
}
variable "ssh_admin_cidrs" {
description = "Source CIDRs allowed to SSH askari (ubongo's address/32)"
type = list(string)
}

View file

@ -1,53 +0,0 @@
# cloud-init: create the unprivileged `ansible` user with ubongo's key + sudo.
# (Mirrors the proxmox_vm module's user_account; Hetzner has no structured field.)
locals {
# Indentation matches the closing EOT (2 spaces) so `<<-` strips to column 0
# cloud-config requires `#cloud-config` as the first line with no leading space.
user_data = <<-EOT
#cloud-config
users:
- name: ansible
groups: [sudo]
sudo: "ALL=(ALL) NOPASSWD:ALL"
shell: /bin/bash
ssh_authorized_keys:
- ${var.ansible_ssh_pubkey}
package_update: true
packages:
- python3
EOT
}
resource "hcloud_ssh_key" "ansible" {
name = "${var.name}-ansible"
public_key = var.ansible_ssh_pubkey
}
resource "hcloud_firewall" "this" {
name = "${var.name}-fw"
# SSH from the control node only. NetBird ports (UDP 3478, TCP 80/443) are added
# in M4 when the coordinator deploys (ADR-020); host nftables stays catalog-driven.
rule {
direction = "in"
protocol = "tcp"
port = "22"
source_ips = var.ssh_admin_cidrs
}
}
resource "hcloud_server" "this" {
name = var.name
server_type = var.server_type
location = var.location
image = var.image
ssh_keys = [hcloud_ssh_key.ansible.id]
user_data = local.user_data
firewall_ids = [hcloud_firewall.this.id]
labels = var.labels
public_net {
ipv4_enabled = true
ipv6_enabled = true
}
}

View file

@ -1,9 +0,0 @@
output "ipv4_address" {
description = "Server public IPv4"
value = hcloud_server.this.ipv4_address
}
output "name" {
description = "Server name"
value = hcloud_server.this.name
}

View file

@ -1,35 +0,0 @@
variable "name" {
description = "Server name (and hostname)"
type = string
}
variable "server_type" {
description = "Hetzner server type, e.g. cax11 (ARM)"
type = string
}
variable "location" {
description = "Hetzner location, e.g. hel1"
type = string
}
variable "image" {
description = "OS image slug, e.g. debian-13"
type = string
}
variable "ansible_ssh_pubkey" {
description = "Public SSH key provisioned for the ansible user via cloud-init"
type = string
}
variable "ssh_admin_cidrs" {
description = "Source CIDRs allowed to reach SSH (e.g. ubongo's address/32)"
type = list(string)
}
variable "labels" {
description = "Hetzner resource labels (metadata only)"
type = map(string)
default = {}
}

View file

@ -1,30 +0,0 @@
import json
import pathlib
import subprocess
import sys
_SCRIPT = pathlib.Path(__file__).resolve().parent.parent / "scripts" / "tf_to_inventory.py"
def _run(tf_output: dict) -> str:
return subprocess.run(
[sys.executable, str(_SCRIPT)],
input=json.dumps(tf_output), capture_output=True, text=True, check=True,
).stdout
def test_offsite_host_lands_in_offsite_hosts():
out = _run({"vms": {"value": {"askari": {"ip": "203.0.113.7", "group": "offsite_hosts"}}}})
assert "offsite_hosts:" in out
assert "askari:" in out
assert "ansible_host: 203.0.113.7" in out
def test_unknown_group_rejected():
proc = subprocess.run(
[sys.executable, str(_SCRIPT)],
input=json.dumps({"vms": {"value": {"x": {"ip": "1.2.3.4", "group": "nope"}}}}),
capture_output=True, text=True,
)
assert proc.returncode == 1
assert "unknown group" in proc.stderr