Compare commits

..

No commits in common. "181a02fd3a85d23128a2da2b3f3113a5f5eee461" and "cff368ece2ba318be447aae3e8c2878a00b50700" have entirely different histories.

13 changed files with 17 additions and 161 deletions

View file

@ -44,8 +44,8 @@ help:
@echo " make lint Run yamllint + ansible-lint" @echo " make lint Run yamllint + ansible-lint"
@echo " make test ROLE=<name> Run Molecule tests for a role" @echo " make test ROLE=<name> Run Molecule tests for a role"
@echo " make test-all Run Molecule tests for all roles" @echo " make test-all Run Molecule tests for all roles"
@echo " make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Dry-run a playbook (check mode)" @echo " make check PLAYBOOK=<name> Dry-run a playbook (check mode)"
@echo " make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Run a playbook against production" @echo " make deploy PLAYBOOK=<name> Run a playbook against production"
@echo " make edit-vault [VAULT=<path>] Edit the vault in nvim (auto re-encrypts + checks)" @echo " make edit-vault [VAULT=<path>] Edit the vault in nvim (auto re-encrypts + checks)"
@echo " make check-vault [VAULT=<path>] Validate vault structure (values masked)" @echo " make check-vault [VAULT=<path>] Validate vault structure (values masked)"
@echo " make encrypt FILE=<path> Encrypt a vault file" @echo " make encrypt FILE=<path> Encrypt a vault file"
@ -103,13 +103,13 @@ check:
ifndef PLAYBOOK ifndef PLAYBOOK
$(error PLAYBOOK is required: make check PLAYBOOK=<name>) $(error PLAYBOOK is required: make check PLAYBOOK=<name>)
endif endif
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) --check --diff playbooks/$(PLAYBOOK).yml $(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) --check --diff playbooks/$(PLAYBOOK).yml
deploy: deploy:
ifndef PLAYBOOK ifndef PLAYBOOK
$(error PLAYBOOK is required: make deploy PLAYBOOK=<name>) $(error PLAYBOOK is required: make deploy PLAYBOOK=<name>)
endif endif
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) playbooks/$(PLAYBOOK).yml $(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) playbooks/$(PLAYBOOK).yml
# ── Vault ───────────────────────────────────────────────────────────────────── # ── Vault ─────────────────────────────────────────────────────────────────────

View file

@ -30,13 +30,13 @@ _Last reviewed: 2026-06-14._
| `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). | | `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). |
| `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. | | `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. |
| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **Pending:** NetBird mesh enrollment (so SSH is LAN-only); full `base` hardening (only the `firewall` concern exists, and it is NOT applied here — applying default-deny with no mesh would lock out inbound SSH on the physical NIC); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). | | `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **Pending:** NetBird mesh enrollment (so SSH is LAN-only); full `base` hardening (only the `firewall` concern exists, and it is NOT applied here — applying default-deny with no mesh would lock out inbound SSH on the physical NIC); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). |
| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me``77.42.120.136`. **SSH-hardened + fail2ban (M3 `hardening` concern applied).** **Pending:** NetBird coordinator (M4), host firewall + mesh enrollment (M5), offsite tfstate backup (ADR-022). | | `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me``77.42.120.136`. **Pending:** `base` hardening (M3), NetBird coordinator (M4), offsite tfstate backup (ADR-022). |
## Scaffolded but empty — NOT implemented ## Scaffolded but empty — NOT implemented
| Thing | State | | Thing | State |
|---|---| |---|---|
| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is built but **not yet applied** to any host (mesh-gated to avoid lockout — M5). Not built: auditd, packages, users (Phase 2 / TODO 15). | | `roles/base/` | **Partially built.** The `firewall` concern is implemented (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) with pytest + Molecule render/syntax tests. Other concerns (SSH hardening, fail2ban, auditd, packages, users) are **not** built yet, so `make deploy PLAYBOOK=site` has no real content to apply (the make target itself now works — see "Real and working today"). |
| `roles/docker_host/` | **Scaffolded, no tasks.** In git (meta/README/molecule filled), wired into `playbooks/site.yml` so the standard state is expressed end-to-end and `make lint` covers it, but it has no tasks yet — applying it is a no-op. Planned scope (Docker engine + Compose, daemon hardening, `nftables.d` container rules) in ADR-004/ADR-020. | | `roles/docker_host/` | **Scaffolded, no tasks.** In git (meta/README/molecule filled), wired into `playbooks/site.yml` so the standard state is expressed end-to-end and `make lint` covers it, but it has no tasks yet — applying it is a no-op. Planned scope (Docker engine + Compose, daemon hardening, `nftables.d` container rules) in ADR-004/ADR-020. |
| `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts | | `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts |
| `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs | | `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs |

View file

@ -21,20 +21,6 @@ earning its keep.
_(append new raw signals here; the next kaizen review consumes them)_ _(append new raw signals here; the next kaizen review consumes them)_
- `[gotcha]` **A tag on `include_tasks` does NOT reach the included tasks — need
`apply: {tags:}`** (2026-06-14): M3's `base/tasks/main.yml` tagged the ssh/fail2ban
`include_tasks` with `hardening`, but `make deploy … TAGS=hardening` ran *nothing*
(`ok=3 changed=0`) — a tag on a dynamic include selects the include, not its contents.
Fix: `include_tasks: {file: x.yml, apply: {tags: [hardening]}}`. The same latent bug sat
in the firewall include (never hit — firewall was only ever run untagged). Also the
check-mode artifact: a `service`/handler for a not-yet-installed package fails in a
first-run `--check` → guard with `when: not ansible_check_mode`. Both caught only by the
**live `make check`/`deploy` on askari** — Molecule converges *untagged*, so it can't
catch tag-propagation. 3rd reinforcement (after M1 `item.values`, M2 TF
`required_providers`) that live execution catches what review + container tests miss.
→ when a role uses tags to apply concern-subsets, `apply:` is mandatory on its includes;
consider an ansible-lint/CI check that `make deploy … TAGS=<concern>` actually changes things.
- `[gotcha]` **Terraform child modules need their own `required_providers` for - `[gotcha]` **Terraform child modules need their own `required_providers` for
non-hashicorp providers** (2026-06-14): `terraform init` for the `offsite` env failed — non-hashicorp providers** (2026-06-14): `terraform init` for the `offsite` env failed —
the `hetzner_vm` module used `hcloud_*` resources with no `required_providers` block, so the `hetzner_vm` module used `hcloud_*` resources with no `required_providers` block, so

View file

@ -92,20 +92,17 @@ it. Design: `docs/superpowers/specs/2026-06-14-askari-provisioning-design.md`.
- **Amends:** ADR-006 (TF scope), ADR-009 (offsite handoff), ADR-020 (Hetzner Cloud - **Amends:** ADR-006 (TF scope), ADR-009 (offsite handoff), ADR-020 (Hetzner Cloud
Firewall = perimeter), ADR-007/016 (`askari` TF-provisioned, not "added manually"). Firewall = perimeter), ADR-007/016 (`askari` TF-provisioned, not "added manually").
### M3 · `base` matured to a "remote-access-sufficient" subset — ✅ DONE ### M3 · `base` matured to a "remote-access-sufficient" subset
Added the `hardening` concern to `base` (sshd drop-in key-only + `PermitRootLogin no`; Today `base` is firewall-only. Add the subset a real, internet-facing host needs:
fail2ban sshd jail 5/1h; ADR-002) and **applied it to askari** by tag **SSH hardening + fail2ban + the NetBird agent task**. Full CIS L1/L2, auditd, AppArmor,
(`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`) — SSH still works, fail2ban AIDE are deferred to Phase 2.
active. Full CIS L1/L2, auditd, AppArmor, AIDE remain deferred to Phase 2 (TODO 15).
- **NetBird agent → M4** (deferred from M3: it enrolls against the coordinator, which - **Why a subset:** `askari` is public (Hetzner) — it must be SSH-hardened and firewalled
doesn't exist until M4 — ADR-016's coordinator-first bootstrap order). *with* exposure, but the full hardening standard is not on the critical path to mobile
- **Host firewall on askari + ubongo hardening → M5** (applying default-deny pre-mesh access.
would lock out SSH; the Hetzner Cloud Firewall is askari's perimeter until then). - **Maps to:** ADR-002 (security baseline), ADR-016 (agent enrollment lives in `base`),
- **Spec/plan:** `docs/superpowers/{specs,plans}/2026-06-14-base-ssh-fail2ban-m3*`. ADR-020 (firewall — already built), TODO 15 (the rest of hardening → Phase 2).
- **Maps to:** ADR-002 (security baseline), ADR-020 (firewall — built, not yet applied),
TODO 15 (the rest of hardening → Phase 2).
### M4 · NetBird control plane on `askari` — first real service role ### M4 · NetBird control plane on `askari` — first real service role

View file

@ -8,8 +8,7 @@ ansible_python_interpreter: /usr/bin/python3
# SSH authorised keys — add one entry per person # SSH authorised keys — add one entry per person
# Format: "ssh-ed25519 AAAA... user@host" # Format: "ssh-ed25519 AAAA... user@host"
base__ssh_authorised_keys: base__ssh_authorised_keys: []
- "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKSx1TFLJ9H8vCe5ZJSu7MYmAiH0/OC8evloQjGR0Bqw claude@ubongo"
# Timezone # Timezone
base__timezone: Europe/Copenhagen base__timezone: Europe/Copenhagen

View file

@ -11,12 +11,3 @@ base__firewall_rollback_timeout: 45 # seconds before the auto-revert fires on a
base__firewall_confirm_timeout: 20 # seconds to re-establish a fresh connection post-apply base__firewall_confirm_timeout: 20 # seconds to re-establish a fresh connection post-apply
base__firewall_dropin_dir: /etc/nftables.d base__firewall_dropin_dir: /etc/nftables.d
base__firewall_apply: true # set false to render+validate without applying (CI/Molecule) base__firewall_apply: true # set false to render+validate without applying (CI/Molecule)
# SSH hardening + fail2ban (ADR-002) — `hardening` concern.
base__ssh_password_authentication: "no"
base__ssh_permit_root_login: "no"
base__fail2ban_maxretry: 5
base__fail2ban_bantime: 1h
base__fail2ban_findtime: 10m
# base__ssh_authorised_keys lives in group_vars/all/vars.yml (per-person control keys).
base__ssh_authorised_keys: []

View file

@ -1,13 +1 @@
--- ---
- name: Reload sshd
listen: reload sshd
ansible.builtin.service:
name: ssh
state: reloaded
- name: Restart fail2ban
listen: restart fail2ban
ansible.builtin.service:
name: fail2ban
state: restarted
when: not ansible_check_mode # fail2ban isn't installed during a first-run --check

View file

@ -47,18 +47,3 @@
- name: Syntax-check the rendered ruleset (no apply) - name: Syntax-check the rendered ruleset (no apply)
ansible.builtin.command: nft -c -f /etc/nftables.conf ansible.builtin.command: nft -c -f /etc/nftables.conf
changed_when: false changed_when: false
- name: Sshd drop-in present and config valid
ansible.builtin.command: sshd -t
changed_when: false
tags: [verify]
- name: PasswordAuthentication is disabled
ansible.builtin.command: grep -q '^PasswordAuthentication no' /etc/ssh/sshd_config.d/10-boma.conf
changed_when: false
tags: [verify]
- name: Fail2ban sshd jail configured
ansible.builtin.command: grep -q '^\[sshd\]' /etc/fail2ban/jail.d/sshd.local
changed_when: false
tags: [verify]

View file

@ -1,25 +0,0 @@
---
- name: Install fail2ban
ansible.builtin.apt:
name: fail2ban
state: present
update_cache: true
- name: Configure the sshd jail
ansible.builtin.template:
src: fail2ban_sshd.local.j2
dest: /etc/fail2ban/jail.d/sshd.local
owner: root
group: root
mode: "0644"
notify: restart fail2ban
- name: Enable and start fail2ban
ansible.builtin.service:
name: fail2ban
enabled: true
state: started
# In --check on a host without fail2ban yet, the package isn't really installed, so the
# service lookup fails. Skip the start in check mode (the install + jail are still
# previewed); a real deploy installs then starts it.
when: not ansible_check_mode

View file

@ -1,24 +1,4 @@
--- ---
# `apply: tags:` propagates the concern tag to the INCLUDED tasks — without it a tag on
# a dynamic include_tasks only selects the include itself, not its contents, so
# `--tags <concern>` would run nothing (Ansible gotcha).
- name: Configure host firewall (nftables) - name: Configure host firewall (nftables)
ansible.builtin.include_tasks: ansible.builtin.include_tasks: firewall.yml
file: firewall.yml
apply:
tags: [firewall]
tags: [firewall] tags: [firewall]
- name: SSH hardening
ansible.builtin.include_tasks:
file: ssh.yml
apply:
tags: [hardening]
tags: [hardening]
- name: Fail2ban intrusion deterrence
ansible.builtin.include_tasks:
file: fail2ban.yml
apply:
tags: [hardening]
tags: [hardening]

View file

@ -1,34 +0,0 @@
---
- name: Ensure openssh-server is installed
ansible.builtin.apt:
name: openssh-server
state: present
update_cache: true
- name: Render hardened sshd drop-in
ansible.builtin.template:
src: sshd_hardening.conf.j2
dest: /etc/ssh/sshd_config.d/10-boma.conf
owner: root
group: root
mode: "0644"
notify: reload sshd
- name: Ensure sshd privilege-separation directory exists (required for sshd -t)
ansible.builtin.file:
path: /run/sshd
state: directory
owner: root
group: root
mode: "0755"
- name: Validate the full sshd config (drop-in included)
ansible.builtin.command: sshd -t
changed_when: false
- name: Authorise control SSH keys for the ansible user
ansible.posix.authorized_key:
user: "{{ ansible_user | default('ansible') }}"
key: "{{ base__ssh_authorised_keys | join('\n') }}"
exclusive: true
when: base__ssh_authorised_keys | length > 0

View file

@ -1,6 +0,0 @@
# Managed by Ansible (base role, ADR-002).
[sshd]
enabled = true
maxretry = {{ base__fail2ban_maxretry }}
bantime = {{ base__fail2ban_bantime }}
findtime = {{ base__fail2ban_findtime }}

View file

@ -1,5 +0,0 @@
# Managed by Ansible (base role, ADR-002). Do not edit on the host.
PasswordAuthentication {{ base__ssh_password_authentication }}
PermitRootLogin {{ base__ssh_permit_root_login }}
PubkeyAuthentication yes
KbdInteractiveAuthentication no