Compare commits
6 commits
cff368ece2
...
181a02fd3a
| Author | SHA1 | Date | |
|---|---|---|---|
| 181a02fd3a | |||
| 9d787a4f53 | |||
| db1e5db138 | |||
| a111a20cc8 | |||
| deec75de0f | |||
| 22021210c4 |
13 changed files with 161 additions and 17 deletions
8
Makefile
8
Makefile
|
|
@ -44,8 +44,8 @@ help:
|
|||
@echo " make lint Run yamllint + ansible-lint"
|
||||
@echo " make test ROLE=<name> Run Molecule tests for a role"
|
||||
@echo " make test-all Run Molecule tests for all roles"
|
||||
@echo " make check PLAYBOOK=<name> Dry-run a playbook (check mode)"
|
||||
@echo " make deploy PLAYBOOK=<name> Run a playbook against production"
|
||||
@echo " make check PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Dry-run a playbook (check mode)"
|
||||
@echo " make deploy PLAYBOOK=<name> [LIMIT=<host>] [TAGS=<tags>] Run a playbook against production"
|
||||
@echo " make edit-vault [VAULT=<path>] Edit the vault in nvim (auto re-encrypts + checks)"
|
||||
@echo " make check-vault [VAULT=<path>] Validate vault structure (values masked)"
|
||||
@echo " make encrypt FILE=<path> Encrypt a vault file"
|
||||
|
|
@ -103,13 +103,13 @@ check:
|
|||
ifndef PLAYBOOK
|
||||
$(error PLAYBOOK is required: make check PLAYBOOK=<name>)
|
||||
endif
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) --check --diff playbooks/$(PLAYBOOK).yml
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) --check --diff playbooks/$(PLAYBOOK).yml
|
||||
|
||||
deploy:
|
||||
ifndef PLAYBOOK
|
||||
$(error PLAYBOOK is required: make deploy PLAYBOOK=<name>)
|
||||
endif
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) playbooks/$(PLAYBOOK).yml
|
||||
$(PLAYBOOK_BIN) $(INVENTORY) $(VAULT_ARGS) $(if $(LIMIT),--limit $(LIMIT)) $(if $(TAGS),--tags $(TAGS)) playbooks/$(PLAYBOOK).yml
|
||||
|
||||
# ── Vault ─────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
|
|
|||
|
|
@ -30,13 +30,13 @@ _Last reviewed: 2026-06-14._
|
|||
| `make check` / `make deploy PLAYBOOK=<name>` | **Works.** First end-to-end run (applying `dev_env`) surfaced + fixed latent bugs: Makefile `PLAYBOOK` var collision (binary path vs playbook-name arg) meant the targets never ran; `ansible.cfg` referenced uninstalled community.general callbacks (now built-in `default` + `ansible.posix.profile_tasks`); `acl` package added so Ansible can `become_user` an unprivileged user. The make targets now function — though `site`/`base`/`docker_host` content is still incomplete (see below). |
|
||||
| `roles/public_dns/` + `playbooks/dns.yml` | **Built + applied.** Manages wingu.me at Gandi LiveDNS as code (`community.general.gandi_livedns`, PAT from `vault.gandi.pat`); record data, anti-spoof baseline (SPF `-all` + DMARC reject), and the Gandi-defaults purge are defined + unit-tested (`tests/test_public_dns.py`). **Applied to wingu.me (2026-06-14):** purged Gandi's 13 seeded defaults; zone now holds only the SPF + DMARC TXT records; idempotent re-run clean. No null-MX (Gandi rejects `0 .`) — the MX is removed, so no MX + no apex A = no mail. M1 of the roadmap. |
|
||||
| `ubongo` — physical control / AI-worker host (ADR-015) | **Built (partial).** Debian 13.5 on a Lenovo M70q (i3-10100T, 16 GB, 256 GB SSD; no disk encryption — accepted risk). Full toolchain installed + pinned to `fisi` (Docker 29.5.3, rbw 1.15.0, Claude Code 2.1.173, ansible-core 2.17.14 + molecule via `make setup`/`make collections`). Repo cloned under a dedicated `claude` user (docker group, no sudo). Vault works via rbw (offline-cache decryption verified). SSH key-only (password + root login disabled). In the production inventory `control` group at 10.20.10.151. **`dev_env` now applied here** (zsh/tmux/nvim for `sjat` + `claude`, via `playbooks/workstation.yml`). Managed as the operator account `sjat` (`group_vars/control` sets `ansible_user: sjat`), not the `ansible` service user `group_vars/all` assumes — ubongo has no bootstrapped `ansible` user. **Pending:** NetBird mesh enrollment (so SSH is LAN-only); full `base` hardening (only the `firewall` concern exists, and it is NOT applied here — applying default-deny with no mesh would lock out inbound SSH on the physical NIC); proper `ansible`-user bootstrap (currently managed as `sjat`); OPNsense DHCP reservation for 10.20.10.151 (MAC `88:a4:c2:e0:ee:da`); Terraform state backup (now relevant — the offsite tfstate exists). |
|
||||
| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **Pending:** `base` hardening (M3), NetBird coordinator (M4), offsite tfstate backup (ADR-022). |
|
||||
| `askari` — off-site Hetzner VPS (ADR-007/016, M2) | **Built + applied.** Provisioned by Terraform (`environments/offsite`, `hetznercloud/hcloud`) as **cx23 / hel1 / Debian 13.5** (CAX11/ARM was out of stock EU-wide on 2026-06-14 → cx23 is same-spec x86, cheaper). cloud-init created the `ansible` user + passwordless sudo; a TF-managed Hetzner Cloud Firewall allows SSH only from ubongo's WAN (`91.226.145.80`). Reachable from ubongo (`ansible offsite_hosts -m ping` ✓), in the `offsite_hosts` inventory (generated `offsite.yml`), published at `askari.wingu.me` → `77.42.120.136`. **SSH-hardened + fail2ban (M3 `hardening` concern applied).** **Pending:** NetBird coordinator (M4), host firewall + mesh enrollment (M5), offsite tfstate backup (ADR-022). |
|
||||
|
||||
## Scaffolded but empty — NOT implemented
|
||||
|
||||
| Thing | State |
|
||||
|---|---|
|
||||
| `roles/base/` | **Partially built.** The `firewall` concern is implemented (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) with pytest + Molecule render/syntax tests. Other concerns (SSH hardening, fail2ban, auditd, packages, users) are **not** built yet, so `make deploy PLAYBOOK=site` has no real content to apply (the make target itself now works — see "Real and working today"). |
|
||||
| `roles/base/` | **Partially built.** Concerns built: `firewall` (nftables: catalog-driven default-deny + east-west allowlist + auto-rollback apply; ADR-020) and **`hardening`** (M3: sshd drop-in key-only + `PermitRootLogin no`, fail2ban sshd jail 5/1h; ADR-002) — both pytest/Molecule-tested. The **`hardening`** concern is **applied to askari** (`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`). The `firewall` concern is built but **not yet applied** to any host (mesh-gated to avoid lockout — M5). Not built: auditd, packages, users (Phase 2 / TODO 15). |
|
||||
| `roles/docker_host/` | **Scaffolded, no tasks.** In git (meta/README/molecule filled), wired into `playbooks/site.yml` so the standard state is expressed end-to-end and `make lint` covers it, but it has no tasks yet — applying it is a no-op. Planned scope (Docker engine + Compose, daemon hardening, `nftables.d` container rules) in ADR-004/ADR-020. |
|
||||
| `inventories/*/hosts.yml` | Structured stubs with empty host maps (`hosts: {}`); regenerated by `make tf-inventory` once Terraform has hosts |
|
||||
| `inventories/production/group_vars/{docker_hosts,proxmox_hosts}/` | Empty dirs |
|
||||
|
|
|
|||
|
|
@ -21,6 +21,20 @@ earning its keep.
|
|||
|
||||
_(append new raw signals here; the next kaizen review consumes them)_
|
||||
|
||||
- `[gotcha]` **A tag on `include_tasks` does NOT reach the included tasks — need
|
||||
`apply: {tags:}`** (2026-06-14): M3's `base/tasks/main.yml` tagged the ssh/fail2ban
|
||||
`include_tasks` with `hardening`, but `make deploy … TAGS=hardening` ran *nothing*
|
||||
(`ok=3 changed=0`) — a tag on a dynamic include selects the include, not its contents.
|
||||
Fix: `include_tasks: {file: x.yml, apply: {tags: [hardening]}}`. The same latent bug sat
|
||||
in the firewall include (never hit — firewall was only ever run untagged). Also the
|
||||
check-mode artifact: a `service`/handler for a not-yet-installed package fails in a
|
||||
first-run `--check` → guard with `when: not ansible_check_mode`. Both caught only by the
|
||||
**live `make check`/`deploy` on askari** — Molecule converges *untagged*, so it can't
|
||||
catch tag-propagation. 3rd reinforcement (after M1 `item.values`, M2 TF
|
||||
`required_providers`) that live execution catches what review + container tests miss.
|
||||
→ when a role uses tags to apply concern-subsets, `apply:` is mandatory on its includes;
|
||||
consider an ansible-lint/CI check that `make deploy … TAGS=<concern>` actually changes things.
|
||||
|
||||
- `[gotcha]` **Terraform child modules need their own `required_providers` for
|
||||
non-hashicorp providers** (2026-06-14): `terraform init` for the `offsite` env failed —
|
||||
the `hetzner_vm` module used `hcloud_*` resources with no `required_providers` block, so
|
||||
|
|
|
|||
|
|
@ -92,17 +92,20 @@ it. Design: `docs/superpowers/specs/2026-06-14-askari-provisioning-design.md`.
|
|||
- **Amends:** ADR-006 (TF scope), ADR-009 (offsite handoff), ADR-020 (Hetzner Cloud
|
||||
Firewall = perimeter), ADR-007/016 (`askari` TF-provisioned, not "added manually").
|
||||
|
||||
### M3 · `base` matured to a "remote-access-sufficient" subset
|
||||
### M3 · `base` matured to a "remote-access-sufficient" subset — ✅ DONE
|
||||
|
||||
Today `base` is firewall-only. Add the subset a real, internet-facing host needs:
|
||||
**SSH hardening + fail2ban + the NetBird agent task**. Full CIS L1/L2, auditd, AppArmor,
|
||||
AIDE are deferred to Phase 2.
|
||||
Added the `hardening` concern to `base` (sshd drop-in key-only + `PermitRootLogin no`;
|
||||
fail2ban sshd jail 5/1h; ADR-002) and **applied it to askari** by tag
|
||||
(`make deploy PLAYBOOK=site LIMIT=askari TAGS=hardening`) — SSH still works, fail2ban
|
||||
active. Full CIS L1/L2, auditd, AppArmor, AIDE remain deferred to Phase 2 (TODO 15).
|
||||
|
||||
- **Why a subset:** `askari` is public (Hetzner) — it must be SSH-hardened and firewalled
|
||||
*with* exposure, but the full hardening standard is not on the critical path to mobile
|
||||
access.
|
||||
- **Maps to:** ADR-002 (security baseline), ADR-016 (agent enrollment lives in `base`),
|
||||
ADR-020 (firewall — already built), TODO 15 (the rest of hardening → Phase 2).
|
||||
- **NetBird agent → M4** (deferred from M3: it enrolls against the coordinator, which
|
||||
doesn't exist until M4 — ADR-016's coordinator-first bootstrap order).
|
||||
- **Host firewall on askari + ubongo hardening → M5** (applying default-deny pre-mesh
|
||||
would lock out SSH; the Hetzner Cloud Firewall is askari's perimeter until then).
|
||||
- **Spec/plan:** `docs/superpowers/{specs,plans}/2026-06-14-base-ssh-fail2ban-m3*`.
|
||||
- **Maps to:** ADR-002 (security baseline), ADR-020 (firewall — built, not yet applied),
|
||||
TODO 15 (the rest of hardening → Phase 2).
|
||||
|
||||
### M4 · NetBird control plane on `askari` — first real service role
|
||||
|
||||
|
|
|
|||
|
|
@ -8,7 +8,8 @@ ansible_python_interpreter: /usr/bin/python3
|
|||
|
||||
# SSH authorised keys — add one entry per person
|
||||
# Format: "ssh-ed25519 AAAA... user@host"
|
||||
base__ssh_authorised_keys: []
|
||||
base__ssh_authorised_keys:
|
||||
- "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIKSx1TFLJ9H8vCe5ZJSu7MYmAiH0/OC8evloQjGR0Bqw claude@ubongo"
|
||||
|
||||
# Timezone
|
||||
base__timezone: Europe/Copenhagen
|
||||
|
|
|
|||
|
|
@ -11,3 +11,12 @@ base__firewall_rollback_timeout: 45 # seconds before the auto-revert fires on a
|
|||
base__firewall_confirm_timeout: 20 # seconds to re-establish a fresh connection post-apply
|
||||
base__firewall_dropin_dir: /etc/nftables.d
|
||||
base__firewall_apply: true # set false to render+validate without applying (CI/Molecule)
|
||||
|
||||
# SSH hardening + fail2ban (ADR-002) — `hardening` concern.
|
||||
base__ssh_password_authentication: "no"
|
||||
base__ssh_permit_root_login: "no"
|
||||
base__fail2ban_maxretry: 5
|
||||
base__fail2ban_bantime: 1h
|
||||
base__fail2ban_findtime: 10m
|
||||
# base__ssh_authorised_keys lives in group_vars/all/vars.yml (per-person control keys).
|
||||
base__ssh_authorised_keys: []
|
||||
|
|
|
|||
|
|
@ -1 +1,13 @@
|
|||
---
|
||||
- name: Reload sshd
|
||||
listen: reload sshd
|
||||
ansible.builtin.service:
|
||||
name: ssh
|
||||
state: reloaded
|
||||
|
||||
- name: Restart fail2ban
|
||||
listen: restart fail2ban
|
||||
ansible.builtin.service:
|
||||
name: fail2ban
|
||||
state: restarted
|
||||
when: not ansible_check_mode # fail2ban isn't installed during a first-run --check
|
||||
|
|
|
|||
|
|
@ -47,3 +47,18 @@
|
|||
- name: Syntax-check the rendered ruleset (no apply)
|
||||
ansible.builtin.command: nft -c -f /etc/nftables.conf
|
||||
changed_when: false
|
||||
|
||||
- name: Sshd drop-in present and config valid
|
||||
ansible.builtin.command: sshd -t
|
||||
changed_when: false
|
||||
tags: [verify]
|
||||
|
||||
- name: PasswordAuthentication is disabled
|
||||
ansible.builtin.command: grep -q '^PasswordAuthentication no' /etc/ssh/sshd_config.d/10-boma.conf
|
||||
changed_when: false
|
||||
tags: [verify]
|
||||
|
||||
- name: Fail2ban sshd jail configured
|
||||
ansible.builtin.command: grep -q '^\[sshd\]' /etc/fail2ban/jail.d/sshd.local
|
||||
changed_when: false
|
||||
tags: [verify]
|
||||
|
|
|
|||
25
roles/base/tasks/fail2ban.yml
Normal file
25
roles/base/tasks/fail2ban.yml
Normal file
|
|
@ -0,0 +1,25 @@
|
|||
---
|
||||
- name: Install fail2ban
|
||||
ansible.builtin.apt:
|
||||
name: fail2ban
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Configure the sshd jail
|
||||
ansible.builtin.template:
|
||||
src: fail2ban_sshd.local.j2
|
||||
dest: /etc/fail2ban/jail.d/sshd.local
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: restart fail2ban
|
||||
|
||||
- name: Enable and start fail2ban
|
||||
ansible.builtin.service:
|
||||
name: fail2ban
|
||||
enabled: true
|
||||
state: started
|
||||
# In --check on a host without fail2ban yet, the package isn't really installed, so the
|
||||
# service lookup fails. Skip the start in check mode (the install + jail are still
|
||||
# previewed); a real deploy installs then starts it.
|
||||
when: not ansible_check_mode
|
||||
|
|
@ -1,4 +1,24 @@
|
|||
---
|
||||
# `apply: tags:` propagates the concern tag to the INCLUDED tasks — without it a tag on
|
||||
# a dynamic include_tasks only selects the include itself, not its contents, so
|
||||
# `--tags <concern>` would run nothing (Ansible gotcha).
|
||||
- name: Configure host firewall (nftables)
|
||||
ansible.builtin.include_tasks: firewall.yml
|
||||
ansible.builtin.include_tasks:
|
||||
file: firewall.yml
|
||||
apply:
|
||||
tags: [firewall]
|
||||
tags: [firewall]
|
||||
|
||||
- name: SSH hardening
|
||||
ansible.builtin.include_tasks:
|
||||
file: ssh.yml
|
||||
apply:
|
||||
tags: [hardening]
|
||||
tags: [hardening]
|
||||
|
||||
- name: Fail2ban intrusion deterrence
|
||||
ansible.builtin.include_tasks:
|
||||
file: fail2ban.yml
|
||||
apply:
|
||||
tags: [hardening]
|
||||
tags: [hardening]
|
||||
|
|
|
|||
34
roles/base/tasks/ssh.yml
Normal file
34
roles/base/tasks/ssh.yml
Normal file
|
|
@ -0,0 +1,34 @@
|
|||
---
|
||||
- name: Ensure openssh-server is installed
|
||||
ansible.builtin.apt:
|
||||
name: openssh-server
|
||||
state: present
|
||||
update_cache: true
|
||||
|
||||
- name: Render hardened sshd drop-in
|
||||
ansible.builtin.template:
|
||||
src: sshd_hardening.conf.j2
|
||||
dest: /etc/ssh/sshd_config.d/10-boma.conf
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0644"
|
||||
notify: reload sshd
|
||||
|
||||
- name: Ensure sshd privilege-separation directory exists (required for sshd -t)
|
||||
ansible.builtin.file:
|
||||
path: /run/sshd
|
||||
state: directory
|
||||
owner: root
|
||||
group: root
|
||||
mode: "0755"
|
||||
|
||||
- name: Validate the full sshd config (drop-in included)
|
||||
ansible.builtin.command: sshd -t
|
||||
changed_when: false
|
||||
|
||||
- name: Authorise control SSH keys for the ansible user
|
||||
ansible.posix.authorized_key:
|
||||
user: "{{ ansible_user | default('ansible') }}"
|
||||
key: "{{ base__ssh_authorised_keys | join('\n') }}"
|
||||
exclusive: true
|
||||
when: base__ssh_authorised_keys | length > 0
|
||||
6
roles/base/templates/fail2ban_sshd.local.j2
Normal file
6
roles/base/templates/fail2ban_sshd.local.j2
Normal file
|
|
@ -0,0 +1,6 @@
|
|||
# Managed by Ansible (base role, ADR-002).
|
||||
[sshd]
|
||||
enabled = true
|
||||
maxretry = {{ base__fail2ban_maxretry }}
|
||||
bantime = {{ base__fail2ban_bantime }}
|
||||
findtime = {{ base__fail2ban_findtime }}
|
||||
5
roles/base/templates/sshd_hardening.conf.j2
Normal file
5
roles/base/templates/sshd_hardening.conf.j2
Normal file
|
|
@ -0,0 +1,5 @@
|
|||
# Managed by Ansible (base role, ADR-002). Do not edit on the host.
|
||||
PasswordAuthentication {{ base__ssh_password_authentication }}
|
||||
PermitRootLogin {{ base__ssh_permit_root_login }}
|
||||
PubkeyAuthentication yes
|
||||
KbdInteractiveAuthentication no
|
||||
Loading…
Add table
Reference in a new issue