Compare commits
3 commits
9be4366ac3
...
4116286ed0
| Author | SHA1 | Date | |
|---|---|---|---|
| 4116286ed0 | |||
| 91713127cb | |||
| 2dbcac11a0 |
8 changed files with 154 additions and 132 deletions
46
.claude/hooks/guard-execution-mode-menu.sh
Executable file
46
.claude/hooks/guard-execution-mode-menu.sh
Executable file
|
|
@ -0,0 +1,46 @@
|
||||||
|
#!/usr/bin/env bash
|
||||||
|
#
|
||||||
|
# Stop guard: block ending the turn when the assistant's final message presents the
|
||||||
|
# execution-mode menu. The writing-plans / subagent-driven-development skills script a
|
||||||
|
# "Subagent-Driven vs Inline Execution — which approach?" menu at the plan→execution
|
||||||
|
# handoff. boma's standing preference (docs/FRICTION.md + the
|
||||||
|
# always-subagent-driven-execution memory) is to NEVER present it and proceed
|
||||||
|
# subagent-driven. Prose reminders failed four times (06-05/06/09/10); this is the
|
||||||
|
# mechanical guard recorded by the 2026-06-10 kaizen review.
|
||||||
|
#
|
||||||
|
# Fails OPEN: any parse/read problem → allow the stop. Respects stop_hook_active so a
|
||||||
|
# block can never loop. The match signature is deliberately tight ("inline execution"
|
||||||
|
# AND "which approach"/"two execution options") so it fires on the actual menu, not on
|
||||||
|
# meta-discussion of it.
|
||||||
|
#
|
||||||
|
set -uo pipefail
|
||||||
|
|
||||||
|
input=$(cat 2>/dev/null) || exit 0
|
||||||
|
|
||||||
|
# Loop guard: if we already blocked once for this stop, let it through.
|
||||||
|
active=$(printf '%s' "$input" | jq -r '.stop_hook_active // false' 2>/dev/null) || exit 0
|
||||||
|
[ "$active" = "true" ] && exit 0
|
||||||
|
|
||||||
|
transcript=$(printf '%s' "$input" | jq -r '.transcript_path // empty' 2>/dev/null) || exit 0
|
||||||
|
[ -z "$transcript" ] || [ ! -r "$transcript" ] && exit 0
|
||||||
|
|
||||||
|
# Last assistant message's text blocks, joined.
|
||||||
|
text=$(jq -rs '
|
||||||
|
([ .[] | select(.type=="assistant") ] | last) as $a
|
||||||
|
| ($a.message.content // [])
|
||||||
|
| if type=="array" then [ .[] | select(.type=="text") | .text ] | join("\n")
|
||||||
|
elif type=="string" then .
|
||||||
|
else "" end
|
||||||
|
' "$transcript" 2>/dev/null) || exit 0
|
||||||
|
|
||||||
|
low="${text,,}"
|
||||||
|
|
||||||
|
if [[ "$low" == *"inline execution"* \
|
||||||
|
&& ( "$low" == *"which approach"* || "$low" == *"two execution options"* ) ]]; then
|
||||||
|
cat <<'JSON'
|
||||||
|
{"decision":"block","reason":"Execution-mode menu detected in your final message. boma standing preference (docs/FRICTION.md + always-subagent-driven-execution memory): never present the subagent-driven-vs-inline menu. Drop the menu and proceed with subagent-driven execution directly (superpowers:subagent-driven-development)."}
|
||||||
|
JSON
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
|
|
||||||
|
exit 0
|
||||||
|
|
@ -56,6 +56,18 @@
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
],
|
||||||
|
"Stop": [
|
||||||
|
{
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "bash \"${CLAUDE_PROJECT_DIR:-.}/.claude/hooks/guard-execution-mode-menu.sh\"",
|
||||||
|
"timeout": 10,
|
||||||
|
"statusMessage": "Checking for execution-mode menu"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -19,6 +19,15 @@ repos:
|
||||||
rev: v24.12.2 # keep in sync with requirements.txt
|
rev: v24.12.2 # keep in sync with requirements.txt
|
||||||
hooks:
|
hooks:
|
||||||
- id: ansible-lint
|
- id: ansible-lint
|
||||||
|
# Only run on Ansible content. ansible-lint loads the play context, which
|
||||||
|
# auto-decrypts inventories/*/group_vars/all/vault.yml via the wired
|
||||||
|
# vault_password_file (→ rbw) — so it needs `rbw unlock`. The upstream hook is
|
||||||
|
# always_run+pass_filenames:false (lints the whole project, every commit); we
|
||||||
|
# override always_run:false and add a files filter so docs-/config-only commits
|
||||||
|
# skip it (no vault needed). pass_filenames stays false → still a project lint
|
||||||
|
# when any Ansible file is staged.
|
||||||
|
always_run: false
|
||||||
|
files: ^(roles|playbooks|inventories)/.*\.ya?ml$
|
||||||
additional_dependencies:
|
additional_dependencies:
|
||||||
- ansible-core==2.17.* # pin (not >=) — keep in sync with requirements.txt
|
- ansible-core==2.17.* # pin (not >=) — keep in sync with requirements.txt
|
||||||
|
|
||||||
|
|
|
||||||
4
Makefile
4
Makefile
|
|
@ -75,12 +75,12 @@ test:
|
||||||
ifndef ROLE
|
ifndef ROLE
|
||||||
$(error ROLE is required: make test ROLE=<rolename>)
|
$(error ROLE is required: make test ROLE=<rolename>)
|
||||||
endif
|
endif
|
||||||
cd roles/$(ROLE) && ../../$(MOLECULE) test
|
cd roles/$(ROLE) && PATH="$(CURDIR)/$(VENV)/bin:$$PATH" molecule test
|
||||||
|
|
||||||
test-all:
|
test-all:
|
||||||
@for role in roles/*/; do \
|
@for role in roles/*/; do \
|
||||||
echo "── Testing $$role ──"; \
|
echo "── Testing $$role ──"; \
|
||||||
cd $$role && ../../$(MOLECULE) test; cd ../..; \
|
cd $$role && PATH="$(CURDIR)/$(VENV)/bin:$$PATH" molecule test; cd ../..; \
|
||||||
done
|
done
|
||||||
|
|
||||||
# ── Playbook execution ────────────────────────────────────────────────────────
|
# ── Playbook execution ────────────────────────────────────────────────────────
|
||||||
|
|
|
||||||
159
docs/FRICTION.md
159
docs/FRICTION.md
|
|
@ -4,10 +4,11 @@ Raw signals for the periodic **kaizen review** (the methodology retrospective; s
|
||||||
`docs/TODO.md`). This is the input that keeps our tooling and conventions sharpening
|
`docs/TODO.md`). This is the input that keeps our tooling and conventions sharpening
|
||||||
over time instead of only accreting.
|
over time instead of only accreting.
|
||||||
|
|
||||||
**How to use:** append freely _during_ work — don't curate, don't fix here. Capture
|
**How to use:** append freely _during_ work under **Open signals** — don't curate,
|
||||||
friction, surprises, fixes that keep recurring, and tooling that isn't earning its
|
don't fix there. Capture friction, surprises, fixes that keep recurring, and tooling
|
||||||
keep. The kaizen review reads this, then proposes **add / change / remove** (biased
|
that isn't earning its keep. The kaizen review reads this, then proposes
|
||||||
toward _remove_) and records the decisions as ADRs.
|
**add / change / remove** (biased toward _remove_), migrates durable knowledge into the
|
||||||
|
right docs, and moves consumed signals into the **decisions ledger** below.
|
||||||
|
|
||||||
**Entry format:** `date — [tag] observation — (optional) → systematization idea`
|
**Entry format:** `date — [tag] observation — (optional) → systematization idea`
|
||||||
Tags: `[friction]` recurring annoyance · `[gotcha]` surprising behaviour ·
|
Tags: `[friction]` recurring annoyance · `[gotcha]` surprising behaviour ·
|
||||||
|
|
@ -16,137 +17,35 @@ earning its keep.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 2026-05-30 — initial seed (from the Claude-Code setup session)
|
## Open signals
|
||||||
|
|
||||||
- `[recurring]` Every `git commit` needs `rbw` unlocked (the pre-commit ansible-lint
|
_(append new raw signals here; the next kaizen review consumes them)_
|
||||||
hook decrypts `vault.yml` for its syntax-check). Mitigated with a 5h lock timeout
|
|
||||||
and an `rbw unlocked` pre-flight convention. → _Open:_ could ansible-lint skip vault
|
|
||||||
decryption for syntax-check, so committing doesn't need the vault at all?
|
|
||||||
- `[gotcha]` pre-commit stashes _unstaged_ changes before running hooks, so a partial
|
|
||||||
commit reverted an interdependent file (`ansible.cfg`) and failed. → Commit
|
|
||||||
interdependent changes together, or stage the config change first.
|
|
||||||
- `[gotcha]` `make new-role` had never worked on this host: `mkdir {a,b,c}` brace
|
|
||||||
expansion fails under `/bin/sh` (dash). Fixed with explicit paths. → A real run
|
|
||||||
catches what static review can't; consider smoke-testing scaffold commands.
|
|
||||||
- `[gotcha]` `rbw sync` is required after adding a Vaultwarden item before `rbw get`
|
|
||||||
finds it (stale local cache).
|
|
||||||
- `[gotcha]` This shell is zsh — unquoted `$VAR` does not word-split, so a variable
|
|
||||||
holding a file list was passed as a single argument. → Use explicit args/arrays.
|
|
||||||
- `[friction]` Long sessions: I make a batch of edits but can't commit until you
|
|
||||||
`rbw unlock`. The 5h timeout + pre-flight check address the symptom; watch whether
|
|
||||||
it still bites.
|
|
||||||
- `[gotcha]` Hooks (or any new `.claude/settings.json`) added mid-session don't
|
|
||||||
activate until a Claude Code **restart** — the settings watcher only tracks settings
|
|
||||||
files that existed at session start. Opening `/hooks` and dismissing did _not_ load
|
|
||||||
them. → Fresh sessions load them normally; restart after adding hooks.
|
|
||||||
|
|
||||||
## 2026-05-31
|
- `[friction]` **ADR-writing policy is unsettled** (2026-05-31): drafting an ADR, I
|
||||||
|
invented a Status header ("Proposed") on the fly because there's no documented
|
||||||
|
convention for how we write ADRs (status lifecycle, required sections). → TODO 10.2 —
|
||||||
|
decide a minimal ADR template / status convention.
|
||||||
|
|
||||||
- I asked to draft an ADR and got: No formal status-header convention, but since this is a draft for discussion I'll mark it Proposed so it isn't mistaken for an
|
---
|
||||||
accepted decision. Here's the draft.
|
|
||||||
|
|
||||||
## 2026-06-01
|
## Kaizen reviews — decisions ledger
|
||||||
|
|
||||||
- `[friction]` The `finishing-a-development-branch` flow (and generic AI/dev tooling)
|
Consumed signals and where their resolution now lives. Newest first.
|
||||||
offers "push and open a Pull Request," but our Forgejo `origin` is trunk-based with
|
|
||||||
no merge-request / approval gate (CLAUDE.md git conventions). That option doesn't
|
|
||||||
apply — the real path is local fast-forward merge to `main`, then push. → Skills and
|
|
||||||
conventions that assume a GitHub-style PR workflow need a homelab-aware variant;
|
|
||||||
encode that here "finishing a branch" means merge-locally-then-push, not open-a-PR.
|
|
||||||
|
|
||||||
## 2026-06-05
|
### 2026-06-10
|
||||||
|
|
||||||
- `[recurring]` The `writing-plans` skill ends by asking "subagent-driven vs inline
|
| Signal (first seen) | Verdict | Resolution / where it lives now |
|
||||||
execution?" — always answer subagent-driven here. Don't ask; default straight to
|
|---|---|---|
|
||||||
subagent-driven (fresh subagent per task + review between tasks). → Standing
|
| Execution-mode menu asked at plan handoff — 4× (06-05/06/09/10) | CHANGE → mechanical | Stop hook in `.claude/settings.json` blocks the turn if the menu appears and tells me to proceed subagent-driven. Prose reminders (CLAUDE.md, memory, 3 FRICTION entries) had failed four times — the lesson is that a behaviour conflicting with an external skill's script needs a *mechanical* guard, not another note. |
|
||||||
preference; skip the execution-mode prompt.
|
| Every `git commit` needs `rbw` unlock — recurring (05-30) | CHANGE | Root cause was **not** the vault syntax-check (`.ansible-lint` already excludes `vault.yml`); it was ansible-lint auto-loading + decrypting `inventories/production/group_vars/all/vault.yml` via the wired `vault_password_file`. Scoped the pre-commit `ansible-lint` hook (`always_run: false` + `files:` ansible content) so **docs-/config-only commits skip it and need no vault**. Ansible-content commits still need `rbw` (intrinsic to linting vault-backed plays; accepted). |
|
||||||
- `[recurring]` When a **deferred** decision later resolves, docs that referenced the
|
| `make test` fails when run non-activated — `ansible-config` not found (06-06) | CHANGE | `Makefile` `test`/`test-all` now prepend `$(CURDIR)/.venv/bin` to `PATH`. |
|
||||||
deferral go stale and a plan's file-map can miss them (e.g. resolving the mesh-VPN
|
| Molecule image missing from the Forgejo registry (06-06) | already built | `make molecule-image-push` target exists. |
|
||||||
choice left `new-host.md` still saying "mesh VPN (choice deferred)"; the ubongo work
|
| Deferred decision goes stale across docs — 3× (06-05) | already built | `scripts/repo-scan.py` `open-deferred-item` / `stale-deferred` checks, run by `/review-repo`. |
|
||||||
similarly left a contradiction in CLAUDE.md). A _broadened_ final grep sweep caught
|
| `make new-role` brace-expansion fails under dash (05-30) | fixed | Explicit paths in the Makefile target. |
|
||||||
both. → On resolving a deferred decision, grep all canonical docs for the deferral
|
| nft `iif` vs `iifname`, Molecule `ansible_host`, apply-path coverage blind spot, render-`nft -c` pattern (06-06) | MIGRATE | → `docs/testing/gotchas.md` (pointer from ADR-008). |
|
||||||
language ("choice deferred", "pending", "TBD", the placeholder's name) and reconcile
|
| hooks-need-restart, pre-commit stashes unstaged, `rbw sync` stale cache, zsh word-split (05-30) | MIGRATE | → `docs/runbooks/claude-code-setup.md` "Environment gotchas". |
|
||||||
every hit — don't rely on the plan's file-map alone. Worth a `/review-repo` check for
|
| `finishing-a-development-branch` offers open-a-PR vs our trunk-based merge (06-01) | accepted | Same root cause as the menu ask (external skill script vs boma convention). CLAUDE.md already mandates trunk-based merge-to-main; covered by the Stop-hook family + awareness. Revisit if it recurs. |
|
||||||
lingering "deferred/pending/TBD" references whose ADR has since resolved.
|
|
||||||
- **Recurred a 3rd time (same day):** ADR-017 resolved the browser-E2E harness but
|
|
||||||
left ADR-015's own "Deferred" list item #2 still reading as open — not caught by the
|
|
||||||
ADR-017 plan's sweep (which only checked for _its own_ placeholder language), only
|
|
||||||
by a later STATUS pass. Lesson sharpened: the stale reference often lives in the
|
|
||||||
**originating ADR's Deferred section**, which the resolving ADR's plan won't think
|
|
||||||
to grep. → When an ADR resolves another ADR's deferred item, edit that **source
|
|
||||||
ADR's Deferred list** in the same change. Three hits now — promote from "worth a
|
|
||||||
check" to **build it**: a `/review-repo` rule flagging any ADR "Deferred/Open" entry
|
|
||||||
whose subject is named as RESOLVED/DECIDED elsewhere.
|
|
||||||
|
|
||||||
## 2026-06-06
|
**Process note:** the `/retro` tool (TODO 11) still isn't built, so this review was
|
||||||
|
manual. Curating by hand (migrate durable knowledge → docs, archive consumed signals →
|
||||||
- `[recurring]` **Asked the execution-mode question AGAIN** ("subagent-driven vs inline —
|
this ledger) worked well; fold that curation step into `/retro` when it's built.
|
||||||
which approach?") at the end of `writing-plans`, despite the 2026-06-05 standing
|
|
||||||
preference _and_ the `always-subagent-driven-execution` memory both saying don't ask.
|
|
||||||
Root cause: the `writing-plans` skill's "Execution Handoff" step scripts the menu, and
|
|
||||||
I followed the skill text over the user's standing override. Second occurrence →
|
|
||||||
escalate from "skip the prompt" to a **hard rule**: never present the execution-mode
|
|
||||||
menu; finishing a plan means defaulting straight to subagent-driven.
|
|
||||||
- `[friction]` **Don't pause for approval between writing a plan and implementing it.**
|
|
||||||
The user has standing pre-approval to carry straight through plan → implementation. The
|
|
||||||
brainstorming/plan flow already has explicit approval gates (design approval, spec
|
|
||||||
review); adding another "shall I proceed to implement?" gate after the plan is written
|
|
||||||
is redundant friction. → After `writing-plans` finishes, begin subagent-driven
|
|
||||||
implementation directly. The only reason to stop is a genuine blocker or ambiguity, not
|
|
||||||
a routine checkpoint.
|
|
||||||
|
|
||||||
### Host nftables firewall build (`base` role)
|
|
||||||
|
|
||||||
- `[gotcha]` **`nft -c` rejects `iif "<name>"` when the interface is absent** (it resolves
|
|
||||||
to an interface _index_ at load time). The render+syntax-check Molecule step caught
|
|
||||||
`iif "wt0"` failing in the container — and it would fail identically on any real host
|
|
||||||
before NetBird brings up `wt0`. Use **`iifname "<name>"`** (string match, no existence
|
|
||||||
requirement, survives the interface coming/going) for any interface that may be absent.
|
|
||||||
- `[gotcha]` **Molecule's `community.docker` connection uses `ansible_host` as the
|
|
||||||
container name** (`remote_addr`). Setting `ansible_host` as _data_ in a scenario's
|
|
||||||
`host_vars` (e.g. to give a resolver a fake IP) breaks the connection → `UNREACHABLE`,
|
|
||||||
"Failed to create temporary directory". Don't override `ansible_host` in molecule; feed
|
|
||||||
fixture IPs another way (or keep fixtures to zone sources and unit-test IP resolution).
|
|
||||||
- `[recurring]` **`make test ROLE=<r>` needs the venv on PATH.** Run non-activated (as
|
|
||||||
agents do), molecule dies with `FileNotFoundError: 'ansible-config'` — it shells out to
|
|
||||||
`ansible-config`/`ansible-playbook` by bare name. Workaround: `PATH="$PWD/.venv/bin:$PATH"
|
|
||||||
.venv/bin/molecule test`. Also the molecule image wasn't in the Forgejo registry (pull →
|
|
||||||
"not found"); had to `make molecule-image` to build it locally. → Consider (a) the
|
|
||||||
Makefile `test` target prepending `.venv/bin` to PATH, and (b) `make molecule-image-push`
|
|
||||||
so a fresh checkout can pull it.
|
|
||||||
- `[gotcha]` **Apply-only task paths have no Level-1 coverage**, so safety bugs hide there.
|
|
||||||
The `nft` auto-rollback snapshot used a bare `nft list ruleset` (no leading `flush
|
|
||||||
ruleset`) → the revert was a silent no-op on first apply and errored on later ones; the
|
|
||||||
whole safety net was dead. Molecule never runs the apply (gated off), so only adversarial
|
|
||||||
review + an isolated-netns round-trip test caught it. → For apply/safety paths molecule
|
|
||||||
can't exercise, validate out-of-band (a throwaway `--privileged` container with its own
|
|
||||||
netns) and treat a final adversarial review as mandatory, not optional.
|
|
||||||
- `[note]` The render-and-`nft -c` (no-apply) Molecule approach **earned its keep** —
|
|
||||||
caught the `iif`/`iifname` bug deterministically without touching the host kernel. Good
|
|
||||||
pattern to reuse for other config-rendering roles.
|
|
||||||
|
|
||||||
## 2026-06-09
|
|
||||||
|
|
||||||
- `[recurring]` **Asked the execution-mode question AGAIN** — presented the
|
|
||||||
"subagent-driven vs inline" menu at the `writing-plans` → execution handoff, even
|
|
||||||
though the standing 2026-06-05 preference and the `always-subagent-driven-execution`
|
|
||||||
memory both say to default to subagent-driven without asking. Third occurrence; the
|
|
||||||
earlier "hard rule" escalation didn't hold because both `writing-plans` and
|
|
||||||
`subagent-driven-development` script the menu and I followed the skill text over the
|
|
||||||
user's standing override. → The standing preference outranks skill scripts: when a
|
|
||||||
skill's handoff offers the execution-mode menu, skip it and proceed subagent-driven;
|
|
||||||
only ask if the user signals otherwise this session.
|
|
||||||
|
|
||||||
## 2026-06-10
|
|
||||||
|
|
||||||
- `[recurring]` **Asked the execution-mode question AGAIN** — presented the
|
|
||||||
"subagent-driven (recommended) vs inline" menu at the `writing-plans` → execution
|
|
||||||
handoff (backup-strategy plan), despite the 2026-06-05 standing preference, the
|
|
||||||
`always-subagent-driven-execution` memory, and two prior FRICTION entries (06-06,
|
|
||||||
06-09) all saying don't ask. **Fourth occurrence.** Doc/memory escalations are not
|
|
||||||
holding: each session I re-read the skill's scripted menu and follow it over the
|
|
||||||
standing override. → Prose reminders have demonstrably failed four times; the fix is
|
|
||||||
no longer "try harder to remember" but **mechanical** — a hook or a `writing-plans`
|
|
||||||
local override that suppresses the handoff menu (cf. `update-config`: standing
|
|
||||||
automated behaviours need a hook, not memory). Flag as the top systematization
|
|
||||||
candidate for the next kaizen review.
|
|
||||||
|
|
|
||||||
|
|
@ -1,5 +1,8 @@
|
||||||
# ADR-008 — Testing methodology
|
# ADR-008 — Testing methodology
|
||||||
|
|
||||||
|
> Practical point-of-use pitfalls (nft render checks, Molecule `community.docker`,
|
||||||
|
> apply-path coverage blind spots) live in `docs/testing/gotchas.md`.
|
||||||
|
|
||||||
## Context
|
## Context
|
||||||
|
|
||||||
Ansible roles must be idempotent and correct before they touch production hosts.
|
Ansible roles must be idempotent and correct before they touch production hosts.
|
||||||
|
|
|
||||||
|
|
@ -58,6 +58,23 @@ The dangerous-mode permission prompt (`skipDangerousModePermissionPrompt`) is a
|
||||||
"operator/agent error" threat, prefer leaving that prompt **on** unless you
|
"operator/agent error" threat, prefer leaving that prompt **on** unless you
|
||||||
deliberately rely on bypass mode.
|
deliberately rely on bypass mode.
|
||||||
|
|
||||||
|
## Environment gotchas
|
||||||
|
|
||||||
|
Migrated from `docs/FRICTION.md` by the 2026-06-10 kaizen review — surprises that bite
|
||||||
|
on this kind of host/toolchain:
|
||||||
|
|
||||||
|
- **Hooks (and any new `.claude/settings.json`) added mid-session don't activate until a
|
||||||
|
Claude Code restart.** The settings watcher only tracks settings files that existed at
|
||||||
|
session start; opening `/hooks` and dismissing does *not* load them. Fresh sessions
|
||||||
|
load them normally — restart after adding a hook.
|
||||||
|
- **pre-commit stashes *unstaged* changes before running hooks**, so a partial commit of
|
||||||
|
interdependent files can revert one and fail (e.g. an `ansible.cfg` change left
|
||||||
|
unstaged). Commit interdependent changes together, or stage the config change first.
|
||||||
|
- **`rbw sync` is required after adding a Vaultwarden item before `rbw get` finds it**
|
||||||
|
(the local cache is stale otherwise).
|
||||||
|
- **This shell is zsh** — unquoted `$VAR` does *not* word-split, so a variable holding a
|
||||||
|
file list is passed as a single argument. Use explicit args/arrays.
|
||||||
|
|
||||||
## Verifying
|
## Verifying
|
||||||
|
|
||||||
After setup, a quick check: the project commands (`/review-repo`, `/capacity-review`,
|
After setup, a quick check: the project commands (`/review-repo`, `/capacity-review`,
|
||||||
|
|
|
||||||
36
docs/testing/gotchas.md
Normal file
36
docs/testing/gotchas.md
Normal file
|
|
@ -0,0 +1,36 @@
|
||||||
|
# Testing & Molecule gotchas
|
||||||
|
|
||||||
|
Durable, point-of-use knowledge for writing and running role tests (ADR-008).
|
||||||
|
Migrated from `docs/FRICTION.md` by the 2026-06-10 kaizen review. Append here when a
|
||||||
|
testing surprise is worth remembering past the session that hit it.
|
||||||
|
|
||||||
|
## nftables / `nft -c` render checks
|
||||||
|
|
||||||
|
- **`nft -c` rejects `iif "<name>"` when the interface is absent** — `iif` resolves to
|
||||||
|
an interface *index* at load time, so it fails in the Molecule container and would
|
||||||
|
fail identically on any real host before the interface exists (e.g. `wt0` before
|
||||||
|
NetBird is up). Use **`iifname "<name>"`** (string match, no existence requirement,
|
||||||
|
survives the interface coming and going) for any interface that may be absent.
|
||||||
|
- **The render-and-`nft -c` (no-apply) Molecule approach earns its keep** — it caught
|
||||||
|
the `iif`/`iifname` bug deterministically without touching the host kernel. Reuse
|
||||||
|
this pattern (render template → static-check, never apply) for other config-rendering
|
||||||
|
roles.
|
||||||
|
|
||||||
|
## Molecule (`community.docker`)
|
||||||
|
|
||||||
|
- **Molecule's `community.docker` connection uses `ansible_host` as the container name**
|
||||||
|
(`remote_addr`). Setting `ansible_host` as *data* in a scenario's `host_vars` (e.g. to
|
||||||
|
give a resolver a fake IP) breaks the connection → `UNREACHABLE` / "Failed to create
|
||||||
|
temporary directory". Don't override `ansible_host` in Molecule; feed fixture IPs
|
||||||
|
another way (keep fixtures to zone sources and unit-test IP resolution).
|
||||||
|
|
||||||
|
## Coverage blind spot: apply-only task paths
|
||||||
|
|
||||||
|
- **Apply-only task paths have no Level-1 coverage**, so safety bugs hide there. Example:
|
||||||
|
an `nft` auto-rollback snapshot used a bare `nft list ruleset` (no leading
|
||||||
|
`flush ruleset`), so the revert was a silent no-op on first apply and errored on later
|
||||||
|
ones — the whole safety net was dead. Molecule never runs the apply (gated off), so
|
||||||
|
only adversarial review + an isolated-netns round-trip test caught it. → For
|
||||||
|
apply/safety paths Molecule can't exercise, validate out-of-band (a throwaway
|
||||||
|
`--privileged` container with its own netns) and treat a final adversarial review as
|
||||||
|
**mandatory, not optional**.
|
||||||
Loading…
Add table
Reference in a new issue