From 16b0d9eb772ef5707dd93da6f121b3b63d0b060d Mon Sep 17 00:00:00 2001 From: YeonGyu-Kim Date: Sat, 14 Mar 2026 13:47:40 +0900 Subject: [PATCH] fix(atlas): gate final-wave approval on real plan state Ignore nested plan checkboxes and track parallel final-wave approvals so Atlas only pauses for user approval when the real top-level review wave is complete. Ultraworked with [Sisyphus](https://github.com/code-yeongyu/oh-my-openagent) Co-authored-by: Sisyphus --- src/agents/atlas/atlas-prompt.test.ts | 13 ++ src/agents/atlas/default.ts | 6 +- src/agents/atlas/gemini.ts | 6 +- src/agents/atlas/gpt.ts | 6 +- ...inal-wave-approval-gate-regression.test.ts | 205 ++++++++++++++++++ src/hooks/atlas/final-wave-approval-gate.ts | 64 +++--- src/hooks/atlas/final-wave-plan-state.ts | 60 +++++ src/hooks/atlas/tool-execute-after.ts | 15 +- src/hooks/atlas/types.ts | 2 + src/hooks/atlas/verification-reminders.ts | 5 +- 10 files changed, 336 insertions(+), 46 deletions(-) create mode 100644 src/hooks/atlas/final-wave-approval-gate-regression.test.ts create mode 100644 src/hooks/atlas/final-wave-plan-state.ts diff --git a/src/agents/atlas/atlas-prompt.test.ts b/src/agents/atlas/atlas-prompt.test.ts index e536cff98..f92417955 100644 --- a/src/agents/atlas/atlas-prompt.test.ts +++ b/src/agents/atlas/atlas-prompt.test.ts @@ -130,4 +130,17 @@ describe("Atlas prompts plan path consistency", () => { expect(prompt).toMatch(/read[\s\S]*?\.sisyphus\/plans\//) } }) + + test("all variants should distinguish top-level plan tasks from nested checkboxes", () => { + // given + const prompts = [ATLAS_SYSTEM_PROMPT, ATLAS_GPT_SYSTEM_PROMPT, ATLAS_GEMINI_SYSTEM_PROMPT] + + // when / then + for (const prompt of prompts) { + const lowerPrompt = prompt.toLowerCase() + expect(lowerPrompt).toMatch(/top-level.*checkbox/) + expect(lowerPrompt).toMatch(/ignore nested.*checkbox/) + expect(lowerPrompt).toMatch(/final verification wave/) + } + }) }) diff --git a/src/agents/atlas/default.ts b/src/agents/atlas/default.ts index c5aafc80b..0470c771d 100644 --- a/src/agents/atlas/default.ts +++ b/src/agents/atlas/default.ts @@ -140,7 +140,8 @@ TodoWrite([ ## Step 1: Analyze Plan 1. Read the todo list file -2. Parse incomplete checkboxes \`- [ ]\` +2. Parse actionable **top-level** task checkboxes in \`## TODOs\` and \`## Final Verification Wave\` + - Ignore nested checkboxes under Acceptance Criteria, Evidence, Definition of Done, and Final Checklist sections. 3. Extract parallelizability info from each task 4. Build parallelization map: - Which tasks can run simultaneously? @@ -242,7 +243,7 @@ After verification, READ the plan file directly — every time, no exceptions: \`\`\` Read(".sisyphus/plans/{plan-name}.md") \`\`\` -Count remaining \`- [ ]\` tasks. This is your ground truth for what comes next. +Count remaining **top-level task** checkboxes. Ignore nested verification/evidence checkboxes. This is your ground truth for what comes next. **Checklist (ALL must be checked):** \`\`\` @@ -296,6 +297,7 @@ Repeat Step 3 until all implementation tasks complete. Then proceed to Step 4. The plan's Final Wave tasks (F1-F4) are APPROVAL GATES — not regular tasks. Each reviewer produces a VERDICT: APPROVE or REJECT. +Final-wave reviewers can finish in parallel before you update the plan file, so do NOT rely on raw unchecked-count alone. 1. Execute all Final Wave tasks in parallel 2. If ANY verdict is REJECT: diff --git a/src/agents/atlas/gemini.ts b/src/agents/atlas/gemini.ts index 78ac92976..26f64d876 100644 --- a/src/agents/atlas/gemini.ts +++ b/src/agents/atlas/gemini.ts @@ -157,7 +157,8 @@ TodoWrite([ ## Step 1: Analyze Plan 1. Read the todo list file -2. Parse incomplete checkboxes \`- [ ]\` +2. Parse actionable **top-level** task checkboxes in \`## TODOs\` and \`## Final Verification Wave\` + - Ignore nested checkboxes under Acceptance Criteria, Evidence, Definition of Done, and Final Checklist sections. 3. Build parallelization map Output format: @@ -263,7 +264,7 @@ ALL three must be YES. "Probably" = NO. "I think so" = NO. \`\`\` Read(".sisyphus/plans/{plan-name}.md") \`\`\` -Count remaining \`- [ ]\` tasks. +Count remaining **top-level task** checkboxes. Ignore nested verification/evidence checkboxes. ### 3.5 Handle Failures @@ -284,6 +285,7 @@ Repeat Step 3 until all implementation tasks complete. Then proceed to Step 4. The plan's Final Wave tasks (F1-F4) are APPROVAL GATES — not regular tasks. Each reviewer produces a VERDICT: APPROVE or REJECT. +Final-wave reviewers can finish in parallel before you update the plan file, so do NOT rely on raw unchecked-count alone. 1. Execute all Final Wave tasks in parallel 2. If ANY verdict is REJECT: diff --git a/src/agents/atlas/gpt.ts b/src/agents/atlas/gpt.ts index a06f45c22..a747a12a3 100644 --- a/src/agents/atlas/gpt.ts +++ b/src/agents/atlas/gpt.ts @@ -167,7 +167,8 @@ TodoWrite([ ## Step 1: Analyze Plan 1. Read the todo list file -2. Parse incomplete checkboxes \`- [ ]\` +2. Parse actionable **top-level** task checkboxes in \`## TODOs\` and \`## Final Verification Wave\` + - Ignore nested checkboxes under Acceptance Criteria, Evidence, Definition of Done, and Final Checklist sections. 3. Build parallelization map Output format: @@ -268,7 +269,7 @@ Before moving to the next task, answer these THREE questions honestly: \`\`\` Read(".sisyphus/plans/{plan-name}.md") \`\`\` -Count remaining \`- [ ]\` tasks. This is your ground truth. +Count remaining **top-level task** checkboxes. Ignore nested verification/evidence checkboxes. This is your ground truth. ### 3.5 Handle Failures @@ -289,6 +290,7 @@ Repeat Step 3 until all implementation tasks complete. Then proceed to Step 4. The plan's Final Wave tasks (F1-F4) are APPROVAL GATES — not regular tasks. Each reviewer produces a VERDICT: APPROVE or REJECT. +Final-wave reviewers can finish in parallel before you update the plan file, so do NOT rely on raw unchecked-count alone. 1. Execute all Final Wave tasks in parallel 2. If ANY verdict is REJECT: diff --git a/src/hooks/atlas/final-wave-approval-gate-regression.test.ts b/src/hooks/atlas/final-wave-approval-gate-regression.test.ts new file mode 100644 index 000000000..b653c66d4 --- /dev/null +++ b/src/hooks/atlas/final-wave-approval-gate-regression.test.ts @@ -0,0 +1,205 @@ +import { afterEach, beforeEach, describe, expect, mock, test } from "bun:test" +import { randomUUID } from "node:crypto" +import { existsSync, mkdirSync, rmSync, writeFileSync } from "node:fs" +import { tmpdir } from "node:os" +import { join } from "node:path" +import { createOpencodeClient } from "@opencode-ai/sdk" +import type { AssistantMessage } from "@opencode-ai/sdk" +import type { BoulderState } from "../../features/boulder-state" +import { clearBoulderState, writeBoulderState } from "../../features/boulder-state" + +const TEST_STORAGE_ROOT = join(tmpdir(), `atlas-final-wave-regression-storage-${randomUUID()}`) +const TEST_MESSAGE_STORAGE = join(TEST_STORAGE_ROOT, "message") +const TEST_PART_STORAGE = join(TEST_STORAGE_ROOT, "part") + +mock.module("../../features/hook-message-injector/constants", () => ({ + OPENCODE_STORAGE: TEST_STORAGE_ROOT, + MESSAGE_STORAGE: TEST_MESSAGE_STORAGE, + PART_STORAGE: TEST_PART_STORAGE, +})) + +mock.module("../../shared/opencode-message-dir", () => ({ + getMessageDir: (sessionID: string) => { + const directoryPath = join(TEST_MESSAGE_STORAGE, sessionID) + return existsSync(directoryPath) ? directoryPath : null + }, +})) + +mock.module("../../shared/opencode-storage-detection", () => ({ + isSqliteBackend: () => false, +})) + +const { createAtlasHook } = await import("./index") +const { MESSAGE_STORAGE } = await import("../../features/hook-message-injector") + +type AtlasHookContext = Parameters[0] + +describe("Atlas final-wave approval gate regressions", () => { + let testDirectory = "" + + function createMockPluginInput(): AtlasHookContext { + const client = createOpencodeClient({ baseUrl: "http://localhost" }) + + Reflect.set(client.session, "prompt", async () => ({ + data: { info: {} as AssistantMessage, parts: [] }, + request: new Request("http://localhost/session/prompt"), + response: new Response(), + })) + + Reflect.set(client.session, "promptAsync", async () => ({ + data: undefined, + request: new Request("http://localhost/session/prompt_async"), + response: new Response(), + })) + + return { + directory: testDirectory, + project: {} as AtlasHookContext["project"], + worktree: testDirectory, + serverUrl: new URL("http://localhost"), + $: {} as AtlasHookContext["$"], + client, + } + } + + function setupMessageStorage(sessionID: string): void { + const messageDirectory = join(MESSAGE_STORAGE, sessionID) + if (!existsSync(messageDirectory)) { + mkdirSync(messageDirectory, { recursive: true }) + } + + writeFileSync( + join(messageDirectory, "msg_test001.json"), + JSON.stringify({ + agent: "atlas", + model: { providerID: "anthropic", modelID: "claude-opus-4-6" }, + }), + ) + } + + function writePlanState(sessionID: string, planName: string, planContent: string): void { + const planPath = join(testDirectory, `${planName}.md`) + writeFileSync(planPath, planContent) + + const state: BoulderState = { + active_plan: planPath, + started_at: "2026-01-02T10:00:00Z", + session_ids: [sessionID], + plan_name: planName, + agent: "atlas", + } + + writeBoulderState(testDirectory, state) + } + + beforeEach(() => { + testDirectory = join(tmpdir(), `atlas-final-wave-regression-${randomUUID()}`) + mkdirSync(join(testDirectory, ".sisyphus"), { recursive: true }) + clearBoulderState(testDirectory) + }) + + afterEach(() => { + clearBoulderState(testDirectory) + if (existsSync(testDirectory)) { + rmSync(testDirectory, { recursive: true, force: true }) + } + }) + + test("waits for approval when nested plan checkboxes remain but the only pending top-level task is final-wave", async () => { + // given + const sessionID = "atlas-nested-final-wave-session" + setupMessageStorage(sessionID) + writePlanState(sessionID, "nested-final-wave-plan", `# Plan + +## TODOs +- [x] 1. Implement feature + + **Acceptance Criteria**: + - [ ] bun test src/feature.test.ts -> PASS + + **Evidence to Capture**: + - [ ] Each evidence file named: task-1-happy-path.txt + +## Final Verification Wave (MANDATORY - after ALL implementation tasks) +- [x] F1. **Plan Compliance Audit** - \`oracle\` +- [x] F2. **Code Quality Review** - \`unspecified-high\` +- [x] F3. **Real Manual QA** - \`unspecified-high\` +- [ ] F4. **Scope Fidelity Check** - \`deep\` + +## Final Checklist +- [ ] All tests pass +`) + + const hook = createAtlasHook(createMockPluginInput()) + const toolOutput = { + title: "Sisyphus Task", + output: `Tasks [1/1 compliant] | Contamination [CLEAN] | Unaccounted [CLEAN] | VERDICT: APPROVE + + +session_id: ses_nested_scope_review +`, + metadata: {}, + } + + // when + await hook["tool.execute.after"]({ tool: "task", sessionID }, toolOutput) + + // then + expect(toolOutput.output).toContain("FINAL WAVE APPROVAL GATE") + expect(toolOutput.output).toContain("explicit user approval") + expect(toolOutput.output).not.toContain("STEP 8: PROCEED TO NEXT TASK") + }) + + test("waits for approval after the final parallel reviewer approves before plan checkboxes are updated", async () => { + // given + const sessionID = "atlas-parallel-final-wave-session" + setupMessageStorage(sessionID) + writePlanState(sessionID, "parallel-final-wave-plan", `# Plan + +## TODOs +- [x] 1. Ship implementation +- [x] 2. Verify implementation + +## Final Verification Wave (MANDATORY - after ALL implementation tasks) +- [ ] F1. **Plan Compliance Audit** - \`oracle\` +- [ ] F2. **Code Quality Review** - \`unspecified-high\` +- [ ] F3. **Real Manual QA** - \`unspecified-high\` +- [ ] F4. **Scope Fidelity Check** - \`deep\` +`) + + const hook = createAtlasHook(createMockPluginInput()) + const firstThreeOutputs = [1, 2, 3].map((index) => ({ + title: `Final review ${index}`, + output: `Reviewer ${index} | VERDICT: APPROVE + + +session_id: ses_parallel_review_${index} +`, + metadata: {}, + })) + const lastOutput = { + title: "Final review 4", + output: `Reviewer 4 | VERDICT: APPROVE + + +session_id: ses_parallel_review_4 +`, + metadata: {}, + } + + // when + for (const toolOutput of firstThreeOutputs) { + await hook["tool.execute.after"]({ tool: "task", sessionID }, toolOutput) + } + await hook["tool.execute.after"]({ tool: "task", sessionID }, lastOutput) + + // then + for (const toolOutput of firstThreeOutputs) { + expect(toolOutput.output).toContain("STEP 8: PROCEED TO NEXT TASK") + expect(toolOutput.output).not.toContain("FINAL WAVE APPROVAL GATE") + } + expect(lastOutput.output).toContain("FINAL WAVE APPROVAL GATE") + expect(lastOutput.output).toContain("explicit user approval") + expect(lastOutput.output).not.toContain("STEP 8: PROCEED TO NEXT TASK") + }) +}) diff --git a/src/hooks/atlas/final-wave-approval-gate.ts b/src/hooks/atlas/final-wave-approval-gate.ts index 9928bbe16..0aeb894ff 100644 --- a/src/hooks/atlas/final-wave-approval-gate.ts +++ b/src/hooks/atlas/final-wave-approval-gate.ts @@ -1,47 +1,47 @@ -import { existsSync, readFileSync } from "node:fs" +import type { SessionState } from "./types" +import { readFinalWavePlanState } from "./final-wave-plan-state" const APPROVE_VERDICT_PATTERN = /\bVERDICT:\s*APPROVE\b/i -const FINAL_VERIFICATION_HEADING_PATTERN = /^##\s+Final Verification Wave\b/i -const UNCHECKED_TASK_PATTERN = /^\s*[-*]\s*\[\s*\]\s*(.+)$/ -const FINAL_WAVE_TASK_PATTERN = /^F\d+\./i + +function clearFinalWaveApprovalTracking(sessionState: SessionState): void { + sessionState.pendingFinalWaveTaskCount = undefined + sessionState.approvedFinalWaveTaskCount = undefined +} export function shouldPauseForFinalWaveApproval(input: { planPath: string taskOutput: string + sessionState: SessionState }): boolean { + const planState = readFinalWavePlanState(input.planPath) + if (!planState) { + return false + } + + if (planState.pendingImplementationTaskCount > 0 || planState.pendingFinalWaveTaskCount === 0) { + clearFinalWaveApprovalTracking(input.sessionState) + return false + } + if (!APPROVE_VERDICT_PATTERN.test(input.taskOutput)) { return false } - if (!existsSync(input.planPath)) { - return false + if (planState.pendingFinalWaveTaskCount === 1) { + clearFinalWaveApprovalTracking(input.sessionState) + return true } - try { - const content = readFileSync(input.planPath, "utf-8") - const lines = content.split(/\r?\n/) - let inFinalVerificationWave = false - let uncheckedTaskCount = 0 - let uncheckedFinalWaveTaskCount = 0 - - for (const line of lines) { - if (/^##\s+/.test(line)) { - inFinalVerificationWave = FINAL_VERIFICATION_HEADING_PATTERN.test(line) - } - - const uncheckedTaskMatch = line.match(UNCHECKED_TASK_PATTERN) - if (!uncheckedTaskMatch) { - continue - } - - uncheckedTaskCount += 1 - if (inFinalVerificationWave && FINAL_WAVE_TASK_PATTERN.test(uncheckedTaskMatch[1].trim())) { - uncheckedFinalWaveTaskCount += 1 - } - } - - return uncheckedTaskCount === 1 && uncheckedFinalWaveTaskCount === 1 - } catch { - return false + if (input.sessionState.pendingFinalWaveTaskCount !== planState.pendingFinalWaveTaskCount) { + input.sessionState.pendingFinalWaveTaskCount = planState.pendingFinalWaveTaskCount + input.sessionState.approvedFinalWaveTaskCount = 0 } + + input.sessionState.approvedFinalWaveTaskCount = (input.sessionState.approvedFinalWaveTaskCount ?? 0) + 1 + const shouldPause = input.sessionState.approvedFinalWaveTaskCount >= planState.pendingFinalWaveTaskCount + if (shouldPause) { + clearFinalWaveApprovalTracking(input.sessionState) + } + + return shouldPause } diff --git a/src/hooks/atlas/final-wave-plan-state.ts b/src/hooks/atlas/final-wave-plan-state.ts new file mode 100644 index 000000000..0fd9b7971 --- /dev/null +++ b/src/hooks/atlas/final-wave-plan-state.ts @@ -0,0 +1,60 @@ +import { existsSync, readFileSync } from "node:fs" + +const TODO_HEADING_PATTERN = /^##\s+TODOs\b/i +const FINAL_VERIFICATION_HEADING_PATTERN = /^##\s+Final Verification Wave\b/i +const SECOND_LEVEL_HEADING_PATTERN = /^##\s+/ +const UNCHECKED_CHECKBOX_PATTERN = /^\s*[-*]\s*\[\s*\]\s*(.+)$/ +const TODO_TASK_PATTERN = /^\d+\./ +const FINAL_WAVE_TASK_PATTERN = /^F\d+\./i + +type PlanSection = "todo" | "final-wave" | "other" + +export type FinalWavePlanState = { + pendingImplementationTaskCount: number + pendingFinalWaveTaskCount: number +} + +export function readFinalWavePlanState(planPath: string): FinalWavePlanState | null { + if (!existsSync(planPath)) { + return null + } + + try { + const content = readFileSync(planPath, "utf-8") + const lines = content.split(/\r?\n/) + let section: PlanSection = "other" + let pendingImplementationTaskCount = 0 + let pendingFinalWaveTaskCount = 0 + + for (const line of lines) { + if (SECOND_LEVEL_HEADING_PATTERN.test(line)) { + section = TODO_HEADING_PATTERN.test(line) + ? "todo" + : FINAL_VERIFICATION_HEADING_PATTERN.test(line) + ? "final-wave" + : "other" + } + + const uncheckedTaskMatch = line.match(UNCHECKED_CHECKBOX_PATTERN) + if (!uncheckedTaskMatch) { + continue + } + + const taskLabel = uncheckedTaskMatch[1].trim() + if (section === "todo" && TODO_TASK_PATTERN.test(taskLabel)) { + pendingImplementationTaskCount += 1 + } + + if (section === "final-wave" && FINAL_WAVE_TASK_PATTERN.test(taskLabel)) { + pendingFinalWaveTaskCount += 1 + } + } + + return { + pendingImplementationTaskCount, + pendingFinalWaveTaskCount, + } + } catch { + return null + } +} diff --git a/src/hooks/atlas/tool-execute-after.ts b/src/hooks/atlas/tool-execute-after.ts index 9c4e82eae..55fb8ddd6 100644 --- a/src/hooks/atlas/tool-execute-after.ts +++ b/src/hooks/atlas/tool-execute-after.ts @@ -72,6 +72,7 @@ export function createToolExecuteAfterHandler(input: { const boulderState = readBoulderState(ctx.directory) if (boulderState) { const progress = getPlanProgress(boulderState.active_plan) + const sessionState = toolInput.sessionID ? getState(toolInput.sessionID) : undefined if (toolInput.sessionID && !boulderState.session_ids?.includes(toolInput.sessionID)) { appendSessionId(ctx.directory, toolInput.sessionID) @@ -83,13 +84,15 @@ export function createToolExecuteAfterHandler(input: { // Preserve original subagent response - critical for debugging failed tasks const originalResponse = toolOutput.output - const shouldPauseForApproval = shouldPauseForFinalWaveApproval({ - planPath: boulderState.active_plan, - taskOutput: originalResponse, - }) + const shouldPauseForApproval = sessionState + ? shouldPauseForFinalWaveApproval({ + planPath: boulderState.active_plan, + taskOutput: originalResponse, + sessionState, + }) + : false - if (toolInput.sessionID) { - const sessionState = getState(toolInput.sessionID) + if (sessionState) { sessionState.waitingForFinalWaveApproval = shouldPauseForApproval if (shouldPauseForApproval && sessionState.pendingRetryTimer) { diff --git a/src/hooks/atlas/types.ts b/src/hooks/atlas/types.ts index bbc83a149..c3aa9bbc7 100644 --- a/src/hooks/atlas/types.ts +++ b/src/hooks/atlas/types.ts @@ -32,4 +32,6 @@ export interface SessionState { lastFailureAt?: number pendingRetryTimer?: ReturnType waitingForFinalWaveApproval?: boolean + pendingFinalWaveTaskCount?: number + approvedFinalWaveTaskCount?: number } diff --git a/src/hooks/atlas/verification-reminders.ts b/src/hooks/atlas/verification-reminders.ts index 33d7a47fe..cb988984e 100644 --- a/src/hooks/atlas/verification-reminders.ts +++ b/src/hooks/atlas/verification-reminders.ts @@ -131,10 +131,11 @@ The last Final Verification Wave result just passed. This is the ONLY point where approval-style user interaction is required. 1. Read \ -\`.sisyphus/plans/${planName}.md\` again and confirm the remaining unchecked item is the last final-wave task. +\`.sisyphus/plans/${planName}.md\` again and confirm every remaining unchecked **top-level** task belongs to F1-F4. + Ignore nested checkboxes under Acceptance Criteria, Evidence, or Final Checklist sections. 2. Consolidate the F1-F4 verdicts into a short summary for the user. 3. Tell the user all final reviewers approved. -4. Ask for explicit user approval before editing the last final-wave checkbox or marking the plan complete. +4. Ask for explicit user approval before editing any remaining final-wave checkboxes or marking the plan complete. 5. Wait for the user's explicit approval. Do NOT auto-continue. Do NOT call \ \`task()\` again unless the user rejects and requests fixes.