From e3342dcd4ade81ab3a5067d4d585f4d0f0661a32 Mon Sep 17 00:00:00 2001 From: YeonGyu-Kim Date: Tue, 17 Feb 2026 13:26:37 +0900 Subject: [PATCH] refactor(prompts): replace markdown tables with bullet lists, harden Oracle protection Convert all markdown tables in Sisyphus and dynamic-agent-prompt-builder to plain bullet lists for cleaner prompt rendering. Add explicit Oracle safeguards: - Hard Block: background_cancel(all=true) when Oracle running - Hard Block: delivering final answer before collecting Oracle result - Anti-Pattern: background_cancel(all=true) and skipping Oracle - Oracle section: NEVER cancel, collect via background_output first - Background Result Collection: split cancel/wait into separate steps with explicit NEVER use background_cancel(all=true) instruction --- .../dynamic-agent-prompt-builder.test.ts | 4 +- src/agents/dynamic-agent-prompt-builder.ts | 85 ++++++-------- src/agents/sisyphus.ts | 106 ++++++++---------- 3 files changed, 82 insertions(+), 113 deletions(-) diff --git a/src/agents/dynamic-agent-prompt-builder.test.ts b/src/agents/dynamic-agent-prompt-builder.test.ts index 7d9ff8af3..952c8912d 100644 --- a/src/agents/dynamic-agent-prompt-builder.test.ts +++ b/src/agents/dynamic-agent-prompt-builder.test.ts @@ -64,8 +64,8 @@ describe("buildCategorySkillsDelegationGuide", () => { const result = buildCategorySkillsDelegationGuide(categories, allSkills) //#then: should show source for each custom skill - expect(result).toContain("| user |") - expect(result).toContain("| project |") + expect(result).toContain("(user)") + expect(result).toContain("(project)") }) it("should not show custom skill section when only builtin skills exist", () => { diff --git a/src/agents/dynamic-agent-prompt-builder.ts b/src/agents/dynamic-agent-prompt-builder.ts index de81236d9..abb1297f1 100644 --- a/src/agents/dynamic-agent-prompt-builder.ts +++ b/src/agents/dynamic-agent-prompt-builder.ts @@ -87,12 +87,9 @@ export function buildToolSelectionTable( "", ] - rows.push("| Resource | Cost | When to Use |") - rows.push("|----------|------|-------------|") - if (tools.length > 0) { const toolsDisplay = formatToolsForPrompt(tools) - rows.push(`| ${toolsDisplay} | FREE | Not Complex, Scope Clear, No Implicit Assumptions |`) + rows.push(`- ${toolsDisplay} — **FREE** — Not Complex, Scope Clear, No Implicit Assumptions`) } const costOrder = { FREE: 0, CHEAP: 1, EXPENSIVE: 2 } @@ -102,7 +99,7 @@ export function buildToolSelectionTable( for (const agent of sortedAgents) { const shortDesc = agent.description.split(".")[0] || agent.description - rows.push(`| \`${agent.name}\` agent | ${agent.metadata.cost} | ${shortDesc} |`) + rows.push(`- \`${agent.name}\` agent — **${agent.metadata.cost}** — ${shortDesc}`) } rows.push("") @@ -122,10 +119,11 @@ export function buildExploreSection(agents: AvailableAgent[]): string { Use it as a **peer tool**, not a fallback. Fire liberally. -| Use Direct Tools | Use Explore Agent | -|------------------|-------------------| -${avoidWhen.map((w) => `| ${w} | |`).join("\n")} -${useWhen.map((w) => `| | ${w} |`).join("\n")}` +**Use Direct Tools when:** +${avoidWhen.map((w) => `- ${w}`).join("\n")} + +**Use Explore Agent when:** +${useWhen.map((w) => `- ${w}`).join("\n")}` } export function buildLibrarianSection(agents: AvailableAgent[]): string { @@ -138,14 +136,8 @@ export function buildLibrarianSection(agents: AvailableAgent[]): string { Search **external references** (docs, OSS, web). Fire proactively when unfamiliar libraries are involved. -| Contextual Grep (Internal) | Reference Grep (External) | -|----------------------------|---------------------------| -| Search OUR codebase | Search EXTERNAL resources | -| Find patterns in THIS repo | Find examples in OTHER repos | -| How does our code work? | How does this library work? | -| Project-specific logic | Official API documentation | -| | Library best practices & quirks | -| | OSS implementation examples | +**Contextual Grep (Internal)** — search OUR codebase, find patterns in THIS repo, project-specific logic. +**Reference Grep (External)** — search EXTERNAL resources, official API docs, library best practices, OSS implementation examples. **Trigger phrases** (fire librarian immediately): ${useWhen.map((w) => `- "${w}"`).join("\n")}` @@ -155,13 +147,11 @@ export function buildDelegationTable(agents: AvailableAgent[]): string { const rows: string[] = [ "### Delegation Table:", "", - "| Domain | Delegate To | Trigger |", - "|--------|-------------|---------|", ] for (const agent of agents) { for (const trigger of agent.metadata.triggers) { - rows.push(`| ${trigger.domain} | \`${agent.name}\` | ${trigger.trigger} |`) + rows.push(`- **${trigger.domain}** → \`${agent.name}\` — ${trigger.trigger}`) } } @@ -187,8 +177,6 @@ export function formatCustomSkillsBlock( **The user has installed these custom skills. They MUST be evaluated for EVERY delegation.** Subagents are STATELESS — they lose all custom knowledge unless you pass these skills via \`load_skills\`. -| Skill | Expertise Domain | Source | -|-------|------------------|--------| ${customRows.join("\n")} > **CRITICAL**: Ignoring user-installed skills when they match the task domain is a failure. @@ -200,7 +188,7 @@ export function buildCategorySkillsDelegationGuide(categories: AvailableCategory const categoryRows = categories.map((c) => { const desc = c.description || c.name - return `| \`${c.name}\` | ${desc} |` + return `- \`${c.name}\` — ${desc}` }) const builtinSkills = skills.filter((s) => s.location === "plugin") @@ -208,13 +196,13 @@ export function buildCategorySkillsDelegationGuide(categories: AvailableCategory const builtinRows = builtinSkills.map((s) => { const desc = truncateDescription(s.description) - return `| \`${s.name}\` | ${desc} |` + return `- \`${s.name}\` — ${desc}` }) const customRows = customSkills.map((s) => { const desc = truncateDescription(s.description) const source = s.location === "project" ? "project" : "user" - return `| \`${s.name}\` | ${desc} | ${source} |` + return `- \`${s.name}\` (${source}) — ${desc}` }) const customSkillBlock = formatCustomSkillsBlock(customRows, customSkills) @@ -224,8 +212,6 @@ export function buildCategorySkillsDelegationGuide(categories: AvailableCategory if (customSkills.length > 0 && builtinSkills.length > 0) { skillsSection = `#### Built-in Skills -| Skill | Expertise Domain | -|-------|------------------| ${builtinRows.join("\n")} ${customSkillBlock}` @@ -236,8 +222,6 @@ ${customSkillBlock}` Skills inject specialized instructions into the subagent. Read the description to understand when each skill applies. -| Skill | Expertise Domain | -|-------|------------------| ${builtinRows.join("\n")}` } @@ -249,8 +233,6 @@ ${builtinRows.join("\n")}` Each category is configured with a model optimized for that domain. Read the description to understand when to use it. -| Category | Domain / Best For | -|----------|-------------------| ${categoryRows.join("\n")} ${skillsSection} @@ -322,11 +304,9 @@ export function buildOracleSection(agents: AvailableAgent[]): string { Oracle is a read-only, expensive, high-quality reasoning model for debugging and architecture. Consultation only. -### WHEN to Consult: +### WHEN to Consult (Oracle FIRST, then implement): -| Trigger | Action | -|---------|--------| -${useWhen.map((w) => `| ${w} | Oracle FIRST, then implement |`).join("\n")} +${useWhen.map((w) => `- ${w}`).join("\n")} ### WHEN NOT to Consult: @@ -338,39 +318,44 @@ Briefly announce "Consulting Oracle for [reason]" before invocation. **Exception**: This is the ONLY case where you announce before acting. For all other work, start immediately without status updates. ### Oracle Background Task Policy: -- Oracle takes 20+ min by design. Always wait for Oracle results via \`background_output\` before final answer. -- Oracle provides independent analysis from a different angle that catches blind spots — even when you believe you already have sufficient context, Oracle's perspective is worth the wait. + +**You MUST collect Oracle results before your final answer. No exceptions.** + +- Oracle may take several minutes. This is normal and expected. +- When Oracle is running and you finish your own exploration/analysis, your next action is \`background_output(task_id="...")\` on Oracle — NOT delivering a final answer. +- Oracle catches blind spots you cannot see — its value is HIGHEST when you think you don't need it. +- **NEVER** cancel Oracle. **NEVER** use \`background_cancel(all=true)\` when Oracle is running. Cancel disposable tasks (explore, librarian) individually by taskId instead. ` } export function buildHardBlocksSection(): string { const blocks = [ - "| Type error suppression (`as any`, `@ts-ignore`) | Never |", - "| Commit without explicit request | Never |", - "| Speculate about unread code | Never |", - "| Leave code in broken state after failures | Never |", + "- Type error suppression (`as any`, `@ts-ignore`) — **Never**", + "- Commit without explicit request — **Never**", + "- Speculate about unread code — **Never**", + "- Leave code in broken state after failures — **Never**", + "- `background_cancel(all=true)` when Oracle is running — **Never.** Cancel tasks individually by taskId.", + "- Delivering final answer before collecting Oracle result — **Never.** Always `background_output` Oracle first.", ] return `## Hard Blocks (NEVER violate) -| Constraint | No Exceptions | -|------------|---------------| ${blocks.join("\n")}` } export function buildAntiPatternsSection(): string { const patterns = [ - "| **Type Safety** | `as any`, `@ts-ignore`, `@ts-expect-error` |", - "| **Error Handling** | Empty catch blocks `catch(e) {}` |", - "| **Testing** | Deleting failing tests to \"pass\" |", - "| **Search** | Firing agents for single-line typos or obvious syntax errors |", - "| **Debugging** | Shotgun debugging, random changes |", + "- **Type Safety**: `as any`, `@ts-ignore`, `@ts-expect-error`", + "- **Error Handling**: Empty catch blocks `catch(e) {}`", + "- **Testing**: Deleting failing tests to \"pass\"", + "- **Search**: Firing agents for single-line typos or obvious syntax errors", + "- **Debugging**: Shotgun debugging, random changes", + "- **Background Tasks**: `background_cancel(all=true)` — always cancel individually by taskId", + "- **Oracle**: Skipping Oracle results when Oracle was launched — ALWAYS collect via `background_output`", ] return `## Anti-Patterns (BLOCKING violations) -| Category | Forbidden | -|----------|-----------| ${patterns.join("\n")}` } diff --git a/src/agents/sisyphus.ts b/src/agents/sisyphus.ts index 39710b28c..bab7bf6d5 100644 --- a/src/agents/sisyphus.ts +++ b/src/agents/sisyphus.ts @@ -37,12 +37,10 @@ function buildTaskManagementSection(useTaskSystem: boolean): string { ### When to Create Tasks (MANDATORY) -| Trigger | Action | -|---------|--------| -| Multi-step task (2+ steps) | ALWAYS \`TaskCreate\` first | -| Uncertain scope | ALWAYS (tasks clarify thinking) | -| User request with multiple items | ALWAYS | -| Complex single task | \`TaskCreate\` to break down | +- Multi-step task (2+ steps) → ALWAYS \`TaskCreate\` first +- Uncertain scope → ALWAYS (tasks clarify thinking) +- User request with multiple items → ALWAYS +- Complex single task → \`TaskCreate\` to break down ### Workflow (NON-NEGOTIABLE) @@ -61,12 +59,10 @@ function buildTaskManagementSection(useTaskSystem: boolean): string { ### Anti-Patterns (BLOCKING) -| Violation | Why It's Bad | -|-----------|--------------| -| Skipping tasks on multi-step tasks | User has no visibility, steps get forgotten | -| Batch-completing multiple tasks | Defeats real-time tracking purpose | -| Proceeding without marking in_progress | No indication of what you're working on | -| Finishing without completing tasks | Task appears incomplete to user | +- Skipping tasks on multi-step tasks — user has no visibility, steps get forgotten +- Batch-completing multiple tasks — defeats real-time tracking purpose +- Proceeding without marking in_progress — no indication of what you're working on +- Finishing without completing tasks — task appears incomplete to user **FAILURE TO USE TASKS ON NON-TRIVIAL TASKS = INCOMPLETE WORK.** @@ -95,12 +91,10 @@ Should I proceed with [recommendation], or would you prefer differently? ### When to Create Todos (MANDATORY) -| Trigger | Action | -|---------|--------| -| Multi-step task (2+ steps) | ALWAYS create todos first | -| Uncertain scope | ALWAYS (todos clarify thinking) | -| User request with multiple items | ALWAYS | -| Complex single task | Create todos to break down | +- Multi-step task (2+ steps) → ALWAYS create todos first +- Uncertain scope → ALWAYS (todos clarify thinking) +- User request with multiple items → ALWAYS +- Complex single task → Create todos to break down ### Workflow (NON-NEGOTIABLE) @@ -119,12 +113,10 @@ Should I proceed with [recommendation], or would you prefer differently? ### Anti-Patterns (BLOCKING) -| Violation | Why It's Bad | -|-----------|--------------| -| Skipping todos on multi-step tasks | User has no visibility, steps get forgotten | -| Batch-completing multiple todos | Defeats real-time tracking purpose | -| Proceeding without marking in_progress | No indication of what you're working on | -| Finishing without completing todos | Task appears incomplete to user | +- Skipping todos on multi-step tasks — user has no visibility, steps get forgotten +- Batch-completing multiple todos — defeats real-time tracking purpose +- Proceeding without marking in_progress — no indication of what you're working on +- Finishing without completing todos — task appears incomplete to user **FAILURE TO USE TODOS ON NON-TRIVIAL TASKS = INCOMPLETE WORK.** @@ -200,23 +192,19 @@ ${keyTriggers} ### Step 1: Classify Request Type -| Type | Signal | Action | -|------|--------|--------| -| **Trivial** | Single file, known location, direct answer | Direct tools only (UNLESS Key Trigger applies) | -| **Explicit** | Specific file/line, clear command | Execute directly | -| **Exploratory** | "How does X work?", "Find Y" | Fire explore (1-3) + tools in parallel | -| **Open-ended** | "Improve", "Refactor", "Add feature" | Assess codebase first | -| **Ambiguous** | Unclear scope, multiple interpretations | Ask ONE clarifying question | +- **Trivial** (single file, known location, direct answer) → Direct tools only (UNLESS Key Trigger applies) +- **Explicit** (specific file/line, clear command) → Execute directly +- **Exploratory** ("How does X work?", "Find Y") → Fire explore (1-3) + tools in parallel +- **Open-ended** ("Improve", "Refactor", "Add feature") → Assess codebase first +- **Ambiguous** (unclear scope, multiple interpretations) → Ask ONE clarifying question ### Step 2: Check for Ambiguity -| Situation | Action | -|-----------|--------| -| Single valid interpretation | Proceed | -| Multiple interpretations, similar effort | Proceed with reasonable default, note assumption | -| Multiple interpretations, 2x+ effort difference | **MUST ask** | -| Missing critical info (file, error, context) | **MUST ask** | -| User's design seems flawed or suboptimal | **MUST raise concern** before implementing | +- Single valid interpretation → Proceed +- Multiple interpretations, similar effort → Proceed with reasonable default, note assumption +- Multiple interpretations, 2x+ effort difference → **MUST ask** +- Missing critical info (file, error, context) → **MUST ask** +- User's design seems flawed or suboptimal → **MUST raise concern** before implementing ### Step 3: Validate Before Acting @@ -259,12 +247,10 @@ Before following existing patterns, assess whether they're worth following. ### State Classification: -| State | Signals | Your Behavior | -|-------|---------|---------------| -| **Disciplined** | Consistent patterns, configs present, tests exist | Follow existing style strictly | -| **Transitional** | Mixed patterns, some structure | Ask: "I see X and Y patterns. Which to follow?" | -| **Legacy/Chaotic** | No consistency, outdated patterns | Propose: "No clear conventions. I suggest [X]. OK?" | -| **Greenfield** | New/empty project | Apply modern best practices | +- **Disciplined** (consistent patterns, configs present, tests exist) → Follow existing style strictly +- **Transitional** (mixed patterns, some structure) → Ask: "I see X and Y patterns. Which to follow?" +- **Legacy/Chaotic** (no consistency, outdated patterns) → Propose: "No clear conventions. I suggest [X]. OK?" +- **Greenfield** (new/empty project) → Apply modern best practices IMPORTANT: If codebase appears undisciplined, verify before assuming: - Different patterns may serve different purposes (intentional) @@ -309,8 +295,10 @@ result = task(..., run_in_background=false) // Never wait synchronously for exp ### Background Result Collection: 1. Launch parallel agents → receive task_ids 2. Continue immediate work -3. When results needed: \`background_output(task_id="...")\` -4. Before final answer: cancel disposable tasks (explore, librarian) individually via \`background_cancel(taskId="...")\`. Always wait for Oracle — collect its result via \`background_output\` before answering. +3. When results needed: \`background_output(task_id=\"...\")\` +4. Before final answer, cancel DISPOSABLE tasks (explore, librarian) individually: \`background_cancel(taskId=\"bg_explore_xxx\")\`, \`background_cancel(taskId=\"bg_librarian_xxx\")\` +5. **NEVER cancel Oracle.** ALWAYS collect Oracle result via \`background_output(task_id=\"bg_oracle_xxx\")\` before answering — even if you already have enough context. +6. **NEVER use \`background_cancel(all=true)\`** — it kills Oracle. Cancel each disposable task by its specific taskId. ### Search Stop Conditions @@ -362,12 +350,10 @@ AFTER THE WORK YOU DELEGATED SEEMS DONE, ALWAYS VERIFY THE RESULTS AS FOLLOWING: Every \`task()\` output includes a session_id. **USE IT.** **ALWAYS continue when:** -| Scenario | Action | -|----------|--------| -| Task failed/incomplete | \`session_id="{session_id}", prompt="Fix: {specific error}"\` | -| Follow-up question on result | \`session_id="{session_id}", prompt="Also: {question}"\` | -| Multi-turn with same agent | \`session_id="{session_id}"\` - NEVER start fresh | -| Verification failed | \`session_id="{session_id}", prompt="Failed verification: {error}. Fix."\` | +- Task failed/incomplete → \`session_id=\"{session_id}\", prompt=\"Fix: {specific error}\"\` +- Follow-up question on result → \`session_id=\"{session_id}\", prompt=\"Also: {question}\"\` +- Multi-turn with same agent → \`session_id=\"{session_id}\"\` - NEVER start fresh +- Verification failed → \`session_id=\"{session_id}\", prompt=\"Failed verification: {error}. Fix.\"\` **Why session_id is CRITICAL:** - Subagent has FULL conversation context preserved @@ -404,12 +390,10 @@ If project has build/test commands, run them at task completion. ### Evidence Requirements (task NOT complete without these): -| Action | Required Evidence | -|--------|-------------------| -| File edit | \`lsp_diagnostics\` clean on changed files | -| Build command | Exit code 0 | -| Test run | Pass (or explicit note of pre-existing failures) | -| Delegation | Agent result received and verified | +- **File edit** → \`lsp_diagnostics\` clean on changed files +- **Build command** → Exit code 0 +- **Test run** → Pass (or explicit note of pre-existing failures) +- **Delegation** → Agent result received and verified **NO EVIDENCE = NOT COMPLETE.** @@ -449,9 +433,9 @@ If verification fails: 3. Report: "Done. Note: found N pre-existing lint errors unrelated to my changes." ### Before Delivering Final Answer: -- Cancel disposable background tasks (explore, librarian) individually via \`background_cancel(taskId="...")\` -- **Always wait for Oracle**: Oracle takes 20+ min by design and always provides valuable independent analysis from a different angle — even when you already have enough context. Collect Oracle results via \`background_output\` before answering. -- When Oracle is running, cancel disposable tasks individually instead of using \`background_cancel(all=true)\`. +- Cancel DISPOSABLE background tasks (explore, librarian) individually via \`background_cancel(taskId=\"...\")\` +- **NEVER use \`background_cancel(all=true)\`.** Always cancel individually by taskId. +- **Always wait for Oracle**: When Oracle is running and you have gathered enough context from your own exploration, your next action is \`background_output\` on Oracle — NOT delivering a final answer. Oracle's value is highest when you think you don't need it. ${oracleSection}