fix(delegate-task): wait for result when forcing unstable agents to background

Previously, unstable agents (gemini models, is_unstable_agent=true) were forced to background mode but returned immediately with task ID. This caused callers to lose visibility into results. Now: launch as background for monitoring stability, but poll and wait for completion, returning actual task output like sync mode. 🤖 Generated with assistance of [OhMyOpenCode](https://github.com/code-yeongyu/oh-my-opencode)
2026-01-21 00:06:05 +09:00
parent c00f210922
commit 516edb445c
2 changed files with 463 additions and 10 deletions
--- a/src/tools/delegate-task/tools.test.ts
+++ b/src/tools/delegate-task/tools.test.ts
@@ -958,6 +958,389 @@ describe("sisyphus-task", () => {
    }, { timeout: 20000 })
  })

+  describe("unstable agent forced background mode", () => {
+    test("gemini model with run_in_background=false should force background but wait for result", async () => {
+      // #given - category using gemini model with run_in_background=false
+      const { createDelegateTask } = require("./tools")
+      let launchCalled = false
+      
+      const mockManager = {
+        launch: async () => {
+          launchCalled = true
+          return {
+            id: "task-unstable",
+            sessionID: "ses_unstable_gemini",
+            description: "Unstable gemini task",
+            agent: "Sisyphus-Junior",
+            status: "running",
+          }
+        },
+      }
+      
+      const mockClient = {
+        app: { agents: async () => ({ data: [] }) },
+        config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) },
+        session: {
+          get: async () => ({ data: { directory: "/project" } }),
+          create: async () => ({ data: { id: "ses_unstable_gemini" } }),
+          prompt: async () => ({ data: {} }),
+          messages: async () => ({
+            data: [
+              { info: { role: "assistant", time: { created: Date.now() } }, parts: [{ type: "text", text: "Gemini task completed successfully" }] }
+            ]
+          }),
+          status: async () => ({ data: { "ses_unstable_gemini": { type: "idle" } } }),
+        },
+      }
+      
+      const tool = createDelegateTask({
+        manager: mockManager,
+        client: mockClient,
+      })
+      
+      const toolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "Sisyphus",
+        abort: new AbortController().signal,
+      }
+      
+      // #when - using visual-engineering (gemini model) with run_in_background=false
+      const result = await tool.execute(
+        {
+          description: "Test gemini forced background",
+          prompt: "Do something visual",
+          category: "visual-engineering",
+          run_in_background: false,
+          skills: [],
+        },
+        toolContext
+      )
+      
+      // #then - should launch as background BUT wait for and return actual result
+      expect(launchCalled).toBe(true)
+      expect(result).toContain("UNSTABLE AGENT")
+      expect(result).toContain("Gemini task completed successfully")
+    }, { timeout: 20000 })
+
+    test("gemini model with run_in_background=true should not show unstable message (normal background)", async () => {
+      // #given - category using gemini model with run_in_background=true (normal background flow)
+      const { createDelegateTask } = require("./tools")
+      let launchCalled = false
+      
+      const mockManager = {
+        launch: async () => {
+          launchCalled = true
+          return {
+            id: "task-normal-bg",
+            sessionID: "ses_normal_bg",
+            description: "Normal background task",
+            agent: "Sisyphus-Junior",
+            status: "running",
+          }
+        },
+      }
+      
+      const mockClient = {
+        app: { agents: async () => ({ data: [] }) },
+        config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) },
+        session: {
+          create: async () => ({ data: { id: "test-session" } }),
+          prompt: async () => ({ data: {} }),
+          messages: async () => ({ data: [] }),
+        },
+      }
+      
+      const tool = createDelegateTask({
+        manager: mockManager,
+        client: mockClient,
+      })
+      
+      const toolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "Sisyphus",
+        abort: new AbortController().signal,
+      }
+      
+      // #when - using visual-engineering with run_in_background=true (normal background)
+      const result = await tool.execute(
+        {
+          description: "Test normal background",
+          prompt: "Do something visual",
+          category: "visual-engineering",
+          run_in_background: true,  // User explicitly says true - normal background
+          skills: [],
+        },
+        toolContext
+      )
+      
+      // #then - should NOT show unstable message (it's normal background flow)
+      expect(launchCalled).toBe(true)
+      expect(result).not.toContain("UNSTABLE AGENT MODE")
+      expect(result).toContain("task-normal-bg")
+    })
+
+    test("non-gemini model with run_in_background=false should run sync (not forced to background)", async () => {
+      // #given - category using non-gemini model with run_in_background=false
+      const { createDelegateTask } = require("./tools")
+      let launchCalled = false
+      let promptCalled = false
+      
+      const mockManager = {
+        launch: async () => {
+          launchCalled = true
+          return { id: "should-not-be-called", sessionID: "x", description: "x", agent: "x", status: "running" }
+        },
+      }
+      
+      const mockClient = {
+        app: { agents: async () => ({ data: [] }) },
+        config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) },
+        session: {
+          get: async () => ({ data: { directory: "/project" } }),
+          create: async () => ({ data: { id: "ses_sync_non_gemini" } }),
+          prompt: async () => {
+            promptCalled = true
+            return { data: {} }
+          },
+          messages: async () => ({
+            data: [{ info: { role: "assistant" }, parts: [{ type: "text", text: "Done sync" }] }]
+          }),
+          status: async () => ({ data: { "ses_sync_non_gemini": { type: "idle" } } }),
+        },
+      }
+      
+      // Use ultrabrain which uses gpt-5.2 (non-gemini)
+      const tool = createDelegateTask({
+        manager: mockManager,
+        client: mockClient,
+      })
+      
+      const toolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "Sisyphus",
+        abort: new AbortController().signal,
+      }
+      
+      // #when - using ultrabrain (gpt model) with run_in_background=false
+      const result = await tool.execute(
+        {
+          description: "Test non-gemini sync",
+          prompt: "Do something smart",
+          category: "ultrabrain",
+          run_in_background: false,
+          skills: [],
+        },
+        toolContext
+      )
+      
+      // #then - should run sync, NOT forced to background
+      expect(launchCalled).toBe(false)  // manager.launch should NOT be called
+      expect(promptCalled).toBe(true)   // sync mode uses session.prompt
+      expect(result).not.toContain("UNSTABLE AGENT MODE")
+    }, { timeout: 20000 })
+
+    test("artistry category (gemini) with run_in_background=false should force background but wait for result", async () => {
+      // #given - artistry also uses gemini model
+      const { createDelegateTask } = require("./tools")
+      let launchCalled = false
+      
+      const mockManager = {
+        launch: async () => {
+          launchCalled = true
+          return {
+            id: "task-artistry",
+            sessionID: "ses_artistry_gemini",
+            description: "Artistry gemini task",
+            agent: "Sisyphus-Junior",
+            status: "running",
+          }
+        },
+      }
+      
+      const mockClient = {
+        app: { agents: async () => ({ data: [] }) },
+        config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) },
+        session: {
+          get: async () => ({ data: { directory: "/project" } }),
+          create: async () => ({ data: { id: "ses_artistry_gemini" } }),
+          prompt: async () => ({ data: {} }),
+          messages: async () => ({
+            data: [
+              { info: { role: "assistant", time: { created: Date.now() } }, parts: [{ type: "text", text: "Artistry result here" }] }
+            ]
+          }),
+          status: async () => ({ data: { "ses_artistry_gemini": { type: "idle" } } }),
+        },
+      }
+      
+      const tool = createDelegateTask({
+        manager: mockManager,
+        client: mockClient,
+      })
+      
+      const toolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "Sisyphus",
+        abort: new AbortController().signal,
+      }
+      
+      // #when - artistry category (gemini-3-pro-preview with max variant)
+      const result = await tool.execute(
+        {
+          description: "Test artistry forced background",
+          prompt: "Do something artistic",
+          category: "artistry",
+          run_in_background: false,
+          skills: [],
+        },
+        toolContext
+      )
+      
+      // #then - should launch as background BUT wait for and return actual result
+      expect(launchCalled).toBe(true)
+      expect(result).toContain("UNSTABLE AGENT")
+      expect(result).toContain("Artistry result here")
+    }, { timeout: 20000 })
+
+    test("writing category (gemini-flash) with run_in_background=false should force background but wait for result", async () => {
+      // #given - writing uses gemini-3-flash-preview
+      const { createDelegateTask } = require("./tools")
+      let launchCalled = false
+      
+      const mockManager = {
+        launch: async () => {
+          launchCalled = true
+          return {
+            id: "task-writing",
+            sessionID: "ses_writing_gemini",
+            description: "Writing gemini task",
+            agent: "Sisyphus-Junior",
+            status: "running",
+          }
+        },
+      }
+      
+      const mockClient = {
+        app: { agents: async () => ({ data: [] }) },
+        config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) },
+        session: {
+          get: async () => ({ data: { directory: "/project" } }),
+          create: async () => ({ data: { id: "ses_writing_gemini" } }),
+          prompt: async () => ({ data: {} }),
+          messages: async () => ({
+            data: [
+              { info: { role: "assistant", time: { created: Date.now() } }, parts: [{ type: "text", text: "Writing result here" }] }
+            ]
+          }),
+          status: async () => ({ data: { "ses_writing_gemini": { type: "idle" } } }),
+        },
+      }
+      
+      const tool = createDelegateTask({
+        manager: mockManager,
+        client: mockClient,
+      })
+      
+      const toolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "Sisyphus",
+        abort: new AbortController().signal,
+      }
+      
+      // #when - writing category (gemini-3-flash-preview)
+      const result = await tool.execute(
+        {
+          description: "Test writing forced background",
+          prompt: "Write something",
+          category: "writing",
+          run_in_background: false,
+          skills: [],
+        },
+        toolContext
+      )
+      
+      // #then - should launch as background BUT wait for and return actual result
+      expect(launchCalled).toBe(true)
+      expect(result).toContain("UNSTABLE AGENT")
+      expect(result).toContain("Writing result here")
+    }, { timeout: 20000 })
+
+    test("is_unstable_agent=true should force background but wait for result", async () => {
+      // #given - custom category with is_unstable_agent=true but non-gemini model
+      const { createDelegateTask } = require("./tools")
+      let launchCalled = false
+      
+      const mockManager = {
+        launch: async () => {
+          launchCalled = true
+          return {
+            id: "task-custom-unstable",
+            sessionID: "ses_custom_unstable",
+            description: "Custom unstable task",
+            agent: "Sisyphus-Junior",
+            status: "running",
+          }
+        },
+      }
+      
+      const mockClient = {
+        app: { agents: async () => ({ data: [] }) },
+        config: { get: async () => ({ data: { model: SYSTEM_DEFAULT_MODEL } }) },
+        session: {
+          get: async () => ({ data: { directory: "/project" } }),
+          create: async () => ({ data: { id: "ses_custom_unstable" } }),
+          prompt: async () => ({ data: {} }),
+          messages: async () => ({
+            data: [
+              { info: { role: "assistant", time: { created: Date.now() } }, parts: [{ type: "text", text: "Custom unstable result" }] }
+            ]
+          }),
+          status: async () => ({ data: { "ses_custom_unstable": { type: "idle" } } }),
+        },
+      }
+      
+      const tool = createDelegateTask({
+        manager: mockManager,
+        client: mockClient,
+        userCategories: {
+          "my-unstable-cat": {
+            model: "openai/gpt-5.2",
+            is_unstable_agent: true,
+          },
+        },
+      })
+      
+      const toolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "Sisyphus",
+        abort: new AbortController().signal,
+      }
+      
+      // #when - using custom unstable category with run_in_background=false
+      const result = await tool.execute(
+        {
+          description: "Test custom unstable",
+          prompt: "Do something",
+          category: "my-unstable-cat",
+          run_in_background: false,
+          skills: [],
+        },
+        toolContext
+      )
+      
+      // #then - should launch as background BUT wait for and return actual result
+      expect(launchCalled).toBe(true)
+      expect(result).toContain("UNSTABLE AGENT")
+      expect(result).toContain("Custom unstable result")
+    }, { timeout: 20000 })
+  })
+
  describe("buildSystemContent", () => {
    test("returns undefined when no skills and no category promptAppend", () => {
      // #given
--- a/src/tools/delegate-task/tools.ts
+++ b/src/tools/delegate-task/tools.ts
@@ -483,10 +483,9 @@ ${textContent || "(No text output)"}`
          : undefined
        categoryPromptAppend = resolved.promptAppend || undefined

-        // Unstable agent detection - force background mode for monitoring
+        // Unstable agent detection - launch as background for monitoring but wait for result
        const isUnstableAgent = resolved.config.is_unstable_agent === true || actualModel.toLowerCase().includes("gemini")
        if (isUnstableAgent && args.run_in_background === false) {
-          // Force background mode for unstable agents
          const systemContent = buildSystemContent({ skillContent, categoryPromptAppend })

          try {
@@ -503,21 +502,92 @@ ${textContent || "(No text output)"}`
              skillContent: systemContent,
            })

+            const sessionID = task.sessionID
+            if (!sessionID) {
+              return formatDetailedError(new Error("Background task launched but no sessionID returned"), {
+                operation: "Launch background task (unstable agent)",
+                args,
+                agent: agentToUse,
+                category: args.category,
+              })
+            }
+
            ctx.metadata?.({
              title: args.description,
-              metadata: { sessionId: task.sessionID, category: args.category },
+              metadata: { sessionId: sessionID, category: args.category },
            })

-            return `[UNSTABLE AGENT MODE]
+            const startTime = new Date()

-This category uses an unstable/experimental model (${actualModel}).
-Forced to background mode for monitoring stability.
+            // Poll for completion (same logic as sync mode)
+            const POLL_INTERVAL_MS = 500
+            const MAX_POLL_TIME_MS = 10 * 60 * 1000
+            const MIN_STABILITY_TIME_MS = 10000
+            const STABILITY_POLLS_REQUIRED = 3
+            const pollStart = Date.now()
+            let lastMsgCount = 0
+            let stablePolls = 0

-Task ID: ${task.id}
-Session ID: ${task.sessionID}
+            while (Date.now() - pollStart < MAX_POLL_TIME_MS) {
+              if (ctx.abort?.aborted) {
+                return `[UNSTABLE AGENT] Task aborted.\n\nSession ID: ${sessionID}`
+              }

-Monitor progress: Use \`background_output\` with task_id="${task.id}"
-Or watch the session directly for real-time updates.`
+              await new Promise(resolve => setTimeout(resolve, POLL_INTERVAL_MS))
+
+              const statusResult = await client.session.status()
+              const allStatuses = (statusResult.data ?? {}) as Record<string, { type: string }>
+              const sessionStatus = allStatuses[sessionID]
+
+              if (sessionStatus && sessionStatus.type !== "idle") {
+                stablePolls = 0
+                lastMsgCount = 0
+                continue
+              }
+
+              if (Date.now() - pollStart < MIN_STABILITY_TIME_MS) continue
+
+              const messagesCheck = await client.session.messages({ path: { id: sessionID } })
+              const msgs = ((messagesCheck as { data?: unknown }).data ?? messagesCheck) as Array<unknown>
+              const currentMsgCount = msgs.length
+
+              if (currentMsgCount === lastMsgCount) {
+                stablePolls++
+                if (stablePolls >= STABILITY_POLLS_REQUIRED) break
+              } else {
+                stablePolls = 0
+                lastMsgCount = currentMsgCount
+              }
+            }
+
+            const messagesResult = await client.session.messages({ path: { id: sessionID } })
+            const messages = ((messagesResult as { data?: unknown }).data ?? messagesResult) as Array<{
+              info?: { role?: string; time?: { created?: number } }
+              parts?: Array<{ type?: string; text?: string }>
+            }>
+
+            const assistantMessages = messages
+              .filter((m) => m.info?.role === "assistant")
+              .sort((a, b) => (b.info?.time?.created ?? 0) - (a.info?.time?.created ?? 0))
+            const lastMessage = assistantMessages[0]
+
+            if (!lastMessage) {
+              return `[UNSTABLE AGENT] No assistant response found.\n\nSession ID: ${sessionID}`
+            }
+
+            const textParts = lastMessage?.parts?.filter((p) => p.type === "text" || p.type === "reasoning") ?? []
+            const textContent = textParts.map((p) => p.text ?? "").filter(Boolean).join("\n")
+            const duration = formatDuration(startTime)
+
+            return `[UNSTABLE AGENT] Task completed in ${duration}.
+
+Model: ${actualModel} (unstable/experimental - launched via background for monitoring)
+Agent: ${agentToUse}${args.category ? ` (category: ${args.category})` : ""}
+Session ID: ${sessionID}
+
+---
+
+${textContent || "(No text output)"}`
          } catch (error) {
            return formatDetailedError(error, {
              operation: "Launch background task (unstable agent)",