Merge pull request #1471 from high726/fix/look-at-clipboard-image-support

feat(look_at): add image_data parameter for clipboard/pasted image support
2026-02-04 15:55:29 +09:00
parent 819c5b5d29 d099b0255f
commit 8d0fa97b72
3 changed files with 231 additions and 21 deletions
--- a/src/tools/look-at/tools.test.ts
+++ b/src/tools/look-at/tools.test.ts
@@ -31,25 +31,52 @@ describe("look-at tool", () => {
      const normalized = normalizeArgs(args as any)
      expect(normalized.file_path).toBe("/preferred.png")
    })
+
+    // given image_data provided
+    // when called with base64 image data
+    // then preserve image_data in normalized args
+    test("preserves image_data when provided", () => {
+      const args = { image_data: "data:image/png;base64,iVBORw0KGgo=", goal: "analyze" }
+      const normalized = normalizeArgs(args as any)
+      expect(normalized.image_data).toBe("data:image/png;base64,iVBORw0KGgo=")
+      expect(normalized.file_path).toBeUndefined()
+    })
  })

  describe("validateArgs", () => {
-    // given valid arguments
+    // given valid arguments with file_path
    // when validated
    // then return null (no error)
-    test("returns null for valid args", () => {
+    test("returns null for valid args with file_path", () => {
      const args = { file_path: "/valid/path.png", goal: "analyze" }
      expect(validateArgs(args)).toBeNull()
    })

-    // given file_path missing
+    // given valid arguments with image_data
+    // when validated
+    // then return null (no error)
+    test("returns null for valid args with image_data", () => {
+      const args = { image_data: "data:image/png;base64,iVBORw0KGgo=", goal: "analyze" }
+      expect(validateArgs(args)).toBeNull()
+    })
+
+    // given neither file_path nor image_data
    // when validated
    // then clear error message
-    test("returns error when file_path is missing", () => {
+    test("returns error when neither file_path nor image_data provided", () => {
      const args = { goal: "analyze" } as any
      const error = validateArgs(args)
      expect(error).toContain("file_path")
-      expect(error).toContain("required")
+      expect(error).toContain("image_data")
+    })
+
+    // given both file_path and image_data
+    // when validated
+    // then return error (mutually exclusive)
+    test("returns error when both file_path and image_data provided", () => {
+      const args = { file_path: "/path.png", image_data: "base64data", goal: "analyze" }
+      const error = validateArgs(args)
+      expect(error).toContain("only one")
    })

    // given goal missing
@@ -69,6 +96,17 @@ describe("look-at tool", () => {
      const args = { file_path: "", goal: "analyze" }
      const error = validateArgs(args)
      expect(error).toContain("file_path")
+      expect(error).toContain("image_data")
+    })
+
+    // given image_data is empty string
+    // when validated
+    // then return error
+    test("returns error when image_data is empty string", () => {
+      const args = { image_data: "", goal: "analyze" }
+      const error = validateArgs(args)
+      expect(error).toContain("file_path")
+      expect(error).toContain("image_data")
    })
  })

@@ -109,7 +147,7 @@ describe("look-at tool", () => {
        toolContext
      )

-      expect(result).toContain("Error: Failed to analyze file")
+      expect(result).toContain("Error: Failed to analyze")
      expect(result).toContain("malformed response")
      expect(result).toContain("multimodal-looker")
      expect(result).toContain("image/png")
@@ -217,4 +255,111 @@ describe("look-at tool", () => {
      })
    })
  })
+
+  describe("createLookAt with image_data", () => {
+    // given base64 image data is provided
+    // when LookAt tool executed
+    // then should send data URL to session.prompt
+    test("sends data URL when image_data provided", async () => {
+      let promptBody: any
+
+      const mockClient = {
+        app: {
+          agents: async () => ({ data: [] }),
+        },
+        session: {
+          get: async () => ({ data: { directory: "/project" } }),
+          create: async () => ({ data: { id: "ses_image_data_test" } }),
+          prompt: async (input: any) => {
+            promptBody = input.body
+            return { data: {} }
+          },
+          messages: async () => ({
+            data: [
+              { info: { role: "assistant", time: { created: 1 } }, parts: [{ type: "text", text: "analyzed" }] },
+            ],
+          }),
+        },
+      }
+
+      const tool = createLookAt({
+        client: mockClient,
+        directory: "/project",
+      } as any)
+
+      const toolContext: ToolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "sisyphus",
+        directory: "/project",
+        worktree: "/project",
+        abort: new AbortController().signal,
+        metadata: () => {},
+        ask: async () => {},
+      }
+
+      await tool.execute(
+        { image_data: "data:image/png;base64,iVBORw0KGgo=", goal: "describe this image" },
+        toolContext
+      )
+
+      const filePart = promptBody.parts.find((p: any) => p.type === "file")
+      expect(filePart).toBeDefined()
+      expect(filePart.url).toContain("data:image/png;base64")
+      expect(filePart.mime).toBe("image/png")
+      expect(filePart.filename).toContain("clipboard-image")
+    })
+
+    // given raw base64 without data URI prefix
+    // when LookAt tool executed
+    // then should detect mime type and create proper data URL
+    test("handles raw base64 without data URI prefix", async () => {
+      let promptBody: any
+
+      const mockClient = {
+        app: {
+          agents: async () => ({ data: [] }),
+        },
+        session: {
+          get: async () => ({ data: { directory: "/project" } }),
+          create: async () => ({ data: { id: "ses_raw_base64_test" } }),
+          prompt: async (input: any) => {
+            promptBody = input.body
+            return { data: {} }
+          },
+          messages: async () => ({
+            data: [
+              { info: { role: "assistant", time: { created: 1 } }, parts: [{ type: "text", text: "analyzed" }] },
+            ],
+          }),
+        },
+      }
+
+      const tool = createLookAt({
+        client: mockClient,
+        directory: "/project",
+      } as any)
+
+      const toolContext: ToolContext = {
+        sessionID: "parent-session",
+        messageID: "parent-message",
+        agent: "sisyphus",
+        directory: "/project",
+        worktree: "/project",
+        abort: new AbortController().signal,
+        metadata: () => {},
+        ask: async () => {},
+      }
+
+      await tool.execute(
+        { image_data: "iVBORw0KGgo=", goal: "analyze" },
+        toolContext
+      )
+
+      const filePart = promptBody.parts.find((p: any) => p.type === "file")
+      expect(filePart).toBeDefined()
+      expect(filePart.url).toContain("data:")
+      expect(filePart.url).toContain("base64")
+    })
+  })
 })
--- a/src/tools/look-at/tools.ts
+++ b/src/tools/look-at/tools.ts
@@ -11,14 +11,23 @@ interface LookAtArgsWithAlias extends LookAtArgs {

 export function normalizeArgs(args: LookAtArgsWithAlias): LookAtArgs {
  return {
-    file_path: args.file_path ?? args.path ?? "",
+    file_path: args.file_path ?? args.path,
+    image_data: args.image_data,
    goal: args.goal ?? "",
  }
 }

 export function validateArgs(args: LookAtArgs): string | null {
-  if (!args.file_path) {
-    return `Error: Missing required parameter 'file_path'. Usage: look_at(file_path="/path/to/file", goal="what to extract")`
+  const hasFilePath = args.file_path && args.file_path.length > 0
+  const hasImageData = args.image_data && args.image_data.length > 0
+  
+  if (!hasFilePath && !hasImageData) {
+    return `Error: Must provide either 'file_path' or 'image_data'. Usage:
+- look_at(file_path="/path/to/file", goal="what to extract")
+- look_at(image_data="base64_encoded_data", goal="what to extract")`
+  }
+  if (hasFilePath && hasImageData) {
+    return `Error: Provide only one of 'file_path' or 'image_data', not both.`
  }
  if (!args.goal) {
    return `Error: Missing required parameter 'goal'. Usage: look_at(file_path="/path/to/file", goal="what to extract")`
@@ -26,6 +35,28 @@ export function validateArgs(args: LookAtArgs): string | null {
  return null
 }

+function inferMimeTypeFromBase64(base64Data: string): string {
+  if (base64Data.startsWith("data:")) {
+    const match = base64Data.match(/^data:([^;]+);/)
+    if (match) return match[1]
+  }
+  
+  try {
+    const cleanData = base64Data.replace(/^data:[^;]+;base64,/, "")
+    const header = atob(cleanData.slice(0, 16))
+    
+    if (header.startsWith("\x89PNG")) return "image/png"
+    if (header.startsWith("\xFF\xD8\xFF")) return "image/jpeg"
+    if (header.startsWith("GIF8")) return "image/gif"
+    if (header.startsWith("RIFF") && header.includes("WEBP")) return "image/webp"
+    if (header.startsWith("%PDF")) return "application/pdf"
+  } catch {
+    // Invalid base64 - fall through to default
+  }
+  
+  return "image/png"
+}
+
 function inferMimeType(filePath: string): string {
  const ext = extname(filePath).toLowerCase()
  const mimeTypes: Record<string, string> = {
@@ -64,11 +95,22 @@ function inferMimeType(filePath: string): string {
  return mimeTypes[ext] || "application/octet-stream"
 }

+function extractBase64Data(imageData: string): string {
+  if (imageData.startsWith("data:")) {
+    const commaIndex = imageData.indexOf(",")
+    if (commaIndex !== -1) {
+      return imageData.slice(commaIndex + 1)
+    }
+  }
+  return imageData
+}
+
 export function createLookAt(ctx: PluginInput): ToolDefinition {
  return tool({
    description: LOOK_AT_DESCRIPTION,
    args: {
-      file_path: tool.schema.string().describe("Absolute path to the file to analyze"),
+      file_path: tool.schema.string().optional().describe("Absolute path to the file to analyze"),
+      image_data: tool.schema.string().optional().describe("Base64 encoded image data (for clipboard/pasted images)"),
      goal: tool.schema.string().describe("What specific information to extract from the file"),
    },
    async execute(rawArgs: LookAtArgs, toolContext) {
@@ -79,12 +121,34 @@ export function createLookAt(ctx: PluginInput): ToolDefinition {
        return validationError
      }

-      log(`[look_at] Analyzing file: ${args.file_path}, goal: ${args.goal}`)
+      const isBase64Input = Boolean(args.image_data)
+      const sourceDescription = isBase64Input ? "clipboard/pasted image" : args.file_path
+      log(`[look_at] Analyzing ${sourceDescription}, goal: ${args.goal}`)

-      const mimeType = inferMimeType(args.file_path)
-      const filename = basename(args.file_path)
+      let mimeType: string
+      let filePart: { type: "file"; mime: string; url: string; filename: string }

-      const prompt = `Analyze this file and extract the requested information.
+      if (isBase64Input) {
+        mimeType = inferMimeTypeFromBase64(args.image_data!)
+        const base64Content = extractBase64Data(args.image_data!)
+        const dataUrl = `data:${mimeType};base64,${base64Content}`
+        filePart = {
+          type: "file",
+          mime: mimeType,
+          url: dataUrl,
+          filename: `clipboard-image.${mimeType.split("/")[1] || "png"}`,
+        }
+      } else {
+        mimeType = inferMimeType(args.file_path!)
+        filePart = {
+          type: "file",
+          mime: mimeType,
+          url: pathToFileURL(args.file_path!).href,
+          filename: basename(args.file_path!),
+        }
+      }
+
+      const prompt = `Analyze this ${isBase64Input ? "image" : "file"} and extract the requested information.

 Goal: ${args.goal}

@@ -157,7 +221,7 @@ Original error: ${createResult.error}`
        log("[look_at] Failed to resolve multimodal-looker model info", error)
      }

-      log(`[look_at] Sending prompt with file passthrough to session ${sessionID}`)
+      log(`[look_at] Sending prompt with ${isBase64Input ? "base64 image" : "file"} to session ${sessionID}`)
      try {
        await promptWithModelSuggestionRetry(ctx.client, {
          path: { id: sessionID },
@@ -171,7 +235,7 @@ Original error: ${createResult.error}`
            },
            parts: [
              { type: "text", text: prompt },
-              { type: "file", mime: mimeType, url: pathToFileURL(args.file_path).href, filename },
+              filePart,
            ],
            ...(agentModel ? { model: { providerID: agentModel.providerID, modelID: agentModel.modelID } } : {}),
            ...(agentVariant ? { variant: agentVariant } : {}),
@@ -183,20 +247,20 @@ Original error: ${createResult.error}`

        const isJsonParseError = errorMessage.includes("JSON") && (errorMessage.includes("EOF") || errorMessage.includes("parse"))
        if (isJsonParseError) {
-          return `Error: Failed to analyze file - received malformed response from multimodal-looker agent.
+          return `Error: Failed to analyze ${isBase64Input ? "image" : "file"} - received malformed response from multimodal-looker agent.

 This typically occurs when:
 1. The multimodal-looker model is not available or not connected
-2. The model does not support this file type (${mimeType})
+2. The model does not support this ${isBase64Input ? "image format" : `file type (${mimeType})`}
 3. The API returned an empty or truncated response

-File: ${args.file_path}
+${isBase64Input ? "Source: clipboard/pasted image" : `File: ${args.file_path}`}
 MIME type: ${mimeType}

 Try:
 - Ensure a vision-capable model (e.g., gemini-3-flash, gpt-5.2) is available
 - Check provider connections in opencode settings
- For text files like .md, .txt, use the Read tool instead
+${!isBase64Input ? "- For text files like .md, .txt, use the Read tool instead" : ""}

 Original error: ${errorMessage}`
        }
--- a/src/tools/look-at/types.ts
+++ b/src/tools/look-at/types.ts
@@ -1,4 +1,5 @@
 export interface LookAtArgs {
-  file_path: string
+  file_path?: string
+  image_data?: string  // base64 encoded image data (for clipboard images)
  goal: string
 }