diff --git a/src/tools/look-at/multimodal-agent-metadata.test.ts b/src/tools/look-at/multimodal-agent-metadata.test.ts index d47a377e5..a92b33c4b 100644 --- a/src/tools/look-at/multimodal-agent-metadata.test.ts +++ b/src/tools/look-at/multimodal-agent-metadata.test.ts @@ -65,6 +65,35 @@ describe("resolveMultimodalLookerAgentMetadata", () => { }) }) + test("preserves hardcoded fallback variant when the registered model matches a cache-derived entry", async () => { + // given + setVisionCapableModelsCache(new Map([ + [ + "openai/gpt-5.4", + { providerID: "openai", modelID: "gpt-5.4" }, + ], + ])) + spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue( + new Set(["openai/gpt-5.4"]), + ) + spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["openai"]) + const ctx = createPluginInput([ + { + name: "multimodal-looker", + model: { providerID: "openai", modelID: "gpt-5.4" }, + }, + ]) + + // when + const result = await resolveMultimodalLookerAgentMetadata(ctx) + + // then + expect(result).toEqual({ + agentModel: { providerID: "openai", modelID: "gpt-5.4" }, + agentVariant: "medium", + }) + }) + test("prefers connected vision-capable provider models before the hardcoded fallback chain", async () => { // given setVisionCapableModelsCache(new Map([ @@ -97,6 +126,12 @@ describe("resolveMultimodalLookerAgentMetadata", () => { test("falls back to the hardcoded multimodal chain when no dynamic vision model exists", async () => { // given + setVisionCapableModelsCache(new Map([ + [ + "google/gemini-3-flash", + { providerID: "google", modelID: "gemini-3-flash" }, + ], + ])) spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue( new Set(["google/gemini-3-flash"]), ) @@ -112,4 +147,24 @@ describe("resolveMultimodalLookerAgentMetadata", () => { agentVariant: undefined, }) }) + + test("does not return a registered model when no vision-capable model is available", async () => { + // given + spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue( + new Set(["openai/gpt-5.4"]), + ) + spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["openai"]) + const ctx = createPluginInput([ + { + name: "multimodal-looker", + model: { providerID: "openai", modelID: "gpt-5.4" }, + }, + ]) + + // when + const result = await resolveMultimodalLookerAgentMetadata(ctx) + + // then + expect(result).toEqual({}) + }) }) diff --git a/src/tools/look-at/multimodal-agent-metadata.ts b/src/tools/look-at/multimodal-agent-metadata.ts index a96f9471e..c2c249b41 100644 --- a/src/tools/look-at/multimodal-agent-metadata.ts +++ b/src/tools/look-at/multimodal-agent-metadata.ts @@ -28,6 +28,19 @@ function getFullModelKey(model: AgentModel): string { return `${model.providerID}/${model.modelID}` } +function isVisionCapableAgentModel( + agentModel: AgentModel | undefined, + visionCapableModels: Array, +): agentModel is AgentModel { + if (!agentModel) { + return false + } + + return visionCapableModels.some((visionCapableModel) => + getFullModelKey(visionCapableModel) === getFullModelKey(agentModel), + ) +} + function parseAgentModel(model: string): AgentModel | undefined { const [providerID, ...modelIDParts] = model.split("/") const modelID = modelIDParts.join("/") @@ -90,6 +103,10 @@ async function resolveDynamicAgentMetadata( }) const agentModel = resolution ? parseAgentModel(resolution.model) : undefined + if (!isVisionCapableAgentModel(agentModel, visionCapableModels)) { + return {} + } + return { agentModel, agentVariant: resolution?.variant, @@ -113,22 +130,32 @@ export async function resolveMultimodalLookerAgentMetadata( try { const registeredMetadata = await resolveRegisteredAgentMetadata(ctx) const visionCapableModels = readVisionCapableModelsCache() - - if (registeredMetadata.agentModel && visionCapableModels.length === 0) { - return registeredMetadata - } + const registeredModelIsVisionCapable = isVisionCapableAgentModel( + registeredMetadata.agentModel, + visionCapableModels, + ) const dynamicMetadata = await resolveDynamicAgentMetadata(ctx, visionCapableModels) - if (isConfiguredVisionModel(registeredMetadata.agentModel, dynamicMetadata.agentModel)) { - return registeredMetadata + if ( + registeredModelIsVisionCapable && + isConfiguredVisionModel(registeredMetadata.agentModel, dynamicMetadata.agentModel) + ) { + return { + agentModel: registeredMetadata.agentModel, + agentVariant: registeredMetadata.agentVariant ?? dynamicMetadata.agentVariant, + } } if (dynamicMetadata.agentModel) { return dynamicMetadata } - return registeredMetadata + if (registeredModelIsVisionCapable) { + return registeredMetadata + } + + return {} } catch (error) { log("[look_at] Failed to resolve multimodal-looker model info", error) return {} diff --git a/src/tools/look-at/multimodal-fallback-chain.test.ts b/src/tools/look-at/multimodal-fallback-chain.test.ts index d37c5b6ec..4d614d070 100644 --- a/src/tools/look-at/multimodal-fallback-chain.test.ts +++ b/src/tools/look-at/multimodal-fallback-chain.test.ts @@ -1,3 +1,5 @@ +import { describe, expect, it } from "bun:test" + describe("buildMultimodalLookerFallbackChain", () => { it("builds fallback chain from vision-capable models", async () => { // given @@ -28,4 +30,20 @@ describe("buildMultimodalLookerFallbackChain", () => { expect(result[0].model).toBe("gpt-5.4") expect(result[0].providers).toContain("openai") }) + + it("preserves hardcoded variant metadata for cache-derived entries", async () => { + // given + const { buildMultimodalLookerFallbackChain } = await import("./multimodal-fallback-chain") + const visionCapableModels = [{ providerID: "openai", modelID: "gpt-5.4" }] + + // when + const result = buildMultimodalLookerFallbackChain(visionCapableModels) + + // then + expect(result[0]).toEqual({ + providers: ["openai"], + model: "gpt-5.4", + variant: "medium", + }) + }) }) diff --git a/src/tools/look-at/multimodal-fallback-chain.ts b/src/tools/look-at/multimodal-fallback-chain.ts index 2e0f65de1..81a3fb580 100644 --- a/src/tools/look-at/multimodal-fallback-chain.ts +++ b/src/tools/look-at/multimodal-fallback-chain.ts @@ -8,6 +8,15 @@ function getFullModelKey(providerID: string, modelID: string): string { return `${providerID}/${modelID}` } +function findHardcodedFallbackEntry( + providerID: string, + modelID: string, +): FallbackEntry | undefined { + return MULTIMODAL_LOOKER_REQUIREMENT.fallbackChain.find((entry) => + entry.model === modelID && entry.providers.includes(providerID), + ) +} + export function isHardcodedMultimodalFallbackModel(model: VisionCapableModel): boolean { return MULTIMODAL_LOOKER_REQUIREMENT.fallbackChain.some((entry) => entry.providers.some((providerID) => @@ -26,10 +35,16 @@ export function buildMultimodalLookerFallbackChain( const key = getFullModelKey(visionCapableModel.providerID, visionCapableModel.modelID) if (seen.has(key)) continue + const hardcodedEntry = findHardcodedFallbackEntry( + visionCapableModel.providerID, + visionCapableModel.modelID, + ) + seen.add(key) fallbackChain.push({ providers: [visionCapableModel.providerID], model: visionCapableModel.modelID, + ...(hardcodedEntry?.variant ? { variant: hardcodedEntry.variant } : {}), }) } @@ -41,7 +56,9 @@ export function buildMultimodalLookerFallbackChain( continue } - providerModelKeys.forEach((key) => seen.add(key)) + providerModelKeys.forEach((key) => { + seen.add(key) + }) fallbackChain.push(entry) } diff --git a/src/tools/look-at/tools.ts b/src/tools/look-at/tools.ts index c363fe77c..3fe5e4e11 100644 --- a/src/tools/look-at/tools.ts +++ b/src/tools/look-at/tools.ts @@ -4,6 +4,7 @@ import { tool, type PluginInput, type ToolDefinition } from "@opencode-ai/plugin import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants" import type { LookAtArgs } from "./types" import { log, promptSyncWithModelSuggestionRetry } from "../../shared" +import { readVisionCapableModelsCache } from "../../shared/vision-capable-models-cache" import { extractLatestAssistantText } from "./assistant-message-extractor" import type { LookAtArgsWithAlias } from "./look-at-arguments" import { normalizeArgs, validateArgs } from "./look-at-arguments" @@ -38,6 +39,16 @@ function getTemporaryConversionPath(error: unknown): string | null { return null } +function isVisionCapableResolvedModel(model: { + providerID: string + modelID: string +}): boolean { + return readVisionCapableModelsCache().some((visionCapableModel) => + visionCapableModel.providerID === model.providerID && + visionCapableModel.modelID === model.modelID, + ) +} + export { normalizeArgs, validateArgs } from "./look-at-arguments" export function createLookAt(ctx: PluginInput): ToolDefinition { @@ -136,6 +147,14 @@ Provide ONLY the extracted information that matches the goal. Be thorough on what was requested, concise on everything else. If the requested information is not found, clearly state what is missing.` + const { agentModel, agentVariant } = await resolveMultimodalLookerAgentMetadata(ctx) + if (!agentModel || !isVisionCapableResolvedModel(agentModel)) { + log("[look_at] No vision-capable multimodal-looker model resolved", { + resolvedModel: agentModel, + }) + return "Error: No vision-capable multimodal-looker model available" + } + log(`[look_at] Creating session with parent: ${toolContext.sessionID}`) const parentSession = await ctx.client.session.get({ path: { id: toolContext.sessionID }, @@ -169,8 +188,6 @@ Original error: ${createResult.error}` const sessionID = createResult.data.id log(`[look_at] Created session: ${sessionID}`) - const { agentModel, agentVariant } = await resolveMultimodalLookerAgentMetadata(ctx) - log(`[look_at] Sending prompt with ${isBase64Input ? "base64 image" : "file"} to session ${sessionID}`) try { await promptSyncWithModelSuggestionRetry(ctx.client, { @@ -187,7 +204,7 @@ Original error: ${createResult.error}` { type: "text", text: prompt }, filePart, ], - ...(agentModel ? { model: { providerID: agentModel.providerID, modelID: agentModel.modelID } } : {}), + model: { providerID: agentModel.providerID, modelID: agentModel.modelID }, ...(agentVariant ? { variant: agentVariant } : {}), }, })