Merge pull request #2469 from code-yeongyu/fix/multimodal-variant-metadata

fix(look-at): preserve variant metadata and block non-vision models
This commit is contained in:
YeonGyu-Kim
2026-03-11 21:58:51 +09:00
committed by GitHub
6 changed files with 147 additions and 10 deletions

View File

@@ -65,6 +65,35 @@ describe("resolveMultimodalLookerAgentMetadata", () => {
})
})
test("preserves hardcoded fallback variant when the registered model matches a cache-derived entry", async () => {
// given
setVisionCapableModelsCache(new Map([
[
"openai/gpt-5.4",
{ providerID: "openai", modelID: "gpt-5.4" },
],
]))
spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
new Set(["openai/gpt-5.4"]),
)
spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["openai"])
const ctx = createPluginInput([
{
name: "multimodal-looker",
model: { providerID: "openai", modelID: "gpt-5.4" },
},
])
// when
const result = await resolveMultimodalLookerAgentMetadata(ctx)
// then
expect(result).toEqual({
agentModel: { providerID: "openai", modelID: "gpt-5.4" },
agentVariant: "medium",
})
})
test("prefers connected vision-capable provider models before the hardcoded fallback chain", async () => {
// given
setVisionCapableModelsCache(new Map([
@@ -97,6 +126,12 @@ describe("resolveMultimodalLookerAgentMetadata", () => {
test("falls back to the hardcoded multimodal chain when no dynamic vision model exists", async () => {
// given
setVisionCapableModelsCache(new Map([
[
"google/gemini-3-flash",
{ providerID: "google", modelID: "gemini-3-flash" },
],
]))
spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
new Set(["google/gemini-3-flash"]),
)
@@ -112,4 +147,24 @@ describe("resolveMultimodalLookerAgentMetadata", () => {
agentVariant: undefined,
})
})
test("does not return a registered model when no vision-capable model is available", async () => {
// given
spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
new Set(["openai/gpt-5.4"]),
)
spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["openai"])
const ctx = createPluginInput([
{
name: "multimodal-looker",
model: { providerID: "openai", modelID: "gpt-5.4" },
},
])
// when
const result = await resolveMultimodalLookerAgentMetadata(ctx)
// then
expect(result).toEqual({})
})
})

View File

@@ -28,6 +28,19 @@ function getFullModelKey(model: AgentModel): string {
return `${model.providerID}/${model.modelID}`
}
function isVisionCapableAgentModel(
agentModel: AgentModel | undefined,
visionCapableModels: Array<AgentModel>,
): agentModel is AgentModel {
if (!agentModel) {
return false
}
return visionCapableModels.some((visionCapableModel) =>
getFullModelKey(visionCapableModel) === getFullModelKey(agentModel),
)
}
function parseAgentModel(model: string): AgentModel | undefined {
const [providerID, ...modelIDParts] = model.split("/")
const modelID = modelIDParts.join("/")
@@ -90,6 +103,10 @@ async function resolveDynamicAgentMetadata(
})
const agentModel = resolution ? parseAgentModel(resolution.model) : undefined
if (!isVisionCapableAgentModel(agentModel, visionCapableModels)) {
return {}
}
return {
agentModel,
agentVariant: resolution?.variant,
@@ -113,22 +130,32 @@ export async function resolveMultimodalLookerAgentMetadata(
try {
const registeredMetadata = await resolveRegisteredAgentMetadata(ctx)
const visionCapableModels = readVisionCapableModelsCache()
if (registeredMetadata.agentModel && visionCapableModels.length === 0) {
return registeredMetadata
}
const registeredModelIsVisionCapable = isVisionCapableAgentModel(
registeredMetadata.agentModel,
visionCapableModels,
)
const dynamicMetadata = await resolveDynamicAgentMetadata(ctx, visionCapableModels)
if (isConfiguredVisionModel(registeredMetadata.agentModel, dynamicMetadata.agentModel)) {
return registeredMetadata
if (
registeredModelIsVisionCapable &&
isConfiguredVisionModel(registeredMetadata.agentModel, dynamicMetadata.agentModel)
) {
return {
agentModel: registeredMetadata.agentModel,
agentVariant: registeredMetadata.agentVariant ?? dynamicMetadata.agentVariant,
}
}
if (dynamicMetadata.agentModel) {
return dynamicMetadata
}
return registeredMetadata
if (registeredModelIsVisionCapable) {
return registeredMetadata
}
return {}
} catch (error) {
log("[look_at] Failed to resolve multimodal-looker model info", error)
return {}

View File

@@ -1,3 +1,5 @@
import { describe, expect, it } from "bun:test"
describe("buildMultimodalLookerFallbackChain", () => {
it("builds fallback chain from vision-capable models", async () => {
// given
@@ -28,4 +30,20 @@ describe("buildMultimodalLookerFallbackChain", () => {
expect(result[0].model).toBe("gpt-5.4")
expect(result[0].providers).toContain("openai")
})
it("preserves hardcoded variant metadata for cache-derived entries", async () => {
// given
const { buildMultimodalLookerFallbackChain } = await import("./multimodal-fallback-chain")
const visionCapableModels = [{ providerID: "openai", modelID: "gpt-5.4" }]
// when
const result = buildMultimodalLookerFallbackChain(visionCapableModels)
// then
expect(result[0]).toEqual({
providers: ["openai"],
model: "gpt-5.4",
variant: "medium",
})
})
})

View File

@@ -8,6 +8,15 @@ function getFullModelKey(providerID: string, modelID: string): string {
return `${providerID}/${modelID}`
}
function findHardcodedFallbackEntry(
providerID: string,
modelID: string,
): FallbackEntry | undefined {
return MULTIMODAL_LOOKER_REQUIREMENT.fallbackChain.find((entry) =>
entry.model === modelID && entry.providers.includes(providerID),
)
}
export function isHardcodedMultimodalFallbackModel(model: VisionCapableModel): boolean {
return MULTIMODAL_LOOKER_REQUIREMENT.fallbackChain.some((entry) =>
entry.providers.some((providerID) =>
@@ -26,10 +35,16 @@ export function buildMultimodalLookerFallbackChain(
const key = getFullModelKey(visionCapableModel.providerID, visionCapableModel.modelID)
if (seen.has(key)) continue
const hardcodedEntry = findHardcodedFallbackEntry(
visionCapableModel.providerID,
visionCapableModel.modelID,
)
seen.add(key)
fallbackChain.push({
providers: [visionCapableModel.providerID],
model: visionCapableModel.modelID,
...(hardcodedEntry?.variant ? { variant: hardcodedEntry.variant } : {}),
})
}
@@ -41,7 +56,9 @@ export function buildMultimodalLookerFallbackChain(
continue
}
providerModelKeys.forEach((key) => seen.add(key))
providerModelKeys.forEach((key) => {
seen.add(key)
})
fallbackChain.push(entry)
}

View File

@@ -1,5 +1,6 @@
import { describe, expect, test, mock } from "bun:test"
import type { ToolContext } from "@opencode-ai/plugin/tool"
import { setVisionCapableModelsCache } from "../../shared/vision-capable-models-cache"
import { normalizeArgs, validateArgs, createLookAt } from "./tools"
describe("look-at tool", () => {
@@ -255,6 +256,8 @@ describe("look-at tool", () => {
// when LookAt tool executed
// then model info should be passed to sync prompt
test("passes multimodal-looker model to sync prompt when available", async () => {
setVisionCapableModelsCache(new Map([["google/gemini-3-flash", { providerID: "google", modelID: "gemini-3-flash" }]]))
let promptBody: any
const mockClient = {

View File

@@ -4,6 +4,7 @@ import { tool, type PluginInput, type ToolDefinition } from "@opencode-ai/plugin
import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants"
import type { LookAtArgs } from "./types"
import { log, promptSyncWithModelSuggestionRetry } from "../../shared"
import { readVisionCapableModelsCache } from "../../shared/vision-capable-models-cache"
import { extractLatestAssistantText } from "./assistant-message-extractor"
import type { LookAtArgsWithAlias } from "./look-at-arguments"
import { normalizeArgs, validateArgs } from "./look-at-arguments"
@@ -38,6 +39,16 @@ function getTemporaryConversionPath(error: unknown): string | null {
return null
}
function isVisionCapableResolvedModel(model: {
providerID: string
modelID: string
}): boolean {
return readVisionCapableModelsCache().some((visionCapableModel) =>
visionCapableModel.providerID === model.providerID &&
visionCapableModel.modelID === model.modelID,
)
}
export { normalizeArgs, validateArgs } from "./look-at-arguments"
export function createLookAt(ctx: PluginInput): ToolDefinition {
@@ -136,6 +147,14 @@ Provide ONLY the extracted information that matches the goal.
Be thorough on what was requested, concise on everything else.
If the requested information is not found, clearly state what is missing.`
const { agentModel, agentVariant } = await resolveMultimodalLookerAgentMetadata(ctx)
if (agentModel && !isVisionCapableResolvedModel(agentModel)) {
log("[look_at] Resolved model is not vision-capable, blocking", {
resolvedModel: agentModel,
})
return "Error: Resolved multimodal-looker model is not vision-capable"
}
log(`[look_at] Creating session with parent: ${toolContext.sessionID}`)
const parentSession = await ctx.client.session.get({
path: { id: toolContext.sessionID },
@@ -169,8 +188,6 @@ Original error: ${createResult.error}`
const sessionID = createResult.data.id
log(`[look_at] Created session: ${sessionID}`)
const { agentModel, agentVariant } = await resolveMultimodalLookerAgentMetadata(ctx)
log(`[look_at] Sending prompt with ${isBase64Input ? "base64 image" : "file"} to session ${sessionID}`)
try {
await promptSyncWithModelSuggestionRetry(ctx.client, {