fix(look-at): preserve variant metadata in fallback chain and block non-vision models

- fallback-chain.ts: cache-derived entries inherit variant from matching hardcoded entries
- agent-metadata.ts: new isVisionCapableAgentModel() guard blocks non-vision registered models
- tools.ts: early vision-capability check before session creation
- Added regression tests for variant preservation and non-vision model rejection
This commit is contained in:
YeonGyu-Kim
2026-03-11 21:45:48 +09:00
parent d4232c9eac
commit 85151f7dfd
5 changed files with 145 additions and 11 deletions

View File

@@ -65,6 +65,35 @@ describe("resolveMultimodalLookerAgentMetadata", () => {
})
})
test("preserves hardcoded fallback variant when the registered model matches a cache-derived entry", async () => {
// given
setVisionCapableModelsCache(new Map([
[
"openai/gpt-5.4",
{ providerID: "openai", modelID: "gpt-5.4" },
],
]))
spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
new Set(["openai/gpt-5.4"]),
)
spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["openai"])
const ctx = createPluginInput([
{
name: "multimodal-looker",
model: { providerID: "openai", modelID: "gpt-5.4" },
},
])
// when
const result = await resolveMultimodalLookerAgentMetadata(ctx)
// then
expect(result).toEqual({
agentModel: { providerID: "openai", modelID: "gpt-5.4" },
agentVariant: "medium",
})
})
test("prefers connected vision-capable provider models before the hardcoded fallback chain", async () => {
// given
setVisionCapableModelsCache(new Map([
@@ -97,6 +126,12 @@ describe("resolveMultimodalLookerAgentMetadata", () => {
test("falls back to the hardcoded multimodal chain when no dynamic vision model exists", async () => {
// given
setVisionCapableModelsCache(new Map([
[
"google/gemini-3-flash",
{ providerID: "google", modelID: "gemini-3-flash" },
],
]))
spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
new Set(["google/gemini-3-flash"]),
)
@@ -112,4 +147,24 @@ describe("resolveMultimodalLookerAgentMetadata", () => {
agentVariant: undefined,
})
})
test("does not return a registered model when no vision-capable model is available", async () => {
// given
spyOn(modelAvailability, "fetchAvailableModels").mockResolvedValue(
new Set(["openai/gpt-5.4"]),
)
spyOn(connectedProvidersCache, "readConnectedProvidersCache").mockReturnValue(["openai"])
const ctx = createPluginInput([
{
name: "multimodal-looker",
model: { providerID: "openai", modelID: "gpt-5.4" },
},
])
// when
const result = await resolveMultimodalLookerAgentMetadata(ctx)
// then
expect(result).toEqual({})
})
})

View File

@@ -28,6 +28,19 @@ function getFullModelKey(model: AgentModel): string {
return `${model.providerID}/${model.modelID}`
}
function isVisionCapableAgentModel(
agentModel: AgentModel | undefined,
visionCapableModels: Array<AgentModel>,
): agentModel is AgentModel {
if (!agentModel) {
return false
}
return visionCapableModels.some((visionCapableModel) =>
getFullModelKey(visionCapableModel) === getFullModelKey(agentModel),
)
}
function parseAgentModel(model: string): AgentModel | undefined {
const [providerID, ...modelIDParts] = model.split("/")
const modelID = modelIDParts.join("/")
@@ -90,6 +103,10 @@ async function resolveDynamicAgentMetadata(
})
const agentModel = resolution ? parseAgentModel(resolution.model) : undefined
if (!isVisionCapableAgentModel(agentModel, visionCapableModels)) {
return {}
}
return {
agentModel,
agentVariant: resolution?.variant,
@@ -113,22 +130,32 @@ export async function resolveMultimodalLookerAgentMetadata(
try {
const registeredMetadata = await resolveRegisteredAgentMetadata(ctx)
const visionCapableModels = readVisionCapableModelsCache()
if (registeredMetadata.agentModel && visionCapableModels.length === 0) {
return registeredMetadata
}
const registeredModelIsVisionCapable = isVisionCapableAgentModel(
registeredMetadata.agentModel,
visionCapableModels,
)
const dynamicMetadata = await resolveDynamicAgentMetadata(ctx, visionCapableModels)
if (isConfiguredVisionModel(registeredMetadata.agentModel, dynamicMetadata.agentModel)) {
return registeredMetadata
if (
registeredModelIsVisionCapable &&
isConfiguredVisionModel(registeredMetadata.agentModel, dynamicMetadata.agentModel)
) {
return {
agentModel: registeredMetadata.agentModel,
agentVariant: registeredMetadata.agentVariant ?? dynamicMetadata.agentVariant,
}
}
if (dynamicMetadata.agentModel) {
return dynamicMetadata
}
return registeredMetadata
if (registeredModelIsVisionCapable) {
return registeredMetadata
}
return {}
} catch (error) {
log("[look_at] Failed to resolve multimodal-looker model info", error)
return {}

View File

@@ -1,3 +1,5 @@
import { describe, expect, it } from "bun:test"
describe("buildMultimodalLookerFallbackChain", () => {
it("builds fallback chain from vision-capable models", async () => {
// given
@@ -28,4 +30,20 @@ describe("buildMultimodalLookerFallbackChain", () => {
expect(result[0].model).toBe("gpt-5.4")
expect(result[0].providers).toContain("openai")
})
it("preserves hardcoded variant metadata for cache-derived entries", async () => {
// given
const { buildMultimodalLookerFallbackChain } = await import("./multimodal-fallback-chain")
const visionCapableModels = [{ providerID: "openai", modelID: "gpt-5.4" }]
// when
const result = buildMultimodalLookerFallbackChain(visionCapableModels)
// then
expect(result[0]).toEqual({
providers: ["openai"],
model: "gpt-5.4",
variant: "medium",
})
})
})

View File

@@ -8,6 +8,15 @@ function getFullModelKey(providerID: string, modelID: string): string {
return `${providerID}/${modelID}`
}
function findHardcodedFallbackEntry(
providerID: string,
modelID: string,
): FallbackEntry | undefined {
return MULTIMODAL_LOOKER_REQUIREMENT.fallbackChain.find((entry) =>
entry.model === modelID && entry.providers.includes(providerID),
)
}
export function isHardcodedMultimodalFallbackModel(model: VisionCapableModel): boolean {
return MULTIMODAL_LOOKER_REQUIREMENT.fallbackChain.some((entry) =>
entry.providers.some((providerID) =>
@@ -26,10 +35,16 @@ export function buildMultimodalLookerFallbackChain(
const key = getFullModelKey(visionCapableModel.providerID, visionCapableModel.modelID)
if (seen.has(key)) continue
const hardcodedEntry = findHardcodedFallbackEntry(
visionCapableModel.providerID,
visionCapableModel.modelID,
)
seen.add(key)
fallbackChain.push({
providers: [visionCapableModel.providerID],
model: visionCapableModel.modelID,
...(hardcodedEntry?.variant ? { variant: hardcodedEntry.variant } : {}),
})
}
@@ -41,7 +56,9 @@ export function buildMultimodalLookerFallbackChain(
continue
}
providerModelKeys.forEach((key) => seen.add(key))
providerModelKeys.forEach((key) => {
seen.add(key)
})
fallbackChain.push(entry)
}

View File

@@ -4,6 +4,7 @@ import { tool, type PluginInput, type ToolDefinition } from "@opencode-ai/plugin
import { LOOK_AT_DESCRIPTION, MULTIMODAL_LOOKER_AGENT } from "./constants"
import type { LookAtArgs } from "./types"
import { log, promptSyncWithModelSuggestionRetry } from "../../shared"
import { readVisionCapableModelsCache } from "../../shared/vision-capable-models-cache"
import { extractLatestAssistantText } from "./assistant-message-extractor"
import type { LookAtArgsWithAlias } from "./look-at-arguments"
import { normalizeArgs, validateArgs } from "./look-at-arguments"
@@ -38,6 +39,16 @@ function getTemporaryConversionPath(error: unknown): string | null {
return null
}
function isVisionCapableResolvedModel(model: {
providerID: string
modelID: string
}): boolean {
return readVisionCapableModelsCache().some((visionCapableModel) =>
visionCapableModel.providerID === model.providerID &&
visionCapableModel.modelID === model.modelID,
)
}
export { normalizeArgs, validateArgs } from "./look-at-arguments"
export function createLookAt(ctx: PluginInput): ToolDefinition {
@@ -136,6 +147,14 @@ Provide ONLY the extracted information that matches the goal.
Be thorough on what was requested, concise on everything else.
If the requested information is not found, clearly state what is missing.`
const { agentModel, agentVariant } = await resolveMultimodalLookerAgentMetadata(ctx)
if (!agentModel || !isVisionCapableResolvedModel(agentModel)) {
log("[look_at] No vision-capable multimodal-looker model resolved", {
resolvedModel: agentModel,
})
return "Error: No vision-capable multimodal-looker model available"
}
log(`[look_at] Creating session with parent: ${toolContext.sessionID}`)
const parentSession = await ctx.client.session.get({
path: { id: toolContext.sessionID },
@@ -169,8 +188,6 @@ Original error: ${createResult.error}`
const sessionID = createResult.data.id
log(`[look_at] Created session: ${sessionID}`)
const { agentModel, agentVariant } = await resolveMultimodalLookerAgentMetadata(ctx)
log(`[look_at] Sending prompt with ${isBase64Input ? "base64 image" : "file"} to session ${sessionID}`)
try {
await promptSyncWithModelSuggestionRetry(ctx.client, {
@@ -187,7 +204,7 @@ Original error: ${createResult.error}`
{ type: "text", text: prompt },
filePart,
],
...(agentModel ? { model: { providerID: agentModel.providerID, modelID: agentModel.modelID } } : {}),
model: { providerID: agentModel.providerID, modelID: agentModel.modelID },
...(agentVariant ? { variant: agentVariant } : {}),
},
})