fix(runtime-fallback): harden fallback progression and success detection

This commit is contained in:
Youngbin Kim
2026-02-12 13:10:47 -05:00
committed by YeonGyu-Kim
parent 5a406cab9e
commit ff230df47c
2 changed files with 1426 additions and 6 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -65,9 +65,91 @@ function extractStatusCode(error: unknown): number | undefined {
return undefined
}
function extractErrorName(error: unknown): string | undefined {
if (!error || typeof error !== "object") return undefined
const errorObj = error as Record<string, unknown>
const directName = errorObj.name
if (typeof directName === "string" && directName.length > 0) {
return directName
}
const nestedError = errorObj.error as Record<string, unknown> | undefined
const nestedName = nestedError?.name
if (typeof nestedName === "string" && nestedName.length > 0) {
return nestedName
}
const dataError = (errorObj.data as Record<string, unknown> | undefined)?.error as Record<string, unknown> | undefined
const dataErrorName = dataError?.name
if (typeof dataErrorName === "string" && dataErrorName.length > 0) {
return dataErrorName
}
return undefined
}
function classifyErrorType(error: unknown): string | undefined {
const message = getErrorMessage(error)
const errorName = extractErrorName(error)?.toLowerCase()
if (
errorName?.includes("loadapi") ||
(/api.?key.?is.?missing/i.test(message) && /environment variable/i.test(message))
) {
return "missing_api_key"
}
if (/api.?key/i.test(message) && /must be a string/i.test(message)) {
return "invalid_api_key"
}
if (errorName?.includes("unknownerror") && /model\s+not\s+found/i.test(message)) {
return "model_not_found"
}
return undefined
}
function extractCopilotAutoRetrySignal(info: Record<string, unknown> | undefined): string | undefined {
if (!info) return undefined
const candidates: string[] = []
const directStatus = info.status
if (typeof directStatus === "string") candidates.push(directStatus)
const summary = info.summary
if (typeof summary === "string") candidates.push(summary)
const message = info.message
if (typeof message === "string") candidates.push(message)
const details = info.details
if (typeof details === "string") candidates.push(details)
const combined = candidates.join("\n")
if (!combined) return undefined
if (/too.?many.?requests/i.test(combined) && /quota.?exceeded/i.test(combined) && /retrying\s+in/i.test(combined)) {
return combined
}
return undefined
}
function isRetryableError(error: unknown, retryOnErrors: number[]): boolean {
const statusCode = extractStatusCode(error)
const message = getErrorMessage(error)
const errorType = classifyErrorType(error)
if (errorType === "missing_api_key") {
return true
}
if (errorType === "model_not_found") {
return true
}
if (statusCode && retryOnErrors.includes(statusCode)) {
return true
@@ -265,14 +347,78 @@ export function createRuntimeFallbackHook(
retry_on_errors: options?.config?.retry_on_errors ?? DEFAULT_CONFIG.retry_on_errors,
max_fallback_attempts: options?.config?.max_fallback_attempts ?? DEFAULT_CONFIG.max_fallback_attempts,
cooldown_seconds: options?.config?.cooldown_seconds ?? DEFAULT_CONFIG.cooldown_seconds,
timeout_seconds: options?.config?.timeout_seconds ?? DEFAULT_CONFIG.timeout_seconds,
notify_on_fallback: options?.config?.notify_on_fallback ?? DEFAULT_CONFIG.notify_on_fallback,
}
const sessionStates = new Map<string, FallbackState>()
const sessionLastAccess = new Map<string, number>()
const sessionRetryInFlight = new Set<string>()
const sessionAwaitingFallbackResult = new Set<string>()
const sessionFallbackTimeouts = new Map<string, ReturnType<typeof setTimeout>>()
const SESSION_TTL_MS = 30 * 60 * 1000 // 30 minutes TTL for stale sessions
const abortSessionRequest = async (sessionID: string, source: string): Promise<void> => {
try {
await ctx.client.session.abort({ path: { id: sessionID } })
log(`[${HOOK_NAME}] Aborted in-flight session request (${source})`, { sessionID })
} catch (error) {
log(`[${HOOK_NAME}] Failed to abort in-flight session request (${source})`, {
sessionID,
error: String(error),
})
}
}
const clearSessionFallbackTimeout = (sessionID: string) => {
const timer = sessionFallbackTimeouts.get(sessionID)
if (timer) {
clearTimeout(timer)
sessionFallbackTimeouts.delete(sessionID)
}
}
const scheduleSessionFallbackTimeout = (sessionID: string, resolvedAgent?: string) => {
clearSessionFallbackTimeout(sessionID)
const timeoutMs = options?.session_timeout_ms ?? config.timeout_seconds * 1000
if (timeoutMs <= 0) return
const timer = setTimeout(async () => {
sessionFallbackTimeouts.delete(sessionID)
const state = sessionStates.get(sessionID)
if (!state) return
if (sessionRetryInFlight.has(sessionID)) {
log(`[${HOOK_NAME}] Overriding in-flight retry due to session timeout`, { sessionID })
}
await abortSessionRequest(sessionID, "session.timeout")
sessionRetryInFlight.delete(sessionID)
if (state.pendingFallbackModel) {
state.pendingFallbackModel = undefined
}
const fallbackModels = getFallbackModelsForSession(sessionID, resolvedAgent, pluginConfig)
if (fallbackModels.length === 0) return
log(`[${HOOK_NAME}] Session fallback timeout reached`, {
sessionID,
timeoutSeconds: config.timeout_seconds,
currentModel: state.currentModel,
})
const result = prepareFallback(sessionID, state, fallbackModels, config)
if (result.success && result.newModel) {
await autoRetryWithFallback(sessionID, result.newModel, resolvedAgent, "session.timeout")
}
}, timeoutMs)
sessionFallbackTimeouts.set(sessionID, timer)
}
// Periodic cleanup of stale session states to prevent memory leaks
const cleanupStaleSessions = () => {
const now = Date.now()
@@ -282,6 +428,8 @@ export function createRuntimeFallbackHook(
sessionStates.delete(sessionID)
sessionLastAccess.delete(sessionID)
sessionRetryInFlight.delete(sessionID)
sessionAwaitingFallbackResult.delete(sessionID)
clearSessionFallbackTimeout(sessionID)
SessionCategoryRegistry.remove(sessionID)
cleanedCount++
}
@@ -354,6 +502,9 @@ export function createRuntimeFallbackHook(
if (retryParts.length > 0) {
const retryAgent = resolvedAgent ?? getSessionAgent(sessionID)
sessionAwaitingFallbackResult.add(sessionID)
scheduleSessionFallbackTimeout(sessionID, retryAgent)
await ctx.client.session.promptAsync({
path: { id: sessionID },
body: {
@@ -370,6 +521,10 @@ export function createRuntimeFallbackHook(
} catch (retryError) {
log(`[${HOOK_NAME}] Auto-retry failed (${source})`, { sessionID, error: String(retryError) })
} finally {
const state = sessionStates.get(sessionID)
if (state?.pendingFallbackModel === newModel) {
state.pendingFallbackModel = undefined
}
sessionRetryInFlight.delete(sessionID)
}
}
@@ -404,6 +559,47 @@ export function createRuntimeFallbackHook(
return undefined
}
const hasVisibleAssistantResponse = async (
sessionID: string,
_info: Record<string, unknown> | undefined,
): Promise<boolean> => {
try {
const messagesResp = await ctx.client.session.messages({
path: { id: sessionID },
query: { directory: ctx.directory },
})
const msgs = (messagesResp as {
data?: Array<{
info?: Record<string, unknown>
parts?: Array<{ type?: string; text?: string }>
}>
}).data
if (!msgs || msgs.length === 0) return false
const lastAssistant = [...msgs].reverse().find((m) => m.info?.role === "assistant")
if (!lastAssistant) return false
if (lastAssistant.info?.error) return false
const parts = lastAssistant.parts ??
(lastAssistant.info?.parts as Array<{ type?: string; text?: string }> | undefined)
const textFromParts = (parts ?? [])
.filter((p) => p.type === "text" && typeof p.text === "string")
.map((p) => p.text!.trim())
.filter((text) => text.length > 0)
.join("\n")
if (!textFromParts) return false
if (extractCopilotAutoRetrySignal({ message: textFromParts })) return false
return true
} catch {
return false
}
}
const eventHandler = async ({ event }: { event: { type: string; properties?: unknown } }) => {
if (!config.enabled) return
@@ -431,11 +627,59 @@ export function createRuntimeFallbackHook(
sessionStates.delete(sessionID)
sessionLastAccess.delete(sessionID)
sessionRetryInFlight.delete(sessionID)
sessionAwaitingFallbackResult.delete(sessionID)
clearSessionFallbackTimeout(sessionID)
SessionCategoryRegistry.remove(sessionID)
}
return
}
if (event.type === "session.stop") {
const sessionID = props?.sessionID as string | undefined
if (!sessionID) return
clearSessionFallbackTimeout(sessionID)
if (sessionRetryInFlight.has(sessionID)) {
await abortSessionRequest(sessionID, "session.stop")
}
sessionRetryInFlight.delete(sessionID)
sessionAwaitingFallbackResult.delete(sessionID)
const state = sessionStates.get(sessionID)
if (state?.pendingFallbackModel) {
state.pendingFallbackModel = undefined
}
log(`[${HOOK_NAME}] Cleared fallback retry state on session.stop`, { sessionID })
return
}
if (event.type === "session.idle") {
const sessionID = props?.sessionID as string | undefined
if (!sessionID) return
if (sessionAwaitingFallbackResult.has(sessionID)) {
log(`[${HOOK_NAME}] session.idle while awaiting fallback result; keeping timeout armed`, { sessionID })
return
}
const hadTimeout = sessionFallbackTimeouts.has(sessionID)
clearSessionFallbackTimeout(sessionID)
sessionRetryInFlight.delete(sessionID)
const state = sessionStates.get(sessionID)
if (state?.pendingFallbackModel) {
state.pendingFallbackModel = undefined
}
if (hadTimeout) {
log(`[${HOOK_NAME}] Cleared fallback timeout after session completion`, { sessionID })
}
return
}
if (event.type === "session.error") {
const sessionID = props?.sessionID as string | undefined
const error = props?.error
@@ -447,16 +691,27 @@ export function createRuntimeFallbackHook(
}
const resolvedAgent = await resolveAgentForSessionFromContext(sessionID, agent)
sessionAwaitingFallbackResult.delete(sessionID)
clearSessionFallbackTimeout(sessionID)
log(`[${HOOK_NAME}] session.error received`, {
sessionID,
agent,
resolvedAgent,
statusCode: extractStatusCode(error),
errorName: extractErrorName(error),
errorType: classifyErrorType(error),
})
if (!isRetryableError(error, config.retry_on_errors)) {
log(`[${HOOK_NAME}] Error not retryable, skipping fallback`, { sessionID })
log(`[${HOOK_NAME}] Error not retryable, skipping fallback`, {
sessionID,
retryable: false,
statusCode: extractStatusCode(error),
errorName: extractErrorName(error),
errorType: classifyErrorType(error),
})
return
}
@@ -524,14 +779,74 @@ export function createRuntimeFallbackHook(
if (event.type === "message.updated") {
const info = props?.info as Record<string, unknown> | undefined
const sessionID = info?.sessionID as string | undefined
const error = info?.error
const retrySignal = extractCopilotAutoRetrySignal(info)
const error = info?.error ?? (retrySignal ? { name: "ProviderRateLimitError", message: retrySignal } : undefined)
const role = info?.role as string | undefined
const model = info?.model as string | undefined
if (sessionID && role === "assistant" && error && model) {
log(`[${HOOK_NAME}] message.updated with assistant error`, { sessionID, model })
if (sessionID && role === "assistant" && !error) {
if (!sessionAwaitingFallbackResult.has(sessionID)) {
return
}
const hasVisibleResponse = await hasVisibleAssistantResponse(sessionID, info)
if (!hasVisibleResponse) {
log(`[${HOOK_NAME}] Assistant update observed without visible final response; keeping fallback timeout`, {
sessionID,
model,
})
return
}
sessionAwaitingFallbackResult.delete(sessionID)
clearSessionFallbackTimeout(sessionID)
const state = sessionStates.get(sessionID)
if (state?.pendingFallbackModel) {
state.pendingFallbackModel = undefined
}
log(`[${HOOK_NAME}] Assistant response observed; cleared fallback timeout`, { sessionID, model })
return
}
if (sessionID && role === "assistant" && error) {
sessionAwaitingFallbackResult.delete(sessionID)
if (sessionRetryInFlight.has(sessionID) && !retrySignal) {
log(`[${HOOK_NAME}] message.updated fallback skipped (retry in flight)`, { sessionID })
return
}
if (retrySignal && sessionRetryInFlight.has(sessionID)) {
log(`[${HOOK_NAME}] Overriding in-flight retry due to Copilot auto-retry signal`, {
sessionID,
model,
})
await abortSessionRequest(sessionID, "message.updated.retry-signal")
sessionRetryInFlight.delete(sessionID)
}
if (retrySignal) {
log(`[${HOOK_NAME}] Detected Copilot auto-retry signal`, { sessionID, model })
}
if (!retrySignal) {
clearSessionFallbackTimeout(sessionID)
}
log(`[${HOOK_NAME}] message.updated with assistant error`, {
sessionID,
model,
statusCode: extractStatusCode(error),
errorName: extractErrorName(error),
errorType: classifyErrorType(error),
})
if (!isRetryableError(error, config.retry_on_errors)) {
log(`[${HOOK_NAME}] message.updated error not retryable, skipping fallback`, {
sessionID,
statusCode: extractStatusCode(error),
errorName: extractErrorName(error),
errorType: classifyErrorType(error),
})
return
}
@@ -545,11 +860,53 @@ export function createRuntimeFallbackHook(
}
if (!state) {
state = createFallbackState(model)
let initialModel = model
if (!initialModel) {
const detectedAgent = resolvedAgent
const agentConfig = detectedAgent
? pluginConfig?.agents?.[detectedAgent as keyof typeof pluginConfig.agents]
: undefined
const agentModel = agentConfig?.model as string | undefined
if (agentModel) {
log(`[${HOOK_NAME}] Derived model from agent config for message.updated`, {
sessionID,
agent: detectedAgent,
model: agentModel,
})
initialModel = agentModel
}
}
if (!initialModel) {
log(`[${HOOK_NAME}] message.updated missing model info, cannot fallback`, {
sessionID,
errorName: extractErrorName(error),
errorType: classifyErrorType(error),
})
return
}
state = createFallbackState(initialModel)
sessionStates.set(sessionID, state)
sessionLastAccess.set(sessionID, Date.now())
} else {
sessionLastAccess.set(sessionID, Date.now())
if (state.pendingFallbackModel) {
if (retrySignal) {
log(`[${HOOK_NAME}] Clearing pending fallback due to Copilot auto-retry signal`, {
sessionID,
pendingFallbackModel: state.pendingFallbackModel,
})
state.pendingFallbackModel = undefined
} else {
log(`[${HOOK_NAME}] message.updated fallback skipped (pending fallback in progress)`, {
sessionID,
pendingFallbackModel: state.pendingFallbackModel,
})
return
}
}
}
const result = prepareFallback(sessionID, state, fallbackModels, config)
@@ -591,6 +948,12 @@ export function createRuntimeFallbackHook(
: undefined
if (requestedModel && requestedModel !== state.currentModel) {
if (state.pendingFallbackModel && state.pendingFallbackModel === requestedModel) {
state.pendingFallbackModel = undefined
sessionLastAccess.set(sessionID, Date.now())
return
}
log(`[${HOOK_NAME}] Detected manual model change, resetting fallback state`, {
sessionID,
from: state.currentModel,