feat(skill-mcp): add auto-reconnect retry on "Not connected" errors

- Added withOperationRetry<T>() helper method that retries operations up to 3 times
- Catches "Not connected" errors (case-insensitive)
- Cleans up stale client before retry
- Modified callTool, readResource, getPrompt to use retry logic
- Added tests for retry behavior (3 new test cases)

Co-authored-by: Sisyphus <clio-agent@sisyphuslabs.ai>
This commit is contained in:
justsisyphus
2026-01-17 18:14:48 +09:00
parent dec35d28a7
commit c433e7397e
2 changed files with 161 additions and 9 deletions

View File

@@ -502,4 +502,110 @@ describe("SkillMcpManager", () => {
)
})
})
describe("operation retry logic", () => {
it("should retry operation when 'Not connected' error occurs", async () => {
// #given
const info: SkillMcpClientInfo = {
serverName: "retry-server",
skillName: "retry-skill",
sessionID: "session-retry-1",
}
const context: SkillMcpServerContext = {
config: {
url: "https://example.com/mcp",
},
skillName: "retry-skill",
}
// Mock client that fails first time with "Not connected", then succeeds
let callCount = 0
const mockClient = {
callTool: mock(async () => {
callCount++
if (callCount === 1) {
throw new Error("Not connected")
}
return { content: [{ type: "text", text: "success" }] }
}),
close: mock(() => Promise.resolve()),
}
// Spy on getOrCreateClientWithRetry to inject mock client
const getOrCreateSpy = spyOn(manager as any, "getOrCreateClientWithRetry")
getOrCreateSpy.mockResolvedValue(mockClient)
// #when
const result = await manager.callTool(info, context, "test-tool", {})
// #then
expect(callCount).toBe(2) // First call fails, second succeeds
expect(result).toEqual([{ type: "text", text: "success" }])
expect(getOrCreateSpy).toHaveBeenCalledTimes(2) // Called twice due to retry
})
it("should fail after 3 retry attempts", async () => {
// #given
const info: SkillMcpClientInfo = {
serverName: "fail-server",
skillName: "fail-skill",
sessionID: "session-fail-1",
}
const context: SkillMcpServerContext = {
config: {
url: "https://example.com/mcp",
},
skillName: "fail-skill",
}
// Mock client that always fails with "Not connected"
const mockClient = {
callTool: mock(async () => {
throw new Error("Not connected")
}),
close: mock(() => Promise.resolve()),
}
const getOrCreateSpy = spyOn(manager as any, "getOrCreateClientWithRetry")
getOrCreateSpy.mockResolvedValue(mockClient)
// #when / #then
await expect(manager.callTool(info, context, "test-tool", {})).rejects.toThrow(
/Failed after 3 reconnection attempts/
)
expect(getOrCreateSpy).toHaveBeenCalledTimes(3) // Initial + 2 retries
})
it("should not retry on non-connection errors", async () => {
// #given
const info: SkillMcpClientInfo = {
serverName: "error-server",
skillName: "error-skill",
sessionID: "session-error-1",
}
const context: SkillMcpServerContext = {
config: {
url: "https://example.com/mcp",
},
skillName: "error-skill",
}
// Mock client that fails with non-connection error
const mockClient = {
callTool: mock(async () => {
throw new Error("Tool not found")
}),
close: mock(() => Promise.resolve()),
}
const getOrCreateSpy = spyOn(manager as any, "getOrCreateClientWithRetry")
getOrCreateSpy.mockResolvedValue(mockClient)
// #when / #then
await expect(manager.callTool(info, context, "test-tool", {})).rejects.toThrow(
"Tool not found"
)
expect(getOrCreateSpy).toHaveBeenCalledTimes(1) // No retry
})
})
})

View File

@@ -415,9 +415,10 @@ export class SkillMcpManager {
name: string,
args: Record<string, unknown>
): Promise<unknown> {
const client = await this.getOrCreateClientWithRetry(info, context.config)
const result = await client.callTool({ name, arguments: args })
return result.content
return this.withOperationRetry(info, context.config, async (client) => {
const result = await client.callTool({ name, arguments: args })
return result.content
})
}
async readResource(
@@ -425,9 +426,10 @@ export class SkillMcpManager {
context: SkillMcpServerContext,
uri: string
): Promise<unknown> {
const client = await this.getOrCreateClientWithRetry(info, context.config)
const result = await client.readResource({ uri })
return result.contents
return this.withOperationRetry(info, context.config, async (client) => {
const result = await client.readResource({ uri })
return result.contents
})
}
async getPrompt(
@@ -436,9 +438,53 @@ export class SkillMcpManager {
name: string,
args: Record<string, string>
): Promise<unknown> {
const client = await this.getOrCreateClientWithRetry(info, context.config)
const result = await client.getPrompt({ name, arguments: args })
return result.messages
return this.withOperationRetry(info, context.config, async (client) => {
const result = await client.getPrompt({ name, arguments: args })
return result.messages
})
}
private async withOperationRetry<T>(
info: SkillMcpClientInfo,
config: ClaudeCodeMcpServer,
operation: (client: Client) => Promise<T>
): Promise<T> {
const maxRetries = 3
let lastError: Error | null = null
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
const client = await this.getOrCreateClientWithRetry(info, config)
return await operation(client)
} catch (error) {
lastError = error instanceof Error ? error : new Error(String(error))
const errorMessage = lastError.message.toLowerCase()
if (!errorMessage.includes("not connected")) {
throw lastError
}
if (attempt === maxRetries) {
throw new Error(
`Failed after ${maxRetries} reconnection attempts: ${lastError.message}`
)
}
const key = this.getClientKey(info)
const existing = this.clients.get(key)
if (existing) {
this.clients.delete(key)
try {
await existing.client.close()
} catch { /* process may already be terminated */ }
try {
await existing.transport.close()
} catch { /* transport may already be terminated */ }
}
}
}
throw lastError || new Error("Operation failed with unknown error")
}
private async getOrCreateClientWithRetry(