mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-17 13:55:50 +00:00
feat: 整合功能恢复与技能学习闭环(含 ECC v2.1 parity + Opus 4.7 接入 + prompt 工程优化)
主要变更: - Skill Learning 闭环系统 (9/9 AC) - Opus 4.7 模型层接入 + adaptive thinking - Prompt 工程优化 (64 审计测试) - Agent Teams 简化门控 (默认启用) - Windows Terminal 后端修复 (EncodedCommand/WT_SESSION) - TF-IDF 技能搜索精准化 (字段加权/CJK 优化) - Autonomy 系统 (/autonomy 命令) - ACP 协议完整实现 - mock.module 泄漏修复 (CI 全绿) - 152+ lint/type 修复
This commit is contained in:
406
scripts/verify-skill-learning-e2e.ts
Normal file
406
scripts/verify-skill-learning-e2e.ts
Normal file
@@ -0,0 +1,406 @@
|
||||
/**
|
||||
* End-to-end verification probe for the skill-learning pipeline.
|
||||
*
|
||||
* Exercises the real public API (not mocks, not unit test harness) so we
|
||||
* can confirm each pipeline stage actually produces the expected on-disk
|
||||
* artefacts under a clean CLAUDE_SKILL_LEARNING_HOME.
|
||||
*
|
||||
* Run with:
|
||||
* bun run scripts/verify-skill-learning-e2e.ts
|
||||
*
|
||||
* Sections:
|
||||
* 1. Fake transcript -> ingest -> observations on disk
|
||||
* 2. Heuristic observer -> instinct candidates -> persisted instincts
|
||||
* 3. Evolution -> skill / command / agent candidates
|
||||
* 4. Write learned skill -> verify skill file exists
|
||||
* 5. Cross-project promotion -> global instinct written
|
||||
* 6. Observer backend env switch probe
|
||||
* 7. Gap state machine walk-through
|
||||
* 8. Tool event observer wrapper invocation
|
||||
*/
|
||||
|
||||
import { mkdtempSync, writeFileSync, existsSync, rmSync, readdirSync } from 'node:fs'
|
||||
import { readFile } from 'node:fs/promises'
|
||||
import { tmpdir } from 'node:os'
|
||||
import { join } from 'node:path'
|
||||
import { execSync } from 'node:child_process'
|
||||
|
||||
type Result = { step: string; ok: boolean; detail: string }
|
||||
const results: Result[] = []
|
||||
|
||||
function record(step: string, ok: boolean, detail: string): void {
|
||||
results.push({ step, ok, detail })
|
||||
const tag = ok ? 'PASS' : 'FAIL'
|
||||
console.log(`[${tag}] ${step} — ${detail}`)
|
||||
}
|
||||
|
||||
async function main(): Promise<void> {
|
||||
const storage = mkdtempSync(join(tmpdir(), 'skill-learning-e2e-'))
|
||||
const projectA = mkdtempSync(join(tmpdir(), 'project-a-'))
|
||||
const projectB = mkdtempSync(join(tmpdir(), 'project-b-'))
|
||||
// Real git repos so resolveProjectContext derives distinct project IDs
|
||||
// (the default `global` fallback for non-git dirs would make A and B
|
||||
// share the same storage and defeat the cross-project probe).
|
||||
execSync(`git init -q "${projectA}"`, { stdio: 'ignore' })
|
||||
execSync(
|
||||
`git -C "${projectA}" remote add origin https://example.test/project-a.git`,
|
||||
{ stdio: 'ignore' },
|
||||
)
|
||||
execSync(`git init -q "${projectB}"`, { stdio: 'ignore' })
|
||||
execSync(
|
||||
`git -C "${projectB}" remote add origin https://example.test/project-b.git`,
|
||||
{ stdio: 'ignore' },
|
||||
)
|
||||
|
||||
// === ECC / plugin isolation ===
|
||||
// The probe must exercise only the project's own skill-learning code, not
|
||||
// the user-level ECC plugin, auto-loaded ECC skill, or any external LLM.
|
||||
// Strip every env that could route observations or observer calls outside
|
||||
// this probe's temp storage.
|
||||
for (const key of [
|
||||
'ANTHROPIC_API_KEY',
|
||||
'ANTHROPIC_AUTH_TOKEN',
|
||||
'OPENAI_API_KEY',
|
||||
'GEMINI_API_KEY',
|
||||
'GROK_API_KEY',
|
||||
'CLAUDE_CODE_PLUGINS_DIR',
|
||||
'CLAUDE_PLUGINS_DIR',
|
||||
'CLAUDE_PLUGIN_MARKETPLACE',
|
||||
'ECC_PLUGIN_ROOT',
|
||||
'ECC_ENABLED',
|
||||
]) {
|
||||
delete process.env[key]
|
||||
}
|
||||
process.env.CLAUDE_SKILL_LEARNING_HOME = storage
|
||||
process.env.SKILL_LEARNING_ENABLED = '1'
|
||||
process.env.SKILL_SEARCH_ENABLED = '1'
|
||||
// Force heuristic backend — no LLM round-trips allowed in clean-room probe.
|
||||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'heuristic'
|
||||
process.env.CLAUDE_SKILL_LEARNING_DISABLE = ''
|
||||
// Instrument global fetch so any stray network call from the skill-learning
|
||||
// path (unexpected LLM fallback, plugin webhook, etc.) aborts the probe
|
||||
// with a visible error rather than hiding behind a try/catch.
|
||||
const originalFetch = globalThis.fetch
|
||||
let networkCalls = 0
|
||||
globalThis.fetch = ((...args: unknown[]) => {
|
||||
networkCalls += 1
|
||||
throw new Error(
|
||||
`clean-room probe must not make network calls, attempted: ${String(args[0])}`,
|
||||
)
|
||||
}) as typeof globalThis.fetch
|
||||
console.log(`storage=${storage}`)
|
||||
console.log(`ecc-isolation: API_KEY env vars cleared, fetch stubbed, observer=heuristic`)
|
||||
|
||||
try {
|
||||
const skillLearning = await import('../src/services/skillLearning/index.js')
|
||||
const projectCtx = await import('../src/services/skillLearning/projectContext.js')
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 1. Ingest a synthetic transcript and verify observations land on disk
|
||||
// ----------------------------------------------------------------------
|
||||
const transcriptPath = join(storage, 'session.jsonl')
|
||||
const transcriptLines = [
|
||||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '请重构 loader.ts 的错误处理' } },
|
||||
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Grep', input: { pattern: 'throw new Error', path: 'src' } }] } },
|
||||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Grep', content: 'src/loader.ts:42', is_error: false }] } },
|
||||
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Read', input: { file_path: 'src/loader.ts' } }] } },
|
||||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Read', content: 'export function load() { ... }', is_error: false }] } },
|
||||
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Edit', input: { file_path: 'src/loader.ts', old_string: 'throw new Error', new_string: 'throw new LoaderError' } }] } },
|
||||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Edit', content: 'diff', is_error: false }] } },
|
||||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '不要直接 mock,用 testing-library' } },
|
||||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '必须用 testing-library 不要 mock' } },
|
||||
]
|
||||
writeFileSync(transcriptPath, transcriptLines.map(JSON.stringify).join('\n'))
|
||||
|
||||
const projectAContext = projectCtx.resolveProjectContext(projectA)
|
||||
const observations = await skillLearning.ingestTranscript(transcriptPath, { project: projectAContext })
|
||||
record(
|
||||
'ingest transcript',
|
||||
observations.length > 0,
|
||||
`${observations.length} observations written under project ${projectAContext.projectId}`,
|
||||
)
|
||||
|
||||
const reread = await skillLearning.readObservations({ project: projectAContext })
|
||||
record(
|
||||
'observations persist on disk',
|
||||
reread.length === observations.length,
|
||||
`disk has ${reread.length} observations (expected ${observations.length})`,
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 2. Heuristic observer -> instinct candidates -> store
|
||||
// ----------------------------------------------------------------------
|
||||
skillLearning.setActiveObserverBackend('heuristic')
|
||||
const candidates = await skillLearning.analyzeWithActiveBackend(observations, { project: projectAContext })
|
||||
record(
|
||||
'heuristic backend produces candidates',
|
||||
candidates.length > 0,
|
||||
`${candidates.length} candidates; first trigger=${candidates[0]?.trigger ?? '?'}`,
|
||||
)
|
||||
|
||||
for (const c of candidates) {
|
||||
await skillLearning.upsertInstinct(skillLearning.createInstinct(c), { project: projectAContext })
|
||||
}
|
||||
const persistedInstincts = await skillLearning.loadInstincts({ project: projectAContext })
|
||||
record(
|
||||
'instincts persisted',
|
||||
persistedInstincts.length > 0,
|
||||
`${persistedInstincts.length} instincts on disk for project A`,
|
||||
)
|
||||
|
||||
// Contradiction probe — push a contradicting instinct to verify conflict-hold
|
||||
const first = persistedInstincts[0]
|
||||
if (first) {
|
||||
const contradictor = skillLearning.createInstinct({
|
||||
trigger: first.trigger,
|
||||
action: first.action.includes('avoid')
|
||||
? first.action.replace('avoid', 'prefer')
|
||||
: first.action.replace(/^/, 'avoid '),
|
||||
confidence: 0.5,
|
||||
domain: first.domain,
|
||||
source: 'session-observation',
|
||||
scope: first.scope,
|
||||
projectId: projectAContext.projectId,
|
||||
projectName: projectAContext.projectName,
|
||||
evidence: ['contradiction probe'],
|
||||
observationIds: [],
|
||||
})
|
||||
await skillLearning.upsertInstinct(contradictor, { project: projectAContext })
|
||||
const after = await skillLearning.loadInstincts({ project: projectAContext })
|
||||
const merged = after.find(i => i.id === first.id) ?? after.find(i => i.trigger === first.trigger)
|
||||
record(
|
||||
'contradiction lowers confidence',
|
||||
!!merged && merged.confidence < first.confidence,
|
||||
`before=${first.confidence.toFixed(2)} after=${merged?.confidence.toFixed(2) ?? 'n/a'}`,
|
||||
)
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 3. Evolution candidates
|
||||
//
|
||||
// clusterInstincts requires EITHER 2+ instincts in the same
|
||||
// (domain, normalized-trigger) bucket OR a single instinct with
|
||||
// confidence >= 0.8. Inject a high-confidence skill instinct + a
|
||||
// 4-instinct agent cluster + a "command"-flavoured instinct so each
|
||||
// of the three evolution paths actually has candidates to emit.
|
||||
// ----------------------------------------------------------------------
|
||||
const highConfidenceSkill = skillLearning.createInstinct({
|
||||
trigger: 'When editing TypeScript error handling',
|
||||
action: 'prefer throwing domain-specific Error subclasses',
|
||||
confidence: 0.9,
|
||||
domain: 'code-style',
|
||||
source: 'session-observation',
|
||||
scope: 'project',
|
||||
projectId: projectAContext.projectId,
|
||||
projectName: projectAContext.projectName,
|
||||
evidence: ['observed 2x in session'],
|
||||
observationIds: [],
|
||||
})
|
||||
await skillLearning.upsertInstinct(highConfidenceSkill, { project: projectAContext })
|
||||
|
||||
const commandSeed = skillLearning.createInstinct({
|
||||
trigger: 'User asks to run the full test suite',
|
||||
action: 'run bun test after every multi-file edit',
|
||||
confidence: 0.9,
|
||||
domain: 'workflow',
|
||||
source: 'session-observation',
|
||||
scope: 'project',
|
||||
projectId: projectAContext.projectId,
|
||||
projectName: projectAContext.projectName,
|
||||
evidence: ['user explicitly requested bun test'],
|
||||
observationIds: [],
|
||||
})
|
||||
await skillLearning.upsertInstinct(commandSeed, { project: projectAContext })
|
||||
|
||||
for (let i = 0; i < 4; i += 1) {
|
||||
const agentSeed = skillLearning.createInstinct({
|
||||
trigger: 'When debugging multi-step investigate flow',
|
||||
action: `step ${i + 1}: research root cause and verify`,
|
||||
confidence: 0.85,
|
||||
domain: 'debugging',
|
||||
source: 'session-observation',
|
||||
scope: 'project',
|
||||
projectId: projectAContext.projectId,
|
||||
projectName: projectAContext.projectName,
|
||||
evidence: [`debug step ${i + 1}`],
|
||||
observationIds: [],
|
||||
})
|
||||
await skillLearning.upsertInstinct(agentSeed, { project: projectAContext })
|
||||
}
|
||||
|
||||
const allInstincts = await skillLearning.loadInstincts({ project: projectAContext })
|
||||
const skillCandidates = skillLearning.generateSkillCandidates(allInstincts, { cwd: projectA })
|
||||
const commandCandidates = skillLearning.generateCommandCandidates(allInstincts, { cwd: projectA })
|
||||
const agentCandidates = skillLearning.generateAgentCandidates(allInstincts, { cwd: projectA })
|
||||
record(
|
||||
'evolution skill path emits candidate (single high-conf instinct)',
|
||||
skillCandidates.length >= 1,
|
||||
`skillCandidates=${skillCandidates.length}`,
|
||||
)
|
||||
record(
|
||||
'evolution command path emits candidate (trigger matches user-asks heuristic)',
|
||||
commandCandidates.length >= 1,
|
||||
`commandCandidates=${commandCandidates.length}`,
|
||||
)
|
||||
record(
|
||||
'evolution agent path emits candidate (4+ debugging instincts)',
|
||||
agentCandidates.length >= 1,
|
||||
`agentCandidates=${agentCandidates.length}`,
|
||||
)
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 4. Write learned skill + verify file on disk
|
||||
// ----------------------------------------------------------------------
|
||||
const firstDraft = skillCandidates[0]
|
||||
if (firstDraft) {
|
||||
const activePath = await skillLearning.writeLearnedSkill(firstDraft)
|
||||
// writeLearnedSkill returns the full SKILL.md path (not the directory).
|
||||
const exists = existsSync(activePath)
|
||||
record(
|
||||
'writeLearnedSkill produces SKILL.md',
|
||||
exists,
|
||||
`path=${activePath} exists=${exists}`,
|
||||
)
|
||||
} else {
|
||||
record('writeLearnedSkill produces SKILL.md', false, 'no skill candidate to write')
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 5. Cross-project promotion
|
||||
// ----------------------------------------------------------------------
|
||||
const projectBContext = projectCtx.resolveProjectContext(projectB)
|
||||
// Duplicate one high-confidence instinct into project B so promotion threshold
|
||||
// (>= 2 projects, avg conf >= 0.8) is met. We seeded a 0.9-confidence skill
|
||||
// instinct above, so this lookup succeeds deterministically.
|
||||
const pickable = allInstincts.find(i => i.confidence >= 0.8)
|
||||
if (pickable) {
|
||||
const projectBCopy = { ...pickable, projectId: projectBContext.projectId, projectName: projectBContext.projectName }
|
||||
await skillLearning.saveInstinct(projectBCopy, { project: projectBContext, scope: 'project' })
|
||||
// findPromotionCandidates groups by instinct id + distinct projectId
|
||||
// count; give it the real merged array seen across both project stores.
|
||||
const fromA = await skillLearning.loadInstincts({ project: projectAContext })
|
||||
const fromB = await skillLearning.loadInstincts({ project: projectBContext })
|
||||
const candidatesPre = skillLearning.findPromotionCandidates([
|
||||
...fromA,
|
||||
...fromB,
|
||||
])
|
||||
record(
|
||||
'cross-project candidate visible',
|
||||
candidatesPre.length > 0,
|
||||
`${candidatesPre.length} promotable instincts across projects (A=${fromA.length} B=${fromB.length})`,
|
||||
)
|
||||
|
||||
await skillLearning.checkPromotion({ project: projectAContext })
|
||||
const globalRoot = { scope: 'global' as const, rootDir: storage }
|
||||
const globalInstincts = await skillLearning.loadInstincts(globalRoot)
|
||||
record(
|
||||
'checkPromotion writes global instinct',
|
||||
globalInstincts.some(i => i.id === pickable.id),
|
||||
`global scope has ${globalInstincts.length} instincts; target id ${pickable.id} present=${globalInstincts.some(i => i.id === pickable.id)}`,
|
||||
)
|
||||
} else {
|
||||
record('cross-project promotion', false, 'no instinct with confidence >= 0.8 to promote')
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 6. Observer backend env switch probe
|
||||
// ----------------------------------------------------------------------
|
||||
const originalBackendEnv = process.env.SKILL_LEARNING_OBSERVER_BACKEND
|
||||
try {
|
||||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'llm'
|
||||
skillLearning.resolveDefaultObserverBackend()
|
||||
const active = skillLearning.getActiveObserverBackend().name
|
||||
record('env switch llm activates', active === 'llm', `active backend=${active}`)
|
||||
|
||||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'unknown-typo'
|
||||
skillLearning.resolveDefaultObserverBackend()
|
||||
const stillActive = skillLearning.getActiveObserverBackend().name
|
||||
record('typo env does not crash', stillActive === 'llm', `active after typo=${stillActive}`)
|
||||
|
||||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'heuristic'
|
||||
skillLearning.resolveDefaultObserverBackend()
|
||||
record('env switch back to heuristic', skillLearning.getActiveObserverBackend().name === 'heuristic', `active=${skillLearning.getActiveObserverBackend().name}`)
|
||||
} finally {
|
||||
if (originalBackendEnv === undefined) delete process.env.SKILL_LEARNING_OBSERVER_BACKEND
|
||||
else process.env.SKILL_LEARNING_OBSERVER_BACKEND = originalBackendEnv
|
||||
}
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 7. Gap state machine walk-through
|
||||
// ----------------------------------------------------------------------
|
||||
const prompt = 'auto-generate e2e verify script skeleton'
|
||||
const firstGap = await skillLearning.recordSkillGap({
|
||||
prompt,
|
||||
cwd: projectA,
|
||||
sessionId: 'e2e-a',
|
||||
project: projectAContext,
|
||||
rootDir: storage,
|
||||
})
|
||||
record('first gap is pending (no draft)', firstGap.status === 'pending' && !firstGap.draft, `status=${firstGap.status} draft=${!!firstGap.draft}`)
|
||||
|
||||
const secondGap = await skillLearning.recordSkillGap({
|
||||
prompt,
|
||||
cwd: projectA,
|
||||
sessionId: 'e2e-a',
|
||||
project: projectAContext,
|
||||
rootDir: storage,
|
||||
})
|
||||
record('second occurrence promotes to draft', secondGap.status === 'draft' && !!secondGap.draft, `status=${secondGap.status} draftPath=${secondGap.draft?.skillPath ?? 'n/a'}`)
|
||||
|
||||
// ----------------------------------------------------------------------
|
||||
// 8. Tool event observer wrapper invocation
|
||||
// ----------------------------------------------------------------------
|
||||
let wrappedRan = false
|
||||
const wrappedResult = await skillLearning.runToolCallWithSkillLearningHooks(
|
||||
'VerifyProbeTool',
|
||||
{ sample: 'input' },
|
||||
{ sessionId: skillLearning.RUNTIME_SESSION_ID, turn: 1 },
|
||||
async () => {
|
||||
wrappedRan = true
|
||||
return { data: { ok: true, payload: 42 } }
|
||||
},
|
||||
)
|
||||
record(
|
||||
'runToolCallWithSkillLearningHooks invokes inner fn',
|
||||
wrappedRan && (wrappedResult as { data?: { ok?: boolean } })?.data?.ok === true,
|
||||
`inner ran=${wrappedRan} result=${JSON.stringify(wrappedResult)}`,
|
||||
)
|
||||
|
||||
// Observations produced by the wrapper are written under the project
|
||||
// context derived from process.cwd() (the test runner repo, not our
|
||||
// ephemeral projectA). Read from BOTH project scopes to catch either.
|
||||
const repoProject = projectCtx.resolveProjectContext(process.cwd())
|
||||
const [obsInProjectA, obsInRepo] = await Promise.all([
|
||||
skillLearning.readObservations({ project: projectAContext }),
|
||||
skillLearning.readObservations({ project: repoProject }),
|
||||
])
|
||||
const toolHookRecords = [...obsInProjectA, ...obsInRepo].filter(
|
||||
o => o.source === 'tool-hook' && o.toolName === 'VerifyProbeTool',
|
||||
)
|
||||
record(
|
||||
'wrapper writes tool-hook observations',
|
||||
toolHookRecords.length > 0,
|
||||
`${toolHookRecords.length} tool-hook records on disk (projectA=${obsInProjectA.length} repo=${obsInRepo.length})`,
|
||||
)
|
||||
} catch (error) {
|
||||
record('uncaught exception', false, String(error))
|
||||
} finally {
|
||||
// Assert clean-room isolation held for the whole probe.
|
||||
record(
|
||||
'clean-room isolation: zero network calls',
|
||||
networkCalls === 0,
|
||||
`${networkCalls} network calls attempted`,
|
||||
)
|
||||
globalThis.fetch = originalFetch
|
||||
rmSync(storage, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
|
||||
rmSync(projectA, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
|
||||
rmSync(projectB, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
|
||||
}
|
||||
|
||||
const passed = results.filter(r => r.ok).length
|
||||
const failed = results.filter(r => !r.ok).length
|
||||
console.log(`\n=== SUMMARY ===\n${passed} pass, ${failed} fail, ${results.length} total`)
|
||||
process.exit(failed > 0 ? 1 : 0)
|
||||
}
|
||||
|
||||
void main()
|
||||
Reference in New Issue
Block a user