Files
claude-code/scripts/verify-skill-learning-e2e.ts
unraid 95fece4b51 feat: 整合功能恢复与技能学习闭环(含 ECC v2.1 parity + Opus 4.7 接入 + prompt 工程优化)
主要变更:
- Skill Learning 闭环系统 (9/9 AC)
- Opus 4.7 模型层接入 + adaptive thinking
- Prompt 工程优化 (64 审计测试)
- Agent Teams 简化门控 (默认启用)
- Windows Terminal 后端修复 (EncodedCommand/WT_SESSION)
- TF-IDF 技能搜索精准化 (字段加权/CJK 优化)
- Autonomy 系统 (/autonomy 命令)
- ACP 协议完整实现
- mock.module 泄漏修复 (CI 全绿)
- 152+ lint/type 修复
2026-04-22 16:07:42 +08:00

407 lines
19 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
/**
* End-to-end verification probe for the skill-learning pipeline.
*
* Exercises the real public API (not mocks, not unit test harness) so we
* can confirm each pipeline stage actually produces the expected on-disk
* artefacts under a clean CLAUDE_SKILL_LEARNING_HOME.
*
* Run with:
* bun run scripts/verify-skill-learning-e2e.ts
*
* Sections:
* 1. Fake transcript -> ingest -> observations on disk
* 2. Heuristic observer -> instinct candidates -> persisted instincts
* 3. Evolution -> skill / command / agent candidates
* 4. Write learned skill -> verify skill file exists
* 5. Cross-project promotion -> global instinct written
* 6. Observer backend env switch probe
* 7. Gap state machine walk-through
* 8. Tool event observer wrapper invocation
*/
import { mkdtempSync, writeFileSync, existsSync, rmSync, readdirSync } from 'node:fs'
import { readFile } from 'node:fs/promises'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { execSync } from 'node:child_process'
type Result = { step: string; ok: boolean; detail: string }
const results: Result[] = []
function record(step: string, ok: boolean, detail: string): void {
results.push({ step, ok, detail })
const tag = ok ? 'PASS' : 'FAIL'
console.log(`[${tag}] ${step}${detail}`)
}
async function main(): Promise<void> {
const storage = mkdtempSync(join(tmpdir(), 'skill-learning-e2e-'))
const projectA = mkdtempSync(join(tmpdir(), 'project-a-'))
const projectB = mkdtempSync(join(tmpdir(), 'project-b-'))
// Real git repos so resolveProjectContext derives distinct project IDs
// (the default `global` fallback for non-git dirs would make A and B
// share the same storage and defeat the cross-project probe).
execSync(`git init -q "${projectA}"`, { stdio: 'ignore' })
execSync(
`git -C "${projectA}" remote add origin https://example.test/project-a.git`,
{ stdio: 'ignore' },
)
execSync(`git init -q "${projectB}"`, { stdio: 'ignore' })
execSync(
`git -C "${projectB}" remote add origin https://example.test/project-b.git`,
{ stdio: 'ignore' },
)
// === ECC / plugin isolation ===
// The probe must exercise only the project's own skill-learning code, not
// the user-level ECC plugin, auto-loaded ECC skill, or any external LLM.
// Strip every env that could route observations or observer calls outside
// this probe's temp storage.
for (const key of [
'ANTHROPIC_API_KEY',
'ANTHROPIC_AUTH_TOKEN',
'OPENAI_API_KEY',
'GEMINI_API_KEY',
'GROK_API_KEY',
'CLAUDE_CODE_PLUGINS_DIR',
'CLAUDE_PLUGINS_DIR',
'CLAUDE_PLUGIN_MARKETPLACE',
'ECC_PLUGIN_ROOT',
'ECC_ENABLED',
]) {
delete process.env[key]
}
process.env.CLAUDE_SKILL_LEARNING_HOME = storage
process.env.SKILL_LEARNING_ENABLED = '1'
process.env.SKILL_SEARCH_ENABLED = '1'
// Force heuristic backend — no LLM round-trips allowed in clean-room probe.
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'heuristic'
process.env.CLAUDE_SKILL_LEARNING_DISABLE = ''
// Instrument global fetch so any stray network call from the skill-learning
// path (unexpected LLM fallback, plugin webhook, etc.) aborts the probe
// with a visible error rather than hiding behind a try/catch.
const originalFetch = globalThis.fetch
let networkCalls = 0
globalThis.fetch = ((...args: unknown[]) => {
networkCalls += 1
throw new Error(
`clean-room probe must not make network calls, attempted: ${String(args[0])}`,
)
}) as typeof globalThis.fetch
console.log(`storage=${storage}`)
console.log(`ecc-isolation: API_KEY env vars cleared, fetch stubbed, observer=heuristic`)
try {
const skillLearning = await import('../src/services/skillLearning/index.js')
const projectCtx = await import('../src/services/skillLearning/projectContext.js')
// ----------------------------------------------------------------------
// 1. Ingest a synthetic transcript and verify observations land on disk
// ----------------------------------------------------------------------
const transcriptPath = join(storage, 'session.jsonl')
const transcriptLines = [
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '请重构 loader.ts 的错误处理' } },
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Grep', input: { pattern: 'throw new Error', path: 'src' } }] } },
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Grep', content: 'src/loader.ts:42', is_error: false }] } },
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Read', input: { file_path: 'src/loader.ts' } }] } },
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Read', content: 'export function load() { ... }', is_error: false }] } },
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Edit', input: { file_path: 'src/loader.ts', old_string: 'throw new Error', new_string: 'throw new LoaderError' } }] } },
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Edit', content: 'diff', is_error: false }] } },
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '不要直接 mock用 testing-library' } },
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '必须用 testing-library 不要 mock' } },
]
writeFileSync(transcriptPath, transcriptLines.map(JSON.stringify).join('\n'))
const projectAContext = projectCtx.resolveProjectContext(projectA)
const observations = await skillLearning.ingestTranscript(transcriptPath, { project: projectAContext })
record(
'ingest transcript',
observations.length > 0,
`${observations.length} observations written under project ${projectAContext.projectId}`,
)
const reread = await skillLearning.readObservations({ project: projectAContext })
record(
'observations persist on disk',
reread.length === observations.length,
`disk has ${reread.length} observations (expected ${observations.length})`,
)
// ----------------------------------------------------------------------
// 2. Heuristic observer -> instinct candidates -> store
// ----------------------------------------------------------------------
skillLearning.setActiveObserverBackend('heuristic')
const candidates = await skillLearning.analyzeWithActiveBackend(observations, { project: projectAContext })
record(
'heuristic backend produces candidates',
candidates.length > 0,
`${candidates.length} candidates; first trigger=${candidates[0]?.trigger ?? '?'}`,
)
for (const c of candidates) {
await skillLearning.upsertInstinct(skillLearning.createInstinct(c), { project: projectAContext })
}
const persistedInstincts = await skillLearning.loadInstincts({ project: projectAContext })
record(
'instincts persisted',
persistedInstincts.length > 0,
`${persistedInstincts.length} instincts on disk for project A`,
)
// Contradiction probe — push a contradicting instinct to verify conflict-hold
const first = persistedInstincts[0]
if (first) {
const contradictor = skillLearning.createInstinct({
trigger: first.trigger,
action: first.action.includes('avoid')
? first.action.replace('avoid', 'prefer')
: first.action.replace(/^/, 'avoid '),
confidence: 0.5,
domain: first.domain,
source: 'session-observation',
scope: first.scope,
projectId: projectAContext.projectId,
projectName: projectAContext.projectName,
evidence: ['contradiction probe'],
observationIds: [],
})
await skillLearning.upsertInstinct(contradictor, { project: projectAContext })
const after = await skillLearning.loadInstincts({ project: projectAContext })
const merged = after.find(i => i.id === first.id) ?? after.find(i => i.trigger === first.trigger)
record(
'contradiction lowers confidence',
!!merged && merged.confidence < first.confidence,
`before=${first.confidence.toFixed(2)} after=${merged?.confidence.toFixed(2) ?? 'n/a'}`,
)
}
// ----------------------------------------------------------------------
// 3. Evolution candidates
//
// clusterInstincts requires EITHER 2+ instincts in the same
// (domain, normalized-trigger) bucket OR a single instinct with
// confidence >= 0.8. Inject a high-confidence skill instinct + a
// 4-instinct agent cluster + a "command"-flavoured instinct so each
// of the three evolution paths actually has candidates to emit.
// ----------------------------------------------------------------------
const highConfidenceSkill = skillLearning.createInstinct({
trigger: 'When editing TypeScript error handling',
action: 'prefer throwing domain-specific Error subclasses',
confidence: 0.9,
domain: 'code-style',
source: 'session-observation',
scope: 'project',
projectId: projectAContext.projectId,
projectName: projectAContext.projectName,
evidence: ['observed 2x in session'],
observationIds: [],
})
await skillLearning.upsertInstinct(highConfidenceSkill, { project: projectAContext })
const commandSeed = skillLearning.createInstinct({
trigger: 'User asks to run the full test suite',
action: 'run bun test after every multi-file edit',
confidence: 0.9,
domain: 'workflow',
source: 'session-observation',
scope: 'project',
projectId: projectAContext.projectId,
projectName: projectAContext.projectName,
evidence: ['user explicitly requested bun test'],
observationIds: [],
})
await skillLearning.upsertInstinct(commandSeed, { project: projectAContext })
for (let i = 0; i < 4; i += 1) {
const agentSeed = skillLearning.createInstinct({
trigger: 'When debugging multi-step investigate flow',
action: `step ${i + 1}: research root cause and verify`,
confidence: 0.85,
domain: 'debugging',
source: 'session-observation',
scope: 'project',
projectId: projectAContext.projectId,
projectName: projectAContext.projectName,
evidence: [`debug step ${i + 1}`],
observationIds: [],
})
await skillLearning.upsertInstinct(agentSeed, { project: projectAContext })
}
const allInstincts = await skillLearning.loadInstincts({ project: projectAContext })
const skillCandidates = skillLearning.generateSkillCandidates(allInstincts, { cwd: projectA })
const commandCandidates = skillLearning.generateCommandCandidates(allInstincts, { cwd: projectA })
const agentCandidates = skillLearning.generateAgentCandidates(allInstincts, { cwd: projectA })
record(
'evolution skill path emits candidate (single high-conf instinct)',
skillCandidates.length >= 1,
`skillCandidates=${skillCandidates.length}`,
)
record(
'evolution command path emits candidate (trigger matches user-asks heuristic)',
commandCandidates.length >= 1,
`commandCandidates=${commandCandidates.length}`,
)
record(
'evolution agent path emits candidate (4+ debugging instincts)',
agentCandidates.length >= 1,
`agentCandidates=${agentCandidates.length}`,
)
// ----------------------------------------------------------------------
// 4. Write learned skill + verify file on disk
// ----------------------------------------------------------------------
const firstDraft = skillCandidates[0]
if (firstDraft) {
const activePath = await skillLearning.writeLearnedSkill(firstDraft)
// writeLearnedSkill returns the full SKILL.md path (not the directory).
const exists = existsSync(activePath)
record(
'writeLearnedSkill produces SKILL.md',
exists,
`path=${activePath} exists=${exists}`,
)
} else {
record('writeLearnedSkill produces SKILL.md', false, 'no skill candidate to write')
}
// ----------------------------------------------------------------------
// 5. Cross-project promotion
// ----------------------------------------------------------------------
const projectBContext = projectCtx.resolveProjectContext(projectB)
// Duplicate one high-confidence instinct into project B so promotion threshold
// (>= 2 projects, avg conf >= 0.8) is met. We seeded a 0.9-confidence skill
// instinct above, so this lookup succeeds deterministically.
const pickable = allInstincts.find(i => i.confidence >= 0.8)
if (pickable) {
const projectBCopy = { ...pickable, projectId: projectBContext.projectId, projectName: projectBContext.projectName }
await skillLearning.saveInstinct(projectBCopy, { project: projectBContext, scope: 'project' })
// findPromotionCandidates groups by instinct id + distinct projectId
// count; give it the real merged array seen across both project stores.
const fromA = await skillLearning.loadInstincts({ project: projectAContext })
const fromB = await skillLearning.loadInstincts({ project: projectBContext })
const candidatesPre = skillLearning.findPromotionCandidates([
...fromA,
...fromB,
])
record(
'cross-project candidate visible',
candidatesPre.length > 0,
`${candidatesPre.length} promotable instincts across projects (A=${fromA.length} B=${fromB.length})`,
)
await skillLearning.checkPromotion({ project: projectAContext })
const globalRoot = { scope: 'global' as const, rootDir: storage }
const globalInstincts = await skillLearning.loadInstincts(globalRoot)
record(
'checkPromotion writes global instinct',
globalInstincts.some(i => i.id === pickable.id),
`global scope has ${globalInstincts.length} instincts; target id ${pickable.id} present=${globalInstincts.some(i => i.id === pickable.id)}`,
)
} else {
record('cross-project promotion', false, 'no instinct with confidence >= 0.8 to promote')
}
// ----------------------------------------------------------------------
// 6. Observer backend env switch probe
// ----------------------------------------------------------------------
const originalBackendEnv = process.env.SKILL_LEARNING_OBSERVER_BACKEND
try {
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'llm'
skillLearning.resolveDefaultObserverBackend()
const active = skillLearning.getActiveObserverBackend().name
record('env switch llm activates', active === 'llm', `active backend=${active}`)
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'unknown-typo'
skillLearning.resolveDefaultObserverBackend()
const stillActive = skillLearning.getActiveObserverBackend().name
record('typo env does not crash', stillActive === 'llm', `active after typo=${stillActive}`)
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'heuristic'
skillLearning.resolveDefaultObserverBackend()
record('env switch back to heuristic', skillLearning.getActiveObserverBackend().name === 'heuristic', `active=${skillLearning.getActiveObserverBackend().name}`)
} finally {
if (originalBackendEnv === undefined) delete process.env.SKILL_LEARNING_OBSERVER_BACKEND
else process.env.SKILL_LEARNING_OBSERVER_BACKEND = originalBackendEnv
}
// ----------------------------------------------------------------------
// 7. Gap state machine walk-through
// ----------------------------------------------------------------------
const prompt = 'auto-generate e2e verify script skeleton'
const firstGap = await skillLearning.recordSkillGap({
prompt,
cwd: projectA,
sessionId: 'e2e-a',
project: projectAContext,
rootDir: storage,
})
record('first gap is pending (no draft)', firstGap.status === 'pending' && !firstGap.draft, `status=${firstGap.status} draft=${!!firstGap.draft}`)
const secondGap = await skillLearning.recordSkillGap({
prompt,
cwd: projectA,
sessionId: 'e2e-a',
project: projectAContext,
rootDir: storage,
})
record('second occurrence promotes to draft', secondGap.status === 'draft' && !!secondGap.draft, `status=${secondGap.status} draftPath=${secondGap.draft?.skillPath ?? 'n/a'}`)
// ----------------------------------------------------------------------
// 8. Tool event observer wrapper invocation
// ----------------------------------------------------------------------
let wrappedRan = false
const wrappedResult = await skillLearning.runToolCallWithSkillLearningHooks(
'VerifyProbeTool',
{ sample: 'input' },
{ sessionId: skillLearning.RUNTIME_SESSION_ID, turn: 1 },
async () => {
wrappedRan = true
return { data: { ok: true, payload: 42 } }
},
)
record(
'runToolCallWithSkillLearningHooks invokes inner fn',
wrappedRan && (wrappedResult as { data?: { ok?: boolean } })?.data?.ok === true,
`inner ran=${wrappedRan} result=${JSON.stringify(wrappedResult)}`,
)
// Observations produced by the wrapper are written under the project
// context derived from process.cwd() (the test runner repo, not our
// ephemeral projectA). Read from BOTH project scopes to catch either.
const repoProject = projectCtx.resolveProjectContext(process.cwd())
const [obsInProjectA, obsInRepo] = await Promise.all([
skillLearning.readObservations({ project: projectAContext }),
skillLearning.readObservations({ project: repoProject }),
])
const toolHookRecords = [...obsInProjectA, ...obsInRepo].filter(
o => o.source === 'tool-hook' && o.toolName === 'VerifyProbeTool',
)
record(
'wrapper writes tool-hook observations',
toolHookRecords.length > 0,
`${toolHookRecords.length} tool-hook records on disk (projectA=${obsInProjectA.length} repo=${obsInRepo.length})`,
)
} catch (error) {
record('uncaught exception', false, String(error))
} finally {
// Assert clean-room isolation held for the whole probe.
record(
'clean-room isolation: zero network calls',
networkCalls === 0,
`${networkCalls} network calls attempted`,
)
globalThis.fetch = originalFetch
rmSync(storage, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
rmSync(projectA, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
rmSync(projectB, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
}
const passed = results.filter(r => r.ok).length
const failed = results.filter(r => !r.ok).length
console.log(`\n=== SUMMARY ===\n${passed} pass, ${failed} fail, ${results.length} total`)
process.exit(failed > 0 ? 1 : 0)
}
void main()