mirror of
https://github.com/claude-code-best/claude-code.git
synced 2026-06-15 21:05:51 +00:00
主要变更: - Skill Learning 闭环系统 (9/9 AC) - Opus 4.7 模型层接入 + adaptive thinking - Prompt 工程优化 (64 审计测试) - Agent Teams 简化门控 (默认启用) - Windows Terminal 后端修复 (EncodedCommand/WT_SESSION) - TF-IDF 技能搜索精准化 (字段加权/CJK 优化) - Autonomy 系统 (/autonomy 命令) - ACP 协议完整实现 - mock.module 泄漏修复 (CI 全绿) - 152+ lint/type 修复
407 lines
19 KiB
TypeScript
407 lines
19 KiB
TypeScript
/**
|
||
* End-to-end verification probe for the skill-learning pipeline.
|
||
*
|
||
* Exercises the real public API (not mocks, not unit test harness) so we
|
||
* can confirm each pipeline stage actually produces the expected on-disk
|
||
* artefacts under a clean CLAUDE_SKILL_LEARNING_HOME.
|
||
*
|
||
* Run with:
|
||
* bun run scripts/verify-skill-learning-e2e.ts
|
||
*
|
||
* Sections:
|
||
* 1. Fake transcript -> ingest -> observations on disk
|
||
* 2. Heuristic observer -> instinct candidates -> persisted instincts
|
||
* 3. Evolution -> skill / command / agent candidates
|
||
* 4. Write learned skill -> verify skill file exists
|
||
* 5. Cross-project promotion -> global instinct written
|
||
* 6. Observer backend env switch probe
|
||
* 7. Gap state machine walk-through
|
||
* 8. Tool event observer wrapper invocation
|
||
*/
|
||
|
||
import { mkdtempSync, writeFileSync, existsSync, rmSync, readdirSync } from 'node:fs'
|
||
import { readFile } from 'node:fs/promises'
|
||
import { tmpdir } from 'node:os'
|
||
import { join } from 'node:path'
|
||
import { execSync } from 'node:child_process'
|
||
|
||
type Result = { step: string; ok: boolean; detail: string }
|
||
const results: Result[] = []
|
||
|
||
function record(step: string, ok: boolean, detail: string): void {
|
||
results.push({ step, ok, detail })
|
||
const tag = ok ? 'PASS' : 'FAIL'
|
||
console.log(`[${tag}] ${step} — ${detail}`)
|
||
}
|
||
|
||
async function main(): Promise<void> {
|
||
const storage = mkdtempSync(join(tmpdir(), 'skill-learning-e2e-'))
|
||
const projectA = mkdtempSync(join(tmpdir(), 'project-a-'))
|
||
const projectB = mkdtempSync(join(tmpdir(), 'project-b-'))
|
||
// Real git repos so resolveProjectContext derives distinct project IDs
|
||
// (the default `global` fallback for non-git dirs would make A and B
|
||
// share the same storage and defeat the cross-project probe).
|
||
execSync(`git init -q "${projectA}"`, { stdio: 'ignore' })
|
||
execSync(
|
||
`git -C "${projectA}" remote add origin https://example.test/project-a.git`,
|
||
{ stdio: 'ignore' },
|
||
)
|
||
execSync(`git init -q "${projectB}"`, { stdio: 'ignore' })
|
||
execSync(
|
||
`git -C "${projectB}" remote add origin https://example.test/project-b.git`,
|
||
{ stdio: 'ignore' },
|
||
)
|
||
|
||
// === ECC / plugin isolation ===
|
||
// The probe must exercise only the project's own skill-learning code, not
|
||
// the user-level ECC plugin, auto-loaded ECC skill, or any external LLM.
|
||
// Strip every env that could route observations or observer calls outside
|
||
// this probe's temp storage.
|
||
for (const key of [
|
||
'ANTHROPIC_API_KEY',
|
||
'ANTHROPIC_AUTH_TOKEN',
|
||
'OPENAI_API_KEY',
|
||
'GEMINI_API_KEY',
|
||
'GROK_API_KEY',
|
||
'CLAUDE_CODE_PLUGINS_DIR',
|
||
'CLAUDE_PLUGINS_DIR',
|
||
'CLAUDE_PLUGIN_MARKETPLACE',
|
||
'ECC_PLUGIN_ROOT',
|
||
'ECC_ENABLED',
|
||
]) {
|
||
delete process.env[key]
|
||
}
|
||
process.env.CLAUDE_SKILL_LEARNING_HOME = storage
|
||
process.env.SKILL_LEARNING_ENABLED = '1'
|
||
process.env.SKILL_SEARCH_ENABLED = '1'
|
||
// Force heuristic backend — no LLM round-trips allowed in clean-room probe.
|
||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'heuristic'
|
||
process.env.CLAUDE_SKILL_LEARNING_DISABLE = ''
|
||
// Instrument global fetch so any stray network call from the skill-learning
|
||
// path (unexpected LLM fallback, plugin webhook, etc.) aborts the probe
|
||
// with a visible error rather than hiding behind a try/catch.
|
||
const originalFetch = globalThis.fetch
|
||
let networkCalls = 0
|
||
globalThis.fetch = ((...args: unknown[]) => {
|
||
networkCalls += 1
|
||
throw new Error(
|
||
`clean-room probe must not make network calls, attempted: ${String(args[0])}`,
|
||
)
|
||
}) as typeof globalThis.fetch
|
||
console.log(`storage=${storage}`)
|
||
console.log(`ecc-isolation: API_KEY env vars cleared, fetch stubbed, observer=heuristic`)
|
||
|
||
try {
|
||
const skillLearning = await import('../src/services/skillLearning/index.js')
|
||
const projectCtx = await import('../src/services/skillLearning/projectContext.js')
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 1. Ingest a synthetic transcript and verify observations land on disk
|
||
// ----------------------------------------------------------------------
|
||
const transcriptPath = join(storage, 'session.jsonl')
|
||
const transcriptLines = [
|
||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '请重构 loader.ts 的错误处理' } },
|
||
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Grep', input: { pattern: 'throw new Error', path: 'src' } }] } },
|
||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Grep', content: 'src/loader.ts:42', is_error: false }] } },
|
||
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Read', input: { file_path: 'src/loader.ts' } }] } },
|
||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Read', content: 'export function load() { ... }', is_error: false }] } },
|
||
{ type: 'assistant', sessionId: 's-e2e', cwd: projectA, message: { role: 'assistant', content: [{ type: 'tool_use', name: 'Edit', input: { file_path: 'src/loader.ts', old_string: 'throw new Error', new_string: 'throw new LoaderError' } }] } },
|
||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: [{ type: 'tool_result', name: 'Edit', content: 'diff', is_error: false }] } },
|
||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '不要直接 mock,用 testing-library' } },
|
||
{ type: 'user', sessionId: 's-e2e', cwd: projectA, message: { role: 'user', content: '必须用 testing-library 不要 mock' } },
|
||
]
|
||
writeFileSync(transcriptPath, transcriptLines.map(JSON.stringify).join('\n'))
|
||
|
||
const projectAContext = projectCtx.resolveProjectContext(projectA)
|
||
const observations = await skillLearning.ingestTranscript(transcriptPath, { project: projectAContext })
|
||
record(
|
||
'ingest transcript',
|
||
observations.length > 0,
|
||
`${observations.length} observations written under project ${projectAContext.projectId}`,
|
||
)
|
||
|
||
const reread = await skillLearning.readObservations({ project: projectAContext })
|
||
record(
|
||
'observations persist on disk',
|
||
reread.length === observations.length,
|
||
`disk has ${reread.length} observations (expected ${observations.length})`,
|
||
)
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 2. Heuristic observer -> instinct candidates -> store
|
||
// ----------------------------------------------------------------------
|
||
skillLearning.setActiveObserverBackend('heuristic')
|
||
const candidates = await skillLearning.analyzeWithActiveBackend(observations, { project: projectAContext })
|
||
record(
|
||
'heuristic backend produces candidates',
|
||
candidates.length > 0,
|
||
`${candidates.length} candidates; first trigger=${candidates[0]?.trigger ?? '?'}`,
|
||
)
|
||
|
||
for (const c of candidates) {
|
||
await skillLearning.upsertInstinct(skillLearning.createInstinct(c), { project: projectAContext })
|
||
}
|
||
const persistedInstincts = await skillLearning.loadInstincts({ project: projectAContext })
|
||
record(
|
||
'instincts persisted',
|
||
persistedInstincts.length > 0,
|
||
`${persistedInstincts.length} instincts on disk for project A`,
|
||
)
|
||
|
||
// Contradiction probe — push a contradicting instinct to verify conflict-hold
|
||
const first = persistedInstincts[0]
|
||
if (first) {
|
||
const contradictor = skillLearning.createInstinct({
|
||
trigger: first.trigger,
|
||
action: first.action.includes('avoid')
|
||
? first.action.replace('avoid', 'prefer')
|
||
: first.action.replace(/^/, 'avoid '),
|
||
confidence: 0.5,
|
||
domain: first.domain,
|
||
source: 'session-observation',
|
||
scope: first.scope,
|
||
projectId: projectAContext.projectId,
|
||
projectName: projectAContext.projectName,
|
||
evidence: ['contradiction probe'],
|
||
observationIds: [],
|
||
})
|
||
await skillLearning.upsertInstinct(contradictor, { project: projectAContext })
|
||
const after = await skillLearning.loadInstincts({ project: projectAContext })
|
||
const merged = after.find(i => i.id === first.id) ?? after.find(i => i.trigger === first.trigger)
|
||
record(
|
||
'contradiction lowers confidence',
|
||
!!merged && merged.confidence < first.confidence,
|
||
`before=${first.confidence.toFixed(2)} after=${merged?.confidence.toFixed(2) ?? 'n/a'}`,
|
||
)
|
||
}
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 3. Evolution candidates
|
||
//
|
||
// clusterInstincts requires EITHER 2+ instincts in the same
|
||
// (domain, normalized-trigger) bucket OR a single instinct with
|
||
// confidence >= 0.8. Inject a high-confidence skill instinct + a
|
||
// 4-instinct agent cluster + a "command"-flavoured instinct so each
|
||
// of the three evolution paths actually has candidates to emit.
|
||
// ----------------------------------------------------------------------
|
||
const highConfidenceSkill = skillLearning.createInstinct({
|
||
trigger: 'When editing TypeScript error handling',
|
||
action: 'prefer throwing domain-specific Error subclasses',
|
||
confidence: 0.9,
|
||
domain: 'code-style',
|
||
source: 'session-observation',
|
||
scope: 'project',
|
||
projectId: projectAContext.projectId,
|
||
projectName: projectAContext.projectName,
|
||
evidence: ['observed 2x in session'],
|
||
observationIds: [],
|
||
})
|
||
await skillLearning.upsertInstinct(highConfidenceSkill, { project: projectAContext })
|
||
|
||
const commandSeed = skillLearning.createInstinct({
|
||
trigger: 'User asks to run the full test suite',
|
||
action: 'run bun test after every multi-file edit',
|
||
confidence: 0.9,
|
||
domain: 'workflow',
|
||
source: 'session-observation',
|
||
scope: 'project',
|
||
projectId: projectAContext.projectId,
|
||
projectName: projectAContext.projectName,
|
||
evidence: ['user explicitly requested bun test'],
|
||
observationIds: [],
|
||
})
|
||
await skillLearning.upsertInstinct(commandSeed, { project: projectAContext })
|
||
|
||
for (let i = 0; i < 4; i += 1) {
|
||
const agentSeed = skillLearning.createInstinct({
|
||
trigger: 'When debugging multi-step investigate flow',
|
||
action: `step ${i + 1}: research root cause and verify`,
|
||
confidence: 0.85,
|
||
domain: 'debugging',
|
||
source: 'session-observation',
|
||
scope: 'project',
|
||
projectId: projectAContext.projectId,
|
||
projectName: projectAContext.projectName,
|
||
evidence: [`debug step ${i + 1}`],
|
||
observationIds: [],
|
||
})
|
||
await skillLearning.upsertInstinct(agentSeed, { project: projectAContext })
|
||
}
|
||
|
||
const allInstincts = await skillLearning.loadInstincts({ project: projectAContext })
|
||
const skillCandidates = skillLearning.generateSkillCandidates(allInstincts, { cwd: projectA })
|
||
const commandCandidates = skillLearning.generateCommandCandidates(allInstincts, { cwd: projectA })
|
||
const agentCandidates = skillLearning.generateAgentCandidates(allInstincts, { cwd: projectA })
|
||
record(
|
||
'evolution skill path emits candidate (single high-conf instinct)',
|
||
skillCandidates.length >= 1,
|
||
`skillCandidates=${skillCandidates.length}`,
|
||
)
|
||
record(
|
||
'evolution command path emits candidate (trigger matches user-asks heuristic)',
|
||
commandCandidates.length >= 1,
|
||
`commandCandidates=${commandCandidates.length}`,
|
||
)
|
||
record(
|
||
'evolution agent path emits candidate (4+ debugging instincts)',
|
||
agentCandidates.length >= 1,
|
||
`agentCandidates=${agentCandidates.length}`,
|
||
)
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 4. Write learned skill + verify file on disk
|
||
// ----------------------------------------------------------------------
|
||
const firstDraft = skillCandidates[0]
|
||
if (firstDraft) {
|
||
const activePath = await skillLearning.writeLearnedSkill(firstDraft)
|
||
// writeLearnedSkill returns the full SKILL.md path (not the directory).
|
||
const exists = existsSync(activePath)
|
||
record(
|
||
'writeLearnedSkill produces SKILL.md',
|
||
exists,
|
||
`path=${activePath} exists=${exists}`,
|
||
)
|
||
} else {
|
||
record('writeLearnedSkill produces SKILL.md', false, 'no skill candidate to write')
|
||
}
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 5. Cross-project promotion
|
||
// ----------------------------------------------------------------------
|
||
const projectBContext = projectCtx.resolveProjectContext(projectB)
|
||
// Duplicate one high-confidence instinct into project B so promotion threshold
|
||
// (>= 2 projects, avg conf >= 0.8) is met. We seeded a 0.9-confidence skill
|
||
// instinct above, so this lookup succeeds deterministically.
|
||
const pickable = allInstincts.find(i => i.confidence >= 0.8)
|
||
if (pickable) {
|
||
const projectBCopy = { ...pickable, projectId: projectBContext.projectId, projectName: projectBContext.projectName }
|
||
await skillLearning.saveInstinct(projectBCopy, { project: projectBContext, scope: 'project' })
|
||
// findPromotionCandidates groups by instinct id + distinct projectId
|
||
// count; give it the real merged array seen across both project stores.
|
||
const fromA = await skillLearning.loadInstincts({ project: projectAContext })
|
||
const fromB = await skillLearning.loadInstincts({ project: projectBContext })
|
||
const candidatesPre = skillLearning.findPromotionCandidates([
|
||
...fromA,
|
||
...fromB,
|
||
])
|
||
record(
|
||
'cross-project candidate visible',
|
||
candidatesPre.length > 0,
|
||
`${candidatesPre.length} promotable instincts across projects (A=${fromA.length} B=${fromB.length})`,
|
||
)
|
||
|
||
await skillLearning.checkPromotion({ project: projectAContext })
|
||
const globalRoot = { scope: 'global' as const, rootDir: storage }
|
||
const globalInstincts = await skillLearning.loadInstincts(globalRoot)
|
||
record(
|
||
'checkPromotion writes global instinct',
|
||
globalInstincts.some(i => i.id === pickable.id),
|
||
`global scope has ${globalInstincts.length} instincts; target id ${pickable.id} present=${globalInstincts.some(i => i.id === pickable.id)}`,
|
||
)
|
||
} else {
|
||
record('cross-project promotion', false, 'no instinct with confidence >= 0.8 to promote')
|
||
}
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 6. Observer backend env switch probe
|
||
// ----------------------------------------------------------------------
|
||
const originalBackendEnv = process.env.SKILL_LEARNING_OBSERVER_BACKEND
|
||
try {
|
||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'llm'
|
||
skillLearning.resolveDefaultObserverBackend()
|
||
const active = skillLearning.getActiveObserverBackend().name
|
||
record('env switch llm activates', active === 'llm', `active backend=${active}`)
|
||
|
||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'unknown-typo'
|
||
skillLearning.resolveDefaultObserverBackend()
|
||
const stillActive = skillLearning.getActiveObserverBackend().name
|
||
record('typo env does not crash', stillActive === 'llm', `active after typo=${stillActive}`)
|
||
|
||
process.env.SKILL_LEARNING_OBSERVER_BACKEND = 'heuristic'
|
||
skillLearning.resolveDefaultObserverBackend()
|
||
record('env switch back to heuristic', skillLearning.getActiveObserverBackend().name === 'heuristic', `active=${skillLearning.getActiveObserverBackend().name}`)
|
||
} finally {
|
||
if (originalBackendEnv === undefined) delete process.env.SKILL_LEARNING_OBSERVER_BACKEND
|
||
else process.env.SKILL_LEARNING_OBSERVER_BACKEND = originalBackendEnv
|
||
}
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 7. Gap state machine walk-through
|
||
// ----------------------------------------------------------------------
|
||
const prompt = 'auto-generate e2e verify script skeleton'
|
||
const firstGap = await skillLearning.recordSkillGap({
|
||
prompt,
|
||
cwd: projectA,
|
||
sessionId: 'e2e-a',
|
||
project: projectAContext,
|
||
rootDir: storage,
|
||
})
|
||
record('first gap is pending (no draft)', firstGap.status === 'pending' && !firstGap.draft, `status=${firstGap.status} draft=${!!firstGap.draft}`)
|
||
|
||
const secondGap = await skillLearning.recordSkillGap({
|
||
prompt,
|
||
cwd: projectA,
|
||
sessionId: 'e2e-a',
|
||
project: projectAContext,
|
||
rootDir: storage,
|
||
})
|
||
record('second occurrence promotes to draft', secondGap.status === 'draft' && !!secondGap.draft, `status=${secondGap.status} draftPath=${secondGap.draft?.skillPath ?? 'n/a'}`)
|
||
|
||
// ----------------------------------------------------------------------
|
||
// 8. Tool event observer wrapper invocation
|
||
// ----------------------------------------------------------------------
|
||
let wrappedRan = false
|
||
const wrappedResult = await skillLearning.runToolCallWithSkillLearningHooks(
|
||
'VerifyProbeTool',
|
||
{ sample: 'input' },
|
||
{ sessionId: skillLearning.RUNTIME_SESSION_ID, turn: 1 },
|
||
async () => {
|
||
wrappedRan = true
|
||
return { data: { ok: true, payload: 42 } }
|
||
},
|
||
)
|
||
record(
|
||
'runToolCallWithSkillLearningHooks invokes inner fn',
|
||
wrappedRan && (wrappedResult as { data?: { ok?: boolean } })?.data?.ok === true,
|
||
`inner ran=${wrappedRan} result=${JSON.stringify(wrappedResult)}`,
|
||
)
|
||
|
||
// Observations produced by the wrapper are written under the project
|
||
// context derived from process.cwd() (the test runner repo, not our
|
||
// ephemeral projectA). Read from BOTH project scopes to catch either.
|
||
const repoProject = projectCtx.resolveProjectContext(process.cwd())
|
||
const [obsInProjectA, obsInRepo] = await Promise.all([
|
||
skillLearning.readObservations({ project: projectAContext }),
|
||
skillLearning.readObservations({ project: repoProject }),
|
||
])
|
||
const toolHookRecords = [...obsInProjectA, ...obsInRepo].filter(
|
||
o => o.source === 'tool-hook' && o.toolName === 'VerifyProbeTool',
|
||
)
|
||
record(
|
||
'wrapper writes tool-hook observations',
|
||
toolHookRecords.length > 0,
|
||
`${toolHookRecords.length} tool-hook records on disk (projectA=${obsInProjectA.length} repo=${obsInRepo.length})`,
|
||
)
|
||
} catch (error) {
|
||
record('uncaught exception', false, String(error))
|
||
} finally {
|
||
// Assert clean-room isolation held for the whole probe.
|
||
record(
|
||
'clean-room isolation: zero network calls',
|
||
networkCalls === 0,
|
||
`${networkCalls} network calls attempted`,
|
||
)
|
||
globalThis.fetch = originalFetch
|
||
rmSync(storage, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
|
||
rmSync(projectA, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
|
||
rmSync(projectB, { recursive: true, force: true, maxRetries: 5, retryDelay: 100 })
|
||
}
|
||
|
||
const passed = results.filter(r => r.ok).length
|
||
const failed = results.filter(r => !r.ok).length
|
||
console.log(`\n=== SUMMARY ===\n${passed} pass, ${failed} fail, ${results.length} total`)
|
||
process.exit(failed > 0 ? 1 : 0)
|
||
}
|
||
|
||
void main()
|