refactor: 简化/复用/防御 — 清理 PR #386 审计发现

简化 (S1, S2):
- src/cli/print.ts: 抽出 dispatchHeadlessCronCommand 本地 helper,把
  cron 三个入口(onFire / onFireTask agent / onFireTask 非-agent)共享的
  「dedup-claim → input-close-recheck → onSuccess」管线集中到一处,
  避免三个分支在「claim 与 dispatch 之间发生 inputClosed」的处理上漂移。
  enqueueAndRun 再抽出来,使两个非-agent 分支共用一个 onSuccess 回调。
  约 -55 行重复模板。
- src/utils/autonomyPersistence.ts: 新增 retainActiveFirst<T> 泛型
  helper —— active 记录无条件保留(不参与 cap),inactive 按 timestamp
  desc 填满剩余预算;统一 selectPersistedAutonomyRuns / Flows 的两阶段
  排序语义。
- src/utils/autonomyRuns.ts、autonomyFlows.ts: 改用 retainActiveFirst,
  删掉重复的内联两阶段排序逻辑。

复用 (R1, review #8):
- tests/mocks/file-system.ts: 新增 readTempFile / tempPathExists 两个
  Bun.file 包装,补齐 Node fs.readFileSync / existsSync 在测试里的
  Bun-only 等价物。
- src/utils/__tests__/autonomyRuns.test.ts: 把全部 Node fs/path 导入
  (existsSync, readFileSync, mkdir, writeFile, path.join/resolve)替换为
  tests/mocks/file-system 的共享 helper + node:path(带 node: 前缀)。
  不再有 6 处 mkdir + writeFile 模板,统一用 writeTempFile(自带 mkdir-p)。
  解决 review #8 (Major) 的 Bun-only 运行时契约违反。

防御 (D1, OOM 早期信号):
- src/services/compact/postCompactCleanup.ts: 在 void import().then() 末尾
  补 .catch(logError)。当前 attributionHooks 是 stub,但当真实现被恢复
  且 sweepFileContentCache 抛错时,这个 .catch 阻止它变成 unhandled
  rejection(函数返回值是 void,调用者无从观察异步失败)。
- src/utils/autonomyRuns.ts: 给 active runs 加 100 条软上限 + 一次性
  warn。selectPersistedAutonomyRuns 仍然永不淘汰 active 记录,但跨过
  阈值时 logError 一次,作为 finalize-leak 早期信号——避免 active 无限
  增长悄悄使 AUTONOMY_RUNS_MAX 失效。
This commit is contained in:
Claude
2026-04-29 13:23:41 +00:00
parent 6b7cfda9b1
commit 7a6e65caf7
7 changed files with 190 additions and 143 deletions

View File

@@ -1,7 +1,5 @@
import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
import { existsSync, readFileSync } from 'fs'
import { mkdir, writeFile } from 'fs/promises'
import { join, resolve as resolvePath } from 'path'
import { join, resolve as resolvePath } from 'node:path'
import {
resetStateForTests,
setCwdState,
@@ -42,11 +40,14 @@ import {
cleanupTempDir,
createTempDir,
createTempSubdir,
readTempFile,
tempPathExists,
writeTempFile,
} from '../../../tests/mocks/file-system'
const AGENTS_REL = join(AUTONOMY_DIR, 'AGENTS.md')
const HEARTBEAT_REL = join(AUTONOMY_DIR, 'HEARTBEAT.md')
const RUNS_REL = join(AUTONOMY_DIR, 'runs.json')
let tempDir = ''
@@ -349,9 +350,9 @@ describe('autonomyRuns', () => {
)
// Seed an active queued run for cron-1 so the next dedup attempt skips.
await mkdir(join(tempDir, AUTONOMY_DIR), { recursive: true })
await writeFile(
resolveAutonomyRunsPath(tempDir),
await writeTempFile(
tempDir,
RUNS_REL,
`${JSON.stringify(
{
runs: [
@@ -373,7 +374,6 @@ describe('autonomyRuns', () => {
null,
2,
)}\n`,
'utf-8',
)
const skipped = await createAutonomyQueuedPromptIfNoActiveSource({
@@ -400,9 +400,9 @@ describe('autonomyRuns', () => {
})
test('createAutonomyQueuedPromptIfNoActiveSource recovers stale active runs from dead owner processes', async () => {
await mkdir(join(tempDir, AUTONOMY_DIR), { recursive: true })
await writeFile(
resolveAutonomyRunsPath(tempDir),
await writeTempFile(
tempDir,
RUNS_REL,
`${JSON.stringify(
{
runs: [
@@ -426,7 +426,6 @@ describe('autonomyRuns', () => {
null,
2,
)}\n`,
'utf-8',
)
await expect(
@@ -483,7 +482,7 @@ describe('autonomyRuns', () => {
await markAutonomyRunRunning(runId, tempDir, 100)
const runsPath = resolveAutonomyRunsPath(tempDir)
const file = JSON.parse(readFileSync(runsPath, 'utf-8')) as {
const file = JSON.parse(await readTempFile(runsPath)) as {
runs: Array<Record<string, unknown>>
}
file.runs = file.runs.map(run =>
@@ -491,7 +490,7 @@ describe('autonomyRuns', () => {
? { ...run, ownerProcessId: 2_147_483_647 }
: run,
)
await writeFile(runsPath, `${JSON.stringify(file, null, 2)}\n`, 'utf-8')
await writeTempFile(tempDir, RUNS_REL, `${JSON.stringify(file, null, 2)}\n`)
const replacement = await createAutonomyQueuedPromptIfNoActiveSource({
basePrompt: 'replacement prompt',
@@ -615,11 +614,10 @@ describe('autonomyRuns', () => {
endedAt: 2_000 + index,
})),
]
await mkdir(join(tempDir, AUTONOMY_DIR), { recursive: true })
await writeFile(
resolveAutonomyRunsPath(tempDir),
await writeTempFile(
tempDir,
RUNS_REL,
`${JSON.stringify({ runs }, null, 2)}\n`,
'utf-8',
)
await createAutonomyRun({
@@ -637,10 +635,9 @@ describe('autonomyRuns', () => {
})
test('listAutonomyRuns keeps older persisted records by normalizing missing runtime and owner metadata', async () => {
const runsPath = resolveAutonomyRunsPath(tempDir)
await mkdir(join(tempDir, '.claude', 'autonomy'), { recursive: true })
await writeFile(
runsPath,
await writeTempFile(
tempDir,
RUNS_REL,
`${JSON.stringify(
{
runs: [
@@ -657,7 +654,6 @@ describe('autonomyRuns', () => {
null,
2,
)}\n`,
'utf-8',
)
const [legacy] = await listAutonomyRuns(tempDir)
@@ -832,7 +828,7 @@ describe('autonomyRuns', () => {
expect(recovered!.autonomy?.flowId).toBe(flow!.flowId)
})
test('STALE_ACTIVE_RUN_ERROR_PREFIX stays in sync with HEARTBEAT.md stale-recovery-health task', () => {
test('STALE_ACTIVE_RUN_ERROR_PREFIX stays in sync with HEARTBEAT.md stale-recovery-health task', async () => {
// The HEARTBEAT.md stale-recovery-health task prompt embeds this prefix
// as a literal string. Changing the constant without updating the
// heartbeat prompt would silently break the monitor — this test fails
@@ -846,12 +842,12 @@ describe('autonomyRuns', () => {
'autonomy',
'HEARTBEAT.md',
)
if (!existsSync(heartbeatPath)) {
if (!(await tempPathExists(heartbeatPath))) {
// .claude/ may be absent in some checkout layouts (e.g., shallow clone
// for npm pack). Skip rather than fail in that case.
return
}
const content = readFileSync(heartbeatPath, 'utf8')
const content = await readTempFile(heartbeatPath)
expect(content).toContain(STALE_ACTIVE_RUN_ERROR_PREFIX)
})
})

View File

@@ -3,7 +3,10 @@ import { mkdir, writeFile } from 'fs/promises'
import { dirname, join, resolve } from 'path'
import { getProjectRoot } from '../bootstrap/state.js'
import { AUTONOMY_DIR, type AutonomyTriggerKind } from './autonomyAuthority.js'
import { withAutonomyPersistenceLock } from './autonomyPersistence.js'
import {
retainActiveFirst,
withAutonomyPersistenceLock,
} from './autonomyPersistence.js'
import { getFsImplementation } from './fsOperations.js'
const AUTONOMY_FLOWS_MAX = 100
@@ -170,26 +173,12 @@ function isManagedFlowStatusActive(status: AutonomyFlowStatus): boolean {
function selectPersistedAutonomyFlows(
flows: AutonomyFlowRecord[],
): AutonomyFlowRecord[] {
// Two-phase sort. Phase 1: priority sort (active flows first, then by
// updatedAt desc) selects the AUTONOMY_FLOWS_MAX most-relevant records to
// retain — active flows are guaranteed a slot before any inactive flow is
// considered. Phase 2: re-sort the retained slice by updatedAt desc only,
// so the persisted file is in plain reverse-chronological order regardless
// of activity status. Listings/UI consume the persisted order directly.
const retained = flows
.slice()
.map(cloneFlowRecord)
.sort((left, right) => {
const leftActive = isManagedFlowStatusActive(left.status)
const rightActive = isManagedFlowStatusActive(right.status)
if (leftActive !== rightActive) {
return leftActive ? -1 : 1
}
return right.updatedAt - left.updatedAt
})
.slice(0, AUTONOMY_FLOWS_MAX)
return retained.sort((left, right) => right.updatedAt - left.updatedAt)
return retainActiveFirst(
flows.map(cloneFlowRecord),
flow => isManagedFlowStatusActive(flow.status),
flow => flow.updatedAt,
AUTONOMY_FLOWS_MAX,
)
}
function defaultFlowSource(params: {

View File

@@ -4,6 +4,33 @@ import { lock } from './lockfile.js'
const persistenceLocks = new Map<string, Promise<void>>()
/**
* Two-phase persistence retention. Active records (queued/running, etc.) are
* always kept — capping them risks evicting in-flight work; that responsibility
* lives in caller-side leak detection. Inactive (terminal) records are ranked
* by `getTimestamp` desc and capped to fill the remaining budget below `max`.
*
* Returned list is sorted by `getTimestamp` desc regardless of activity, so
* the persisted file is plain reverse-chronological order — listings/UI can
* consume it directly without re-sorting.
*/
export function retainActiveFirst<T>(
records: readonly T[],
isActive: (record: T) => boolean,
getTimestamp: (record: T) => number,
max: number,
): T[] {
const sortDesc = (left: T, right: T) =>
getTimestamp(right) - getTimestamp(left)
const active = records.filter(isActive).slice().sort(sortDesc)
const history = records
.filter(record => !isActive(record))
.slice()
.sort(sortDesc)
.slice(0, Math.max(0, max - active.length))
return [...active, ...history].sort(sortDesc)
}
export function getAutonomyPersistenceLockCountForTests(): number {
if (process.env.NODE_ENV !== 'test') {
throw new Error(

View File

@@ -27,12 +27,22 @@ import {
type AutonomyFlowSyncMode,
type ManagedAutonomyFlowStepDefinition,
} from './autonomyFlows.js'
import { withAutonomyPersistenceLock } from './autonomyPersistence.js'
import {
retainActiveFirst,
withAutonomyPersistenceLock,
} from './autonomyPersistence.js'
import { getFsImplementation } from './fsOperations.js'
import { isProcessRunning } from './genericProcessUtils.js'
import { logError } from './log.js'
const AUTONOMY_RUNS_MAX = 200
// Diagnostic threshold for active (queued/running) runs. Active records are
// deliberately exempt from AUTONOMY_RUNS_MAX so a leak in finalization cannot
// silently evict in-flight work; that exemption only makes sense if a leak is
// loud when it appears. Crossing this threshold warns once per process so
// operators see the divergence in logs before runs.json grows pathologically.
const AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD = 100
let warnedActiveRunsThresholdCrossed = false
const AUTONOMY_RUNS_RELATIVE_PATH = join(AUTONOMY_DIR, 'runs.json')
// Sentinel string surfaced to operators via runs.json error fields and
// referenced literally by the HEARTBEAT.md `stale-recovery-health` task.
@@ -130,17 +140,24 @@ function isAutonomyRunActive(run: AutonomyRunRecord): boolean {
function selectPersistedAutonomyRuns(
runs: AutonomyRunRecord[],
): AutonomyRunRecord[] {
const cloned = runs.slice().map(cloneRunRecord)
const active = cloned
.filter(isAutonomyRunActive)
.sort((left, right) => right.createdAt - left.createdAt)
const history = cloned
.filter(run => !isAutonomyRunActive(run))
.sort((left, right) => right.createdAt - left.createdAt)
.slice(0, Math.max(0, AUTONOMY_RUNS_MAX - active.length))
return [...active, ...history].sort(
(left, right) => right.createdAt - left.createdAt,
const cloned = runs.map(cloneRunRecord)
const activeCount = cloned.filter(isAutonomyRunActive).length
if (
!warnedActiveRunsThresholdCrossed &&
activeCount >= AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD
) {
warnedActiveRunsThresholdCrossed = true
logError(
new Error(
`autonomy: ${activeCount} active runs exceed warn threshold ${AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD}; check for finalize leaks`,
),
)
}
return retainActiveFirst(
cloned,
isAutonomyRunActive,
run => run.createdAt,
AUTONOMY_RUNS_MAX,
)
}