refactor: 简化/复用/防御 — 清理 PR #386 审计发现

简化 (S1, S2): - src/cli/print.ts: 抽出 dispatchHeadlessCronCommand 本地 helper，把 cron 三个入口（onFire / onFireTask agent / onFireTask 非-agent）共享的「dedup-claim → input-close-recheck → onSuccess」管线集中到一处，避免三个分支在「claim 与 dispatch 之间发生 inputClosed」的处理上漂移。 enqueueAndRun 再抽出来，使两个非-agent 分支共用一个 onSuccess 回调。约 -55 行重复模板。 - src/utils/autonomyPersistence.ts: 新增 retainActiveFirst<T> 泛型 helper —— active 记录无条件保留（不参与 cap），inactive 按 timestamp desc 填满剩余预算；统一 selectPersistedAutonomyRuns / Flows 的两阶段排序语义。 - src/utils/autonomyRuns.ts、autonomyFlows.ts: 改用 retainActiveFirst，删掉重复的内联两阶段排序逻辑。复用 (R1, review #8): - tests/mocks/file-system.ts: 新增 readTempFile / tempPathExists 两个 Bun.file 包装，补齐 Node fs.readFileSync / existsSync 在测试里的 Bun-only 等价物。 - src/utils/__tests__/autonomyRuns.test.ts: 把全部 Node fs/path 导入（existsSync, readFileSync, mkdir, writeFile, path.join/resolve）替换为 tests/mocks/file-system 的共享 helper + node:path（带 node: 前缀）。不再有 6 处 mkdir + writeFile 模板，统一用 writeTempFile（自带 mkdir-p）。解决 review #8 (Major) 的 Bun-only 运行时契约违反。防御 (D1, OOM 早期信号): - src/services/compact/postCompactCleanup.ts: 在 void import().then() 末尾补 .catch(logError)。当前 attributionHooks 是 stub，但当真实现被恢复且 sweepFileContentCache 抛错时，这个 .catch 阻止它变成 unhandled rejection（函数返回值是 void，调用者无从观察异步失败）。 - src/utils/autonomyRuns.ts: 给 active runs 加 100 条软上限 + 一次性 warn。selectPersistedAutonomyRuns 仍然永不淘汰 active 记录，但跨过阈值时 logError 一次，作为 finalize-leak 早期信号——避免 active 无限增长悄悄使 AUTONOMY_RUNS_MAX 失效。
2026-06-17 22:05:50 +00:00 · 2026-04-29 13:23:41 +00:00
parent 6b7cfda9b1
commit 7a6e65caf7
7 changed files with 190 additions and 143 deletions
--- a/src/utils/tests/autonomyRuns.test.ts
+++ b/src/utils/tests/autonomyRuns.test.ts
@@ -1,7 +1,5 @@
 import { afterEach, beforeEach, describe, expect, test } from 'bun:test'
-import { existsSync, readFileSync } from 'fs'
-import { mkdir, writeFile } from 'fs/promises'
-import { join, resolve as resolvePath } from 'path'
+import { join, resolve as resolvePath } from 'node:path'
 import {
  resetStateForTests,
  setCwdState,
@@ -42,11 +40,14 @@ import {
  cleanupTempDir,
  createTempDir,
  createTempSubdir,
+  readTempFile,
+  tempPathExists,
  writeTempFile,
 } from '../../../tests/mocks/file-system'

 const AGENTS_REL = join(AUTONOMY_DIR, 'AGENTS.md')
 const HEARTBEAT_REL = join(AUTONOMY_DIR, 'HEARTBEAT.md')
+const RUNS_REL = join(AUTONOMY_DIR, 'runs.json')

 let tempDir = ''

@@ -349,9 +350,9 @@ describe('autonomyRuns', () => {
    )

    // Seed an active queued run for cron-1 so the next dedup attempt skips.
-    await mkdir(join(tempDir, AUTONOMY_DIR), { recursive: true })
-    await writeFile(
-      resolveAutonomyRunsPath(tempDir),
+    await writeTempFile(
+      tempDir,
+      RUNS_REL,
      `${JSON.stringify(
        {
          runs: [
@@ -373,7 +374,6 @@ describe('autonomyRuns', () => {
        null,
        2,
      )}\n`,
-      'utf-8',
    )

    const skipped = await createAutonomyQueuedPromptIfNoActiveSource({
@@ -400,9 +400,9 @@ describe('autonomyRuns', () => {
  })

  test('createAutonomyQueuedPromptIfNoActiveSource recovers stale active runs from dead owner processes', async () => {
-    await mkdir(join(tempDir, AUTONOMY_DIR), { recursive: true })
-    await writeFile(
-      resolveAutonomyRunsPath(tempDir),
+    await writeTempFile(
+      tempDir,
+      RUNS_REL,
      `${JSON.stringify(
        {
          runs: [
@@ -426,7 +426,6 @@ describe('autonomyRuns', () => {
        null,
        2,
      )}\n`,
-      'utf-8',
    )

    await expect(
@@ -483,7 +482,7 @@ describe('autonomyRuns', () => {
    await markAutonomyRunRunning(runId, tempDir, 100)

    const runsPath = resolveAutonomyRunsPath(tempDir)
-    const file = JSON.parse(readFileSync(runsPath, 'utf-8')) as {
+    const file = JSON.parse(await readTempFile(runsPath)) as {
      runs: Array<Record<string, unknown>>
    }
    file.runs = file.runs.map(run =>
@@ -491,7 +490,7 @@ describe('autonomyRuns', () => {
        ? { ...run, ownerProcessId: 2_147_483_647 }
        : run,
    )
-    await writeFile(runsPath, `${JSON.stringify(file, null, 2)}\n`, 'utf-8')
+    await writeTempFile(tempDir, RUNS_REL, `${JSON.stringify(file, null, 2)}\n`)

    const replacement = await createAutonomyQueuedPromptIfNoActiveSource({
      basePrompt: 'replacement prompt',
@@ -615,11 +614,10 @@ describe('autonomyRuns', () => {
        endedAt: 2_000 + index,
      })),
    ]
-    await mkdir(join(tempDir, AUTONOMY_DIR), { recursive: true })
-    await writeFile(
-      resolveAutonomyRunsPath(tempDir),
+    await writeTempFile(
+      tempDir,
+      RUNS_REL,
      `${JSON.stringify({ runs }, null, 2)}\n`,
-      'utf-8',
    )

    await createAutonomyRun({
@@ -637,10 +635,9 @@ describe('autonomyRuns', () => {
  })

  test('listAutonomyRuns keeps older persisted records by normalizing missing runtime and owner metadata', async () => {
-    const runsPath = resolveAutonomyRunsPath(tempDir)
-    await mkdir(join(tempDir, '.claude', 'autonomy'), { recursive: true })
-    await writeFile(
-      runsPath,
+    await writeTempFile(
+      tempDir,
+      RUNS_REL,
      `${JSON.stringify(
        {
          runs: [
@@ -657,7 +654,6 @@ describe('autonomyRuns', () => {
        null,
        2,
      )}\n`,
-      'utf-8',
    )

    const [legacy] = await listAutonomyRuns(tempDir)
@@ -832,7 +828,7 @@ describe('autonomyRuns', () => {
    expect(recovered!.autonomy?.flowId).toBe(flow!.flowId)
  })

-  test('STALE_ACTIVE_RUN_ERROR_PREFIX stays in sync with HEARTBEAT.md stale-recovery-health task', () => {
+  test('STALE_ACTIVE_RUN_ERROR_PREFIX stays in sync with HEARTBEAT.md stale-recovery-health task', async () => {
    // The HEARTBEAT.md stale-recovery-health task prompt embeds this prefix
    // as a literal string. Changing the constant without updating the
    // heartbeat prompt would silently break the monitor — this test fails
@@ -846,12 +842,12 @@ describe('autonomyRuns', () => {
      'autonomy',
      'HEARTBEAT.md',
    )
-    if (!existsSync(heartbeatPath)) {
+    if (!(await tempPathExists(heartbeatPath))) {
      // .claude/ may be absent in some checkout layouts (e.g., shallow clone
      // for npm pack). Skip rather than fail in that case.
      return
    }
-    const content = readFileSync(heartbeatPath, 'utf8')
+    const content = await readTempFile(heartbeatPath)
    expect(content).toContain(STALE_ACTIVE_RUN_ERROR_PREFIX)
  })
 })
--- a/src/utils/autonomyFlows.ts
+++ b/src/utils/autonomyFlows.ts
@@ -3,7 +3,10 @@ import { mkdir, writeFile } from 'fs/promises'
 import { dirname, join, resolve } from 'path'
 import { getProjectRoot } from '../bootstrap/state.js'
 import { AUTONOMY_DIR, type AutonomyTriggerKind } from './autonomyAuthority.js'
-import { withAutonomyPersistenceLock } from './autonomyPersistence.js'
+import {
+  retainActiveFirst,
+  withAutonomyPersistenceLock,
+} from './autonomyPersistence.js'
 import { getFsImplementation } from './fsOperations.js'

 const AUTONOMY_FLOWS_MAX = 100
@@ -170,26 +173,12 @@ function isManagedFlowStatusActive(status: AutonomyFlowStatus): boolean {
 function selectPersistedAutonomyFlows(
  flows: AutonomyFlowRecord[],
 ): AutonomyFlowRecord[] {
-  // Two-phase sort. Phase 1: priority sort (active flows first, then by
-  // updatedAt desc) selects the AUTONOMY_FLOWS_MAX most-relevant records to
-  // retain — active flows are guaranteed a slot before any inactive flow is
-  // considered. Phase 2: re-sort the retained slice by updatedAt desc only,
-  // so the persisted file is in plain reverse-chronological order regardless
-  // of activity status. Listings/UI consume the persisted order directly.
-  const retained = flows
-    .slice()
-    .map(cloneFlowRecord)
-    .sort((left, right) => {
-      const leftActive = isManagedFlowStatusActive(left.status)
-      const rightActive = isManagedFlowStatusActive(right.status)
-      if (leftActive !== rightActive) {
-        return leftActive ? -1 : 1
-      }
-      return right.updatedAt - left.updatedAt
-    })
-    .slice(0, AUTONOMY_FLOWS_MAX)
-
-  return retained.sort((left, right) => right.updatedAt - left.updatedAt)
+  return retainActiveFirst(
+    flows.map(cloneFlowRecord),
+    flow => isManagedFlowStatusActive(flow.status),
+    flow => flow.updatedAt,
+    AUTONOMY_FLOWS_MAX,
+  )
 }

 function defaultFlowSource(params: {
--- a/src/utils/autonomyPersistence.ts
+++ b/src/utils/autonomyPersistence.ts
@@ -4,6 +4,33 @@ import { lock } from './lockfile.js'

 const persistenceLocks = new Map<string, Promise<void>>()

+/**
+ * Two-phase persistence retention. Active records (queued/running, etc.) are
+ * always kept — capping them risks evicting in-flight work; that responsibility
+ * lives in caller-side leak detection. Inactive (terminal) records are ranked
+ * by `getTimestamp` desc and capped to fill the remaining budget below `max`.
+ *
+ * Returned list is sorted by `getTimestamp` desc regardless of activity, so
+ * the persisted file is plain reverse-chronological order — listings/UI can
+ * consume it directly without re-sorting.
+ */
+export function retainActiveFirst<T>(
+  records: readonly T[],
+  isActive: (record: T) => boolean,
+  getTimestamp: (record: T) => number,
+  max: number,
+): T[] {
+  const sortDesc = (left: T, right: T) =>
+    getTimestamp(right) - getTimestamp(left)
+  const active = records.filter(isActive).slice().sort(sortDesc)
+  const history = records
+    .filter(record => !isActive(record))
+    .slice()
+    .sort(sortDesc)
+    .slice(0, Math.max(0, max - active.length))
+  return [...active, ...history].sort(sortDesc)
+}
+
 export function getAutonomyPersistenceLockCountForTests(): number {
  if (process.env.NODE_ENV !== 'test') {
    throw new Error(
--- a/src/utils/autonomyRuns.ts
+++ b/src/utils/autonomyRuns.ts
@@ -27,12 +27,22 @@ import {
  type AutonomyFlowSyncMode,
  type ManagedAutonomyFlowStepDefinition,
 } from './autonomyFlows.js'
-import { withAutonomyPersistenceLock } from './autonomyPersistence.js'
+import {
+  retainActiveFirst,
+  withAutonomyPersistenceLock,
+} from './autonomyPersistence.js'
 import { getFsImplementation } from './fsOperations.js'
 import { isProcessRunning } from './genericProcessUtils.js'
 import { logError } from './log.js'

 const AUTONOMY_RUNS_MAX = 200
+// Diagnostic threshold for active (queued/running) runs. Active records are
+// deliberately exempt from AUTONOMY_RUNS_MAX so a leak in finalization cannot
+// silently evict in-flight work; that exemption only makes sense if a leak is
+// loud when it appears. Crossing this threshold warns once per process so
+// operators see the divergence in logs before runs.json grows pathologically.
+const AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD = 100
+let warnedActiveRunsThresholdCrossed = false
 const AUTONOMY_RUNS_RELATIVE_PATH = join(AUTONOMY_DIR, 'runs.json')
 // Sentinel string surfaced to operators via runs.json error fields and
 // referenced literally by the HEARTBEAT.md `stale-recovery-health` task.
@@ -130,17 +140,24 @@ function isAutonomyRunActive(run: AutonomyRunRecord): boolean {
 function selectPersistedAutonomyRuns(
  runs: AutonomyRunRecord[],
 ): AutonomyRunRecord[] {
-  const cloned = runs.slice().map(cloneRunRecord)
-  const active = cloned
-    .filter(isAutonomyRunActive)
-    .sort((left, right) => right.createdAt - left.createdAt)
-  const history = cloned
-    .filter(run => !isAutonomyRunActive(run))
-    .sort((left, right) => right.createdAt - left.createdAt)
-    .slice(0, Math.max(0, AUTONOMY_RUNS_MAX - active.length))
-
-  return [...active, ...history].sort(
-    (left, right) => right.createdAt - left.createdAt,
+  const cloned = runs.map(cloneRunRecord)
+  const activeCount = cloned.filter(isAutonomyRunActive).length
+  if (
+    !warnedActiveRunsThresholdCrossed &&
+    activeCount >= AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD
+  ) {
+    warnedActiveRunsThresholdCrossed = true
+    logError(
+      new Error(
+        `autonomy: ${activeCount} active runs exceed warn threshold ${AUTONOMY_ACTIVE_RUNS_WARN_THRESHOLD}; check for finalize leaks`,
+      ),
+    )
+  }
+  return retainActiveFirst(
+    cloned,
+    isAutonomyRunActive,
+    run => run.createdAt,
+    AUTONOMY_RUNS_MAX,
  )
 }