Skip to content

Commit bd43b15

Browse files
committed
feat(compare): model comparison with planning rate fix
5-section compare view: Performance (one-shot, retry, self-correction), Efficiency (cost/call, cost/edit, output/call, cache hit), Category Head-to-Head bar charts, Working Style, and Context. Planning rate now detects TaskCreate/TaskUpdate/TodoWrite instead of only EnterPlanMode (which was never used, showing 0% for all models). Validated against raw JSONL with zero false positives. Responsive side-by-side layout at 90+ cols. Self-correction scanner with compact file skipping and model+timestamp dedup. 274 tests.
1 parent fb24eea commit bd43b15

4 files changed

Lines changed: 533 additions & 70 deletions

File tree

.gitignore

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,3 +36,6 @@ npm-debug.log*
3636

3737
# Local Discord brand / promo assets not yet ready to publish
3838
assets/discord-*.png
39+
40+
# Desktop app experiments
41+
desktop/

src/compare-stats.ts

Lines changed: 155 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ import { join } from 'path'
33

44
import type { ProjectSummary } from './types.js'
55

6+
const PLANNING_TOOLS = new Set(['TaskCreate', 'TaskUpdate', 'TodoWrite', 'EnterPlanMode', 'ExitPlanMode'])
7+
68
export type ModelStats = {
79
model: string
810
calls: number
@@ -16,6 +18,7 @@ export type ModelStats = {
1618
oneShotTurns: number
1719
retries: number
1820
selfCorrections: number
21+
editCost: number
1922
firstSeen: string
2023
lastSeen: string
2124
}
@@ -26,7 +29,7 @@ export function aggregateModelStats(projects: ProjectSummary[]): ModelStats[] {
2629
const ensure = (model: string): ModelStats => {
2730
let s = byModel.get(model)
2831
if (!s) {
29-
s = { model, calls: 0, cost: 0, outputTokens: 0, inputTokens: 0, cacheReadTokens: 0, cacheWriteTokens: 0, totalTurns: 0, editTurns: 0, oneShotTurns: 0, retries: 0, selfCorrections: 0, firstSeen: '', lastSeen: '' }
32+
s = { model, calls: 0, cost: 0, outputTokens: 0, inputTokens: 0, cacheReadTokens: 0, cacheWriteTokens: 0, totalTurns: 0, editTurns: 0, oneShotTurns: 0, retries: 0, selfCorrections: 0, editCost: 0, firstSeen: '', lastSeen: '' }
3033
byModel.set(model, s)
3134
}
3235
return s
@@ -41,8 +44,13 @@ export function aggregateModelStats(projects: ProjectSummary[]): ModelStats[] {
4144

4245
const ms = ensure(primaryModel)
4346
ms.totalTurns++
44-
if (turn.hasEdits) ms.editTurns++
45-
if (turn.hasEdits && turn.retries === 0) ms.oneShotTurns++
47+
if (turn.hasEdits) {
48+
ms.editTurns++
49+
if (turn.retries === 0) ms.oneShotTurns++
50+
for (const c of turn.assistantCalls) {
51+
if (c.model !== '<synthetic>') ms.editCost += c.costUSD
52+
}
53+
}
4654
ms.retries += turn.retries
4755

4856
for (const call of turn.assistantCalls) {
@@ -66,14 +74,34 @@ export function aggregateModelStats(projects: ProjectSummary[]): ModelStats[] {
6674
}
6775

6876
export type ComparisonRow = {
77+
section: string
6978
label: string
7079
valueA: number | null
7180
valueB: number | null
7281
formatFn: 'cost' | 'number' | 'percent' | 'decimal'
7382
winner: 'a' | 'b' | 'tie' | 'none'
7483
}
7584

85+
export type CategoryComparison = {
86+
category: string
87+
turnsA: number
88+
editTurnsA: number
89+
oneShotRateA: number | null
90+
turnsB: number
91+
editTurnsB: number
92+
oneShotRateB: number | null
93+
winner: 'a' | 'b' | 'tie' | 'none'
94+
}
95+
96+
export type WorkingStyleRow = {
97+
label: string
98+
valueA: number | null
99+
valueB: number | null
100+
formatFn: ComparisonRow['formatFn']
101+
}
102+
76103
type MetricDef = {
104+
section: string
77105
label: string
78106
formatFn: ComparisonRow['formatFn']
79107
higherIsBetter: boolean
@@ -82,18 +110,49 @@ type MetricDef = {
82110

83111
const METRICS: MetricDef[] = [
84112
{
113+
section: 'Performance',
114+
label: 'One-shot rate',
115+
formatFn: 'percent',
116+
higherIsBetter: true,
117+
compute: s => s.editTurns > 0 ? (s.oneShotTurns / s.editTurns) * 100 : null,
118+
},
119+
{
120+
section: 'Performance',
121+
label: 'Retry rate',
122+
formatFn: 'decimal',
123+
higherIsBetter: false,
124+
compute: s => s.editTurns > 0 ? s.retries / s.editTurns : null,
125+
},
126+
{
127+
section: 'Performance',
128+
label: 'Self-correction',
129+
formatFn: 'percent',
130+
higherIsBetter: false,
131+
compute: s => s.totalTurns > 0 ? (s.selfCorrections / s.totalTurns) * 100 : null,
132+
},
133+
{
134+
section: 'Efficiency',
85135
label: 'Cost / call',
86136
formatFn: 'cost',
87137
higherIsBetter: false,
88138
compute: s => s.calls > 0 ? s.cost / s.calls : null,
89139
},
90140
{
141+
section: 'Efficiency',
142+
label: 'Cost / edit',
143+
formatFn: 'cost',
144+
higherIsBetter: false,
145+
compute: s => s.editTurns > 0 ? s.editCost / s.editTurns : null,
146+
},
147+
{
148+
section: 'Efficiency',
91149
label: 'Output tok / call',
92150
formatFn: 'number',
93151
higherIsBetter: false,
94152
compute: s => s.calls > 0 ? Math.round(s.outputTokens / s.calls) : null,
95153
},
96154
{
155+
section: 'Efficiency',
97156
label: 'Cache hit rate',
98157
formatFn: 'percent',
99158
higherIsBetter: true,
@@ -102,24 +161,6 @@ const METRICS: MetricDef[] = [
102161
return total > 0 ? (s.cacheReadTokens / total) * 100 : null
103162
},
104163
},
105-
{
106-
label: 'One-shot rate',
107-
formatFn: 'percent',
108-
higherIsBetter: true,
109-
compute: s => s.editTurns > 0 ? (s.oneShotTurns / s.editTurns) * 100 : null,
110-
},
111-
{
112-
label: 'Retry rate',
113-
formatFn: 'decimal',
114-
higherIsBetter: false,
115-
compute: s => s.editTurns > 0 ? s.retries / s.editTurns : null,
116-
},
117-
{
118-
label: 'Self-correction',
119-
formatFn: 'percent',
120-
higherIsBetter: false,
121-
compute: s => s.totalTurns > 0 ? (s.selfCorrections / s.totalTurns) * 100 : null,
122-
},
123164
]
124165

125166
function pickWinner(valueA: number | null, valueB: number | null, higherIsBetter: boolean): ComparisonRow['winner'] {
@@ -134,6 +175,7 @@ export function computeComparison(a: ModelStats, b: ModelStats): ComparisonRow[]
134175
const valueA = m.compute(a)
135176
const valueB = m.compute(b)
136177
return {
178+
section: m.section,
137179
label: m.label,
138180
valueA,
139181
valueB,
@@ -143,6 +185,98 @@ export function computeComparison(a: ModelStats, b: ModelStats): ComparisonRow[]
143185
})
144186
}
145187

188+
export function computeCategoryComparison(projects: ProjectSummary[], modelA: string, modelB: string): CategoryComparison[] {
189+
type Accum = { turns: number; editTurns: number; oneShotTurns: number }
190+
const mapA = new Map<string, Accum>()
191+
const mapB = new Map<string, Accum>()
192+
193+
const ensure = (map: Map<string, Accum>, cat: string): Accum => {
194+
let a = map.get(cat)
195+
if (!a) { a = { turns: 0, editTurns: 0, oneShotTurns: 0 }; map.set(cat, a) }
196+
return a
197+
}
198+
199+
for (const project of projects) {
200+
for (const session of project.sessions) {
201+
for (const turn of session.turns) {
202+
if (turn.assistantCalls.length === 0) continue
203+
const primary = turn.assistantCalls[0]!.model
204+
if (primary !== modelA && primary !== modelB) continue
205+
206+
const acc = ensure(primary === modelA ? mapA : mapB, turn.category)
207+
acc.turns++
208+
if (turn.hasEdits) {
209+
acc.editTurns++
210+
if (turn.retries === 0) acc.oneShotTurns++
211+
}
212+
}
213+
}
214+
}
215+
216+
const allCats = new Set([...mapA.keys(), ...mapB.keys()])
217+
const result: CategoryComparison[] = []
218+
219+
for (const category of allCats) {
220+
const a = mapA.get(category)
221+
const b = mapB.get(category)
222+
if ((!a || a.editTurns === 0) && (!b || b.editTurns === 0)) continue
223+
224+
const rateA = a && a.editTurns > 0 ? (a.oneShotTurns / a.editTurns) * 100 : null
225+
const rateB = b && b.editTurns > 0 ? (b.oneShotTurns / b.editTurns) * 100 : null
226+
227+
result.push({
228+
category,
229+
turnsA: a?.turns ?? 0,
230+
editTurnsA: a?.editTurns ?? 0,
231+
oneShotRateA: rateA,
232+
turnsB: b?.turns ?? 0,
233+
editTurnsB: b?.editTurns ?? 0,
234+
oneShotRateB: rateB,
235+
winner: pickWinner(rateA, rateB, true),
236+
})
237+
}
238+
239+
return result.sort((a, b) => (b.turnsA + b.turnsB) - (a.turnsA + a.turnsB))
240+
}
241+
242+
export function computeWorkingStyle(projects: ProjectSummary[], modelA: string, modelB: string): WorkingStyleRow[] {
243+
type StyleAccum = { totalTurns: number; agentSpawns: number; planModeUses: number; totalToolCalls: number; fastModeCalls: number }
244+
const sA: StyleAccum = { totalTurns: 0, agentSpawns: 0, planModeUses: 0, totalToolCalls: 0, fastModeCalls: 0 }
245+
const sB: StyleAccum = { totalTurns: 0, agentSpawns: 0, planModeUses: 0, totalToolCalls: 0, fastModeCalls: 0 }
246+
247+
for (const project of projects) {
248+
for (const session of project.sessions) {
249+
for (const turn of session.turns) {
250+
if (turn.assistantCalls.length === 0) continue
251+
const primary = turn.assistantCalls[0]!.model
252+
if (primary !== modelA && primary !== modelB) continue
253+
254+
const s = primary === modelA ? sA : sB
255+
s.totalTurns++
256+
const turnTools = turn.assistantCalls.flatMap(c => c.tools)
257+
if (turnTools.some(t => PLANNING_TOOLS.has(t)) || turn.assistantCalls.some(c => c.hasPlanMode)) {
258+
s.planModeUses++
259+
}
260+
for (const call of turn.assistantCalls) {
261+
s.totalToolCalls += call.tools.length
262+
if (call.hasAgentSpawn) s.agentSpawns++
263+
if (call.speed === 'fast') s.fastModeCalls++
264+
}
265+
}
266+
}
267+
}
268+
269+
const pct = (num: number, den: number) => den > 0 ? (num / den) * 100 : null
270+
const avg = (num: number, den: number) => den > 0 ? num / den : null
271+
272+
return [
273+
{ label: 'Delegation rate', valueA: pct(sA.agentSpawns, sA.totalTurns), valueB: pct(sB.agentSpawns, sB.totalTurns), formatFn: 'percent' as const },
274+
{ label: 'Planning rate', valueA: pct(sA.planModeUses, sA.totalTurns), valueB: pct(sB.planModeUses, sB.totalTurns), formatFn: 'percent' as const },
275+
{ label: 'Avg tools / turn', valueA: avg(sA.totalToolCalls, sA.totalTurns), valueB: avg(sB.totalToolCalls, sB.totalTurns), formatFn: 'decimal' as const },
276+
{ label: 'Fast mode usage', valueA: pct(sA.fastModeCalls, sA.totalTurns), valueB: pct(sB.fastModeCalls, sB.totalTurns), formatFn: 'percent' as const },
277+
]
278+
}
279+
146280
const SELF_CORRECTION_PATTERNS = [
147281
/\bmy mistake\b/i,
148282
/\bmy bad\b/i,

0 commit comments

Comments
 (0)