@@ -3,6 +3,8 @@ import { join } from 'path'
33
44import type { ProjectSummary } from './types.js'
55
6+ const PLANNING_TOOLS = new Set ( [ 'TaskCreate' , 'TaskUpdate' , 'TodoWrite' , 'EnterPlanMode' , 'ExitPlanMode' ] )
7+
68export type ModelStats = {
79 model : string
810 calls : number
@@ -16,6 +18,7 @@ export type ModelStats = {
1618 oneShotTurns : number
1719 retries : number
1820 selfCorrections : number
21+ editCost : number
1922 firstSeen : string
2023 lastSeen : string
2124}
@@ -26,7 +29,7 @@ export function aggregateModelStats(projects: ProjectSummary[]): ModelStats[] {
2629 const ensure = ( model : string ) : ModelStats => {
2730 let s = byModel . get ( model )
2831 if ( ! s ) {
29- s = { model, calls : 0 , cost : 0 , outputTokens : 0 , inputTokens : 0 , cacheReadTokens : 0 , cacheWriteTokens : 0 , totalTurns : 0 , editTurns : 0 , oneShotTurns : 0 , retries : 0 , selfCorrections : 0 , firstSeen : '' , lastSeen : '' }
32+ s = { model, calls : 0 , cost : 0 , outputTokens : 0 , inputTokens : 0 , cacheReadTokens : 0 , cacheWriteTokens : 0 , totalTurns : 0 , editTurns : 0 , oneShotTurns : 0 , retries : 0 , selfCorrections : 0 , editCost : 0 , firstSeen : '' , lastSeen : '' }
3033 byModel . set ( model , s )
3134 }
3235 return s
@@ -41,8 +44,13 @@ export function aggregateModelStats(projects: ProjectSummary[]): ModelStats[] {
4144
4245 const ms = ensure ( primaryModel )
4346 ms . totalTurns ++
44- if ( turn . hasEdits ) ms . editTurns ++
45- if ( turn . hasEdits && turn . retries === 0 ) ms . oneShotTurns ++
47+ if ( turn . hasEdits ) {
48+ ms . editTurns ++
49+ if ( turn . retries === 0 ) ms . oneShotTurns ++
50+ for ( const c of turn . assistantCalls ) {
51+ if ( c . model !== '<synthetic>' ) ms . editCost += c . costUSD
52+ }
53+ }
4654 ms . retries += turn . retries
4755
4856 for ( const call of turn . assistantCalls ) {
@@ -66,14 +74,34 @@ export function aggregateModelStats(projects: ProjectSummary[]): ModelStats[] {
6674}
6775
6876export type ComparisonRow = {
77+ section : string
6978 label : string
7079 valueA : number | null
7180 valueB : number | null
7281 formatFn : 'cost' | 'number' | 'percent' | 'decimal'
7382 winner : 'a' | 'b' | 'tie' | 'none'
7483}
7584
85+ export type CategoryComparison = {
86+ category : string
87+ turnsA : number
88+ editTurnsA : number
89+ oneShotRateA : number | null
90+ turnsB : number
91+ editTurnsB : number
92+ oneShotRateB : number | null
93+ winner : 'a' | 'b' | 'tie' | 'none'
94+ }
95+
96+ export type WorkingStyleRow = {
97+ label : string
98+ valueA : number | null
99+ valueB : number | null
100+ formatFn : ComparisonRow [ 'formatFn' ]
101+ }
102+
76103type MetricDef = {
104+ section : string
77105 label : string
78106 formatFn : ComparisonRow [ 'formatFn' ]
79107 higherIsBetter : boolean
@@ -82,18 +110,49 @@ type MetricDef = {
82110
83111const METRICS : MetricDef [ ] = [
84112 {
113+ section : 'Performance' ,
114+ label : 'One-shot rate' ,
115+ formatFn : 'percent' ,
116+ higherIsBetter : true ,
117+ compute : s => s . editTurns > 0 ? ( s . oneShotTurns / s . editTurns ) * 100 : null ,
118+ } ,
119+ {
120+ section : 'Performance' ,
121+ label : 'Retry rate' ,
122+ formatFn : 'decimal' ,
123+ higherIsBetter : false ,
124+ compute : s => s . editTurns > 0 ? s . retries / s . editTurns : null ,
125+ } ,
126+ {
127+ section : 'Performance' ,
128+ label : 'Self-correction' ,
129+ formatFn : 'percent' ,
130+ higherIsBetter : false ,
131+ compute : s => s . totalTurns > 0 ? ( s . selfCorrections / s . totalTurns ) * 100 : null ,
132+ } ,
133+ {
134+ section : 'Efficiency' ,
85135 label : 'Cost / call' ,
86136 formatFn : 'cost' ,
87137 higherIsBetter : false ,
88138 compute : s => s . calls > 0 ? s . cost / s . calls : null ,
89139 } ,
90140 {
141+ section : 'Efficiency' ,
142+ label : 'Cost / edit' ,
143+ formatFn : 'cost' ,
144+ higherIsBetter : false ,
145+ compute : s => s . editTurns > 0 ? s . editCost / s . editTurns : null ,
146+ } ,
147+ {
148+ section : 'Efficiency' ,
91149 label : 'Output tok / call' ,
92150 formatFn : 'number' ,
93151 higherIsBetter : false ,
94152 compute : s => s . calls > 0 ? Math . round ( s . outputTokens / s . calls ) : null ,
95153 } ,
96154 {
155+ section : 'Efficiency' ,
97156 label : 'Cache hit rate' ,
98157 formatFn : 'percent' ,
99158 higherIsBetter : true ,
@@ -102,24 +161,6 @@ const METRICS: MetricDef[] = [
102161 return total > 0 ? ( s . cacheReadTokens / total ) * 100 : null
103162 } ,
104163 } ,
105- {
106- label : 'One-shot rate' ,
107- formatFn : 'percent' ,
108- higherIsBetter : true ,
109- compute : s => s . editTurns > 0 ? ( s . oneShotTurns / s . editTurns ) * 100 : null ,
110- } ,
111- {
112- label : 'Retry rate' ,
113- formatFn : 'decimal' ,
114- higherIsBetter : false ,
115- compute : s => s . editTurns > 0 ? s . retries / s . editTurns : null ,
116- } ,
117- {
118- label : 'Self-correction' ,
119- formatFn : 'percent' ,
120- higherIsBetter : false ,
121- compute : s => s . totalTurns > 0 ? ( s . selfCorrections / s . totalTurns ) * 100 : null ,
122- } ,
123164]
124165
125166function pickWinner ( valueA : number | null , valueB : number | null , higherIsBetter : boolean ) : ComparisonRow [ 'winner' ] {
@@ -134,6 +175,7 @@ export function computeComparison(a: ModelStats, b: ModelStats): ComparisonRow[]
134175 const valueA = m . compute ( a )
135176 const valueB = m . compute ( b )
136177 return {
178+ section : m . section ,
137179 label : m . label ,
138180 valueA,
139181 valueB,
@@ -143,6 +185,98 @@ export function computeComparison(a: ModelStats, b: ModelStats): ComparisonRow[]
143185 } )
144186}
145187
188+ export function computeCategoryComparison ( projects : ProjectSummary [ ] , modelA : string , modelB : string ) : CategoryComparison [ ] {
189+ type Accum = { turns : number ; editTurns : number ; oneShotTurns : number }
190+ const mapA = new Map < string , Accum > ( )
191+ const mapB = new Map < string , Accum > ( )
192+
193+ const ensure = ( map : Map < string , Accum > , cat : string ) : Accum => {
194+ let a = map . get ( cat )
195+ if ( ! a ) { a = { turns : 0 , editTurns : 0 , oneShotTurns : 0 } ; map . set ( cat , a ) }
196+ return a
197+ }
198+
199+ for ( const project of projects ) {
200+ for ( const session of project . sessions ) {
201+ for ( const turn of session . turns ) {
202+ if ( turn . assistantCalls . length === 0 ) continue
203+ const primary = turn . assistantCalls [ 0 ] ! . model
204+ if ( primary !== modelA && primary !== modelB ) continue
205+
206+ const acc = ensure ( primary === modelA ? mapA : mapB , turn . category )
207+ acc . turns ++
208+ if ( turn . hasEdits ) {
209+ acc . editTurns ++
210+ if ( turn . retries === 0 ) acc . oneShotTurns ++
211+ }
212+ }
213+ }
214+ }
215+
216+ const allCats = new Set ( [ ...mapA . keys ( ) , ...mapB . keys ( ) ] )
217+ const result : CategoryComparison [ ] = [ ]
218+
219+ for ( const category of allCats ) {
220+ const a = mapA . get ( category )
221+ const b = mapB . get ( category )
222+ if ( ( ! a || a . editTurns === 0 ) && ( ! b || b . editTurns === 0 ) ) continue
223+
224+ const rateA = a && a . editTurns > 0 ? ( a . oneShotTurns / a . editTurns ) * 100 : null
225+ const rateB = b && b . editTurns > 0 ? ( b . oneShotTurns / b . editTurns ) * 100 : null
226+
227+ result . push ( {
228+ category,
229+ turnsA : a ?. turns ?? 0 ,
230+ editTurnsA : a ?. editTurns ?? 0 ,
231+ oneShotRateA : rateA ,
232+ turnsB : b ?. turns ?? 0 ,
233+ editTurnsB : b ?. editTurns ?? 0 ,
234+ oneShotRateB : rateB ,
235+ winner : pickWinner ( rateA , rateB , true ) ,
236+ } )
237+ }
238+
239+ return result . sort ( ( a , b ) => ( b . turnsA + b . turnsB ) - ( a . turnsA + a . turnsB ) )
240+ }
241+
242+ export function computeWorkingStyle ( projects : ProjectSummary [ ] , modelA : string , modelB : string ) : WorkingStyleRow [ ] {
243+ type StyleAccum = { totalTurns : number ; agentSpawns : number ; planModeUses : number ; totalToolCalls : number ; fastModeCalls : number }
244+ const sA : StyleAccum = { totalTurns : 0 , agentSpawns : 0 , planModeUses : 0 , totalToolCalls : 0 , fastModeCalls : 0 }
245+ const sB : StyleAccum = { totalTurns : 0 , agentSpawns : 0 , planModeUses : 0 , totalToolCalls : 0 , fastModeCalls : 0 }
246+
247+ for ( const project of projects ) {
248+ for ( const session of project . sessions ) {
249+ for ( const turn of session . turns ) {
250+ if ( turn . assistantCalls . length === 0 ) continue
251+ const primary = turn . assistantCalls [ 0 ] ! . model
252+ if ( primary !== modelA && primary !== modelB ) continue
253+
254+ const s = primary === modelA ? sA : sB
255+ s . totalTurns ++
256+ const turnTools = turn . assistantCalls . flatMap ( c => c . tools )
257+ if ( turnTools . some ( t => PLANNING_TOOLS . has ( t ) ) || turn . assistantCalls . some ( c => c . hasPlanMode ) ) {
258+ s . planModeUses ++
259+ }
260+ for ( const call of turn . assistantCalls ) {
261+ s . totalToolCalls += call . tools . length
262+ if ( call . hasAgentSpawn ) s . agentSpawns ++
263+ if ( call . speed === 'fast' ) s . fastModeCalls ++
264+ }
265+ }
266+ }
267+ }
268+
269+ const pct = ( num : number , den : number ) => den > 0 ? ( num / den ) * 100 : null
270+ const avg = ( num : number , den : number ) => den > 0 ? num / den : null
271+
272+ return [
273+ { label : 'Delegation rate' , valueA : pct ( sA . agentSpawns , sA . totalTurns ) , valueB : pct ( sB . agentSpawns , sB . totalTurns ) , formatFn : 'percent' as const } ,
274+ { label : 'Planning rate' , valueA : pct ( sA . planModeUses , sA . totalTurns ) , valueB : pct ( sB . planModeUses , sB . totalTurns ) , formatFn : 'percent' as const } ,
275+ { label : 'Avg tools / turn' , valueA : avg ( sA . totalToolCalls , sA . totalTurns ) , valueB : avg ( sB . totalToolCalls , sB . totalTurns ) , formatFn : 'decimal' as const } ,
276+ { label : 'Fast mode usage' , valueA : pct ( sA . fastModeCalls , sA . totalTurns ) , valueB : pct ( sB . fastModeCalls , sB . totalTurns ) , formatFn : 'percent' as const } ,
277+ ]
278+ }
279+
146280const SELF_CORRECTION_PATTERNS = [
147281 / \b m y m i s t a k e \b / i,
148282 / \b m y b a d \b / i,
0 commit comments