@@ -494,12 +494,11 @@ private static void ScoreWaitStats(PlanStatement stmt)
494494 var isParallel = stmt . DegreeOfParallelism > 1 && stmt . RootNode != null ;
495495
496496 // Collect all operators with per-thread stats for parallel benefit calculation
497- List < OperatorWaitProfile > ? operatorProfiles = null ;
498- if ( isParallel )
499- {
500- operatorProfiles = new List < OperatorWaitProfile > ( ) ;
501- CollectOperatorWaitProfiles ( stmt . RootNode ! , operatorProfiles ) ;
502- }
497+ // Collect operator profiles even for serial plans — the external-wait formula
498+ // uses sum-of-max-thread-cpu across operators and works for both.
499+ var operatorProfiles = new List < OperatorWaitProfile > ( ) ;
500+ if ( stmt . RootNode != null )
501+ CollectOperatorWaitProfiles ( stmt . RootNode , operatorProfiles ) ;
503502
504503 foreach ( var wait in stmt . WaitStats )
505504 {
@@ -508,7 +507,15 @@ private static void ScoreWaitStats(PlanStatement stmt)
508507 var category = ClassifyWaitType ( wait . WaitType ) ;
509508 double benefitPct ;
510509
511- if ( category == "Parallelism" && isParallel )
510+ if ( IsExternalWait ( wait . WaitType ) && operatorProfiles . Count > 0 )
511+ {
512+ // External / preemptive waits (MEMORY_ALLOCATION_*, PREEMPTIVE_*): the worker
513+ // is CPU-busy in kernel, so operator elapsed ≈ operator cpu and the wait
514+ // barely shows in the per-thread (elapsed - cpu) calculation. Joe's formula:
515+ // benefit = (wait_ms / total_cpu_ms) * Σ max_thread_cpu_per_operator / elapsed
516+ benefitPct = CalculateExternalWaitBenefit ( wait , operatorProfiles , stmt . QueryTimeStats ! . CpuTimeMs , elapsedMs ) ;
517+ }
518+ else if ( category == "Parallelism" && isParallel )
512519 {
513520 // CXPACKET/CXCONSUMER/CXSYNC: benefit is the parallelism efficiency gap,
514521 // not the raw wait time. Threads waiting for other threads is a symptom
@@ -525,7 +532,7 @@ private static void ScoreWaitStats(PlanStatement stmt)
525532 benefitPct = ( double ) wait . WaitTimeMs / elapsedMs * 100 ;
526533 }
527534 }
528- else if ( ! isParallel || operatorProfiles == null || operatorProfiles . Count == 0 )
535+ else if ( ! isParallel || operatorProfiles . Count == 0 )
529536 {
530537 // Serial plan or no operator data: simple ratio
531538 benefitPct = ( double ) wait . WaitTimeMs / elapsedMs * 100 ;
@@ -585,6 +592,48 @@ private static double CalculateParallelWaitBenefit(
585592 return benefitMs / stmtElapsedMs * 100 ;
586593 }
587594
595+ /// <summary>
596+ /// Joe's formula for external/preemptive waits where the worker is CPU-busy in kernel
597+ /// (MEMORY_ALLOCATION_*, PREEMPTIVE_*). The standard (elapsed-cpu) per-thread
598+ /// wait accounting misses these because elapsed ≈ cpu for those threads. Use the
599+ /// wait's share of total CPU, scaled by the plan's critical-path CPU.
600+ /// wait_cpu_share = wait_ms / total_cpu_ms
601+ /// sum_max_cpu = Σ max_thread_cpu across operators
602+ /// benefit_ms = wait_cpu_share * sum_max_cpu
603+ /// Then convert to % of statement elapsed.
604+ /// </summary>
605+ private static double CalculateExternalWaitBenefit (
606+ WaitStatInfo wait , List < OperatorWaitProfile > profiles ,
607+ long stmtCpuMs , long stmtElapsedMs )
608+ {
609+ if ( stmtCpuMs <= 0 || stmtElapsedMs <= 0 )
610+ return ( double ) wait . WaitTimeMs / Math . Max ( 1 , stmtElapsedMs ) * 100 ;
611+
612+ long sumMaxCpu = 0 ;
613+ foreach ( var p in profiles )
614+ sumMaxCpu += p . MaxThreadCpuMs ;
615+
616+ if ( sumMaxCpu <= 0 )
617+ return ( double ) wait . WaitTimeMs / stmtElapsedMs * 100 ;
618+
619+ var waitCpuShare = ( double ) wait . WaitTimeMs / stmtCpuMs ;
620+ var benefitMs = waitCpuShare * sumMaxCpu ;
621+ return benefitMs / stmtElapsedMs * 100 ;
622+ }
623+
624+ /// <summary>
625+ /// External / preemptive waits where the worker is CPU-busy in kernel rather than
626+ /// descheduled. Their wait time counts toward the query's CPU time, so the usual
627+ /// (elapsed - cpu) per-thread wait math misses them entirely.
628+ /// </summary>
629+ public static bool IsExternalWait ( string waitType )
630+ {
631+ if ( string . IsNullOrEmpty ( waitType ) ) return false ;
632+ var wt = waitType . ToUpperInvariant ( ) ;
633+ return wt . Contains ( "MEMORY_ALLOCATION" )
634+ || wt . StartsWith ( "PREEMPTIVE_" ) ;
635+ }
636+
588637 /// <summary>
589638 /// Determines if an operator is relevant for a given wait category.
590639 /// </summary>
@@ -624,13 +673,18 @@ private static void CollectOperatorWaitProfiles(PlanNode node, List<OperatorWait
624673 maxThreadWait = threadWait ;
625674 }
626675
627- if ( totalWait > 0 || maxThreadWait > 0 )
676+ // Max per-thread SELF CPU (non-cumulative) — critical-path CPU contribution
677+ // from this operator. Used by the external-wait formula.
678+ var maxThreadCpu = PlanAnalyzer . GetOperatorMaxThreadOwnCpuMs ( node ) ;
679+
680+ if ( totalWait > 0 || maxThreadWait > 0 || maxThreadCpu > 0 )
628681 {
629682 profiles . Add ( new OperatorWaitProfile
630683 {
631684 Node = node ,
632685 MaxThreadWaitMs = maxThreadWait ,
633686 TotalWaitMs = totalWait ,
687+ MaxThreadCpuMs = maxThreadCpu ,
634688 HasPhysicalReads = node . ActualPhysicalReads > 0 ,
635689 HasCpuWork = node . ActualCPUMs > 0 ,
636690 IsExchange = node . PhysicalOp == "Parallelism" ,
@@ -681,6 +735,8 @@ private sealed class OperatorWaitProfile
681735 public PlanNode Node { get ; init ; } = null ! ;
682736 public long MaxThreadWaitMs { get ; init ; }
683737 public long TotalWaitMs { get ; init ; }
738+ /// <summary>Max CPU time among this operator's threads (critical-path CPU for external-wait formula).</summary>
739+ public long MaxThreadCpuMs { get ; init ; }
684740 public bool HasPhysicalReads { get ; init ; }
685741 public bool HasCpuWork { get ; init ; }
686742 public bool IsExchange { get ; init ; }
0 commit comments