From 6b61d791d4d004664ab76bf95b3230573e1f006e Mon Sep 17 00:00:00 2001 From: Erik Darling <2136037+erikdarlingdata@users.noreply.github.com> Date: Wed, 22 Apr 2026 12:02:38 -0400 Subject: [PATCH] External/preemptive wait benefit + CPU:Elapsed external-wait adjustment (#215 C6, C7) (#259) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit C6 — external-wait benefit formula: The existing parallel-wait formula uses per-thread wait = max(0, elapsed - cpu). External / preemptive waits (MEMORY_ALLOCATION_EXT, RESERVED_MEMORY_ALLOCATION_EXT, PREEMPTIVE_*) are CPU-busy in kernel, so elapsed ≈ cpu for those threads and the wait barely shows in that math. Joe's formula: wait_cpu_share = wait_ms / total_cpu_ms sum_max_cpu = Σ max-thread-self-cpu across operators benefit_ms = wait_cpu_share * sum_max_cpu Verified against Joe's worked example on MwX36LEHJS: (26307 / 45075) * (11341 + 42) / 13748 = 48.3% ✓ Implementation: - New IsExternalWait classifier (public so mapping can see it) covering MEMORY_ALLOCATION* and PREEMPTIVE_* prefixes. - OperatorWaitProfile grows MaxThreadCpuMs — per-thread self-CPU (non-cumulative), taken from new PlanAnalyzer.GetOperatorMaxThreadOwnCpuMs which mirrors GetOperatorOwnElapsedMs. Uses self-CPU so summing across operators in a serial plan doesn't double-count cumulative CPU, and sums to approximate critical-path CPU in parallel plans. - ScoreWaitStats routes external waits to CalculateExternalWaitBenefit. - Serial plans with external waits: sum-of-self-cpu ≈ statement cpu, so the formula collapses to wait/elapsed — same as before, no regression. C7 — CPU:Elapsed numerator: External waits inflate the CPU number without representing real query CPU, so the CPU:Elapsed ratio (used in web runtime card, HTML export runtime card, and Avalonia DOP efficiency) subtracts Σ external-wait ms from CPU before dividing by elapsed. - AnalysisResult.QueryTimeResult gains ExternalWaitMs, populated in ResultMapper by summing wait stats matching IsExternalWait. - HtmlExporter.WriteRuntimeCard, Index.razor runtime card, and PlanViewerControl DOP-efficiency calc all use (cpu - externalWaitMs). Version bump 1.7.3 -> 1.7.4. Co-authored-by: Claude Opus 4.7 (1M context) --- .../Controls/PlanViewerControl.axaml.cs | 11 ++- src/PlanViewer.App/PlanViewer.App.csproj | 2 +- src/PlanViewer.Core/Output/AnalysisResult.cs | 8 ++ src/PlanViewer.Core/Output/HtmlExporter.cs | 3 +- src/PlanViewer.Core/Output/ResultMapper.cs | 11 ++- src/PlanViewer.Core/Services/BenefitScorer.cs | 74 ++++++++++++++++--- src/PlanViewer.Core/Services/PlanAnalyzer.cs | 60 +++++++++++++++ src/PlanViewer.Web/Pages/Index.razor | 3 +- 8 files changed, 157 insertions(+), 15 deletions(-) diff --git a/src/PlanViewer.App/Controls/PlanViewerControl.axaml.cs b/src/PlanViewer.App/Controls/PlanViewerControl.axaml.cs index bd45cc4..836a377 100644 --- a/src/PlanViewer.App/Controls/PlanViewerControl.axaml.cs +++ b/src/PlanViewer.App/Controls/PlanViewerControl.axaml.cs @@ -2839,8 +2839,15 @@ static string EfficiencyColor(double pct) => pct >= 80 ? "#E4E6EB" statement.QueryTimeStats.CpuTimeMs > 0 && statement.DegreeOfParallelism > 1) { - // Speedup ratio: CPU/elapsed = 1.0 means serial, = DOP means perfect parallelism - var speedup = (double)statement.QueryTimeStats.CpuTimeMs / statement.QueryTimeStats.ElapsedTimeMs; + // Speedup ratio: CPU/elapsed = 1.0 means serial, = DOP means perfect parallelism. + // Subtract external/preemptive wait time from CPU — those waits are CPU-busy + // in kernel and inflate the ratio without representing real query work. + long externalWaitMs = 0; + foreach (var w in statement.WaitStats) + if (BenefitScorer.IsExternalWait(w.WaitType)) + externalWaitMs += w.WaitTimeMs; + var effectiveCpu = Math.Max(0, statement.QueryTimeStats.CpuTimeMs - externalWaitMs); + var speedup = (double)effectiveCpu / statement.QueryTimeStats.ElapsedTimeMs; var efficiency = Math.Min(100.0, (speedup - 1.0) / (statement.DegreeOfParallelism - 1.0) * 100.0); efficiency = Math.Max(0.0, efficiency); dopText += $" ({efficiency:N0}% efficient)"; diff --git a/src/PlanViewer.App/PlanViewer.App.csproj b/src/PlanViewer.App/PlanViewer.App.csproj index d3bb39f..815d970 100644 --- a/src/PlanViewer.App/PlanViewer.App.csproj +++ b/src/PlanViewer.App/PlanViewer.App.csproj @@ -6,7 +6,7 @@ app.manifest EDD.ico true - 1.7.3 + 1.7.4 Erik Darling Darling Data LLC Performance Studio diff --git a/src/PlanViewer.Core/Output/AnalysisResult.cs b/src/PlanViewer.Core/Output/AnalysisResult.cs index 21da4fc..20f83d7 100644 --- a/src/PlanViewer.Core/Output/AnalysisResult.cs +++ b/src/PlanViewer.Core/Output/AnalysisResult.cs @@ -176,6 +176,14 @@ public class QueryTimeResult [JsonPropertyName("elapsed_time_ms")] public long ElapsedTimeMs { get; set; } + + /// + /// Sum of external/preemptive wait time (MEMORY_ALLOCATION_*, PREEMPTIVE_*) — + /// these waits are CPU-busy in kernel and inflate CpuTimeMs vs real query CPU. + /// Subtract from CpuTimeMs for a truer CPU:Elapsed ratio. + /// + [JsonPropertyName("external_wait_ms")] + public long ExternalWaitMs { get; set; } } public class ParameterResult diff --git a/src/PlanViewer.Core/Output/HtmlExporter.cs b/src/PlanViewer.Core/Output/HtmlExporter.cs index 7877137..fc9f54e 100644 --- a/src/PlanViewer.Core/Output/HtmlExporter.cs +++ b/src/PlanViewer.Core/Output/HtmlExporter.cs @@ -303,7 +303,8 @@ private static void WriteRuntimeCard(StringBuilder sb, StatementResult stmt) WriteRow(sb, "CPU", $"{stmt.QueryTime.CpuTimeMs:N0} ms"); if (stmt.QueryTime.ElapsedTimeMs > 0) { - var ratio = (double)stmt.QueryTime.CpuTimeMs / stmt.QueryTime.ElapsedTimeMs; + var effectiveCpu = Math.Max(0, stmt.QueryTime.CpuTimeMs - stmt.QueryTime.ExternalWaitMs); + var ratio = (double)effectiveCpu / stmt.QueryTime.ElapsedTimeMs; WriteRow(sb, "CPU:Elapsed", ratio.ToString("N2")); } } diff --git a/src/PlanViewer.Core/Output/ResultMapper.cs b/src/PlanViewer.Core/Output/ResultMapper.cs index fde71c7..35b0fca 100644 --- a/src/PlanViewer.Core/Output/ResultMapper.cs +++ b/src/PlanViewer.Core/Output/ResultMapper.cs @@ -1,4 +1,5 @@ using PlanViewer.Core.Models; +using PlanViewer.Core.Services; namespace PlanViewer.Core.Output; @@ -111,10 +112,18 @@ private static StatementResult MapStatement(PlanStatement stmt) // Query time (actual plans) if (stmt.QueryTimeStats != null) { + long externalWaitMs = 0; + foreach (var w in stmt.WaitStats) + { + if (BenefitScorer.IsExternalWait(w.WaitType)) + externalWaitMs += w.WaitTimeMs; + } + result.QueryTime = new QueryTimeResult { CpuTimeMs = stmt.QueryTimeStats.CpuTimeMs, - ElapsedTimeMs = stmt.QueryTimeStats.ElapsedTimeMs + ElapsedTimeMs = stmt.QueryTimeStats.ElapsedTimeMs, + ExternalWaitMs = externalWaitMs }; } diff --git a/src/PlanViewer.Core/Services/BenefitScorer.cs b/src/PlanViewer.Core/Services/BenefitScorer.cs index c9ff3a7..c6cc842 100644 --- a/src/PlanViewer.Core/Services/BenefitScorer.cs +++ b/src/PlanViewer.Core/Services/BenefitScorer.cs @@ -494,12 +494,11 @@ private static void ScoreWaitStats(PlanStatement stmt) var isParallel = stmt.DegreeOfParallelism > 1 && stmt.RootNode != null; // Collect all operators with per-thread stats for parallel benefit calculation - List? operatorProfiles = null; - if (isParallel) - { - operatorProfiles = new List(); - CollectOperatorWaitProfiles(stmt.RootNode!, operatorProfiles); - } + // Collect operator profiles even for serial plans — the external-wait formula + // uses sum-of-max-thread-cpu across operators and works for both. + var operatorProfiles = new List(); + if (stmt.RootNode != null) + CollectOperatorWaitProfiles(stmt.RootNode, operatorProfiles); foreach (var wait in stmt.WaitStats) { @@ -508,7 +507,15 @@ private static void ScoreWaitStats(PlanStatement stmt) var category = ClassifyWaitType(wait.WaitType); double benefitPct; - if (category == "Parallelism" && isParallel) + if (IsExternalWait(wait.WaitType) && operatorProfiles.Count > 0) + { + // External / preemptive waits (MEMORY_ALLOCATION_*, PREEMPTIVE_*): the worker + // is CPU-busy in kernel, so operator elapsed ≈ operator cpu and the wait + // barely shows in the per-thread (elapsed - cpu) calculation. Joe's formula: + // benefit = (wait_ms / total_cpu_ms) * Σ max_thread_cpu_per_operator / elapsed + benefitPct = CalculateExternalWaitBenefit(wait, operatorProfiles, stmt.QueryTimeStats!.CpuTimeMs, elapsedMs); + } + else if (category == "Parallelism" && isParallel) { // CXPACKET/CXCONSUMER/CXSYNC: benefit is the parallelism efficiency gap, // not the raw wait time. Threads waiting for other threads is a symptom @@ -525,7 +532,7 @@ private static void ScoreWaitStats(PlanStatement stmt) benefitPct = (double)wait.WaitTimeMs / elapsedMs * 100; } } - else if (!isParallel || operatorProfiles == null || operatorProfiles.Count == 0) + else if (!isParallel || operatorProfiles.Count == 0) { // Serial plan or no operator data: simple ratio benefitPct = (double)wait.WaitTimeMs / elapsedMs * 100; @@ -585,6 +592,48 @@ private static double CalculateParallelWaitBenefit( return benefitMs / stmtElapsedMs * 100; } + /// + /// Joe's formula for external/preemptive waits where the worker is CPU-busy in kernel + /// (MEMORY_ALLOCATION_*, PREEMPTIVE_*). The standard (elapsed-cpu) per-thread + /// wait accounting misses these because elapsed ≈ cpu for those threads. Use the + /// wait's share of total CPU, scaled by the plan's critical-path CPU. + /// wait_cpu_share = wait_ms / total_cpu_ms + /// sum_max_cpu = Σ max_thread_cpu across operators + /// benefit_ms = wait_cpu_share * sum_max_cpu + /// Then convert to % of statement elapsed. + /// + private static double CalculateExternalWaitBenefit( + WaitStatInfo wait, List profiles, + long stmtCpuMs, long stmtElapsedMs) + { + if (stmtCpuMs <= 0 || stmtElapsedMs <= 0) + return (double)wait.WaitTimeMs / Math.Max(1, stmtElapsedMs) * 100; + + long sumMaxCpu = 0; + foreach (var p in profiles) + sumMaxCpu += p.MaxThreadCpuMs; + + if (sumMaxCpu <= 0) + return (double)wait.WaitTimeMs / stmtElapsedMs * 100; + + var waitCpuShare = (double)wait.WaitTimeMs / stmtCpuMs; + var benefitMs = waitCpuShare * sumMaxCpu; + return benefitMs / stmtElapsedMs * 100; + } + + /// + /// External / preemptive waits where the worker is CPU-busy in kernel rather than + /// descheduled. Their wait time counts toward the query's CPU time, so the usual + /// (elapsed - cpu) per-thread wait math misses them entirely. + /// + public static bool IsExternalWait(string waitType) + { + if (string.IsNullOrEmpty(waitType)) return false; + var wt = waitType.ToUpperInvariant(); + return wt.Contains("MEMORY_ALLOCATION") + || wt.StartsWith("PREEMPTIVE_"); + } + /// /// Determines if an operator is relevant for a given wait category. /// @@ -624,13 +673,18 @@ private static void CollectOperatorWaitProfiles(PlanNode node, List 0 || maxThreadWait > 0) + // Max per-thread SELF CPU (non-cumulative) — critical-path CPU contribution + // from this operator. Used by the external-wait formula. + var maxThreadCpu = PlanAnalyzer.GetOperatorMaxThreadOwnCpuMs(node); + + if (totalWait > 0 || maxThreadWait > 0 || maxThreadCpu > 0) { profiles.Add(new OperatorWaitProfile { Node = node, MaxThreadWaitMs = maxThreadWait, TotalWaitMs = totalWait, + MaxThreadCpuMs = maxThreadCpu, HasPhysicalReads = node.ActualPhysicalReads > 0, HasCpuWork = node.ActualCPUMs > 0, IsExchange = node.PhysicalOp == "Parallelism", @@ -681,6 +735,8 @@ private sealed class OperatorWaitProfile public PlanNode Node { get; init; } = null!; public long MaxThreadWaitMs { get; init; } public long TotalWaitMs { get; init; } + /// Max CPU time among this operator's threads (critical-path CPU for external-wait formula). + public long MaxThreadCpuMs { get; init; } public bool HasPhysicalReads { get; init; } public bool HasCpuWork { get; init; } public bool IsExchange { get; init; } diff --git a/src/PlanViewer.Core/Services/PlanAnalyzer.cs b/src/PlanViewer.Core/Services/PlanAnalyzer.cs index a66cf67..8969af5 100644 --- a/src/PlanViewer.Core/Services/PlanAnalyzer.cs +++ b/src/PlanViewer.Core/Services/PlanAnalyzer.cs @@ -1592,6 +1592,66 @@ private static long GetPerThreadOwnElapsed(PlanNode node) return maxSelf; } + /// + /// Max per-thread self-CPU for this operator. + /// Parallel: for each thread, self_cpu = thread_cpu - Σ same-thread child cpu; take max. + /// Serial / single-thread: operator_cpu - Σ effective child cpu. + /// Needed for external-wait benefit scoring (Joe's formula). + /// + internal static long GetOperatorMaxThreadOwnCpuMs(PlanNode node) + { + if (!node.HasActualStats || node.ActualCPUMs <= 0) return 0; + + if (node.PerThreadStats.Count > 1) + { + var parentByThread = new Dictionary(); + foreach (var ts in node.PerThreadStats) + parentByThread[ts.ThreadId] = ts.ActualCPUMs; + + var childSumByThread = new Dictionary(); + foreach (var child in node.Children) + { + var childNode = child; + if (child.PhysicalOp == "Parallelism" && child.Children.Count > 0) + childNode = child.Children.OrderByDescending(c => c.ActualCPUMs).First(); + foreach (var ts in childNode.PerThreadStats) + { + childSumByThread.TryGetValue(ts.ThreadId, out var existing); + childSumByThread[ts.ThreadId] = existing + ts.ActualCPUMs; + } + } + + var maxSelf = 0L; + foreach (var (threadId, parentCpu) in parentByThread) + { + childSumByThread.TryGetValue(threadId, out var childCpu); + var self = Math.Max(0, parentCpu - childCpu); + if (self > maxSelf) maxSelf = self; + } + return maxSelf; + } + + // Serial: operator_cpu - Σ effective child cpu + var totalChildCpu = 0L; + foreach (var child in node.Children) + totalChildCpu += GetEffectiveChildCpuMs(child); + return Math.Max(0, node.ActualCPUMs - totalChildCpu); + } + + private static long GetEffectiveChildCpuMs(PlanNode child) + { + if (child.PhysicalOp == "Parallelism" && child.Children.Count > 0) + return child.Children.Max(GetEffectiveChildCpuMs); + if (child.ActualCPUMs > 0) + return child.ActualCPUMs; + if (child.Children.Count == 0) + return 0; + var sum = 0L; + foreach (var grandchild in child.Children) + sum += GetEffectiveChildCpuMs(grandchild); + return sum; + } + /// /// Serial row mode self-time: subtract all direct children's effective elapsed. /// Pass-through operators (Compute Scalar, etc.) don't carry runtime stats — diff --git a/src/PlanViewer.Web/Pages/Index.razor b/src/PlanViewer.Web/Pages/Index.razor index eabbcc2..6d885d8 100644 --- a/src/PlanViewer.Web/Pages/Index.razor +++ b/src/PlanViewer.Web/Pages/Index.razor @@ -163,7 +163,8 @@ else @if (ActiveStmt!.QueryTime.ElapsedTimeMs > 0) { - var ratio = (double)ActiveStmt!.QueryTime.CpuTimeMs / ActiveStmt!.QueryTime.ElapsedTimeMs; + var effectiveCpu = Math.Max(0L, ActiveStmt!.QueryTime.CpuTimeMs - ActiveStmt!.QueryTime.ExternalWaitMs); + var ratio = (double)effectiveCpu / ActiveStmt!.QueryTime.ElapsedTimeMs;
CPU:Elapsed @ratio.ToString("N2")