From fa58798dc350e961af383a9dc02d2bb81ffa2434 Mon Sep 17 00:00:00 2001 From: Reuben Bond Date: Tue, 31 Mar 2026 12:16:43 -0700 Subject: [PATCH] Detect local GC pauses during direct probes --- .../MembershipService/SiloHealthMonitor.cs | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs b/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs index 1fae8c7e390..fb4b855e52a 100644 --- a/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs +++ b/src/Orleans.Runtime/MembershipService/SiloHealthMonitor.cs @@ -190,7 +190,7 @@ private async Task Run() if (isDirectProbe) { // Probe the silo directly. - probeResult = await this.ProbeDirectly(cancellation.Token).ConfigureAwait(false); + probeResult = await this.ProbeDirectly(cancellation.Token, timeout).ConfigureAwait(false); } else { @@ -255,12 +255,14 @@ TimeSpan GetTimeout(bool isDirectProbe) /// Probes the remote silo. /// /// A token to cancel and fail the probe attempt. + /// The timeout used for this probe, for GC pause evaluation. /// The number of failed probes since the last successful probe. - private async Task ProbeDirectly(CancellationToken cancellation) + private async Task ProbeDirectly(CancellationToken cancellation, TimeSpan probeTimeout) { var id = ++_nextProbeId; LogTraceGoingToSendPing(_log, id, TargetSiloAddress); + var gcPauseBefore = GC.GetTotalPauseDuration(); var roundTripTimer = ValueStopwatch.StartNew(); ProbeResult probeResult; Exception? failureException; @@ -295,12 +297,24 @@ private async Task ProbeDirectly(CancellationToken cancellation) } else { - MessagingInstruments.OnPingReplyMissed(TargetSiloAddress); + // Check if a GC pause consumed a significant portion of the probe timeout. + // If so, the local silo may have been unable to process the response in time, + // so we treat this as an inconclusive result rather than a failure. + var gcPauseDuring = GC.GetTotalPauseDuration() - gcPauseBefore; + if (gcPauseDuring > probeTimeout.Multiply(0.25)) + { + LogWarningProbeFailureDuringGcPause(_log, id, TargetSiloAddress, roundTripTimer.Elapsed, gcPauseDuring, _failedProbes); + probeResult = ProbeResult.CreateDirect(_failedProbes, ProbeResultStatus.Unknown); + } + else + { + MessagingInstruments.OnPingReplyMissed(TargetSiloAddress); - var failedProbes = ++_failedProbes; - LogWarningDidNotGetResponseForProbe(_log, failureException, id, TargetSiloAddress, roundTripTimer.Elapsed, failedProbes); + var failedProbes = ++_failedProbes; + LogWarningDidNotGetResponseForProbe(_log, failureException, id, TargetSiloAddress, roundTripTimer.Elapsed, failedProbes); - probeResult = ProbeResult.CreateDirect(failedProbes, ProbeResultStatus.Failed); + probeResult = ProbeResult.CreateDirect(failedProbes, ProbeResultStatus.Failed); + } } return probeResult; @@ -470,5 +484,11 @@ public enum ProbeResultStatus Message = "Exception monitoring silo {SiloAddress}" )] private static partial void LogErrorExceptionMonitoringSilo(ILogger logger, Exception exception, SiloAddress siloAddress); + + [LoggerMessage( + Level = LogLevel.Warning, + Message = "Probe #{Id} to silo {SiloAddress} failed after {Elapsed}, but a GC pause of {GcPauseDuration} was detected during the probe. Treating as inconclusive. Consecutive failed probes remains at {FailedProbeCount}." + )] + private static partial void LogWarningProbeFailureDuringGcPause(ILogger logger, int id, SiloAddress siloAddress, TimeSpan elapsed, TimeSpan gcPauseDuration, int failedProbeCount); } }