From 6c78d7544f18a66c4dd21207c8b267b4565a10cd Mon Sep 17 00:00:00 2001 From: Jon Chambers Date: Tue, 3 Nov 2020 12:05:06 -0500 Subject: [PATCH] Capture a thread dump when Redis commands time out. --- .../redis/FaultTolerantPubSubConnection.java | 22 ++++++++++----- .../redis/FaultTolerantRedisCluster.java | 20 ++++++++++---- .../textsecuregcm/util/ThreadDumpUtil.java | 27 +++++++++++++++++++ 3 files changed, 58 insertions(+), 11 deletions(-) create mode 100644 service/src/main/java/org/whispersystems/textsecuregcm/util/ThreadDumpUtil.java diff --git a/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantPubSubConnection.java b/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantPubSubConnection.java index 8066eb1fd..984938295 100644 --- a/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantPubSubConnection.java +++ b/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantPubSubConnection.java @@ -16,7 +16,9 @@ import io.lettuce.core.cluster.pubsub.StatefulRedisClusterPubSubConnection; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.whispersystems.textsecuregcm.util.Constants; +import org.whispersystems.textsecuregcm.util.ThreadDumpUtil; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; import java.util.function.Function; @@ -31,8 +33,9 @@ public class FaultTolerantPubSubConnection { private final CircuitBreaker circuitBreaker; private final Retry retry; - private final Timer executeTimer; - private final Meter commandTimeoutMeter; + private final Timer executeTimer; + private final Meter commandTimeoutMeter; + private final AtomicBoolean wroteThreadDump = new AtomicBoolean(false); private static final Logger log = LoggerFactory.getLogger(FaultTolerantPubSubConnection.class); @@ -56,8 +59,7 @@ public class FaultTolerantPubSubConnection { try (final Timer.Context ignored = executeTimer.time()) { consumer.accept(pubSubConnection); } catch (final RedisCommandTimeoutException e) { - commandTimeoutMeter.mark(); - log.warn("Command timeout exception ({}-pubsub)", this.name, e); + recordCommandTimeout(e); throw e; } })); @@ -78,8 +80,7 @@ public class FaultTolerantPubSubConnection { try (final Timer.Context ignored = executeTimer.time()) { return function.apply(pubSubConnection); } catch (final RedisCommandTimeoutException e) { - commandTimeoutMeter.mark(); - log.warn("Command timeout exception ({}-pubsub)", this.name, e); + recordCommandTimeout(e); throw e; } })); @@ -93,4 +94,13 @@ public class FaultTolerantPubSubConnection { } } } + + private void recordCommandTimeout(final RedisCommandTimeoutException e) { + commandTimeoutMeter.mark(); + log.warn("Command timeout exception ({}-pubsub)", this.name, e); + + if (wroteThreadDump.compareAndSet(false, true)) { + ThreadDumpUtil.writeThreadDump(); + } + } } diff --git a/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantRedisCluster.java b/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantRedisCluster.java index 00e5fdae2..7e872acaa 100644 --- a/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantRedisCluster.java +++ b/service/src/main/java/org/whispersystems/textsecuregcm/redis/FaultTolerantRedisCluster.java @@ -25,10 +25,12 @@ import org.whispersystems.textsecuregcm.configuration.RedisClusterConfiguration; import org.whispersystems.textsecuregcm.configuration.RetryConfiguration; import org.whispersystems.textsecuregcm.util.CircuitBreakerUtil; import org.whispersystems.textsecuregcm.util.Constants; +import org.whispersystems.textsecuregcm.util.ThreadDumpUtil; import java.time.Duration; import java.util.ArrayList; import java.util.List; +import java.util.concurrent.atomic.AtomicBoolean; import java.util.function.Consumer; import java.util.function.Function; import java.util.stream.Collectors; @@ -53,7 +55,8 @@ public class FaultTolerantRedisCluster { private final CircuitBreaker circuitBreaker; private final Retry retry; - private final Meter commandTimeoutMeter; + private final Meter commandTimeoutMeter; + private final AtomicBoolean wroteThreadDump = new AtomicBoolean(false); private static final Logger log = LoggerFactory.getLogger(FaultTolerantRedisCluster.class); @@ -122,8 +125,7 @@ public class FaultTolerantRedisCluster { try { consumer.accept(connection); } catch (final RedisCommandTimeoutException e) { - commandTimeoutMeter.mark(); - log.warn("Command timeout exception ({})", this.name, e); + recordCommandTimeout(e); throw e; } })); @@ -144,8 +146,7 @@ public class FaultTolerantRedisCluster { try { return function.apply(connection); } catch (final RedisCommandTimeoutException e) { - commandTimeoutMeter.mark(); - log.warn("Command timeout exception ({})", this.name, e); + recordCommandTimeout(e); throw e; } })); @@ -160,6 +161,15 @@ public class FaultTolerantRedisCluster { } } + private void recordCommandTimeout(final RedisCommandTimeoutException e) { + commandTimeoutMeter.mark(); + log.warn("Command timeout exception ({})", this.name, e); + + if (wroteThreadDump.compareAndSet(false, true)) { + ThreadDumpUtil.writeThreadDump(); + } + } + public FaultTolerantPubSubConnection createPubSubConnection() { final StatefulRedisClusterPubSubConnection pubSubConnection = clusterClient.connectPubSub(); pubSubConnections.add(pubSubConnection); diff --git a/service/src/main/java/org/whispersystems/textsecuregcm/util/ThreadDumpUtil.java b/service/src/main/java/org/whispersystems/textsecuregcm/util/ThreadDumpUtil.java new file mode 100644 index 000000000..0fdf3bdab --- /dev/null +++ b/service/src/main/java/org/whispersystems/textsecuregcm/util/ThreadDumpUtil.java @@ -0,0 +1,27 @@ +package org.whispersystems.textsecuregcm.util; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.lang.management.ManagementFactory; +import java.lang.management.ThreadInfo; + +public class ThreadDumpUtil { + + private static final Logger log = LoggerFactory.getLogger(ThreadDumpUtil.class); + + public static void writeThreadDump() { + try { + try (final PrintWriter out = new PrintWriter(File.createTempFile("thread_dump_", ".txt"))) { + for (ThreadInfo info : ManagementFactory.getThreadMXBean().dumpAllThreads(true, true)) { + out.print(info); + } + } + } catch (final IOException e) { + log.warn("Failed to write thread dump", e); + } + } +}