From 240e743288bd2062f64db402b732a02736c8cd61 Mon Sep 17 00:00:00 2001 From: Brett Boston Date: Tue, 26 Nov 2024 16:03:55 -0800 Subject: [PATCH] Extend nomination monte carlo simulations to analyze timeouts Closes #4561 This change adds a couple more tests to the nomination tests for the following scenarios: 1. Asymmetric quorums 2. Unresponsive validators It also removes some protocol gating in the test suite that's no longer necessary now that P22 is the current protocol in the build. --- src/herder/test/HerderTests.cpp | 688 ++++++++++++++++++++++++++------ 1 file changed, 558 insertions(+), 130 deletions(-) diff --git a/src/herder/test/HerderTests.cpp b/src/herder/test/HerderTests.cpp index fcc07519f4..27247b4e6c 100644 --- a/src/herder/test/HerderTests.cpp +++ b/src/herder/test/HerderTests.cpp @@ -5806,53 +5806,48 @@ testWeights(std::vector const& validators) VirtualClock clock; Application::pointer app = createTestApplication(clock, cfg); - for_versions_from( - static_cast( - APPLICATION_SPECIFIC_NOMINATION_LEADER_ELECTION_PROTOCOL_VERSION), - *app, [&]() { - // Collect info about orgs - ValidatorQuality maxQuality; - std::unordered_map orgQualities; - std::unordered_map orgSizes; - std::unordered_map orgQualityCounts; - collectOrgInfo(maxQuality, orgQualities, orgSizes, orgQualityCounts, - validators); - - // Check per-validator weights - HerderImpl& herder = dynamic_cast(app->getHerder()); - std::unordered_map normalizedOrgWeights; - for (ValidatorEntry const& validator : validators) - { - uint64_t weight = herder.getHerderSCPDriver().getNodeWeight( - validator.mKey, cfg.QUORUM_SET, false); - double normalizedWeight = - static_cast(weight) / UINT64_MAX; - normalizedOrgWeights[validator.mHomeDomain] += normalizedWeight; - - std::string const& org = validator.mHomeDomain; - REQUIRE_THAT(normalizedWeight, - Catch::Matchers::WithinAbs( - expectedNormalizedWeight( - orgQualityCounts, maxQuality, - orgQualities.at(org), orgSizes.at(org)), - 0.0001)); - } - // Check per-org weights - for (auto const& [org, weight] : normalizedOrgWeights) - { - REQUIRE_THAT(weight, Catch::Matchers::WithinAbs( - expectedOrgNormalizedWeight( - orgQualityCounts, maxQuality, - orgQualities.at(org)), - 0.0001)); - } - }); + // Collect info about orgs + ValidatorQuality maxQuality; + std::unordered_map orgQualities; + std::unordered_map orgSizes; + std::unordered_map orgQualityCounts; + collectOrgInfo(maxQuality, orgQualities, orgSizes, orgQualityCounts, + validators); + + // Check per-validator weights + HerderImpl& herder = dynamic_cast(app->getHerder()); + std::unordered_map normalizedOrgWeights; + for (ValidatorEntry const& validator : validators) + { + uint64_t weight = herder.getHerderSCPDriver().getNodeWeight( + validator.mKey, cfg.QUORUM_SET, false); + double normalizedWeight = static_cast(weight) / UINT64_MAX; + normalizedOrgWeights[validator.mHomeDomain] += normalizedWeight; + + std::string const& org = validator.mHomeDomain; + REQUIRE_THAT(normalizedWeight, + Catch::Matchers::WithinAbs( + expectedNormalizedWeight(orgQualityCounts, maxQuality, + orgQualities.at(org), + orgSizes.at(org)), + 0.0001)); + } + + // Check per-org weights + for (auto const& [org, weight] : normalizedOrgWeights) + { + REQUIRE_THAT( + weight, Catch::Matchers::WithinAbs( + expectedOrgNormalizedWeight( + orgQualityCounts, maxQuality, orgQualities.at(org)), + 0.0001)); + } } // Test that HerderSCPDriver::getNodeWeight produces weights that result in a // fair distribution of nomination wins. -TEST_CASE_VERSIONS("getNodeWeight", "[herder]") +TEST_CASE("getNodeWeight", "[herder]") { SECTION("3 tier 1 validators, 1 org") { @@ -5895,9 +5890,10 @@ class TestNominationProtocol : public NominationProtocol } std::set const& - updateRoundLeadersForTesting() + updateRoundLeadersForTesting( + std::optional const& previousValue = std::nullopt) { - mPreviousValue = getRandomValue(); + mPreviousValue = previousValue.value_or(getRandomValue()); updateRoundLeaders(); return getLeaders(); } @@ -5938,102 +5934,95 @@ testWinProbabilities(std::vector const& sks, VirtualClock clock; Application::pointer app = createTestApplication(clock, cfg); - for_versions_from( - static_cast( - APPLICATION_SPECIFIC_NOMINATION_LEADER_ELECTION_PROTOCOL_VERSION), - *app, [&]() { - // Run for `numLedgers` slots, recording the number of times each - // node wins nomination - UnorderedMap publishCounts; - HerderImpl& herder = dynamic_cast(app->getHerder()); - SCP& scp = herder.getSCP(); - int fastTimeouts = 0; - for (int i = 0; i < numLedgers; ++i) - { - auto s = std::make_shared(i, scp); - TestNominationProtocol np(*s); + // Run for `numLedgers` slots, recording the number of times each + // node wins nomination + UnorderedMap publishCounts; + HerderImpl& herder = dynamic_cast(app->getHerder()); + SCP& scp = herder.getSCP(); + int fastTimeouts = 0; + for (int i = 0; i < numLedgers; ++i) + { + auto s = std::make_shared(i, scp); + TestNominationProtocol np(*s); - std::set const& leaders = - np.updateRoundLeadersForTesting(); - REQUIRE(leaders.size() == 1); - for (NodeID const& leader : leaders) - { - ++publishCounts[leader]; - } + std::set const& leaders = np.updateRoundLeadersForTesting(); + REQUIRE(leaders.size() == 1); + for (NodeID const& leader : leaders) + { + ++publishCounts[leader]; + } - if (np.fastTimedOut()) - { - ++fastTimeouts; - } - } + if (np.fastTimedOut()) + { + ++fastTimeouts; + } + } - CLOG_INFO(Herder, "Fast Timeouts: {} ({}%)", fastTimeouts, - fastTimeouts * 100.0 / numLedgers); + CLOG_INFO(Herder, "Fast Timeouts: {} ({}%)", fastTimeouts, + fastTimeouts * 100.0 / numLedgers); - // Compute total expected normalized weight across all nodes - double totalNormalizedWeight = 0.0; - for (ValidatorEntry const& validator : validators) - { - totalNormalizedWeight += expectedNormalizedWeight( - orgQualityCounts, maxQuality, - orgQualities.at(validator.mHomeDomain), - orgSizes.at(validator.mHomeDomain)); - } + // Compute total expected normalized weight across all nodes + double totalNormalizedWeight = 0.0; + for (ValidatorEntry const& validator : validators) + { + totalNormalizedWeight += + expectedNormalizedWeight(orgQualityCounts, maxQuality, + orgQualities.at(validator.mHomeDomain), + orgSizes.at(validator.mHomeDomain)); + } - // Check validator win rates - std::map orgPublishCounts; - for (ValidatorEntry const& validator : validators) - { - NodeID const& nodeID = validator.mKey; - int publishCount = publishCounts[nodeID]; - - // Compute and report node's win rate - double winRate = static_cast(publishCount) / numLedgers; - CLOG_INFO(Herder, "Node {} win rate: {} (published {} ledgers)", - cfg.toShortString(nodeID), winRate, publishCount); - - // Expected win rate is `weight / total weight` - double expectedWinRate = - expectedNormalizedWeight( - orgQualityCounts, maxQuality, - orgQualities.at(validator.mHomeDomain), - orgSizes.at(validator.mHomeDomain)) / - totalNormalizedWeight; - - // Check that actual win rate is within .05 of expected win - // rate. - REQUIRE_THAT(winRate, - Catch::Matchers::WithinAbs(expectedWinRate, 0.05)); - - // Record org publish counts for the next set of checks - orgPublishCounts[validator.mHomeDomain] += publishCount; - } + // Check validator win rates + std::map orgPublishCounts; + for (ValidatorEntry const& validator : validators) + { + NodeID const& nodeID = validator.mKey; + int publishCount = publishCounts[nodeID]; - // Check org win rates - for (auto const& [org, count] : orgPublishCounts) - { - // Compute and report org's win rate - double winRate = static_cast(count) / numLedgers; - CLOG_INFO(Herder, "Org {} win rate: {} (published {} ledgers)", - org, winRate, count); - - // Expected win rate is `weight / total weight` - double expectedWinRate = - expectedOrgNormalizedWeight(orgQualityCounts, maxQuality, - orgQualities.at(org)) / - totalNormalizedWeight; - - // Check that actual win rate is within .05 of expected win - // rate. - REQUIRE_THAT(winRate, - Catch::Matchers::WithinAbs(expectedWinRate, 0.05)); - } - }); + // Compute and report node's win rate + double winRate = static_cast(publishCount) / numLedgers; + CLOG_INFO(Herder, "Node {} win rate: {} (published {} ledgers)", + cfg.toShortString(nodeID), winRate, publishCount); + + // Expected win rate is `weight / total weight` + double expectedWinRate = + expectedNormalizedWeight(orgQualityCounts, maxQuality, + orgQualities.at(validator.mHomeDomain), + orgSizes.at(validator.mHomeDomain)) / + totalNormalizedWeight; + + // Check that actual win rate is within .05 of expected win + // rate. + REQUIRE_THAT(winRate, + Catch::Matchers::WithinAbs(expectedWinRate, 0.05)); + + // Record org publish counts for the next set of checks + orgPublishCounts[validator.mHomeDomain] += publishCount; + } + + // Check org win rates + for (auto const& [org, count] : orgPublishCounts) + { + // Compute and report org's win rate + double winRate = static_cast(count) / numLedgers; + CLOG_INFO(Herder, "Org {} win rate: {} (published {} ledgers)", org, + winRate, count); + + // Expected win rate is `weight / total weight` + double expectedWinRate = + expectedOrgNormalizedWeight(orgQualityCounts, maxQuality, + orgQualities.at(org)) / + totalNormalizedWeight; + + // Check that actual win rate is within .05 of expected win + // rate. + REQUIRE_THAT(winRate, + Catch::Matchers::WithinAbs(expectedWinRate, 0.05)); + } } // Test that the nomination algorithm produces a fair distribution of ledger // publishers. -TEST_CASE_VERSIONS("Fair nomination win rates", "[herder]") +TEST_CASE("Fair nomination win rates", "[herder]") { SECTION("3 tier 1 validators, 1 org") { @@ -6062,3 +6051,442 @@ TEST_CASE_VERSIONS("Fair nomination win rates", "[herder]") } } } + +// Returns a new `Topology` with the last org in `t` replaced with a new org +// with 3 validators. Requires that the last org in `t` have 3 validators and be +// contiguous at the back of the validators vecto. +static Topology +replaceOneOrg(Topology const& t) +{ + Topology t2(t); // Copy the topology + auto& [sks, validators] = t2; + REQUIRE(sks.size() == validators.size()); + + // Give the org a unique name + std::string const orgName = "org-replaced"; + + // Double check that the new org name is unique + for (ValidatorEntry const& v : validators) + { + REQUIRE(v.mHomeDomain != orgName); + } + + // Remove the last org + constexpr int validatorsPerOrg = 3; + sks.resize(sks.size() - validatorsPerOrg); + validators.resize(validators.size() - validatorsPerOrg); + + // Add new org with 3 validators + int constexpr numValidators = 3; + for (int j = 0; j < numValidators; ++j) + { + SecretKey const& key = sks.emplace_back(SecretKey::random()); + ValidatorEntry& entry = validators.emplace_back(); + entry.mName = fmt::format("validator-replaced-{}", j); + entry.mHomeDomain = orgName; + entry.mQuality = ValidatorQuality::VALIDATOR_HIGH_QUALITY; + entry.mKey = key.getPublicKey(); + entry.mHasHistory = false; + } + + return {sks, validators}; +} + +// Add `orgsToAdd` new orgs to the topology `t`. Each org will have 3 +// validators. +static Topology +addOrgs(int orgsToAdd, Topology const& t) +{ + Topology t2(t); // Copy the topology + auto& [sks, validators] = t2; + REQUIRE(sks.size() == validators.size()); + + // Generate new orgs + for (int i = 0; i < orgsToAdd; ++i) + { + std::string const org = fmt::format("new-org-{}", i); + int constexpr numValidators = 3; + for (int j = 0; j < numValidators; ++j) + { + SecretKey const& key = sks.emplace_back(SecretKey::random()); + ValidatorEntry& entry = validators.emplace_back(); + entry.mName = fmt::format("new-validator-{}-{}", i, j); + entry.mHomeDomain = org; + entry.mQuality = ValidatorQuality::VALIDATOR_HIGH_QUALITY; + entry.mKey = key.getPublicKey(); + entry.mHasHistory = false; + } + } + return t2; +} + +// Returns `true` if the set intersection of `leaders1` and `leaders2` is not +// empty. +bool +leadersIntersect(std::set const& leaders1, + std::set const& leaders2) +{ + std::vector intersection; + std::set_intersection(leaders1.begin(), leaders1.end(), leaders2.begin(), + leaders2.end(), std::back_inserter(intersection)); + return !intersection.empty(); +} + +// Given two quorum sets consisting of validators in `validators1` and +// `validators2`, this function returns the probability that the two quorum sets +// will agree on a leader in the first round of nomination. +double +computeExpectedFirstRoundAgreementProbability( + std::vector const& validators1, + std::vector const& validators2) +{ + // Gather orgs + std::set orgs1; + std::transform(validators1.begin(), validators1.end(), + std::inserter(orgs1, orgs1.end()), + [](ValidatorEntry const& v) { return v.mHomeDomain; }); + std::set orgs2; + std::transform(validators2.begin(), validators2.end(), + std::inserter(orgs2, orgs2.end()), + [](ValidatorEntry const& v) { return v.mHomeDomain; }); + + // Compute overlap + std::vector sharedOrgs; + std::set_intersection(orgs1.begin(), orgs1.end(), orgs2.begin(), + orgs2.end(), std::back_inserter(sharedOrgs)); + + // Probability of agreement in first round is (orgs overlapping / orgs1) * + // (orgs overlapping / orgs2). That's the probability that the two sides + // will pick any overlapping org. The algorithm guarantees that if they pick + // overlapping validator, they'll pick the same validator. + double overlap = static_cast(sharedOrgs.size()); + return overlap / orgs1.size() * overlap / orgs2.size(); +} + +// Test that the nomination algorithm behaves as expected when the two quorum +// sets `qs1` and `qs2` are not equivalent. This function requires that both +// quorum sets overlap, and contain only a single quality level of validators. +// Runs simulation for `numLedgers` slots. +// NOTE: This test counts any failure to agree on a leader as a timeout. In +// practice, it's possible that one side of the split is large enough to proceed +// without the other side. In this case, the larger side might not experience a +// timeout and "drag" the other side through consensus with it. However, this +// test aims to analyze the worst case scenario where the two sides are fairly +// balanced and real-world networking conditions are in place (some nodes +// lagging, etc), such that disagreement always results in a timeout. +void +testAsymmetricTimeouts(Topology const& qs1, Topology const& qs2, + int const numLedgers) +{ + auto const& [sks1, validators1] = qs1; + auto const& [sks2, validators2] = qs2; + + REQUIRE(sks1.size() == validators1.size()); + REQUIRE(sks2.size() == validators2.size()); + + // Generate configs and nodes representing one validator with each quorum + // set + std::vector clocks(2); + std::vector apps; + for (int i = 0; i < 2; ++i) + { + Config cfg = getTestConfig(i); + cfg.ARTIFICIALLY_ACCELERATE_TIME_FOR_TESTING = true; + cfg.generateQuorumSetForTesting(i == 0 ? validators1 : validators2); + cfg.NODE_SEED = i == 0 ? sks1.back() : sks2.back(); + + auto app = apps.emplace_back(createTestApplication(clocks.at(i), cfg)); + } + + // Run the nomination algorithm for `numLedgers` slots. Simulate timeouts by + // re-running slots that don't agree on a leader until their leader + // elections overlap. Record the number of timeouts it takes for the two + // quorum sets to agree on a leader in `timeouts`, which is effectively a + // mapping from number of timeouts to the number of ledgers that experienced + // that many timeouts. + std::vector timeouts(std::max(validators1.size(), validators2.size())); + for (int i = 0; i < numLedgers; ++i) + { + Value const v = getRandomValue(); + SCP& scp1 = dynamic_cast(apps.at(0)->getHerder()).getSCP(); + SCP& scp2 = dynamic_cast(apps.at(1)->getHerder()).getSCP(); + auto s1 = std::make_shared(i, scp1); + auto s2 = std::make_shared(i, scp2); + + TestNominationProtocol np1(*s1); + TestNominationProtocol np2(*s2); + + for (int j = 0; j < timeouts.size(); ++j) + { + std::set const& leaders1 = + np1.updateRoundLeadersForTesting(v); + std::set const& leaders2 = + np2.updateRoundLeadersForTesting(v); + REQUIRE(leaders1.size() == j + 1); + REQUIRE(leaders2.size() == j + 1); + + if (leadersIntersect(leaders1, leaders2)) + { + // Agreed on a leader! Record the number of timeouts resulted. + ++timeouts.at(j); + break; + } + } + + // If leaders don't intersect after running through the loop then the + // two quorum sets have no overlap and the test is broken. + REQUIRE(leadersIntersect(np1.getLeaders(), np2.getLeaders())); + } + + // For the first round, we can easily compute the expected agreement + // probability. For subsequent rounds, we check only that the success rate + // increases over time (modulo some small epsilon). + double expectedSuccessRate = + computeExpectedFirstRoundAgreementProbability(validators1, validators2); + + // Allow for some small decrease in success rate from the theoretical value. + // We're working with probabilistic simulation here so we can't be too + // strict or the test will be flaky. + double constexpr epsilon = 0.1; + + // There's not enough data in the tail of the distribution to allow us to + // assert that the success rate is what's expected. To avoid sporadic test + // failures, we cut off `tailCutoffPoint` of the tail of the distribution + // for the purposes of asserting test values. However, the test will still + // log those success rates for manual examination. + double constexpr tailCutoffPoint = 0.05; + + int numLedgersRemaining = numLedgers; + for (int i = 0; i < timeouts.size(); ++i) + { + int const numTimeouts = timeouts.at(i); + if (numTimeouts == 0) + { + // Avoid cluttering output + continue; + } + + CLOG_INFO(Herder, "Ledgers with {} timeouts: {} ({}%)", i, numTimeouts, + static_cast(numTimeouts) * 100 / numLedgers); + + if (numLedgersRemaining > numLedgers * tailCutoffPoint) + { + // Check that success rate increases over time. Allow some epsilon + // decrease because this is a probabilistic simulation. Also stop + // checking when we're at the last `tailCutoffPoint` timeouts as the + // data is too sparse to be useful. + double successRate = + static_cast(timeouts.at(i)) / numLedgersRemaining; + REQUIRE(successRate > expectedSuccessRate - epsilon); + + // Take max of success rate and previous success rate to avoid + // accidentally accepting a declining success rate due to episilon. + expectedSuccessRate = std::max(successRate, expectedSuccessRate); + numLedgersRemaining -= numTimeouts; + } + } +} + +// Test timeouts with asymmetric quorums. This test serves two purposes: +// 1. It contains assertions checking for moderate (10%) deviations from the +// expected behavior of the nomination algorithm. These should detect any +// major issues/regressions with the algorithm. +// 2. It logs the distributions of timeouts for manual inspection. This is +// useful for understanding the behavior of the algorithm and for testing +// specific scenarios one might be interested in (e.g., if tier 1 disagrees +// on one org's presence in tier 1, what is the impact on nomination +// timeouts?). +// NOTE: This provides a worst-case analysis of timeouts. See the NOTE on +// `testAsymmetricTimeouts` for more details. +TEST_CASE("Asymmetric quorum timeouts", "[herder]") +{ + // Number of slots to run for + int constexpr numLedgers = 20000; + + SECTION("Tier 1-like topology with replaced org") + { + auto t = teir1Like(); + testAsymmetricTimeouts(t, replaceOneOrg(t), numLedgers); + } + + SECTION("Tier 1-like topology with 1 added org") + { + auto t = teir1Like(); + testAsymmetricTimeouts(t, addOrgs(1, t), numLedgers); + } + + SECTION("Tier 1-like topology with 3 added orgs") + { + auto t = teir1Like(); + testAsymmetricTimeouts(t, addOrgs(3, t), numLedgers); + } +} + +// Test that the nomination algorithm behaves as expected when a random +// `numUnresponsive` set of nodes in `qs` are unresponsive. Runs simulation for +// `numLedgers` slots. +void +testUnresponsiveTimeouts(Topology const& qs, int numUnresponsive, + int const numLedgers) +{ + auto const& [sks, validators] = qs; + REQUIRE(sks.size() == validators.size()); + REQUIRE(numUnresponsive < validators.size()); + + // extract and shuffle node ids. Choose `numUnresponsive` nodes to be the + // unresponsive nodes. + std::vector nodeIDs; + std::transform(validators.begin(), validators.end(), + std::back_inserter(nodeIDs), + [](ValidatorEntry const& v) { return v.mKey; }); + stellar::shuffle(nodeIDs.begin(), nodeIDs.end(), gRandomEngine); + std::set unresponsive(nodeIDs.begin(), + nodeIDs.begin() + numUnresponsive); + + // Collect info about orgs + ValidatorQuality maxQuality; + std::unordered_map orgQualities; + std::unordered_map orgSizes; + std::unordered_map orgQualityCounts; + collectOrgInfo(maxQuality, orgQualities, orgSizes, orgQualityCounts, + validators); + + // Compute total weight of all validators, as well as the total weight of + // unresponsive validators + double totalWeight = 0.0; + double unresponsiveWeight = 0.0; + for (ValidatorEntry const& validator : validators) + { + double normalizedWeight = + expectedNormalizedWeight(orgQualityCounts, maxQuality, + orgQualities.at(validator.mHomeDomain), + orgSizes.at(validator.mHomeDomain)); + totalWeight += normalizedWeight; + if (unresponsive.count(validator.mKey)) + { + unresponsiveWeight += normalizedWeight; + } + } + + // Compute the average weight of an unresponsive node + double avgUnresponsiveWeight = unresponsiveWeight / numUnresponsive; + + // Compute expected number of ledgers experiencing `n` timeouts where `n` is + // the index of the `timeouts` vector. This vector is a mapping from number + // of timeouts to expected number of ledgers experiencing that number of + // timeouts. + std::vector expectedTimeouts(numUnresponsive + 1); + double remainingWeight = totalWeight; + int remainingUnresponsive = numUnresponsive; + int remainingLedgers = numLedgers; + for (int i = 0; i < expectedTimeouts.size(); ++i) + { + double timeoutProb = + (avgUnresponsiveWeight * remainingUnresponsive) / remainingWeight; + // To get expected number of ledgers experiencing `i` timeouts, we take + // the probability a timeout does not occur and multiply it by the + // number of remaining ledgers. + int expectedLedgers = (1 - timeoutProb) * remainingLedgers; + expectedTimeouts.at(i) = expectedLedgers; + + // Remaining ledgers decreases by expected number of ledgers + // experiencing `i` timeouts + remainingLedgers -= expectedLedgers; + + // For `i+1` timeouts to occur, an unresponsive node must be chosen. + // Therefore, deduct the average weight of an unresponsive node from the + // total weight left in the network. + remainingWeight -= avgUnresponsiveWeight; + --remainingUnresponsive; + } + + // Generate a config + Config cfg = getTestConfig(); + cfg.ARTIFICIALLY_ACCELERATE_TIME_FOR_TESTING = true; + cfg.generateQuorumSetForTesting(validators); + cfg.NODE_SEED = sks.front(); + + // Create an application + VirtualClock clock; + Application::pointer app = createTestApplication(clock, cfg); + + // Run for `numLedgers` slots, recording the number of times each slot timed + // out due to unresponsive nodes before successfully electing a responsive + // leader. + SCP& scp = dynamic_cast(app->getHerder()).getSCP(); + std::vector timeouts(numUnresponsive + 1); + for (int i = 0; i < numLedgers; ++i) + { + Value const v = getRandomValue(); + auto s = std::make_shared(i, scp); + + TestNominationProtocol np(*s); + for (int i = 0; i < timeouts.size(); ++i) + { + std::set const& leaders = + np.updateRoundLeadersForTesting(v); + // If leaders is a subset of unresponsive, then a timeout occurs. + if (!std::includes(unresponsive.begin(), unresponsive.end(), + leaders.begin(), leaders.end())) + { + ++timeouts.at(i); + break; + } + } + } + + // Allow for some small multiplicative increase in timeouts from the + // theoretical value. We're working with probabilistic simulation here so + // we can't be too strict or the test will be flaky. + double constexpr epsilon = 1.1; + + // There's not enough data in the tail of the distribution to allow us to + // assert that the timeout values are what's expected. To avoid sporadic + // test failures, we cut off `tailCutoffPoint` of the tail of the + // distribution for the purposes of asserting test values. However, the test + // will still log those values for manual examination. + double constexpr tailCutoffPoint = 0.05; + + // Analyze timeouts + int numLedgersRemaining = numLedgers; + for (int i = 0; i < timeouts.size(); ++i) + { + int const numTimeouts = timeouts.at(i); + int const expectedNumTimeouts = expectedTimeouts.at(i); + + if (numLedgersRemaining > numLedgers * tailCutoffPoint) + { + // Check that timeouts are less than epsilon times the expected + // value. Also stop checking when we're at the last + // `tailCutoffPoint` timeouts as the data is too sparse to be + // useful. + REQUIRE(numTimeouts < expectedNumTimeouts * epsilon); + } + CLOG_INFO(Herder, "Ledgers with {} timeouts: {} ({}%)", i, numTimeouts, + numTimeouts * 100.0 / numLedgers); + numLedgersRemaining -= numTimeouts; + } +} + +// Test timeouts for a tier 1-like topology with 1-5 unresponsive nodes. This +// test serves two purposes: +// 1. It contains assertions checking for moderate (10%) deviations from the +// expected behavior of the nomination algorithm. These should detect any +// major issues/regressions with the algorithm. +// 2. It logs the distributions of timeouts for manual inspection. This is +// useful for understanding the behavior of the algorithm and for testing +// specific scenarios one might be interested in (e.g., if 3 tier 1 nodes +// are heavily lagging, what is the impact on nomination timeouts?). +TEST_CASE("Unresponsive quorum timeouts", "[herder]") +{ + // Number of slots to run for + int constexpr numLedgers = 20000; + + auto t = teir1Like(); + for (int i = 1; i <= 5; ++i) + { + CLOG_INFO(Herder, "Simulating nomination with {} unresponsive nodes", + i); + testUnresponsiveTimeouts(t, i, numLedgers); + } +} \ No newline at end of file