Skip to content

Commit 688fd31

Browse files
Calle Wilunddenesb
Calle Wilund
authored andcommitted
commitlog: Add counters for actual pending allocations + segment wait
Fixes scylladb#9367 The CL counters pending_allocations and requests_blocked_memory are exposed in graphana (etc) and often referred to as metrics on whether we are blocking on commit log. But they don't really show this, as they only measure whether or not we are blocked on the memory bandwidth semaphore that provides rate back pressure (fixed num bytes/s - sortof). However, actual tasks in allocation or segment wait is not exposed, so if we are blocked on disk IO or waiting for segments to become available, we have no visible metrics. While the "old" counters certainly are valid, I have yet to ever see them be non-zero in modern life. Closes scylladb#9368
1 parent e22364d commit 688fd31

File tree

3 files changed

+49
-3
lines changed

3 files changed

+49
-3
lines changed

db/commitlog/commitlog.cc

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,21 @@ class db::commitlog::segment_manager : public ::enable_shared_from_this<segment_
342342
// size allocated on disk - i.e. files created (new, reserve, recycled)
343343
uint64_t total_size_on_disk = 0;
344344
uint64_t requests_blocked_memory = 0;
345+
uint64_t blocked_on_new_segment = 0;
346+
uint64_t active_allocations = 0;
347+
};
348+
349+
class scope_increment_counter {
350+
uint64_t& _dst;
351+
public:
352+
scope_increment_counter(uint64_t& dst)
353+
: _dst(dst)
354+
{
355+
++_dst;
356+
}
357+
~scope_increment_counter() {
358+
--_dst;
359+
}
345360
};
346361

347362
stats totals;
@@ -1258,6 +1273,8 @@ future<R> db::commitlog::segment_manager::allocate_when_possible(T writer, db::t
12581273
totals.requests_blocked_memory++;
12591274
}
12601275

1276+
scope_increment_counter allocating(totals.active_allocations);
1277+
12611278
auto permit = co_await std::move(fut);
12621279
sseg_ptr s;
12631280

@@ -1493,6 +1510,12 @@ void db::commitlog::segment_manager::create_counters(const sstring& metrics_cate
14931510

14941511
sm::make_gauge("memory_buffer_bytes", totals.buffer_list_bytes,
14951512
sm::description("Holds the total number of bytes in internal memory buffers.")),
1513+
1514+
sm::make_gauge("blocked_on_new_segment", totals.blocked_on_new_segment,
1515+
sm::description("Number of allocations blocked on acquiring new segment.")),
1516+
1517+
sm::make_gauge("active_allocations", totals.active_allocations,
1518+
sm::description("Current number of active allocations.")),
14961519
});
14971520
}
14981521

@@ -1739,6 +1762,8 @@ future<db::commitlog::segment_manager::sseg_ptr> db::commitlog::segment_manager:
17391762
co_return _segments.back();
17401763
}
17411764

1765+
scope_increment_counter blocked_on_new(totals.blocked_on_new_segment);
1766+
17421767
// #9896 - we don't want to issue a new_segment call until
17431768
// the old one has terminated with either result or exception.
17441769
// Do all waiting through the shared_future
@@ -2790,6 +2815,14 @@ uint64_t db::commitlog::get_num_active_segments() const {
27902815
return _segment_manager->get_num_active_segments();
27912816
}
27922817

2818+
uint64_t db::commitlog::get_num_blocked_on_new_segment() const {
2819+
return _segment_manager->totals.blocked_on_new_segment;
2820+
}
2821+
2822+
uint64_t db::commitlog::get_num_active_allocations() const {
2823+
return _segment_manager->totals.active_allocations;
2824+
}
2825+
27932826
future<std::vector<db::commitlog::descriptor>> db::commitlog::list_existing_descriptors() const {
27942827
return list_existing_descriptors(active_config().commit_log_location);
27952828
}

db/commitlog/commitlog.hh

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,10 @@ public:
290290
uint64_t get_flush_limit_exceeded_count() const;
291291
uint64_t get_num_segments_created() const;
292292
uint64_t get_num_segments_destroyed() const;
293+
uint64_t get_num_blocked_on_new_segment() const;
294+
uint64_t get_num_active_allocations() const;
295+
296+
293297
/**
294298
* Get number of inactive (finished), segments lingering
295299
* due to still being dirty

test/boost/commitlog_test.cc

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -782,19 +782,25 @@ SEASTAR_TEST_CASE(test_commitlog_deadlock_in_recycle) {
782782
};
783783
});
784784

785+
uint64_t num_active_allocations = 0, num_blocked_on_new_segment = 0;
786+
785787
// add a flush handler that delays releasing things until disk threshold is reached.
786788
auto r = log.add_flush_handler([&](cf_id_type, replay_position pos) {
787789
auto old = std::exchange(rps, rp_set{});
788790
queue.emplace_back(std::move(old));
789-
if (log.disk_footprint() >= log.disk_limit() && !t.armed()) {
790-
t.arm(5s);
791+
if (log.disk_footprint() >= log.disk_limit()) {
792+
num_active_allocations += log.get_num_active_allocations();
793+
num_blocked_on_new_segment += log.get_num_blocked_on_new_segment();
794+
if (!t.armed()) {
795+
t.arm(5s);
796+
}
791797
}
792798
});
793799

794800
bool release = true;
795801

796802
try {
797-
while (n < 10) {
803+
while (n < 10 || !num_active_allocations || !num_blocked_on_new_segment) {
798804
auto now = timeout_clock::now();
799805
rp_handle h = co_await with_timeout(now + 30s, log.add_mutation(uuid, size, db::commitlog::force_sync::no, [&](db::commitlog::output& dst) {
800806
dst.fill('1', size);
@@ -810,6 +816,9 @@ SEASTAR_TEST_CASE(test_commitlog_deadlock_in_recycle) {
810816
co_await log.shutdown();
811817
co_await log.clear();
812818
}
819+
820+
BOOST_REQUIRE_GT(num_active_allocations, 0);
821+
BOOST_REQUIRE_GT(num_blocked_on_new_segment, 0);
813822
}
814823

815824
// Test for #8438 - ensure we can shut down (in orderly fashion)

0 commit comments

Comments
 (0)