Skip to content

Commit 4a92375

Browse files
grwilsonChristopher Siden
authored and
Christopher Siden
committed
3642 dsl_scan_active() should not issue I/O to determine if async destroying is active
3643 txg_delay should not hold the tc_lock Reviewed by: Matthew Ahrens <[email protected]> Reviewed by: Adam Leventhal <[email protected]> Approved by: Gordon Ross <[email protected]>
1 parent 5918f98 commit 4a92375

File tree

5 files changed

+75
-15
lines changed

5 files changed

+75
-15
lines changed

usr/src/uts/common/fs/zfs/dsl_destroy.c

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,12 +753,16 @@ dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx)
753753
zil_destroy_sync(dmu_objset_zil(os), tx);
754754

755755
if (!spa_feature_is_active(dp->dp_spa, async_destroy)) {
756+
dsl_scan_t *scn = dp->dp_scan;
757+
756758
spa_feature_incr(dp->dp_spa, async_destroy, tx);
757759
dp->dp_bptree_obj = bptree_alloc(mos, tx);
758760
VERIFY0(zap_add(mos,
759761
DMU_POOL_DIRECTORY_OBJECT,
760762
DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1,
761763
&dp->dp_bptree_obj, tx));
764+
ASSERT(!scn->scn_async_destroying);
765+
scn->scn_async_destroying = B_TRUE;
762766
}
763767

764768
used = ds->ds_dir->dd_phys->dd_used_bytes;

usr/src/uts/common/fs/zfs/dsl_scan.c

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,15 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg)
9595
scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP);
9696
scn->scn_dp = dp;
9797

98+
/*
99+
* It's possible that we're resuming a scan after a reboot so
100+
* make sure that the scan_async_destroying flag is initialized
101+
* appropriately.
102+
*/
103+
ASSERT(!scn->scn_async_destroying);
104+
scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa,
105+
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY]);
106+
98107
err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
99108
"scrub_func", sizeof (uint64_t), 1, &f);
100109
if (err == 0) {
@@ -1344,13 +1353,10 @@ dsl_scan_active(dsl_scan_t *scn)
13441353
if (spa_shutting_down(spa))
13451354
return (B_FALSE);
13461355

1347-
if (scn->scn_phys.scn_state == DSS_SCANNING)
1356+
if (scn->scn_phys.scn_state == DSS_SCANNING ||
1357+
scn->scn_async_destroying)
13481358
return (B_TRUE);
13491359

1350-
if (spa_feature_is_active(spa,
1351-
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1352-
return (B_TRUE);
1353-
}
13541360
if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) {
13551361
(void) bpobj_space(&scn->scn_dp->dp_free_bpobj,
13561362
&used, &comp, &uncomp);
@@ -1406,6 +1412,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
14061412

14071413
if (err == 0 && spa_feature_is_active(spa,
14081414
&spa_feature_table[SPA_FEATURE_ASYNC_DESTROY])) {
1415+
ASSERT(scn->scn_async_destroying);
14091416
scn->scn_is_bptree = B_TRUE;
14101417
scn->scn_zio_root = zio_root(dp->dp_spa, NULL,
14111418
NULL, ZIO_FLAG_MUSTSUCCEED);
@@ -1426,6 +1433,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx)
14261433
VERIFY0(bptree_free(dp->dp_meta_objset,
14271434
dp->dp_bptree_obj, tx));
14281435
dp->dp_bptree_obj = 0;
1436+
scn->scn_async_destroying = B_FALSE;
14291437
}
14301438
}
14311439
if (scn->scn_visited_this_txg) {

usr/src/uts/common/fs/zfs/sys/dsl_scan.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
*/
2121
/*
2222
* Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved.
23-
* Copyright (c) 2012 by Delphix. All rights reserved.
23+
* Copyright (c) 2013 by Delphix. All rights reserved.
2424
*/
2525

2626
#ifndef _SYS_DSL_SCAN_H
@@ -82,6 +82,7 @@ typedef struct dsl_scan {
8282

8383
/* for freeing blocks */
8484
boolean_t scn_is_bptree;
85+
boolean_t scn_async_destroying;
8586

8687
/* for debugging / information */
8788
uint64_t scn_visited_this_txg;

usr/src/uts/common/fs/zfs/sys/txg_impl.h

Lines changed: 44 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
*/
2525

2626
/*
27-
* Copyright (c) 2012 by Delphix. All rights reserved.
27+
* Copyright (c) 2013 by Delphix. All rights reserved.
2828
*/
2929

3030
#ifndef _SYS_TXG_IMPL_H
@@ -37,14 +37,55 @@
3737
extern "C" {
3838
#endif
3939

40+
/*
41+
* The tx_cpu structure is a per-cpu structure that is used to track
42+
* the number of active transaction holds (tc_count). As transactions
43+
* are assigned into a transaction group the appropriate tc_count is
44+
* incremented to indicate that there are pending changes that have yet
45+
* to quiesce. Consumers evenutally call txg_rele_to_sync() to decrement
46+
* the tc_count. A transaction group is not considered quiesced until all
47+
* tx_cpu structures have reached a tc_count of zero.
48+
*
49+
* This structure is a per-cpu structure by design. Updates to this structure
50+
* are frequent and concurrent. Having a single structure would result in
51+
* heavy lock contention so a per-cpu design was implemented. With the fanned
52+
* out mutex design, consumers only need to lock the mutex associated with
53+
* thread's cpu.
54+
*
55+
* The tx_cpu contains two locks, the tc_lock and tc_open_lock.
56+
* The tc_lock is used to protect all members of the tx_cpu structure with
57+
* the exception of the tc_open_lock. This lock should only be held for a
58+
* short period of time, typically when updating the value of tc_count.
59+
*
60+
* The tc_open_lock protects the tx_open_txg member of the tx_state structure.
61+
* This lock is used to ensure that transactions are only assigned into
62+
* the current open transaction group. In order to move the current open
63+
* transaction group to the quiesce phase, the txg_quiesce thread must
64+
* grab all tc_open_locks, increment the tx_open_txg, and drop the locks.
65+
* The tc_open_lock is held until the transaction is assigned into the
66+
* transaction group. Typically, this is a short operation but if throttling
67+
* is occuring it may be held for longer periods of time.
68+
*/
4069
struct tx_cpu {
41-
kmutex_t tc_lock;
70+
kmutex_t tc_open_lock; /* protects tx_open_txg */
71+
kmutex_t tc_lock; /* protects the rest of this struct */
4272
kcondvar_t tc_cv[TXG_SIZE];
4373
uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
4474
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
45-
char tc_pad[16]; /* pad to fill 3 cache lines */
75+
char tc_pad[8]; /* pad to fill 3 cache lines */
4676
};
4777

78+
/*
79+
* The tx_state structure maintains the state information about the different
80+
* stages of the pool's transcation groups. A per pool tx_state structure
81+
* is used to track this information. The tx_state structure also points to
82+
* an array of tx_cpu structures (described above). Although the tx_sync_lock
83+
* is used to protect the members of this structure, it is not used to
84+
* protect the tx_open_txg. Instead a special lock in the tx_cpu structure
85+
* is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock.
86+
* Any thread wishing to update tx_open_txg must grab the tc_open_lock on
87+
* every cpu (see txg_quiesce()).
88+
*/
4889
typedef struct tx_state {
4990
tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */
5091
kmutex_t tx_sync_lock; /* protects the rest of this struct */

usr/src/uts/common/fs/zfs/txg.c

Lines changed: 12 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,8 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
126126
int i;
127127

128128
mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL);
129+
mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_DEFAULT,
130+
NULL);
129131
for (i = 0; i < TXG_SIZE; i++) {
130132
cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
131133
NULL);
@@ -168,6 +170,7 @@ txg_fini(dsl_pool_t *dp)
168170
for (c = 0; c < max_ncpus; c++) {
169171
int i;
170172

173+
mutex_destroy(&tx->tx_cpu[c].tc_open_lock);
171174
mutex_destroy(&tx->tx_cpu[c].tc_lock);
172175
for (i = 0; i < TXG_SIZE; i++) {
173176
cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
@@ -292,10 +295,12 @@ txg_hold_open(dsl_pool_t *dp, txg_handle_t *th)
292295
tx_cpu_t *tc = &tx->tx_cpu[CPU_SEQID];
293296
uint64_t txg;
294297

295-
mutex_enter(&tc->tc_lock);
296-
298+
mutex_enter(&tc->tc_open_lock);
297299
txg = tx->tx_open_txg;
300+
301+
mutex_enter(&tc->tc_lock);
298302
tc->tc_count[txg & TXG_MASK]++;
303+
mutex_exit(&tc->tc_lock);
299304

300305
th->th_cpu = tc;
301306
th->th_txg = txg;
@@ -308,7 +313,8 @@ txg_rele_to_quiesce(txg_handle_t *th)
308313
{
309314
tx_cpu_t *tc = th->th_cpu;
310315

311-
mutex_exit(&tc->tc_lock);
316+
ASSERT(!MUTEX_HELD(&tc->tc_lock));
317+
mutex_exit(&tc->tc_open_lock);
312318
}
313319

314320
void
@@ -345,10 +351,10 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
345351
int c;
346352

347353
/*
348-
* Grab all tx_cpu locks so nobody else can get into this txg.
354+
* Grab all tc_open_locks so nobody else can get into this txg.
349355
*/
350356
for (c = 0; c < max_ncpus; c++)
351-
mutex_enter(&tx->tx_cpu[c].tc_lock);
357+
mutex_enter(&tx->tx_cpu[c].tc_open_lock);
352358

353359
ASSERT(txg == tx->tx_open_txg);
354360
tx->tx_open_txg++;
@@ -361,7 +367,7 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
361367
* enter the next transaction group.
362368
*/
363369
for (c = 0; c < max_ncpus; c++)
364-
mutex_exit(&tx->tx_cpu[c].tc_lock);
370+
mutex_exit(&tx->tx_cpu[c].tc_open_lock);
365371

366372
/*
367373
* Quiesce the transaction group by waiting for everyone to txg_exit().

0 commit comments

Comments
 (0)