Skip to content

Commit 6ab608f

Browse files
committed
Merge tag 'for-6.3-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux
Pull btrfs fixes from David Sterba: - scan block devices in non-exclusive mode to avoid temporary mkfs failures - fix race between quota disable and quota assign ioctls - fix deadlock when aborting transaction during relocation with scrub - ignore fiemap path cache when there are multiple paths for a node * tag 'for-6.3-rc4-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: btrfs: ignore fiemap path cache when there are multiple paths for a node btrfs: fix deadlock when aborting transaction during relocation with scrub btrfs: scan device in non-exclusive mode btrfs: fix race between quota disable and quota assign ioctls
2 parents f95b8ea + 2280d42 commit 6ab608f

File tree

5 files changed

+107
-26
lines changed

5 files changed

+107
-26
lines changed

fs/btrfs/backref.c

+63-22
Original file line numberDiff line numberDiff line change
@@ -1921,8 +1921,7 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
19211921
level = -1;
19221922
ULIST_ITER_INIT(&uiter);
19231923
while (1) {
1924-
bool is_shared;
1925-
bool cached;
1924+
const unsigned long prev_ref_count = ctx->refs.nnodes;
19261925

19271926
walk_ctx.bytenr = bytenr;
19281927
ret = find_parent_nodes(&walk_ctx, &shared);
@@ -1940,21 +1939,36 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
19401939
ret = 0;
19411940

19421941
/*
1943-
* If our data extent was not directly shared (without multiple
1944-
* reference items), than it might have a single reference item
1945-
* with a count > 1 for the same offset, which means there are 2
1946-
* (or more) file extent items that point to the data extent -
1947-
* this happens when a file extent item needs to be split and
1948-
* then one item gets moved to another leaf due to a b+tree leaf
1949-
* split when inserting some item. In this case the file extent
1950-
* items may be located in different leaves and therefore some
1951-
* of the leaves may be referenced through shared subtrees while
1952-
* others are not. Since our extent buffer cache only works for
1953-
* a single path (by far the most common case and simpler to
1954-
* deal with), we can not use it if we have multiple leaves
1955-
* (which implies multiple paths).
1942+
* More than one extent buffer (bytenr) may have been added to
1943+
* the ctx->refs ulist, in which case we have to check multiple
1944+
* tree paths in case the first one is not shared, so we can not
1945+
* use the path cache which is made for a single path. Multiple
1946+
* extent buffers at the current level happen when:
1947+
*
1948+
* 1) level -1, the data extent: If our data extent was not
1949+
* directly shared (without multiple reference items), then
1950+
* it might have a single reference item with a count > 1 for
1951+
* the same offset, which means there are 2 (or more) file
1952+
* extent items that point to the data extent - this happens
1953+
* when a file extent item needs to be split and then one
1954+
* item gets moved to another leaf due to a b+tree leaf split
1955+
* when inserting some item. In this case the file extent
1956+
* items may be located in different leaves and therefore
1957+
* some of the leaves may be referenced through shared
1958+
* subtrees while others are not. Since our extent buffer
1959+
* cache only works for a single path (by far the most common
1960+
* case and simpler to deal with), we can not use it if we
1961+
* have multiple leaves (which implies multiple paths).
1962+
*
1963+
* 2) level >= 0, a tree node/leaf: We can have a mix of direct
1964+
* and indirect references on a b+tree node/leaf, so we have
1965+
* to check multiple paths, and the extent buffer (the
1966+
* current bytenr) may be shared or not. One example is
1967+
* during relocation as we may get a shared tree block ref
1968+
* (direct ref) and a non-shared tree block ref (indirect
1969+
* ref) for the same node/leaf.
19561970
*/
1957-
if (level == -1 && ctx->refs.nnodes > 1)
1971+
if ((ctx->refs.nnodes - prev_ref_count) > 1)
19581972
ctx->use_path_cache = false;
19591973

19601974
if (level >= 0)
@@ -1964,18 +1978,45 @@ int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
19641978
if (!node)
19651979
break;
19661980
bytenr = node->val;
1967-
level++;
1968-
cached = lookup_backref_shared_cache(ctx, root, bytenr, level,
1969-
&is_shared);
1970-
if (cached) {
1971-
ret = (is_shared ? 1 : 0);
1972-
break;
1981+
if (ctx->use_path_cache) {
1982+
bool is_shared;
1983+
bool cached;
1984+
1985+
level++;
1986+
cached = lookup_backref_shared_cache(ctx, root, bytenr,
1987+
level, &is_shared);
1988+
if (cached) {
1989+
ret = (is_shared ? 1 : 0);
1990+
break;
1991+
}
19731992
}
19741993
shared.share_count = 0;
19751994
shared.have_delayed_delete_refs = false;
19761995
cond_resched();
19771996
}
19781997

1998+
/*
1999+
* If the path cache is disabled, then it means at some tree level we
2000+
* got multiple parents due to a mix of direct and indirect backrefs or
2001+
* multiple leaves with file extent items pointing to the same data
2002+
* extent. We have to invalidate the cache and cache only the sharedness
2003+
* result for the levels where we got only one node/reference.
2004+
*/
2005+
if (!ctx->use_path_cache) {
2006+
int i = 0;
2007+
2008+
level--;
2009+
if (ret >= 0 && level >= 0) {
2010+
bytenr = ctx->path_cache_entries[level].bytenr;
2011+
ctx->use_path_cache = true;
2012+
store_backref_shared_cache(ctx, root, bytenr, level, ret);
2013+
i = level + 1;
2014+
}
2015+
2016+
for ( ; i < BTRFS_MAX_LEVEL; i++)
2017+
ctx->path_cache_entries[i].bytenr = 0;
2018+
}
2019+
19792020
/*
19802021
* Cache the sharedness result for the data extent if we know our inode
19812022
* has more than 1 file extent item that refers to the data extent.

fs/btrfs/ioctl.c

+2
Original file line numberDiff line numberDiff line change
@@ -3732,7 +3732,9 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
37323732
}
37333733

37343734
/* update qgroup status and info */
3735+
mutex_lock(&fs_info->qgroup_ioctl_lock);
37353736
err = btrfs_run_qgroups(trans);
3737+
mutex_unlock(&fs_info->qgroup_ioctl_lock);
37363738
if (err < 0)
37373739
btrfs_handle_fs_error(fs_info, err,
37383740
"failed to update qgroup status and info");

fs/btrfs/qgroup.c

+10-1
Original file line numberDiff line numberDiff line change
@@ -2828,13 +2828,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
28282828
}
28292829

28302830
/*
2831-
* called from commit_transaction. Writes all changed qgroups to disk.
2831+
* Writes all changed qgroups to disk.
2832+
* Called by the transaction commit path and the qgroup assign ioctl.
28322833
*/
28332834
int btrfs_run_qgroups(struct btrfs_trans_handle *trans)
28342835
{
28352836
struct btrfs_fs_info *fs_info = trans->fs_info;
28362837
int ret = 0;
28372838

2839+
/*
2840+
* In case we are called from the qgroup assign ioctl, assert that we
2841+
* are holding the qgroup_ioctl_lock, otherwise we can race with a quota
2842+
* disable operation (ioctl) and access a freed quota root.
2843+
*/
2844+
if (trans->transaction->state != TRANS_STATE_COMMIT_DOING)
2845+
lockdep_assert_held(&fs_info->qgroup_ioctl_lock);
2846+
28382847
if (!fs_info->quota_root)
28392848
return ret;
28402849

fs/btrfs/transaction.c

+14-1
Original file line numberDiff line numberDiff line change
@@ -2035,7 +2035,20 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
20352035

20362036
if (current->journal_info == trans)
20372037
current->journal_info = NULL;
2038-
btrfs_scrub_cancel(fs_info);
2038+
2039+
/*
2040+
* If relocation is running, we can't cancel scrub because that will
2041+
* result in a deadlock. Before relocating a block group, relocation
2042+
* pauses scrub, then starts and commits a transaction before unpausing
2043+
* scrub. If the transaction commit is being done by the relocation
2044+
* task or triggered by another task and the relocation task is waiting
2045+
* for the commit, and we end up here due to an error in the commit
2046+
* path, then calling btrfs_scrub_cancel() will deadlock, as we are
2047+
* asking for scrub to stop while having it asked to be paused higher
2048+
* above in relocation code.
2049+
*/
2050+
if (!test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
2051+
btrfs_scrub_cancel(fs_info);
20392052

20402053
kmem_cache_free(btrfs_trans_handle_cachep, trans);
20412054
}

fs/btrfs/volumes.c

+18-2
Original file line numberDiff line numberDiff line change
@@ -1366,8 +1366,17 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
13661366
* So, we need to add a special mount option to scan for
13671367
* later supers, using BTRFS_SUPER_MIRROR_MAX instead
13681368
*/
1369-
flags |= FMODE_EXCL;
13701369

1370+
/*
1371+
* Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
1372+
* initiate the device scan which may race with the user's mount
1373+
* or mkfs command, resulting in failure.
1374+
* Since the device scan is solely for reading purposes, there is
1375+
* no need for FMODE_EXCL. Additionally, the devices are read again
1376+
* during the mount process. It is ok to get some inconsistent
1377+
* values temporarily, as the device paths of the fsid are the only
1378+
* required information for assembling the volume.
1379+
*/
13711380
bdev = blkdev_get_by_path(path, flags, holder);
13721381
if (IS_ERR(bdev))
13731382
return ERR_CAST(bdev);
@@ -3266,8 +3275,15 @@ int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
32663275
btrfs_scrub_pause(fs_info);
32673276
ret = btrfs_relocate_block_group(fs_info, chunk_offset);
32683277
btrfs_scrub_continue(fs_info);
3269-
if (ret)
3278+
if (ret) {
3279+
/*
3280+
* If we had a transaction abort, stop all running scrubs.
3281+
* See transaction.c:cleanup_transaction() why we do it here.
3282+
*/
3283+
if (BTRFS_FS_ERROR(fs_info))
3284+
btrfs_scrub_cancel(fs_info);
32703285
return ret;
3286+
}
32713287

32723288
block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
32733289
if (!block_group)

0 commit comments

Comments
 (0)