From: Anand Jain <anand.jain@oracle.com>
To: Zorro Lang <zlang@redhat.com>, Boris Burkov <boris@bur.io>
Cc: linux-btrfs@vger.kernel.org, kernel-team@fb.com, fstests@vger.kernel.org
Subject: Re: [PATCH v4] btrfs: new test for devt change between mounts
Date: Tue, 9 Apr 2024 18:29:05 +0800 [thread overview]
Message-ID: <1939cd62-770b-429a-a261-c130f78bb7ea@oracle.com> (raw)
In-Reply-To: <20240403112049.awm3kvsl2zeukjvr@dell-per750-06-vm-08.rhts.eng.pek2.redhat.com>
On 4/3/24 19:20, Zorro Lang wrote:
> On Mon, Mar 11, 2024 at 12:13:44PM -0700, Boris Burkov wrote:
>> It is possible to confuse the btrfs device cache (fs_devices) by
>> starting with a multi-device filesystem, then removing and re-adding a
>> device in a way which changes its dev_t while the filesystem is
>> unmounted. After this procedure, if we remount, then we are in a funny
>> state where struct btrfs_device's "devt" field does not match the bd_dev
>> of the "bdev" field. I would say this is bad enough, as we have violated
>> a pretty clear invariant.
>>
>> But for style points, we can then remove the extra device from the fs,
>> making it a single device fs, which enables the "temp_fsid" feature,
>> which permits multiple separate mounts of different devices with the
>> same fsid. Since btrfs is confused and *thinks* there are different
>> devices (based on device->devt), it allows a second redundant mount of
>> the same device (not a bind mount!). This then allows us to corrupt the
>> original mount by doing stuff to the one that should be a bind mount.
>>
>> Reviewed-by: Filipe Manana <fdmanana@suse.com>
>> Signed-off-by: Boris Burkov <boris@bur.io>
>> ---
>> Changelog:
>> v4:
>> - add tempfsid group
>> - remove fail check on mount command
>> - btrfs/311 -> btrfs/318
>> - add fixed_by annotation
>> v3:
>> - fstests convention improvements (helpers, output, comments, etc...)
>> v2:
>> - fix numerous fundamental issues, v1 wasn't really ready
>>
>> common/config | 1 +
>> tests/btrfs/318 | 108 ++++++++++++++++++++++++++++++++++++++++++++
>> tests/btrfs/318.out | 2 +
>> 3 files changed, 111 insertions(+)
>> create mode 100755 tests/btrfs/318
>> create mode 100644 tests/btrfs/318.out
>>
>> diff --git a/common/config b/common/config
>> index 2a1434bb1..d8a4508f4 100644
>> --- a/common/config
>> +++ b/common/config
>> @@ -235,6 +235,7 @@ export BLKZONE_PROG="$(type -P blkzone)"
>> export GZIP_PROG="$(type -P gzip)"
>> export BTRFS_IMAGE_PROG="$(type -P btrfs-image)"
>> export BTRFS_MAP_LOGICAL_PROG=$(type -P btrfs-map-logical)
>> +export PARTED_PROG="$(type -P parted)"
>>
>> # use 'udevadm settle' or 'udevsettle' to wait for lv to be settled.
>> # newer systems have udevadm command but older systems like RHEL5 don't.
>> diff --git a/tests/btrfs/318 b/tests/btrfs/318
>> new file mode 100755
>> index 000000000..796f09d13
>> --- /dev/null
>> +++ b/tests/btrfs/318
>> @@ -0,0 +1,108 @@
>> +#! /bin/bash
>> +# SPDX-License-Identifier: GPL-2.0
>> +# Copyright (C) 2024 Meta, Inc. All Rights Reserved.
>> +#
>> +# FS QA Test 318
>> +#
>> +# Test an edge case of multi device volume management in btrfs.
>> +# If a device changes devt between mounts of a multi device fs, we can trick
>> +# btrfs into mounting the same device twice fully (not as a bind mount). From
>> +# there, it is trivial to induce corruption.
>> +#
>> +. ./common/preamble
>> +_begin_fstest auto quick volume scrub tempfsid
>> +
>> +_fixed_by_kernel_commit XXXXXXXXXXXX \
>> + "btrfs: validate device maj:min during open"
>> +
>> +# real QA test starts here
>> +_supported_fs btrfs
>> +_require_test
>> +_require_command "$PARTED_PROG" parted
>> +_require_batched_discard "$TEST_DIR"
>> +
>> +_cleanup() {
>> + cd /
>> + $UMOUNT_PROG $MNT
>> + $UMOUNT_PROG $BIND
>> + losetup -d $DEV0
>> + losetup -d $DEV1
>> + losetup -d $DEV2
>> + rm $IMG0
>> + rm $IMG1
>> + rm $IMG2
>
> losetup -d $DEV0 $DEV1 $DEV2
> rm -f $IMG0 $IMG1 $IMG2
>
>> +}
>> +
>> +IMG0=$TEST_DIR/$$.img0
>> +IMG1=$TEST_DIR/$$.img1
>> +IMG2=$TEST_DIR/$$.img2
>> +truncate -s 1G $IMG0
>> +truncate -s 1G $IMG1
>> +truncate -s 1G $IMG2
>> +DEV0=$(losetup -f $IMG0 --show)
>> +DEV1=$(losetup -f $IMG1 --show)
>> +DEV2=$(losetup -f $IMG2 --show)
>
> _create_loop_device ?
>
>
>
>> +D0P1=$DEV0"p1"
>> +D1P1=$DEV1"p1"
>> +MNT=$TEST_DIR/mnt
>> +BIND=$TEST_DIR/bind
>
> Not sure if these two directories will be taken, better to remove
> them at first. And use "$seq" (or others) to be a prefix or suffix,
> e.g.
>
> $TEST_DIR/mnt-${seq}
>
> Others look good to me.
>
I have made these changes locally and included them for the PR.
Thanks.
--------
diff --git a/tests/btrfs/312 b/tests/btrfs/312
index d30e312f8689..89b548a0ad63 100755
--- a/tests/btrfs/312
+++ b/tests/btrfs/312
@@ -25,12 +25,9 @@ _cleanup() {
cd /
$UMOUNT_PROG $MNT
$UMOUNT_PROG $BIND
- losetup -d $DEV0
- losetup -d $DEV1
- losetup -d $DEV2
- rm $IMG0
- rm $IMG1
- rm $IMG2
+ losetup -d $DEV0 $DEV1 $DEV2
+ rm -f $IMG0 $IMG1 $IMG2
+ rm -rf $MNT $BIND
}
IMG0=$TEST_DIR/$$.img0
@@ -39,13 +36,13 @@ IMG2=$TEST_DIR/$$.img2
truncate -s 1G $IMG0
truncate -s 1G $IMG1
truncate -s 1G $IMG2
-DEV0=$(losetup -f $IMG0 --show)
-DEV1=$(losetup -f $IMG1 --show)
-DEV2=$(losetup -f $IMG2 --show)
+DEV0=$(_create_loop_device $IMG0)
+DEV1=$(_create_loop_device $IMG1)
+DEV2=$(_create_loop_device $IMG2)
D0P1=$DEV0"p1"
D1P1=$DEV1"p1"
-MNT=$TEST_DIR/mnt
-BIND=$TEST_DIR/bind
+MNT=$TEST_DIR/mnt-${seq}
+BIND=$TEST_DIR/bind-${seq}
# Setup partition table with one partition on each device.
$PARTED_PROG $DEV0 'mktable gpt' --script
@@ -59,6 +56,7 @@ $MKFS_BTRFS_PROG -f -msingle -dsingle $D0P1 $DEV2
>>$seqres.full 2>&1 || _fail "
# Cycle mount the two device fs to populate both devices into the
# stale device cache.
+rm -rf $MNT
mkdir -p $MNT
_mount $D0P1 $MNT
$UMOUNT_PROG $MNT
@@ -76,6 +74,7 @@ _mount $D0P1 $MNT
$BTRFS_UTIL_PROG device remove $DEV2 $MNT
# Create the would be bind mount.
+rm -rf $BIND
mkdir -p $BIND
_mount $D0P1 $BIND
mount_show=$($BTRFS_UTIL_PROG filesystem show $MNT)
> Thanks,
> Zorro
>
>
>> +
>> +# Setup partition table with one partition on each device.
>> +$PARTED_PROG $DEV0 'mktable gpt' --script
>> +$PARTED_PROG $DEV1 'mktable gpt' --script
>> +$PARTED_PROG $DEV0 'mkpart mypart 1M 100%' --script
>> +$PARTED_PROG $DEV1 'mkpart mypart 1M 100%' --script
>> +
>> +# mkfs with two devices to avoid clearing devices on close
>> +# single raid to allow removing DEV2.
>> +$MKFS_BTRFS_PROG -f -msingle -dsingle $D0P1 $DEV2 >>$seqres.full 2>&1 || _fail "failed to mkfs.btrfs"
>> +
>> +# Cycle mount the two device fs to populate both devices into the
>> +# stale device cache.
>> +mkdir -p $MNT
>> +_mount $D0P1 $MNT
>> +$UMOUNT_PROG $MNT
>> +
>> +# Swap the partition dev_ts. This leaves the dev_t in the cache out of date.
>> +$PARTED_PROG $DEV0 'rm 1' --script
>> +$PARTED_PROG $DEV1 'rm 1' --script
>> +$PARTED_PROG $DEV1 'mkpart mypart 1M 100%' --script
>> +$PARTED_PROG $DEV0 'mkpart mypart 1M 100%' --script
>> +
>> +# Mount with mismatched dev_t!
>> +_mount $D0P1 $MNT
>> +
>> +# Remove the extra device to bring temp-fsid back in the fray.
>> +$BTRFS_UTIL_PROG device remove $DEV2 $MNT
>> +
>> +# Create the would be bind mount.
>> +mkdir -p $BIND
>> +_mount $D0P1 $BIND
>> +mount_show=$($BTRFS_UTIL_PROG filesystem show $MNT)
>> +bind_show=$($BTRFS_UTIL_PROG filesystem show $BIND)
>> +# If they're different, we are in trouble.
>> +[ "$mount_show" = "$bind_show" ] || echo "$mount_show != $bind_show"
>> +
>> +# Now really prove it by corrupting the first mount with the second.
>> +for i in $(seq 20); do
>> + $XFS_IO_PROG -f -c "pwrite 0 50M" $MNT/foo.$i >>$seqres.full 2>&1
>> +done
>> +for i in $(seq 20); do
>> + $XFS_IO_PROG -f -c "pwrite 0 50M" $BIND/foo.$i >>$seqres.full 2>&1
>> +done
>> +
>> +# sync so that we really write the large file data out to the shared device
>> +sync
>> +
>> +# now delete from one view of the shared device
>> +find $BIND -type f -delete
>> +# sync so that fstrim definitely has deleted data to trim
>> +sync
>> +# This should blow up both mounts, if the writes somehow didn't overlap at all.
>> +$FSTRIM_PROG $BIND
>> +# drop caches to improve the odds we read from the corrupted device while scrubbing.
>> +echo 3 > /proc/sys/vm/drop_caches
>> +$BTRFS_UTIL_PROG scrub start -B $MNT | grep "Error summary:"
>> +
>> +status=0
>> +exit
>> diff --git a/tests/btrfs/318.out b/tests/btrfs/318.out
>> new file mode 100644
>> index 000000000..2798c4ba7
>> --- /dev/null
>> +++ b/tests/btrfs/318.out
>> @@ -0,0 +1,2 @@
>> +QA output created by 318
>> +Error summary: no errors found
>> --
>> 2.43.0
>>
>>
>
prev parent reply other threads:[~2024-04-09 10:29 UTC|newest]
Thread overview: 3+ messages / expand[flat|nested] mbox.gz Atom feed top
2024-03-11 19:13 [PATCH v4] btrfs: new test for devt change between mounts Boris Burkov
2024-04-03 11:20 ` Zorro Lang
2024-04-09 10:29 ` Anand Jain [this message]
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=1939cd62-770b-429a-a261-c130f78bb7ea@oracle.com \
--to=anand.jain@oracle.com \
--cc=boris@bur.io \
--cc=fstests@vger.kernel.org \
--cc=kernel-team@fb.com \
--cc=linux-btrfs@vger.kernel.org \
--cc=zlang@redhat.com \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).