Re: [RFC PATCH 03/11] rust: block: introduce `kernel::block::mq` module

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



Hi Benno,

Benno Lossin <benno.lossin@xxxxxxxxx> writes:

<...>

>> diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs
>> new file mode 100644
>> index 000000000000..50496af15bbf
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/gen_disk.rs
>> @@ -0,0 +1,133 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! GenDisk abstraction
>> +//!
>> +//! C header: [`include/linux/blkdev.h`](../../include/linux/blkdev.h)
>> +//! C header: [`include/linux/blk_mq.h`](../../include/linux/blk_mq.h)
>> +
>> +use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet};
>> +use crate::{
>> +    bindings, error::from_err_ptr, error::Result, sync::Arc, types::ForeignOwnable,
>> +    types::ScopeGuard,
>> +};
>> +use core::fmt::{self, Write};
>> +
>> +/// A generic block device
>> +///
>> +/// # Invariants
>> +///
>> +///  - `gendisk` must always point to an initialized and valid `struct gendisk`.
>> +pub struct GenDisk<T: Operations> {
>> +    _tagset: Arc<TagSet<T>>,
>> +    gendisk: *mut bindings::gendisk,
>
> Why are these two fields not embedded? Shouldn't the user decide where
> to allocate?

The `TagSet` can be shared between multiple `GenDisk`. Using an `Arc`
seems resonable?

For the `gendisk` field, the allocation is done by C and the address
must be stable. We are owning the pointee and must drop it when it goes out
of scope. I could do this:

#[repr(transparent)]
struct GenDisk(Opaque<bindings::gendisk>);

struct UniqueGenDiskRef {
    _tagset: Arc<TagSet<T>>,
    gendisk: Pin<&'static mut GenDisk>,

}

but it seems pointless. `struct GenDisk` would not be pub in that case. What do you think?

>
>> +}
>> +
>> +// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a
>> +// `TagSet` It is safe to send this to other threads as long as T is Send.
>> +unsafe impl<T: Operations + Send> Send for GenDisk<T> {}
>> +
>> +impl<T: Operations> GenDisk<T> {
>> +    /// Try to create a new `GenDisk`
>> +    pub fn try_new(tagset: Arc<TagSet<T>>, queue_data: T::QueueData) -> Result<Self> {
>> +        let data = queue_data.into_foreign();
>> +        let recover_data = ScopeGuard::new(|| {
>> +            // SAFETY: T::QueueData was created by the call to `into_foreign()` above
>> +            unsafe { T::QueueData::from_foreign(data) };
>> +        });
>> +
>> +        let lock_class_key = crate::sync::LockClassKey::new();
>> +
>> +        // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set
>> +        let gendisk = from_err_ptr(unsafe {
>> +            bindings::__blk_mq_alloc_disk(tagset.raw_tag_set(), data as _, lock_class_key.as_ptr())
>
> Avoid `as _` casts.

👍

>
>> +        })?;
>> +
>> +        const TABLE: bindings::block_device_operations = bindings::block_device_operations {
>> +            submit_bio: None,
>> +            open: None,
>> +            release: None,
>> +            ioctl: None,
>> +            compat_ioctl: None,
>> +            check_events: None,
>> +            unlock_native_capacity: None,
>> +            getgeo: None,
>> +            set_read_only: None,
>> +            swap_slot_free_notify: None,
>> +            report_zones: None,
>> +            devnode: None,
>> +            alternative_gpt_sector: None,
>> +            get_unique_id: None,
>> +            owner: core::ptr::null_mut(),
>> +            pr_ops: core::ptr::null_mut(),
>> +            free_disk: None,
>> +            poll_bio: None,
>> +        };
>> +
>> +        // SAFETY: gendisk is a valid pointer as we initialized it above
>> +        unsafe { (*gendisk).fops = &TABLE };
>> +
>> +        recover_data.dismiss();
>> +        Ok(Self {
>> +            _tagset: tagset,
>> +            gendisk,
>> +        })
>> +    }
>> +
>> +    /// Set the name of the device
>> +    pub fn set_name(&self, args: fmt::Arguments<'_>) -> Result {
>> +        let mut raw_writer = RawWriter::from_array(unsafe { &mut (*self.gendisk).disk_name });
>
> Missing `SAFETY` also see below.

Yes, I have a few of those. Will add for next version.

>
>> +        raw_writer.write_fmt(args)?;
>> +        raw_writer.write_char('\0')?;
>> +        Ok(())
>> +    }
>> +
>> +    /// Register the device with the kernel. When this function return, the
>> +    /// device is accessible from VFS. The kernel may issue reads to the device
>> +    /// during registration to discover partition infomation.
>> +    pub fn add(&self) -> Result {
>> +        crate::error::to_result(unsafe {
>> +            bindings::device_add_disk(core::ptr::null_mut(), self.gendisk, core::ptr::null_mut())
>> +        })
>> +    }
>> +
>> +    /// Call to tell the block layer the capcacity of the device
>> +    pub fn set_capacity(&self, sectors: u64) {
>> +        unsafe { bindings::set_capacity(self.gendisk, sectors) };
>> +    }
>> +
>> +    /// Set the logical block size of the device
>> +    pub fn set_queue_logical_block_size(&self, size: u32) {
>> +        unsafe { bindings::blk_queue_logical_block_size((*self.gendisk).queue, size) };
>> +    }
>> +
>> +    /// Set the physical block size of the device
>
> What does this *do*? I do not think the doc string gives any meaningful
> information not present in the function name (this might just be,
> because I have no idea of what this is and anyone with just a little
> more knowledge would know, but I still wanted to mention it).

I'll add some more context.

>
>> +    pub fn set_queue_physical_block_size(&self, size: u32) {
>> +        unsafe { bindings::blk_queue_physical_block_size((*self.gendisk).queue, size) };
>> +    }
>> +
>> +    /// Set the rotational media attribute for the device
>> +    pub fn set_rotational(&self, rotational: bool) {
>> +        if !rotational {
>> +            unsafe {
>> +                bindings::blk_queue_flag_set(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue)
>> +            };
>> +        } else {
>> +            unsafe {
>> +                bindings::blk_queue_flag_clear(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue)
>> +            };
>> +        }
>> +    }
>> +}
>> +
>> +impl<T: Operations> Drop for GenDisk<T> {
>> +    fn drop(&mut self) {
>> +        let queue_data = unsafe { (*(*self.gendisk).queue).queuedata };
>> +
>> +        unsafe { bindings::del_gendisk(self.gendisk) };
>> +
>> +        // SAFETY: `queue.queuedata` was created by `GenDisk::try_new()` with a
>> +        // call to `ForeignOwnable::into_pointer()` to create `queuedata`.
>> +        // `ForeignOwnable::from_foreign()` is only called here.
>> +        let _queue_data = unsafe { T::QueueData::from_foreign(queue_data) };
>> +    }
>> +}
>> diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs
>> new file mode 100644
>> index 000000000000..fb1ab707d1f0
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/operations.rs
>> @@ -0,0 +1,260 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides an interface for blk-mq drivers to implement.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h)
>> +
>> +use crate::{
>> +    bindings,
>> +    block::mq::{tag_set::TagSetRef, Request},
>> +    error::{from_result, Result},
>> +    types::ForeignOwnable,
>> +};
>> +use core::{marker::PhantomData, pin::Pin};
>> +
>> +/// Implement this trait to interface blk-mq as block devices
>> +#[macros::vtable]
>> +pub trait Operations: Sized {
>
> Is this trait really safe? Are there **no** requirements for e.g.
> `QueueData`? So could I use `Box<()>`?

Yes, it is intended to be safe. `ForeignOwnable` covers safety
requirements for these associated data types.

>
>> +    /// Data associated with a request. This data is located next to the request
>> +    /// structure.
>> +    type RequestData;
>> +
>> +    /// Data associated with the `struct request_queue` that is allocated for
>> +    /// the `GenDisk` associated with this `Operations` implementation.
>> +    type QueueData: ForeignOwnable;
>> +
>> +    /// Data associated with a dispatch queue. This is stored as a pointer in
>> +    /// `struct blk_mq_hw_ctx`.
>> +    type HwData: ForeignOwnable;
>> +
>> +    /// Data associated with a tag set. This is stored as a pointer in `struct
>> +    /// blk_mq_tag_set`.
>> +    type TagSetData: ForeignOwnable;
>> +
>> +    /// Called by the kernel to allocate a new `RequestData`. The structure will
>> +    /// eventually be pinned, so defer initialization to `init_request_data()`
>> +    fn new_request_data(
>> +        _tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>,
>> +    ) -> Result<Self::RequestData>;
>> +
>> +    /// Called by the kernel to initialize a previously allocated `RequestData`
>> +    fn init_request_data(
>> +        _tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>,
>> +        _data: Pin<&mut Self::RequestData>,
>> +    ) -> Result {
>> +        Ok(())
>> +    }
>> +
>> +    /// Called by the kernel to queue a request with the driver. If `is_last` is
>> +    /// `false`, the driver is allowed to defer commiting the request.
>> +    fn queue_rq(
>> +        hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>,
>> +        queue_data: <Self::QueueData as ForeignOwnable>::Borrowed<'_>,
>> +        rq: &Request<Self>,
>> +        is_last: bool,
>> +    ) -> Result;
>> +
>> +    /// Called by the kernel to indicate that queued requests should be submitted
>> +    fn commit_rqs(
>> +        hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>,
>> +        queue_data: <Self::QueueData as ForeignOwnable>::Borrowed<'_>,
>> +    );
>> +
>> +    /// Called by the kernel when the request is completed
>> +    fn complete(_rq: &Request<Self>);
>> +
>> +    /// Called by the kernel to allocate and initialize a driver specific hardware context data
>> +    fn init_hctx(
>> +        tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>,
>> +        hctx_idx: u32,
>> +    ) -> Result<Self::HwData>;
>> +
>> +    /// Called by the kernel to poll the device for completed requests. Only used for poll queues.
>> +    fn poll(_hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>) -> i32 {
>> +        unreachable!()
>
> Why are these implemented this way? Should this really panic? Maybe
> return an error? Why `i32` as the return type? If it can error it should
> be `Result<u32>`.

I will update in accordance with the new documentation for `#[vtable]`.

Return type should be `bool`, I will change it. It inherited the int
from `core::ffi::c_int`.

>
>> +    }
>> +
>> +    /// Called by the kernel to map submission queues to CPU cores.
>> +    fn map_queues(_tag_set: &TagSetRef) {
>> +        unreachable!()
>> +    }
>> +
>> +    // There is no need for exit_request() because `drop` will be called.
>> +}
>> +
>> +pub(crate) struct OperationsVtable<T: Operations>(PhantomData<T>);
>> +
>> +impl<T: Operations> OperationsVtable<T> {
>> +    // # Safety
>> +    //
>> +    // The caller of this function must ensure that `hctx` and `bd` are valid
>> +    // and initialized. The pointees must outlive this function. Further
>> +    // `hctx->driver_data` must be a pointer created by a call to
>> +    // `Self::init_hctx_callback()` and the pointee must outlive this function.
>> +    // This function must not be called with a `hctx` for which
>> +    // `Self::exit_hctx_callback()` has been called.
>> +    unsafe extern "C" fn queue_rq_callback(
>> +        hctx: *mut bindings::blk_mq_hw_ctx,
>> +        bd: *const bindings::blk_mq_queue_data,
>> +    ) -> bindings::blk_status_t {
>> +        // SAFETY: `bd` is valid as required by the safety requirement for this function.
>> +        let rq = unsafe { (*bd).rq };
>> +
>> +        // SAFETY: The safety requirement for this function ensure that
>> +        // `(*hctx).driver_data` was returned by a call to
>> +        // `Self::init_hctx_callback()`. That function uses
>> +        // `PointerWrapper::into_pointer()` to create `driver_data`. Further,
>> +        // the returned value does not outlive this function and
>> +        // `from_foreign()` is not called until `Self::exit_hctx_callback()` is
>> +        // called. By the safety requirement of this function and contract with
>> +        // the `blk-mq` API, `queue_rq_callback()` will not be called after that
>> +        // point.
>
> This safety section and the others here are rather long and mostly
> repeat themselves. Is it possible to put this in its own module and
> explain the safety invariants in that module and then in these safety
> sections just refer to some labels from that section?
>
> I think we should discuss this in our next meeting.

Not sure about the best way to do this. Lets talk.

<...>

>> +
>> +    pub(crate) const unsafe fn build() -> &'static bindings::blk_mq_ops {
>> +        &Self::VTABLE
>> +    }
>
> Why is this function `unsafe`?

I don't think it needs to be unsafe, thanks.

>
>> +}
>
> Some `# Safety` and `SAFETY` missing in this hunk.
>
>> diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs
>> new file mode 100644
>> index 000000000000..25c16ee0b1f7
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/raw_writer.rs
>> @@ -0,0 +1,30 @@
>> +use core::fmt::{self, Write};
>> +
>> +pub(crate) struct RawWriter {
>> +    ptr: *mut u8,
>> +    len: usize,
>> +}
>> +
>> +impl RawWriter {
>> +    unsafe fn new(ptr: *mut u8, len: usize) -> Self {
>> +        Self { ptr, len }
>> +    }
>> +
>> +    pub(crate) fn from_array<const N: usize>(a: &mut [core::ffi::c_char; N]) -> Self {
>> +        unsafe { Self::new(&mut a[0] as *mut _ as _, N) }
>> +    }
>
> This function needs to be `unsafe`, because it never captures the
> lifetime of `a`. I can write:
>     let mut a = Box::new([0; 10]);
>     let mut writer = RawWriter::from_array(&mut *a);
>     drop(a);
>     writer.write_str("Abc"); // UAF
> Alternatively add a lifetime to `RawWriter`.

Yes, a lifetime is missing in RawWriter, thanks.

>
>> +}
>> +
>> +impl Write for RawWriter {
>> +    fn write_str(&mut self, s: &str) -> fmt::Result {
>> +        let bytes = s.as_bytes();
>> +        let len = bytes.len();
>> +        if len > self.len {
>> +            return Err(fmt::Error);
>> +        }
>> +        unsafe { core::ptr::copy_nonoverlapping(&bytes[0], self.ptr, len) };
>> +        self.ptr = unsafe { self.ptr.add(len) };
>> +        self.len -= len;
>> +        Ok(())
>> +    }
>> +}
>> diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs
>> new file mode 100644
>> index 000000000000..e95ae3fd71ad
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/request.rs
>> @@ -0,0 +1,71 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides a wrapper for the C `struct request` type.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h)
>> +
>> +use crate::{
>> +    bindings,
>> +    block::mq::Operations,
>> +    error::{Error, Result},
>> +};
>> +use core::marker::PhantomData;
>> +
>> +/// A wrapper around a blk-mq `struct request`. This represents an IO request.
>> +pub struct Request<T: Operations> {
>> +    ptr: *mut bindings::request,
>
> Why is this not embedded?

I have changed it to `struct Request(Opaque<bindings::request>)` for
next version 👍

>
>> +    _p: PhantomData<T>,
>> +}
>> +
>> +impl<T: Operations> Request<T> {
>> +    pub(crate) unsafe fn from_ptr(ptr: *mut bindings::request) -> Self {
>> +        Self {
>> +            ptr,
>> +            _p: PhantomData,
>> +        }
>> +    }
>> +
>> +    /// Get the command identifier for the request
>> +    pub fn command(&self) -> u32 {
>> +        unsafe { (*self.ptr).cmd_flags & ((1 << bindings::REQ_OP_BITS) - 1) }
>> +    }
>> +
>> +    /// Call this to indicate to the kernel that the request has been issued by the driver
>> +    pub fn start(&self) {
>> +        unsafe { bindings::blk_mq_start_request(self.ptr) };
>> +    }
>> +
>> +    /// Call this to indicate to the kernel that the request has been completed without errors
>> +    // TODO: Consume rq so that we can't use it after ending it?
>> +    pub fn end_ok(&self) {
>> +        unsafe { bindings::blk_mq_end_request(self.ptr, bindings::BLK_STS_OK as _) };
>> +    }
>> +
>> +    /// Call this to indicate to the kernel that the request completed with an error
>> +    pub fn end_err(&self, err: Error) {
>> +        unsafe { bindings::blk_mq_end_request(self.ptr, err.to_blk_status()) };
>> +    }
>> +
>> +    /// Call this to indicate that the request completed with the status indicated by `status`
>> +    pub fn end(&self, status: Result) {
>> +        if let Err(e) = status {
>> +            self.end_err(e);
>> +        } else {
>> +            self.end_ok();
>> +        }
>> +    }
>> +
>> +    /// Call this to schedule defered completion of the request
>> +    // TODO: Consume rq so that we can't use it after completing it?
>> +    pub fn complete(&self) {
>> +        if !unsafe { bindings::blk_mq_complete_request_remote(self.ptr) } {
>> +            T::complete(&unsafe { Self::from_ptr(self.ptr) });
>> +        }
>> +    }
>> +
>> +    /// Get the target sector for the request
>> +    #[inline(always)]
>
> Why is this `inline(always)`?

Compiler would not inline from kernel crate to modules without this. I
will check if this is still the case.

>
>> +    pub fn sector(&self) -> usize {
>> +        unsafe { (*self.ptr).__sector as usize }
>> +    }
>> +}
>> diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs
>> new file mode 100644
>> index 000000000000..d122db7f6d0e
>> --- /dev/null
>> +++ b/rust/kernel/block/mq/tag_set.rs
>> @@ -0,0 +1,92 @@
>> +// SPDX-License-Identifier: GPL-2.0
>> +
>> +//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`.
>> +//!
>> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h)
>> +
>> +use crate::{
>> +    bindings,
>> +    block::mq::{operations::OperationsVtable, Operations},
>> +    error::{Error, Result},
>> +    sync::Arc,
>> +    types::ForeignOwnable,
>> +};
>> +use core::{cell::UnsafeCell, convert::TryInto, marker::PhantomData};
>> +
>> +/// A wrapper for the C `struct blk_mq_tag_set`
>> +pub struct TagSet<T: Operations> {
>> +    inner: UnsafeCell<bindings::blk_mq_tag_set>,
>> +    _p: PhantomData<T>,
>> +}
>> +
>> +impl<T: Operations> TagSet<T> {
>> +    /// Try to create a new tag set
>> +    pub fn try_new(
>> +        nr_hw_queues: u32,
>> +        tagset_data: T::TagSetData,
>> +        num_tags: u32,
>> +        num_maps: u32,
>> +    ) -> Result<Arc<Self>> {
>
> Why force the users to use `Arc`?

Changed to return a `PinInit<TagSet>` for next version.

>
>> +        let tagset = Arc::try_new(Self {
>> +            inner: UnsafeCell::new(bindings::blk_mq_tag_set::default()),
>> +            _p: PhantomData,
>> +        })?;
>> +
>> +        // SAFETY: We just allocated `tagset`, we know this is the only reference to it.
>> +        let inner = unsafe { &mut *tagset.inner.get() };
>> +
>> +        inner.ops = unsafe { OperationsVtable::<T>::build() };
>> +        inner.nr_hw_queues = nr_hw_queues;
>> +        inner.timeout = 0; // 0 means default which is 30 * HZ in C
>> +        inner.numa_node = bindings::NUMA_NO_NODE;
>> +        inner.queue_depth = num_tags;
>> +        inner.cmd_size = core::mem::size_of::<T::RequestData>().try_into()?;
>> +        inner.flags = bindings::BLK_MQ_F_SHOULD_MERGE;
>> +        inner.driver_data = tagset_data.into_foreign() as _;
>> +        inner.nr_maps = num_maps;
>> +
>> +        // SAFETY: `inner` points to valid and initialised memory.
>> +        let ret = unsafe { bindings::blk_mq_alloc_tag_set(inner) };
>> +        if ret < 0 {
>> +            // SAFETY: We created `driver_data` above with `into_foreign`
>> +            unsafe { T::TagSetData::from_foreign(inner.driver_data) };
>> +            return Err(Error::from_errno(ret));
>> +        }
>> +
>> +        Ok(tagset)
>> +    }
>> +
>> +    /// Return the pointer to the wrapped `struct blk_mq_tag_set`
>> +    pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set {
>> +        self.inner.get()
>> +    }
>> +}
>> +
>> +impl<T: Operations> Drop for TagSet<T> {
>> +    fn drop(&mut self) {
>> +        let tagset_data = unsafe { (*self.inner.get()).driver_data };
>> +
>> +        // SAFETY: `inner` is valid and has been properly initialised during construction.
>> +        unsafe { bindings::blk_mq_free_tag_set(self.inner.get()) };
>> +
>> +        // SAFETY: `tagset_data` was created by a call to
>> +        // `ForeignOwnable::into_foreign` in `TagSet::try_new()`
>> +        unsafe { T::TagSetData::from_foreign(tagset_data) };
>> +    }
>> +}
>> +
>> +/// A tag set reference. Used to control lifetime and prevent drop of TagSet references passed to
>> +/// `Operations::map_queues()`
>> +pub struct TagSetRef {
>> +    ptr: *mut bindings::blk_mq_tag_set,
>> +}
>> +
>> +impl TagSetRef {
>> +    pub(crate) unsafe fn from_ptr(tagset: *mut bindings::blk_mq_tag_set) -> Self {
>> +        Self { ptr: tagset }
>> +    }
>> +
>> +    pub fn ptr(&self) -> *mut bindings::blk_mq_tag_set {
>> +        self.ptr
>> +    }
>> +}
>
> This is a **very** thin abstraction, why is it needed?

It is not. I changed it to `&TagSet`, thanks.


Thanks for the comments!

Best regards,
Andreas





[Index of Archives]     [Linux RAID]     [Linux SCSI]     [Linux ATA RAID]     [IDE]     [Linux Wireless]     [Linux Kernel]     [ATH6KL]     [Linux Bluetooth]     [Linux Netdev]     [Kernel Newbies]     [Security]     [Git]     [Netfilter]     [Bugtraq]     [Yosemite News]     [MIPS Linux]     [ARM Linux]     [Linux Security]     [Device Mapper]

  Powered by Linux