Hi Benno, Benno Lossin <benno.lossin@xxxxxxxxx> writes: <...> >> diff --git a/rust/kernel/block/mq/gen_disk.rs b/rust/kernel/block/mq/gen_disk.rs >> new file mode 100644 >> index 000000000000..50496af15bbf >> --- /dev/null >> +++ b/rust/kernel/block/mq/gen_disk.rs >> @@ -0,0 +1,133 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> + >> +//! GenDisk abstraction >> +//! >> +//! C header: [`include/linux/blkdev.h`](../../include/linux/blkdev.h) >> +//! C header: [`include/linux/blk_mq.h`](../../include/linux/blk_mq.h) >> + >> +use crate::block::mq::{raw_writer::RawWriter, Operations, TagSet}; >> +use crate::{ >> + bindings, error::from_err_ptr, error::Result, sync::Arc, types::ForeignOwnable, >> + types::ScopeGuard, >> +}; >> +use core::fmt::{self, Write}; >> + >> +/// A generic block device >> +/// >> +/// # Invariants >> +/// >> +/// - `gendisk` must always point to an initialized and valid `struct gendisk`. >> +pub struct GenDisk<T: Operations> { >> + _tagset: Arc<TagSet<T>>, >> + gendisk: *mut bindings::gendisk, > > Why are these two fields not embedded? Shouldn't the user decide where > to allocate? The `TagSet` can be shared between multiple `GenDisk`. Using an `Arc` seems resonable? For the `gendisk` field, the allocation is done by C and the address must be stable. We are owning the pointee and must drop it when it goes out of scope. I could do this: #[repr(transparent)] struct GenDisk(Opaque<bindings::gendisk>); struct UniqueGenDiskRef { _tagset: Arc<TagSet<T>>, gendisk: Pin<&'static mut GenDisk>, } but it seems pointless. `struct GenDisk` would not be pub in that case. What do you think? > >> +} >> + >> +// SAFETY: `GenDisk` is an owned pointer to a `struct gendisk` and an `Arc` to a >> +// `TagSet` It is safe to send this to other threads as long as T is Send. >> +unsafe impl<T: Operations + Send> Send for GenDisk<T> {} >> + >> +impl<T: Operations> GenDisk<T> { >> + /// Try to create a new `GenDisk` >> + pub fn try_new(tagset: Arc<TagSet<T>>, queue_data: T::QueueData) -> Result<Self> { >> + let data = queue_data.into_foreign(); >> + let recover_data = ScopeGuard::new(|| { >> + // SAFETY: T::QueueData was created by the call to `into_foreign()` above >> + unsafe { T::QueueData::from_foreign(data) }; >> + }); >> + >> + let lock_class_key = crate::sync::LockClassKey::new(); >> + >> + // SAFETY: `tagset.raw_tag_set()` points to a valid and initialized tag set >> + let gendisk = from_err_ptr(unsafe { >> + bindings::__blk_mq_alloc_disk(tagset.raw_tag_set(), data as _, lock_class_key.as_ptr()) > > Avoid `as _` casts. 👍 > >> + })?; >> + >> + const TABLE: bindings::block_device_operations = bindings::block_device_operations { >> + submit_bio: None, >> + open: None, >> + release: None, >> + ioctl: None, >> + compat_ioctl: None, >> + check_events: None, >> + unlock_native_capacity: None, >> + getgeo: None, >> + set_read_only: None, >> + swap_slot_free_notify: None, >> + report_zones: None, >> + devnode: None, >> + alternative_gpt_sector: None, >> + get_unique_id: None, >> + owner: core::ptr::null_mut(), >> + pr_ops: core::ptr::null_mut(), >> + free_disk: None, >> + poll_bio: None, >> + }; >> + >> + // SAFETY: gendisk is a valid pointer as we initialized it above >> + unsafe { (*gendisk).fops = &TABLE }; >> + >> + recover_data.dismiss(); >> + Ok(Self { >> + _tagset: tagset, >> + gendisk, >> + }) >> + } >> + >> + /// Set the name of the device >> + pub fn set_name(&self, args: fmt::Arguments<'_>) -> Result { >> + let mut raw_writer = RawWriter::from_array(unsafe { &mut (*self.gendisk).disk_name }); > > Missing `SAFETY` also see below. Yes, I have a few of those. Will add for next version. > >> + raw_writer.write_fmt(args)?; >> + raw_writer.write_char('\0')?; >> + Ok(()) >> + } >> + >> + /// Register the device with the kernel. When this function return, the >> + /// device is accessible from VFS. The kernel may issue reads to the device >> + /// during registration to discover partition infomation. >> + pub fn add(&self) -> Result { >> + crate::error::to_result(unsafe { >> + bindings::device_add_disk(core::ptr::null_mut(), self.gendisk, core::ptr::null_mut()) >> + }) >> + } >> + >> + /// Call to tell the block layer the capcacity of the device >> + pub fn set_capacity(&self, sectors: u64) { >> + unsafe { bindings::set_capacity(self.gendisk, sectors) }; >> + } >> + >> + /// Set the logical block size of the device >> + pub fn set_queue_logical_block_size(&self, size: u32) { >> + unsafe { bindings::blk_queue_logical_block_size((*self.gendisk).queue, size) }; >> + } >> + >> + /// Set the physical block size of the device > > What does this *do*? I do not think the doc string gives any meaningful > information not present in the function name (this might just be, > because I have no idea of what this is and anyone with just a little > more knowledge would know, but I still wanted to mention it). I'll add some more context. > >> + pub fn set_queue_physical_block_size(&self, size: u32) { >> + unsafe { bindings::blk_queue_physical_block_size((*self.gendisk).queue, size) }; >> + } >> + >> + /// Set the rotational media attribute for the device >> + pub fn set_rotational(&self, rotational: bool) { >> + if !rotational { >> + unsafe { >> + bindings::blk_queue_flag_set(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue) >> + }; >> + } else { >> + unsafe { >> + bindings::blk_queue_flag_clear(bindings::QUEUE_FLAG_NONROT, (*self.gendisk).queue) >> + }; >> + } >> + } >> +} >> + >> +impl<T: Operations> Drop for GenDisk<T> { >> + fn drop(&mut self) { >> + let queue_data = unsafe { (*(*self.gendisk).queue).queuedata }; >> + >> + unsafe { bindings::del_gendisk(self.gendisk) }; >> + >> + // SAFETY: `queue.queuedata` was created by `GenDisk::try_new()` with a >> + // call to `ForeignOwnable::into_pointer()` to create `queuedata`. >> + // `ForeignOwnable::from_foreign()` is only called here. >> + let _queue_data = unsafe { T::QueueData::from_foreign(queue_data) }; >> + } >> +} >> diff --git a/rust/kernel/block/mq/operations.rs b/rust/kernel/block/mq/operations.rs >> new file mode 100644 >> index 000000000000..fb1ab707d1f0 >> --- /dev/null >> +++ b/rust/kernel/block/mq/operations.rs >> @@ -0,0 +1,260 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> + >> +//! This module provides an interface for blk-mq drivers to implement. >> +//! >> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h) >> + >> +use crate::{ >> + bindings, >> + block::mq::{tag_set::TagSetRef, Request}, >> + error::{from_result, Result}, >> + types::ForeignOwnable, >> +}; >> +use core::{marker::PhantomData, pin::Pin}; >> + >> +/// Implement this trait to interface blk-mq as block devices >> +#[macros::vtable] >> +pub trait Operations: Sized { > > Is this trait really safe? Are there **no** requirements for e.g. > `QueueData`? So could I use `Box<()>`? Yes, it is intended to be safe. `ForeignOwnable` covers safety requirements for these associated data types. > >> + /// Data associated with a request. This data is located next to the request >> + /// structure. >> + type RequestData; >> + >> + /// Data associated with the `struct request_queue` that is allocated for >> + /// the `GenDisk` associated with this `Operations` implementation. >> + type QueueData: ForeignOwnable; >> + >> + /// Data associated with a dispatch queue. This is stored as a pointer in >> + /// `struct blk_mq_hw_ctx`. >> + type HwData: ForeignOwnable; >> + >> + /// Data associated with a tag set. This is stored as a pointer in `struct >> + /// blk_mq_tag_set`. >> + type TagSetData: ForeignOwnable; >> + >> + /// Called by the kernel to allocate a new `RequestData`. The structure will >> + /// eventually be pinned, so defer initialization to `init_request_data()` >> + fn new_request_data( >> + _tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>, >> + ) -> Result<Self::RequestData>; >> + >> + /// Called by the kernel to initialize a previously allocated `RequestData` >> + fn init_request_data( >> + _tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>, >> + _data: Pin<&mut Self::RequestData>, >> + ) -> Result { >> + Ok(()) >> + } >> + >> + /// Called by the kernel to queue a request with the driver. If `is_last` is >> + /// `false`, the driver is allowed to defer commiting the request. >> + fn queue_rq( >> + hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>, >> + queue_data: <Self::QueueData as ForeignOwnable>::Borrowed<'_>, >> + rq: &Request<Self>, >> + is_last: bool, >> + ) -> Result; >> + >> + /// Called by the kernel to indicate that queued requests should be submitted >> + fn commit_rqs( >> + hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>, >> + queue_data: <Self::QueueData as ForeignOwnable>::Borrowed<'_>, >> + ); >> + >> + /// Called by the kernel when the request is completed >> + fn complete(_rq: &Request<Self>); >> + >> + /// Called by the kernel to allocate and initialize a driver specific hardware context data >> + fn init_hctx( >> + tagset_data: <Self::TagSetData as ForeignOwnable>::Borrowed<'_>, >> + hctx_idx: u32, >> + ) -> Result<Self::HwData>; >> + >> + /// Called by the kernel to poll the device for completed requests. Only used for poll queues. >> + fn poll(_hw_data: <Self::HwData as ForeignOwnable>::Borrowed<'_>) -> i32 { >> + unreachable!() > > Why are these implemented this way? Should this really panic? Maybe > return an error? Why `i32` as the return type? If it can error it should > be `Result<u32>`. I will update in accordance with the new documentation for `#[vtable]`. Return type should be `bool`, I will change it. It inherited the int from `core::ffi::c_int`. > >> + } >> + >> + /// Called by the kernel to map submission queues to CPU cores. >> + fn map_queues(_tag_set: &TagSetRef) { >> + unreachable!() >> + } >> + >> + // There is no need for exit_request() because `drop` will be called. >> +} >> + >> +pub(crate) struct OperationsVtable<T: Operations>(PhantomData<T>); >> + >> +impl<T: Operations> OperationsVtable<T> { >> + // # Safety >> + // >> + // The caller of this function must ensure that `hctx` and `bd` are valid >> + // and initialized. The pointees must outlive this function. Further >> + // `hctx->driver_data` must be a pointer created by a call to >> + // `Self::init_hctx_callback()` and the pointee must outlive this function. >> + // This function must not be called with a `hctx` for which >> + // `Self::exit_hctx_callback()` has been called. >> + unsafe extern "C" fn queue_rq_callback( >> + hctx: *mut bindings::blk_mq_hw_ctx, >> + bd: *const bindings::blk_mq_queue_data, >> + ) -> bindings::blk_status_t { >> + // SAFETY: `bd` is valid as required by the safety requirement for this function. >> + let rq = unsafe { (*bd).rq }; >> + >> + // SAFETY: The safety requirement for this function ensure that >> + // `(*hctx).driver_data` was returned by a call to >> + // `Self::init_hctx_callback()`. That function uses >> + // `PointerWrapper::into_pointer()` to create `driver_data`. Further, >> + // the returned value does not outlive this function and >> + // `from_foreign()` is not called until `Self::exit_hctx_callback()` is >> + // called. By the safety requirement of this function and contract with >> + // the `blk-mq` API, `queue_rq_callback()` will not be called after that >> + // point. > > This safety section and the others here are rather long and mostly > repeat themselves. Is it possible to put this in its own module and > explain the safety invariants in that module and then in these safety > sections just refer to some labels from that section? > > I think we should discuss this in our next meeting. Not sure about the best way to do this. Lets talk. <...> >> + >> + pub(crate) const unsafe fn build() -> &'static bindings::blk_mq_ops { >> + &Self::VTABLE >> + } > > Why is this function `unsafe`? I don't think it needs to be unsafe, thanks. > >> +} > > Some `# Safety` and `SAFETY` missing in this hunk. > >> diff --git a/rust/kernel/block/mq/raw_writer.rs b/rust/kernel/block/mq/raw_writer.rs >> new file mode 100644 >> index 000000000000..25c16ee0b1f7 >> --- /dev/null >> +++ b/rust/kernel/block/mq/raw_writer.rs >> @@ -0,0 +1,30 @@ >> +use core::fmt::{self, Write}; >> + >> +pub(crate) struct RawWriter { >> + ptr: *mut u8, >> + len: usize, >> +} >> + >> +impl RawWriter { >> + unsafe fn new(ptr: *mut u8, len: usize) -> Self { >> + Self { ptr, len } >> + } >> + >> + pub(crate) fn from_array<const N: usize>(a: &mut [core::ffi::c_char; N]) -> Self { >> + unsafe { Self::new(&mut a[0] as *mut _ as _, N) } >> + } > > This function needs to be `unsafe`, because it never captures the > lifetime of `a`. I can write: > let mut a = Box::new([0; 10]); > let mut writer = RawWriter::from_array(&mut *a); > drop(a); > writer.write_str("Abc"); // UAF > Alternatively add a lifetime to `RawWriter`. Yes, a lifetime is missing in RawWriter, thanks. > >> +} >> + >> +impl Write for RawWriter { >> + fn write_str(&mut self, s: &str) -> fmt::Result { >> + let bytes = s.as_bytes(); >> + let len = bytes.len(); >> + if len > self.len { >> + return Err(fmt::Error); >> + } >> + unsafe { core::ptr::copy_nonoverlapping(&bytes[0], self.ptr, len) }; >> + self.ptr = unsafe { self.ptr.add(len) }; >> + self.len -= len; >> + Ok(()) >> + } >> +} >> diff --git a/rust/kernel/block/mq/request.rs b/rust/kernel/block/mq/request.rs >> new file mode 100644 >> index 000000000000..e95ae3fd71ad >> --- /dev/null >> +++ b/rust/kernel/block/mq/request.rs >> @@ -0,0 +1,71 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> + >> +//! This module provides a wrapper for the C `struct request` type. >> +//! >> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h) >> + >> +use crate::{ >> + bindings, >> + block::mq::Operations, >> + error::{Error, Result}, >> +}; >> +use core::marker::PhantomData; >> + >> +/// A wrapper around a blk-mq `struct request`. This represents an IO request. >> +pub struct Request<T: Operations> { >> + ptr: *mut bindings::request, > > Why is this not embedded? I have changed it to `struct Request(Opaque<bindings::request>)` for next version 👍 > >> + _p: PhantomData<T>, >> +} >> + >> +impl<T: Operations> Request<T> { >> + pub(crate) unsafe fn from_ptr(ptr: *mut bindings::request) -> Self { >> + Self { >> + ptr, >> + _p: PhantomData, >> + } >> + } >> + >> + /// Get the command identifier for the request >> + pub fn command(&self) -> u32 { >> + unsafe { (*self.ptr).cmd_flags & ((1 << bindings::REQ_OP_BITS) - 1) } >> + } >> + >> + /// Call this to indicate to the kernel that the request has been issued by the driver >> + pub fn start(&self) { >> + unsafe { bindings::blk_mq_start_request(self.ptr) }; >> + } >> + >> + /// Call this to indicate to the kernel that the request has been completed without errors >> + // TODO: Consume rq so that we can't use it after ending it? >> + pub fn end_ok(&self) { >> + unsafe { bindings::blk_mq_end_request(self.ptr, bindings::BLK_STS_OK as _) }; >> + } >> + >> + /// Call this to indicate to the kernel that the request completed with an error >> + pub fn end_err(&self, err: Error) { >> + unsafe { bindings::blk_mq_end_request(self.ptr, err.to_blk_status()) }; >> + } >> + >> + /// Call this to indicate that the request completed with the status indicated by `status` >> + pub fn end(&self, status: Result) { >> + if let Err(e) = status { >> + self.end_err(e); >> + } else { >> + self.end_ok(); >> + } >> + } >> + >> + /// Call this to schedule defered completion of the request >> + // TODO: Consume rq so that we can't use it after completing it? >> + pub fn complete(&self) { >> + if !unsafe { bindings::blk_mq_complete_request_remote(self.ptr) } { >> + T::complete(&unsafe { Self::from_ptr(self.ptr) }); >> + } >> + } >> + >> + /// Get the target sector for the request >> + #[inline(always)] > > Why is this `inline(always)`? Compiler would not inline from kernel crate to modules without this. I will check if this is still the case. > >> + pub fn sector(&self) -> usize { >> + unsafe { (*self.ptr).__sector as usize } >> + } >> +} >> diff --git a/rust/kernel/block/mq/tag_set.rs b/rust/kernel/block/mq/tag_set.rs >> new file mode 100644 >> index 000000000000..d122db7f6d0e >> --- /dev/null >> +++ b/rust/kernel/block/mq/tag_set.rs >> @@ -0,0 +1,92 @@ >> +// SPDX-License-Identifier: GPL-2.0 >> + >> +//! This module provides the `TagSet` struct to wrap the C `struct blk_mq_tag_set`. >> +//! >> +//! C header: [`include/linux/blk-mq.h`](../../include/linux/blk-mq.h) >> + >> +use crate::{ >> + bindings, >> + block::mq::{operations::OperationsVtable, Operations}, >> + error::{Error, Result}, >> + sync::Arc, >> + types::ForeignOwnable, >> +}; >> +use core::{cell::UnsafeCell, convert::TryInto, marker::PhantomData}; >> + >> +/// A wrapper for the C `struct blk_mq_tag_set` >> +pub struct TagSet<T: Operations> { >> + inner: UnsafeCell<bindings::blk_mq_tag_set>, >> + _p: PhantomData<T>, >> +} >> + >> +impl<T: Operations> TagSet<T> { >> + /// Try to create a new tag set >> + pub fn try_new( >> + nr_hw_queues: u32, >> + tagset_data: T::TagSetData, >> + num_tags: u32, >> + num_maps: u32, >> + ) -> Result<Arc<Self>> { > > Why force the users to use `Arc`? Changed to return a `PinInit<TagSet>` for next version. > >> + let tagset = Arc::try_new(Self { >> + inner: UnsafeCell::new(bindings::blk_mq_tag_set::default()), >> + _p: PhantomData, >> + })?; >> + >> + // SAFETY: We just allocated `tagset`, we know this is the only reference to it. >> + let inner = unsafe { &mut *tagset.inner.get() }; >> + >> + inner.ops = unsafe { OperationsVtable::<T>::build() }; >> + inner.nr_hw_queues = nr_hw_queues; >> + inner.timeout = 0; // 0 means default which is 30 * HZ in C >> + inner.numa_node = bindings::NUMA_NO_NODE; >> + inner.queue_depth = num_tags; >> + inner.cmd_size = core::mem::size_of::<T::RequestData>().try_into()?; >> + inner.flags = bindings::BLK_MQ_F_SHOULD_MERGE; >> + inner.driver_data = tagset_data.into_foreign() as _; >> + inner.nr_maps = num_maps; >> + >> + // SAFETY: `inner` points to valid and initialised memory. >> + let ret = unsafe { bindings::blk_mq_alloc_tag_set(inner) }; >> + if ret < 0 { >> + // SAFETY: We created `driver_data` above with `into_foreign` >> + unsafe { T::TagSetData::from_foreign(inner.driver_data) }; >> + return Err(Error::from_errno(ret)); >> + } >> + >> + Ok(tagset) >> + } >> + >> + /// Return the pointer to the wrapped `struct blk_mq_tag_set` >> + pub(crate) fn raw_tag_set(&self) -> *mut bindings::blk_mq_tag_set { >> + self.inner.get() >> + } >> +} >> + >> +impl<T: Operations> Drop for TagSet<T> { >> + fn drop(&mut self) { >> + let tagset_data = unsafe { (*self.inner.get()).driver_data }; >> + >> + // SAFETY: `inner` is valid and has been properly initialised during construction. >> + unsafe { bindings::blk_mq_free_tag_set(self.inner.get()) }; >> + >> + // SAFETY: `tagset_data` was created by a call to >> + // `ForeignOwnable::into_foreign` in `TagSet::try_new()` >> + unsafe { T::TagSetData::from_foreign(tagset_data) }; >> + } >> +} >> + >> +/// A tag set reference. Used to control lifetime and prevent drop of TagSet references passed to >> +/// `Operations::map_queues()` >> +pub struct TagSetRef { >> + ptr: *mut bindings::blk_mq_tag_set, >> +} >> + >> +impl TagSetRef { >> + pub(crate) unsafe fn from_ptr(tagset: *mut bindings::blk_mq_tag_set) -> Self { >> + Self { ptr: tagset } >> + } >> + >> + pub fn ptr(&self) -> *mut bindings::blk_mq_tag_set { >> + self.ptr >> + } >> +} > > This is a **very** thin abstraction, why is it needed? It is not. I changed it to `&TagSet`, thanks. Thanks for the comments! Best regards, Andreas