On 5/30/22 15:48, David Hildenbrand wrote:
+
struct virtio_balloon {
struct virtio_device *vdev;
struct virtqueue *inflate_vq, *deflate_vq, *stats_vq, *free_page_vq;
@@ -126,6 +133,16 @@ struct virtio_balloon {
/* Free page reporting device */
struct virtqueue *reporting_vq;
struct page_reporting_dev_info pr_dev_info;
+
+ /* Memory recover VQ - VIRTIO_BALLOON_F_RECOVER */
+ struct virtqueue *recover_vq;
+ spinlock_t recover_vq_lock;
+ struct notifier_block memory_failure_nb;
+ struct list_head corrupted_page_list;
+ struct list_head recovered_page_list;
+ spinlock_t recover_page_list_lock;
+ struct __virtio_balloon_recover in_vbr;
+ struct work_struct unpoison_memory_work;
I assume we want all that only with CONFIG_MEMORY_FAILURE.
Sorry, I missed this.
};
static const struct virtio_device_id id_table[] = {
@@ -494,6 +511,198 @@ static void update_balloon_size_func(struct work_struct *work)
queue_work(system_freezable_wq, work);
}
+/*
+ * virtballoon_memory_failure - notified by memory failure, try to fix the
+ * corrupted page.
+ * The memory failure notifier is designed to call back when the kernel handled
+ * successfully only, WARN_ON_ONCE on the unlikely condition to find out any
+ * error(memory error handling is a best effort, not 100% coverd).
+ */
+static int virtballoon_memory_failure(struct notifier_block *notifier,
+ unsigned long pfn, void *parm)
+{
+ struct virtio_balloon *vb = container_of(notifier, struct virtio_balloon,
+ memory_failure_nb);
+ struct page *page;
+ struct __virtio_balloon_recover *out_vbr;
+ struct scatterlist sg;
+ unsigned long flags;
+ int err;
+
+ page = pfn_to_online_page(pfn);
+ if (WARN_ON_ONCE(!page))
+ return NOTIFY_DONE;
+
+ if (PageHuge(page))
+ return NOTIFY_DONE;
+
+ if (WARN_ON_ONCE(!PageHWPoison(page)))
+ return NOTIFY_DONE;
+
+ if (WARN_ON_ONCE(page_count(page) != 1))
+ return NOTIFY_DONE;
Relying on the page_count to be 1 for correctness is usually a bit
shaky, for example, when racing against isolate_movable_page() that
might temporarily bump upo the refcount.
The memory notifier is designed to call the chain if a page gets result
MF_RECOVERED only:
if (result == MF_RECOVERED)
blocking_notifier_call_chain(&mf_notifier_list, pfn, NULL);
+
+ get_page(page); /* balloon reference */
+
+ out_vbr = kzalloc(sizeof(*out_vbr), GFP_KERNEL);
Are we always guaranteed to be able to use GFP_KERNEL out of MCE
context? (IOW, are we never atomic?)
+ if (WARN_ON_ONCE(!out_vbr))
+ return NOTIFY_BAD;
+
+ spin_lock(&vb->recover_page_list_lock);
+ balloon_page_push(&vb->corrupted_page_list, page);
+ spin_unlock(&vb->recover_page_list_lock);
+
+ out_vbr->vbr.cmd = VIRTIO_BALLOON_R_CMD_RECOVER;
This makes me wonder if we should have a more generic guest->host
request queue, similar to what e.g., virtio-mem uses, instead of adding
a separate VIRTIO_BALLOON_VQ_RECOVER vq.
I'm OK with either one, I'll follow your decision! :D
+ set_page_pfns(vb, out_vbr->pfns, page);
+ sg_init_one(&sg, out_vbr, sizeof(*out_vbr));
+
+ spin_lock_irqsave(&vb->recover_vq_lock, flags);
+ err = virtqueue_add_outbuf(vb->recover_vq, &sg, 1, out_vbr, GFP_KERNEL);
+ if (unlikely(err)) {
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+ return NOTIFY_DONE;
+ }
+ virtqueue_kick(vb->recover_vq);
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+
+ return NOTIFY_OK;
+}
+
+static int recover_vq_get_response(struct virtio_balloon *vb)
+{
+ struct __virtio_balloon_recover *in_vbr;
+ struct scatterlist sg;
+ unsigned long flags;
+ int err;
+
+ spin_lock_irqsave(&vb->recover_vq_lock, flags);
+ in_vbr = &vb->in_vbr;
+ memset(in_vbr, 0x00, sizeof(*in_vbr));
+ sg_init_one(&sg, in_vbr, sizeof(*in_vbr));
+ err = virtqueue_add_inbuf(vb->recover_vq, &sg, 1, in_vbr, GFP_KERNEL);
+ if (unlikely(err)) {
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+ return err;
+ }
+
+ virtqueue_kick(vb->recover_vq);
+ spin_unlock_irqrestore(&vb->recover_vq_lock, flags);
+
+ return 0;
+}
+
+static void recover_vq_handle_response(struct virtio_balloon *vb, unsigned int len)
+{
+ struct __virtio_balloon_recover *in_vbr;
+ struct virtio_balloon_recover *vbr;
+ struct page *page;
+ unsigned int pfns;
+ u32 pfn0, pfn1;
+ __u8 status;
+
+ /* the response is not expected */
+ if (unlikely(len != sizeof(struct __virtio_balloon_recover)))
+ return;
+
+ in_vbr = &vb->in_vbr;
+ vbr = &in_vbr->vbr;
+ if (unlikely(vbr->cmd != VIRTIO_BALLOON_R_CMD_RESPONSE))
+ return;
+
+ /* to make sure the contiguous balloon PFNs */
+ for (pfns = 1; pfns < VIRTIO_BALLOON_PAGES_PER_PAGE; pfns++) {
+ pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns - 1]);
+ pfn1 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[pfns]);
+ if (pfn1 - pfn0 != 1)
+ return;
Yeah, we really shouldn't be dealing with (legacy) 4k PFNs here, but
instead, proper ranges I guess.
MST also pointed out this, I explained in this link:
https://lkml.org/lkml/2022/5/26/942
Rather than page reporting style, virtio-mem style should be fine. Ex,
struct virtio_memory_recover {
__virtio64 addr;
__virtio32 length;
__virtio16 padding[2];
};
+ }
+
+ pfn0 = virtio32_to_cpu(vb->vdev, in_vbr->pfns[0]);
+ if (!pfn_valid(pfn0))
+ return;
+
+ pfn1 = -1;
+ spin_lock(&vb->recover_page_list_lock);
+ list_for_each_entry(page, &vb->corrupted_page_list, lru) {
+ pfn1 = page_to_pfn(page);
+ if (pfn1 == pfn0)
+ break;
+ }
+ spin_unlock(&vb->recover_page_list_lock);
+
+ status = vbr->status;
+ switch (status) {
+ case VIRTIO_BALLOON_R_STATUS_RECOVERED:
+ if (pfn1 == pfn0) {
+ spin_lock(&vb->recover_page_list_lock);
+ list_del(&page->lru);
+ balloon_page_push(&vb->recovered_page_list, page);
We rather not reuse actual balloon functions in !balloon context. Just
move the page to the proper list directly.
OK.
+ spin_unlock(&vb->recover_page_list_lock);
+ queue_work(system_freezable_wq, &vb->unpoison_memory_work);
+ dev_info_ratelimited(&vb->vdev->dev, "recovered pfn 0x%x", pfn0);
Well, not yet. Shouldn't this go into unpoison_memory_func() ?
OK.
[...]
+out_unregister_reporting:
+ if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_REPORTING))
+ page_reporting_unregister(&vb->pr_dev_info);
out_unregister_oom:
if (virtio_has_feature(vb->vdev, VIRTIO_BALLOON_F_DEFLATE_ON_OOM))
unregister_oom_notifier(&vb->oom_nb);
@@ -1082,6 +1319,11 @@ static void virtballoon_remove(struct virtio_device *vdev)
destroy_workqueue(vb->balloon_wq);
}
+ if (virtio_has_feature(vdev, VIRTIO_BALLOON_F_RECOVER)) {
Could the notifier already have been triggered and we might be using the
device before already fully initialized from the notifier and might end
up leaking memory here that we allocated?
+ unregister_memory_failure_notifier(&vb->memory_failure_nb);
+ cancel_work_sync(&vb->unpoison_memory_work);
+ }
+
Could we be leaking memory from the virtballoon_remove() path?
Yes, I'll fix the possible memory leak here.
Thanks a lot.
--
zhenwei pi