Re: [PATCH] virtio_ring: Fix the stale index in available ring

[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]

 



On 3/18/24 02:50, Michael S. Tsirkin wrote:
On Fri, Mar 15, 2024 at 09:24:36PM +1000, Gavin Shan wrote:

On 3/15/24 21:05, Michael S. Tsirkin wrote:
On Fri, Mar 15, 2024 at 08:45:10PM +1000, Gavin Shan wrote:
Yes, I guess smp_wmb() ('dmb') is buggy on NVidia's grace-hopper platform. I tried
to reproduce it with my own driver where one thread writes to the shared buffer
and another thread reads from the buffer. I don't hit the out-of-order issue so
far.

Make sure the 2 areas you are accessing are in different cache lines.


Yes, I already put those 2 areas to separate cache lines.


My driver may be not correct somewhere and I will update if I can reproduce
the issue with my driver in the future.

Then maybe your change is just making virtio slower and masks the bug
that is actually elsewhere?

You don't really need a driver. Here's a simple test: without barriers
assertion will fail. With barriers it will not.
(Warning: didn't bother testing too much, could be buggy.

---

#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#define FIRST values[0]
#define SECOND values[64]

volatile int values[100] = {};

void* writer_thread(void* arg) {
	while (1) {
	FIRST++;
	// NEED smp_wmb here
         __asm__ volatile("dmb ishst" : : : "memory");
	SECOND++;
	}
}

void* reader_thread(void* arg) {
      while (1) {
	int first = FIRST;
	// NEED smp_rmb here
         __asm__ volatile("dmb ishld" : : : "memory");
	int second = SECOND;
	assert(first - second == 1 || first - second == 0);
      }
}

int main() {
      pthread_t writer, reader;

      pthread_create(&writer, NULL, writer_thread, NULL);
      pthread_create(&reader, NULL, reader_thread, NULL);

      pthread_join(writer, NULL);
      pthread_join(reader, NULL);

      return 0;
}


Had a quick test on NVidia's grace-hopper and Ampere's CPUs. I hit
the assert on both of them. After replacing 'dmb' with 'dsb', I can
hit assert on both of them too. I need to look at the code closely.

[root@virt-mtcollins-02 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
Aborted (core dumped)

[root@nvidia-grace-hopper-05 test]# ./a
a: a.c:26: reader_thread: Assertion `first - second == 1 || first - second == 0' failed.
Aborted (core dumped)

Thanks,
Gavin


Actually this test is broken. No need for ordering it's a simple race.
The following works on x86 though (x86 does not need barriers
though).


#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>

#if 0
#define x86_rmb()  asm volatile("lfence":::"memory")
#define x86_mb()  asm volatile("mfence":::"memory")
#define x86_smb()  asm volatile("sfence":::"memory")
#else
#define x86_rmb()  asm volatile("":::"memory")
#define x86_mb()  asm volatile("":::"memory")
#define x86_smb()  asm volatile("":::"memory")
#endif

#define FIRST values[0]
#define SECOND values[640]
#define FLAG values[1280]

volatile unsigned values[2000] = {};

void* writer_thread(void* arg) {
	while (1) {
	/* Now synchronize with reader */
	while(FLAG);
	FIRST++;
	x86_smb();
	SECOND++;
	x86_smb();
	FLAG = 1;
	}
}

void* reader_thread(void* arg) {
     while (1) {
	/* Now synchronize with writer */
	while(!FLAG);
	x86_rmb();
	unsigned first = FIRST;
	x86_rmb();
	unsigned second = SECOND;
	assert(first - second == 1 || first - second == 0);
	FLAG = 0;

	if (!(first %1000000))
		printf("%d\n", first);
    }
}

int main() {
     pthread_t writer, reader;

     pthread_create(&writer, NULL, writer_thread, NULL);
     pthread_create(&reader, NULL, reader_thread, NULL);

     pthread_join(writer, NULL);
     pthread_join(reader, NULL);

     return 0;
}


I tried it on host and VM of NVidia's grace-hopper. Without the barriers, I
can hit assert. With the barriers, it's working fine without hitting the
assert.

I also had some code to mimic virtio vring last weekend, and it's just
working well. Back to our original issue, __smb_wmb() is issued by guest
while __smb_rmb() is executed on host. The VM and host are running at
different exception level: EL2 vs EL1. I'm not sure it's the cause. I
need to modify my code so that __smb_wmb() and __smb_rmb() can be executed
from guest and host.

[gshan@gshan code]$ cat test.h
#ifndef __TEST_H
#define __TEST_H

struct vring_desc {
	uint64_t	addr;
	uint32_t	len;
	uint16_t	flags;
	uint16_t	next;
} __attribute__((aligned(4)));

struct vring_avail {
	uint16_t	flags;
	uint16_t	idx;
	uint16_t	ring[];
} __attribute__((aligned(4)));

struct vring_used_elem {
	uint32_t	id;
	uint32_t	len;
} __attribute__((aligned(4)));

struct vring_used {
	uint16_t 		flags;
        uint16_t 		idx;
        struct vring_used_elem	ring[];
} __attribute__((aligned(4)));

struct vring {
	struct vring_desc	*desc;
	struct vring_avail	*avail;
	struct vring_used	*used;
	uint8_t			pad0[64];

	/* Writer */
	uint32_t		num;
	uint32_t		w_num_free;
	uint32_t		w_free_head;
	uint16_t		w_avail_idx;
	uint16_t		w_last_used_idx;
	uint16_t		*w_extra_data;
	uint16_t		*w_extra_next;
	uint8_t			pad1[64];

	/* Reader */
	uint16_t		r_avail_idx;
	uint16_t		r_last_avail_idx;
	uint16_t		r_last_used_idx;
	uint8_t			pad2[64];
};

static inline unsigned int vring_size(unsigned int num, unsigned long align)
{
	return ((sizeof(struct vring_desc) * num +
		 sizeof(uint16_t) * (3 + num) + (align - 1)) & ~(align - 1)) +
	       sizeof(uint16_t) * 3 + sizeof(struct vring_used_elem) * num;
		
}

static inline void __smp_rmb(void)
{
#ifdef WEAK_BARRIER
	__asm__ volatile("dmb ishld" : : : "memory");
#else
	__asm__ volatile("dsb sy"    : : : "memory");
#endif
}

static inline void __smp_wmb(void)
{
#ifdef WEAK_BARRIER
	__asm__ volatile("dmb ishst" : : : "memory");
#else
	__asm__ volatile("dsb sy"    : : : "memory");
#endif
}

#endif /* __TEST_H */


[gshan@gshan code]$ cat test.c
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <errno.h>
#include <stdbool.h>
#include <stdint.h>
#include <sys/types.h>
#include <assert.h>
#include <sched.h>
#include <pthread.h>

#include "test.h"

static struct vring *vring;

static int bind_cpu(int cpuid)
{
	cpu_set_t cpuset;

	CPU_ZERO(&cpuset);
	CPU_SET(cpuid, &cpuset);

	return pthread_setaffinity_np(pthread_self(), sizeof(cpuset), &cpuset);
}

static void write_free_used_desc(void)
{
	uint16_t last_used;
	uint32_t idx;

	if ((uint16_t)(vring->used->idx - vring->w_last_used_idx) < 64)
		return;

	while (true) {
		if (vring->w_last_used_idx == vring->used->idx)
			return;

		__smp_rmb();

		/* Retrieve the head */
		last_used = vring->w_last_used_idx & (vring->num - 1);
		idx = vring->used->ring[last_used].id;
		assert(idx < vring->num);
		assert(vring->w_extra_data[idx]);

		/* Reclaim the descriptor */
		vring->w_extra_data[idx] = 0;
		vring->w_extra_next[idx] = vring->w_free_head;
		vring->w_free_head = idx;

		/* Update statistics */
		vring->w_num_free++;
		vring->w_last_used_idx++;
	}
}

static void write_push_desc(void)
{
	uint32_t head = vring->w_free_head;
	uint32_t avail_idx;

	if (vring->w_num_free < 1)
		return;

	/*
	 * The data in the descriptor doesn't matter. The idea here
	 * is to dirty the cache line.
	 */
	vring->desc[head].flags = 1;
	vring->desc[head].addr  = 0xffffffffffffffff;
	vring->desc[head].len   = 0xffffffff;
	vring->desc[head].next  = vring->w_extra_next[head];
	vring->desc[head].flags = 0;

	vring->w_num_free--;
	vring->w_free_head = vring->w_extra_next[head];
	vring->w_extra_data[head] = 1;

	avail_idx = vring->w_avail_idx & (vring->num - 1);
	vring->avail->ring[avail_idx] = head;

	__smp_wmb();

	vring->w_avail_idx++;
	vring->avail->idx = vring->w_avail_idx;
}

static void *write_worker(void *arg)
{
	assert(!bind_cpu(10));

	while (true) {
		write_free_used_desc();
		write_push_desc();
	}

	return NULL;
}

static void read_pull_desc(void)
{
	uint16_t avail_idx, last_avail_idx;
	uint32_t head;

	last_avail_idx = vring->r_last_avail_idx;
	if (vring->r_avail_idx == vring->r_last_avail_idx) {
		vring->r_avail_idx = vring->avail->idx;
		if (vring->r_avail_idx == last_avail_idx)
			return;

		__smp_rmb();
	}

	head = vring->avail->ring[last_avail_idx & (vring->num - 1)];
	assert(head < vring->num);
	vring->r_last_avail_idx++;

	vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].id  = head;
	vring->used->ring[vring->r_last_used_idx & (vring->num - 1)].len = 0;
	vring->r_last_used_idx++;

	__smp_wmb();

	vring->used->idx = vring->r_last_used_idx;
}

static void *read_worker(void *arg)
{
	assert(!bind_cpu(60));

	while (true) {
		read_pull_desc();
	}

	return NULL;
}

static void init_vring(unsigned int num, unsigned long align)
{
	unsigned int size, i;

	/* vring */
	vring = malloc(sizeof(*vring));
	assert(vring);
	memset(vring, 0, sizeof(*vring));

	/* Descriptors */
	size = vring_size(num, align);
	vring->desc = (struct vring_desc *)malloc(size);
	assert(vring->desc);
	memset(vring->desc, 0, size);
	vring->avail = (struct vring_avail *)((void *)vring->desc +
					      num * sizeof(struct vring_desc));
	vring->used = (struct vring_used *)(((unsigned long)&vring->avail->ring[num] +
					    sizeof(uint16_t) + (align - 1)) & ~(align - 1));

	/* Writer's extra data */
	vring->w_extra_data = malloc(sizeof(uint16_t) * num);
	assert(vring->w_extra_data);
	memset(vring->w_extra_data, 0, sizeof(uint16_t) * num);
	vring->w_extra_next = malloc(sizeof(uint16_t) * num);
	assert(vring->w_extra_next);
	memset(vring->w_extra_next, 0, sizeof(uint16_t) * num);
	for (i = 0; i < num - 1; i++)
		vring->w_extra_next[i] = i + 1;

	/* Statistics */
	vring->num = num;
	vring->w_num_free = num;
	vring->w_free_head = 0;
	vring->w_avail_idx = 0;
	vring->w_last_used_idx = 0;
	vring->r_avail_idx = 0;
	vring->r_last_avail_idx = 0;
	vring->r_last_used_idx = 0;
}

int main(int argc, char **argv)
{
	pthread_t w_tid, r_tid;
	int ret;

	assert(!bind_cpu(0));

	init_vring(256, 64);

	ret = pthread_create(&w_tid, NULL, write_worker, NULL);
	assert(!ret);
	ret = pthread_create(&r_tid, NULL, read_worker, NULL);
	assert(!ret);
	ret = pthread_join(w_tid, NULL);
	assert(!ret);
	ret = pthread_join(r_tid, NULL);
	assert(!ret);

	while (true) {
		sleep(1);
	}

	return 0;
}

Thanks,
Gavin








[Index of Archives]     [KVM Development]     [Libvirt Development]     [Libvirt Users]     [CentOS Virtualization]     [Netdev]     [Ethernet Bridging]     [Linux Wireless]     [Kernel Newbies]     [Security]     [Linux for Hams]     [Netfilter]     [Bugtraq]     [Yosemite Forum]     [MIPS Linux]     [ARM Linux]     [Linux RAID]     [Linux Admin]     [Samba]

  Powered by Linux