Hi, This is a debugging patch I'm sending to ask for help for a bug I'm currently facing when enabling memfd for client playback audio. In this patch, I simply transform the client playback mempool (context->mempool) to be backed by memfd shared memory instead of posix SHM one. Using this setup, and when I run a gstreamer client along with a client using the pa simple APIs, a certain misbehavior occurs in our pstreams do_read() code. For background, memfd blocks transfer over pstreams work now like this: - The pstream descriptor is filled with necessary fields like: # descriptor flags (PA_PSTREAM_DESCRIPTOR_FLAGS) implying that we're marshalling a memfd block through (PA_FLAG_MEMFD_FD) # descriptor payload length (PA_PSTREAM_DESCRIPTOR_LENGTH), which is the `memfd_info' array length - The file-descriptor backing the memfd block is then sent using unix socket ancilary data (SCM_RIGHTS). Two fields are sent: nfd, number of fds to be sent (1), and the fd array itself (fds[0], containing memfd fd). All of this is done using diwic's quite nice fd-passing abstractions When srbchannels were converted to use memfds, the above sequence worked perfectly. when doing so for the playback mempool, sometimes pstreams code reach do_read() with a flag PA_FLAG_MEMFD_FD (implying that this is indeed a marshalled memfd-block frame), but with a reset ancillary data field (nfd = 0). After a lot of debugging, using below patch as an example, I found that _even in the error case_ of nfd=0, the memfd-block frame was received correctly: - descriptor flags field denotes a marshalled memfd block - descriptor payload length indeed shows memfd_info array length - ancillary data nfd = 1, with fds[0] containing our kernel-passed fd But by the time the full frame is complete, [*] someone has reset the pstreams ancillary data behind my back. Re-examining all the descriptor fields, and descriptor payload fields (memfd_info), shows no signs of data corruption at all. So, who's changing the `pstream->read_ancillary_data' behind our back? Is it the memblock shm revoke/release callbacks? This issue does not happen all the time, but is 100%-reproducible when a gstreamer client runs in parallel with a simple API client. Any advice on how to debug this issue further? Thanks a lot :-) [*] (re->index >= ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_LENGTH]) + PA_PSTREAM_DESCRIPTOR_SIZE) Debugging patch: --- diff --git a/src/pulse/context.c b/src/pulse/context.c index f272cd6..14af5e8 100644 --- a/src/pulse/context.c +++ b/src/pulse/context.c @@ -62,6 +62,7 @@ #include <pulsecore/creds.h> #include <pulsecore/macro.h> #include <pulsecore/proplist-util.h> +#include <pulsecore/memfd.h> #include "internal.h" #include "context.h" @@ -125,7 +126,7 @@ static void reset_callbacks(pa_context *c) { pa_context *pa_context_new_with_proplist(pa_mainloop_api *mainloop, const char *name, pa_proplist *p) { pa_context *c; - pa_mem_type_t type; + pa_mem_type_t mem_type; pa_assert(mainloop); @@ -171,9 +172,14 @@ pa_context *pa_context_new_with_proplist(pa_mainloop_api *mainloop, const char * c->srb_template.readfd = -1; c->srb_template.writefd = -1; - type = !c->conf->disable_shm ? PA_MEMORY_SHARED_POSIX : PA_MEMORY_PRIVATE; - if (!(c->mempool = pa_mempool_new(type, c->conf->shm_size))) { + if (c->conf->disable_shm) + mem_type = PA_MEMORY_PRIVATE; + else if (pa_memfd_is_supported()) + mem_type = PA_MEMORY_SHARED_MEMFD; + else + mem_type = PA_MEMORY_SHARED_POSIX; + if (!(c->mempool = pa_mempool_new(mem_type, c->conf->shm_size))) { if (!c->conf->disable_shm) { pa_log_warn("Failed to allocate shared memory pool. Falling back to a normal private one."); c->mempool = pa_mempool_new(PA_MEMORY_PRIVATE, c->conf->shm_size); diff --git a/src/pulsecore/pstream.c b/src/pulsecore/pstream.c index a65b0a6..6e12221 100644 --- a/src/pulsecore/pstream.c +++ b/src/pulsecore/pstream.c @@ -58,6 +58,7 @@ enum { PA_PSTREAM_DESCRIPTOR_OFFSET_HI, PA_PSTREAM_DESCRIPTOR_OFFSET_LO, PA_PSTREAM_DESCRIPTOR_FLAGS, + PA_PSTREAM_DESCRIPTOR_MAGIC, PA_PSTREAM_DESCRIPTOR_MAX }; @@ -524,11 +525,14 @@ static void prepare_next_write_item(pa_pstream *p) { p->write.minibuf_validsize = 0; pa_memchunk_reset(&p->write.memchunk); + static pa_atomic_t magic = PA_ATOMIC_INIT(100); + p->write.descriptor[PA_PSTREAM_DESCRIPTOR_LENGTH] = 0; p->write.descriptor[PA_PSTREAM_DESCRIPTOR_CHANNEL] = htonl((uint32_t) -1); p->write.descriptor[PA_PSTREAM_DESCRIPTOR_OFFSET_HI] = 0; p->write.descriptor[PA_PSTREAM_DESCRIPTOR_OFFSET_LO] = 0; p->write.descriptor[PA_PSTREAM_DESCRIPTOR_FLAGS] = 0; + p->write.descriptor[PA_PSTREAM_DESCRIPTOR_MAGIC] = htonl(pa_atomic_inc(&magic)); if (p->write.current->type == PA_PSTREAM_ITEM_PACKET) { size_t plen; @@ -826,6 +830,25 @@ static int do_read(pa_pstream *p, struct pstream_read *re) { if ((r = pa_iochannel_read_with_ancil_data(p->io, d, l, &b)) <= 0) goto fail; + /* This implies that we get the pstream descriptor out of the first read() system call. + * This is usually the case, so no need for making our debugging code more complicated. */ + if (re->index == 0 && r == PA_PSTREAM_DESCRIPTOR_SIZE) { + int flags = ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_FLAGS]); + if (flags & PA_FLAG_MEMFD_FD) { + pa_log("@@@@@@@@"); + pa_log("Just out of the oven memfd stream descriptor!"); + pa_log("descriptor: MAGIC = %u", ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_MAGIC])); + pa_log("descriptor: size = %lu", PA_PSTREAM_DESCRIPTOR_SIZE); + pa_log("descriptor: flags = 0x%x", ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_FLAGS])); + pa_log("descriptor: paylod (memfd_info) size = %u", ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_LENGTH])); + pa_log("ancil: creds_valid = %s", pa_yes_no(b.creds_valid)); + pa_log("ancil: creds uid = %u, gid = %u", b.creds.uid, b.creds.gid); + pa_log("ancil: nfd = %d", b.nfd); + pa_log("ancil: fds[0] = %d", b.fds[0]); + pa_log("@@@@@@@@"); + } + } + if (b.creds_valid) { p->read_ancil_data.creds_valid = true; p->read_ancil_data.creds = b.creds; @@ -974,7 +997,28 @@ static int do_read(pa_pstream *p, struct pstream_read *re) { !!(flags & PA_FLAG_SHMWRITABLE)); } else if ((flags & PA_FLAG_SHMMASK) == PA_FLAG_MEMFD_FD) { int memfd_fd; - pa_assert(p->read_ancil_data.nfd == 1); + + if (p->read_ancil_data.nfd != 1) { + pa_log_warn("error; found nfd = %d", p->read_ancil_data.nfd); + + pa_log_warn("++++++++"); + pa_log_error("descriptor: MAGIC = %u", ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_MAGIC])); + pa_log_warn("descriptor: size = %lu", PA_PSTREAM_DESCRIPTOR_SIZE); + pa_log_warn("descriptor: flags = 0x%x", ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_FLAGS])); + pa_log_warn("descriptor: paylod (memfd_info) size = %u", ntohl(re->descriptor[PA_PSTREAM_DESCRIPTOR_LENGTH])); + pa_log_warn("------"); + pa_log_warn("payload: memfd blk id = %u", ntohl(re->per_type.memfd_info[PA_PSTREAM_MEMFD_BLOCKID])); + pa_log_warn("payload: memfd index = %u", ntohl(re->per_type.memfd_info[PA_PSTREAM_MEMFD_INDEX])); + pa_log_warn("payload: memfd shared region length = %u", ntohl(re->per_type.memfd_info[PA_PSTREAM_MEMFD_LENGTH])); + pa_log_warn("------"); + pa_log_warn("ancil: creds_valid = %s", pa_yes_no(p->read_ancil_data.creds_valid)); + pa_log_warn("ancil: creds uid = %u, gid = %u", p->read_ancil_data.creds.uid, p->read_ancil_data.creds.gid); + pa_log_warn("ancil: nfd = %d", p->read_ancil_data.nfd); + pa_log_warn("ancil: fds[0] = %d", p->read_ancil_data.fds[0]); + pa_log_warn("++++++++"); + + pa_assert_not_reached(); + } memfd_fd = p->read_ancil_data.fds[0]; pa_assert(memfd_fd >= 0);