[PATCH v4] Make module loopback honor requested latency

georg@xxxxxxxx (Georg Chini) · Sat, 31 Jan 2015 23:43:42 +0100

This is the final version of my patch for module-loopback. It is on top of the
patch I sent about an hour ago and contains a lot more changes than the previous
versions:

- Honor specified latency if possible, if not adjust to the lowest possible value
- Smooth switching from fixed latency to dynamic latency source or sink and vice versa
- good rate and latency stability, no rate oscillation
- adjusts latency as good as your setup allows
- fast regulation of latency offsets, adjusts 100 ms offset within 22 seconds (adjust
  time=1) to 60 seconds (adjust_time=10)
- usable latency range 4 - 30000 ms
- Avoid rewinds and "cannot peek into queue" messages during startup and switching
- works with rates between 200 and 190000 Hz
- maximum latency offset after source/sink switch or at startup around is 200 ms

I also introduced a new parameter, buffer_latency_msec which can be used together
with latency_msec. If buffer_latency_msec is specified, the resulting latency
will be latency_msec + buffer_latency_msec. Latency_msec then refers only to
the source/sink latency while buffer_latency_msec specifies the buffer part.
This can be used to save a lot of CPU at low latencies, running 10 ms latency
with latency_msec=6 buffer_latency_msec=4 gives 8% CPU on my system compared to
12% when I only specify latency_msec=10.
Additionally you can go beyond the safe-guard limits that are built in, you can
access the range 1 - 3 ms or lower the buffer latency for fixed latency devices.
Some of my USB devices run fine at a buffer latency of fragment size + 4 ms
instead of the dfault fragment size + 20 ms.

I tested it all with Intel HDA, USB and bluetooth sound devices. I would like to
see some test results from other people.

---
 src/modules/module-loopback.c | 539 +++++++++++++++++++++++++++++++-----------
 1 file changed, 397 insertions(+), 142 deletions(-)

diff --git a/src/modules/module-loopback.c b/src/modules/module-loopback.c
index 7e2b92a..8def03e 100644
--- a/src/modules/module-loopback.c
+++ b/src/modules/module-loopback.c
@@ -47,6 +47,7 @@ PA_MODULE_USAGE(
         "sink=<sink to connect to> "
         "adjust_time=<how often to readjust rates in s> "
         "latency_msec=<latency in ms> "
+        "buffer_latency_msec=<buffer latency in ms> "
         "format=<sample format> "
         "rate=<sample rate> "
         "channels=<number of channels> "
@@ -59,7 +60,9 @@ PA_MODULE_USAGE(
 
 #define DEFAULT_LATENCY_MSEC 200
 
-#define MEMBLOCKQ_MAXLENGTH (1024*1024*16)
+#define MEMBLOCKQ_MAXLENGTH (1024*1024*32)
+
+#define DEFAULT_BUFFER_MARGIN_MSEC 20
 
 #define DEFAULT_ADJUST_TIME_USEC (10*PA_USEC_PER_SEC)
 
@@ -80,12 +83,26 @@ struct userdata {
 
     int64_t recv_counter;
     int64_t send_counter;
+    uint32_t sink_adjust_counter;
+    uint32_t source_adjust_counter;
+    uint32_t underruns;
+    uint32_t no_peeks;
 
-    size_t skip;
     pa_usec_t latency;
+    pa_usec_t buffer_latency;
+    pa_usec_t initial_buffer_latency;
+    pa_usec_t configured_sink_latency;
+    pa_usec_t configured_source_latency;
+
+    pa_usec_t source_latency_sum;
+    pa_usec_t sink_latency_sum;
+    pa_usec_t next_latency;
+    double latency_error;
 
     bool in_pop;
-    size_t min_memblockq_length;
+    bool pop_called;
+    bool source_sink_changed;
+    bool buffer_latency_set;
 
     struct {
         int64_t send_counter;
@@ -95,9 +112,6 @@ struct userdata {
         int64_t recv_counter;
         size_t sink_input_buffer;
         pa_usec_t sink_latency;
-
-        size_t min_memblockq_length;
-        size_t max_request;
     } latency_snapshot;
 };
 
@@ -106,6 +120,7 @@ static const char* const valid_modargs[] = {
     "sink",
     "adjust_time",
     "latency_msec",
+    "buffer_latency_msec",
     "format",
     "rate",
     "channels",
@@ -121,8 +136,7 @@ static const char* const valid_modargs[] = {
 enum {
     SINK_INPUT_MESSAGE_POST = PA_SINK_INPUT_MESSAGE_MAX,
     SINK_INPUT_MESSAGE_REWIND,
-    SINK_INPUT_MESSAGE_LATENCY_SNAPSHOT,
-    SINK_INPUT_MESSAGE_MAX_REQUEST_CHANGED
+    SINK_INPUT_MESSAGE_LATENCY_SNAPSHOT
 };
 
 enum {
@@ -168,84 +182,134 @@ static void teardown(struct userdata *u) {
 }
 
 /* Called from main context */
-static void adjust_rates(struct userdata *u) {
-    size_t buffer, fs;
-    uint32_t old_rate, base_rate, new_rate;
-    pa_usec_t buffer_latency;
+static void adjust_rates(struct userdata *u, pa_usec_t snapshot_delay) {
+    size_t buffer;
+    uint32_t old_rate, base_rate, new_rate, hours, cut_off_frequency;
+    pa_usec_t final_latency, source_sink_latency, current_buffer_latency, current_latency, corrected_latency;
+    double min_cycles;
+    int32_t latency_difference;
 
     pa_assert(u);
     pa_assert_ctl_context();
 
-    pa_asyncmsgq_send(u->source_output->source->asyncmsgq, PA_MSGOBJECT(u->source_output), SOURCE_OUTPUT_MESSAGE_LATENCY_SNAPSHOT, NULL, 0, NULL);
-    pa_asyncmsgq_send(u->sink_input->sink->asyncmsgq, PA_MSGOBJECT(u->sink_input), SINK_INPUT_MESSAGE_LATENCY_SNAPSHOT, NULL, 0, NULL);
+    /* Runtime and counters since last change of source or sink */
+    hours = PA_MIN(u->sink_adjust_counter, u->source_adjust_counter) * u->adjust_time / PA_USEC_PER_SEC / 3600;
+    u->sink_adjust_counter +=1;
+    u->source_adjust_counter +=1;
+
+    /* Latency sums */
+    u->source_latency_sum += u->latency_snapshot.source_latency;
+    u->sink_latency_sum += u->latency_snapshot.sink_latency;
+
+    /* If we are seeing underruns or cannot peek into the queue during pop then the latency is too small */
+    if (u->underruns > 2 || u->no_peeks > 30) {
+       u->buffer_latency += 5 * PA_USEC_PER_MSEC;
+       pa_log_warn("Too many underruns, increasing buffer latency to %0.2f ms", (double) u->buffer_latency / PA_USEC_PER_MSEC);
+       u->underruns = 0;
+       u->no_peeks = 0;
+    }
 
-    buffer =
-        u->latency_snapshot.sink_input_buffer +
-        u->latency_snapshot.source_output_buffer;
+    /* Allow one underrun and 15 no peek messages per hour */
+    if (PA_MIN(u->sink_adjust_counter, u->source_adjust_counter) * u->adjust_time / PA_USEC_PER_SEC / 3600 > hours) {
+       pa_log_info("Underrun counters: %u, %u", u->underruns, u->no_peeks);
+       u->underruns = PA_CLIP_SUB(u->underruns, 1u);
+       u->no_peeks = PA_CLIP_SUB(u->no_peeks, 15u);
+    }
+
+    /* Rates and latencies*/
+    old_rate = u->sink_input->sample_spec.rate;
+    base_rate = u->source_output->sample_spec.rate;
 
+    buffer = u->latency_snapshot.sink_input_buffer + u->latency_snapshot.source_output_buffer;
     if (u->latency_snapshot.recv_counter <= u->latency_snapshot.send_counter)
         buffer += (size_t) (u->latency_snapshot.send_counter - u->latency_snapshot.recv_counter);
     else
-        buffer += PA_CLIP_SUB(buffer, (size_t) (u->latency_snapshot.recv_counter - u->latency_snapshot.send_counter));
+        buffer = PA_CLIP_SUB(buffer, (size_t) (u->latency_snapshot.recv_counter - u->latency_snapshot.send_counter));
+    current_buffer_latency = pa_bytes_to_usec(buffer, &u->sink_input->sample_spec);
 
-    buffer_latency = pa_bytes_to_usec(buffer, &u->sink_input->sample_spec);
+    current_latency = u->latency_snapshot.sink_latency + current_buffer_latency + u->latency_snapshot.source_latency - snapshot_delay;
 
-    pa_log_debug("Loopback overall latency is %0.2f ms + %0.2f ms + %0.2f ms = %0.2f ms",
+    source_sink_latency = u->sink_latency_sum / u->sink_adjust_counter +
+                          u->source_latency_sum / u->source_adjust_counter;
+    final_latency = u->latency;
+    if (u->buffer_latency_set)
+       final_latency += u->initial_buffer_latency;
+    final_latency = PA_MAX(final_latency, source_sink_latency + u->buffer_latency);
+
+    pa_log_debug("Loopback overall latency is %0.2f ms + %0.2f ms + %0.2f ms = %0.2f ms, latency difference: %0.2f ms, rate difference: %i Hz",
                 (double) u->latency_snapshot.sink_latency / PA_USEC_PER_MSEC,
-                (double) buffer_latency / PA_USEC_PER_MSEC,
+                (double) current_buffer_latency / PA_USEC_PER_MSEC,
                 (double) u->latency_snapshot.source_latency / PA_USEC_PER_MSEC,
-                ((double) u->latency_snapshot.sink_latency + buffer_latency + u->latency_snapshot.source_latency) / PA_USEC_PER_MSEC);
-
-    pa_log_debug("Should buffer %zu bytes, buffered at minimum %zu bytes",
-                u->latency_snapshot.max_request*2,
-                u->latency_snapshot.min_memblockq_length);
-
-    fs = pa_frame_size(&u->sink_input->sample_spec);
-    old_rate = u->sink_input->sample_spec.rate;
-    base_rate = u->source_output->sample_spec.rate;
-
-    if (u->latency_snapshot.min_memblockq_length < u->latency_snapshot.max_request*2)
-        new_rate = base_rate - (((u->latency_snapshot.max_request*2 - u->latency_snapshot.min_memblockq_length) / fs) *PA_USEC_PER_SEC)/u->adjust_time;
-    else
-        new_rate = base_rate + (((u->latency_snapshot.min_memblockq_length - u->latency_snapshot.max_request*2) / fs) *PA_USEC_PER_SEC)/u->adjust_time;
-
-    if (new_rate < (uint32_t) (base_rate*0.8) || new_rate > (uint32_t) (base_rate*1.25)) {
-        pa_log_warn("Sample rates too different, not adjusting (%u vs. %u).", base_rate, new_rate);
-        new_rate = base_rate;
-    } else {
-        if (base_rate < new_rate + 20 && new_rate < base_rate + 20)
-          new_rate = base_rate;
-        /* Do the adjustment in small steps; 2â?° can be considered inaudible */
-        if (new_rate < (uint32_t) (old_rate*0.998) || new_rate > (uint32_t) (old_rate*1.002)) {
-            pa_log_info("New rate of %u Hz not within 2â?° of %u Hz, forcing smaller adjustment", new_rate, old_rate);
-            new_rate = PA_CLAMP(new_rate, (uint32_t) (old_rate*0.998), (uint32_t) (old_rate*1.002));
-        }
+                (double) current_latency / PA_USEC_PER_MSEC,
+                (double) (int32_t)(current_latency - final_latency) / PA_USEC_PER_MSEC,
+                (int32_t)(old_rate - base_rate));
+
+   /* Low pass filtered difference between expectation value and observed latency */
+   if (!u->source_sink_changed)
+      u->latency_error = (4 * u->latency_error + (double)abs((int32_t)(current_latency - u->next_latency)) / final_latency) / 5;
+   else
+      u->source_sink_changed = false;
+
+    /* Latency and latency difference at base rate */
+    corrected_latency = u->latency_snapshot.source_latency + (u->latency_snapshot.sink_latency + current_buffer_latency) * old_rate / base_rate - snapshot_delay;
+    latency_difference = (int32_t)(corrected_latency - final_latency);
+
+    /* Minimum number of adjust times + 1 needed to adjust at 0.75% deviation from base rate */
+    min_cycles = (double)abs(latency_difference) / u->adjust_time / 0.0075 + 1;
+
+    /* Rate calculation, maximum deviation from base rate will be less than 0.75% due to min_cycles */
+    new_rate = base_rate * (1.0 + latency_difference / min_cycles / u->adjust_time) + 0.5;
+
+    /* Adjust as good as physics allows (with some safety margin) */
+    if (abs(latency_difference) <= 2.5 * u->latency_error * final_latency + u->adjust_time / 2 / base_rate + 100)
+       new_rate = base_rate;
+
+    /* Do the adjustment in small steps; 2â?° of base rate can be considered inaudible, use 1 Hz below 500 Hz base rate */
+    cut_off_frequency = PA_MAX(1, 0.002 * base_rate);
+    if (new_rate < old_rate - cut_off_frequency || new_rate > old_rate + cut_off_frequency) {
+        pa_log_info("New rate of %u Hz not within 2â?° of %u Hz, forcing smaller adjustment", new_rate, old_rate);
+        new_rate = PA_CLAMP(new_rate, old_rate - cut_off_frequency, old_rate + cut_off_frequency);
     }
 
+    /* Predictor */
+    u->next_latency = (corrected_latency * base_rate + (int32_t)(base_rate - new_rate) * (int64_t)u->adjust_time) / new_rate;
+
+    /* Set rate */
     pa_sink_input_set_rate(u->sink_input, new_rate);
     pa_log_debug("[%s] Updated sampling rate to %lu Hz.", u->sink_input->sink->name, (unsigned long) new_rate);
-
-    pa_core_rttime_restart(u->core, u->time_event, pa_rtclock_now() + u->adjust_time);
 }
 
 /* Called from main context */
 static void time_callback(pa_mainloop_api *a, pa_time_event *e, const struct timeval *t, void *userdata) {
     struct userdata *u = userdata;
+    pa_usec_t timestamp;
 
     pa_assert(u);
     pa_assert(a);
     pa_assert(u->time_event == e);
 
-    adjust_rates(u);
+    /* Restart timer right away */
+    pa_core_rttime_restart(u->core, u->time_event, pa_rtclock_now() + u->adjust_time);
+
+    /* Get sink and source latency snapshot. The resulting latency must be corrected by the delay between sink and
+     * source snapshot which can somtimes be quite large (>>10 ms) */
+    pa_asyncmsgq_send(u->sink_input->sink->asyncmsgq, PA_MSGOBJECT(u->sink_input), SINK_INPUT_MESSAGE_LATENCY_SNAPSHOT, NULL, 0, NULL);
+    timestamp = pa_rtclock_now();
+    pa_asyncmsgq_send(u->source_output->source->asyncmsgq, PA_MSGOBJECT(u->source_output), SOURCE_OUTPUT_MESSAGE_LATENCY_SNAPSHOT, NULL, 0, NULL);
+
+    adjust_rates(u, pa_rtclock_now() - timestamp);
 }
 
-/* Called from main context */
+/* Called from main context
+ * When source or sink changes, give it a third of a second to settle down, then call adjust_rates for the first time */
 static void enable_adjust_timer(struct userdata *u, bool enable) {
     if (enable) {
-        if (u->time_event || u->adjust_time <= 0)
+        if (!u->adjust_time)
             return;
+        if (u->time_event)
+            u->core->mainloop->time_free(u->time_event);
 
-        u->time_event = pa_core_rttime_new(u->module->core, pa_rtclock_now() + u->adjust_time, time_callback, u);
+        u->time_event = pa_core_rttime_new(u->module->core, pa_rtclock_now() + 333 * PA_USEC_PER_MSEC, time_callback, u);
     } else {
         if (!u->time_event)
             return;
@@ -263,29 +327,62 @@ static void update_adjust_timer(struct userdata *u) {
         enable_adjust_timer(u, true);
 }
 
+static pa_usec_t get_requested_latency(struct userdata *u) {
+    pa_usec_t requested_latency;
+
+    requested_latency = u->latency;
+    if(u->buffer_latency_set)
+      requested_latency += u->buffer_latency;
+    return PA_MAX(u->configured_sink_latency + u->buffer_latency, requested_latency);
+}
+
+/* Called from all contexts */
+static void memblockq_adjust(struct userdata *u, int32_t offset, bool allow_push) {
+    size_t memblock_bytes, requested_buffer_bytes;
+    pa_usec_t requested_buffer_latency;
+    size_t buffer_offset;
+    pa_memchunk silence;
+
+    requested_buffer_latency = get_requested_latency(u);
+    if (offset > 0)
+       requested_buffer_latency = PA_CLIP_SUB(requested_buffer_latency, (pa_usec_t)offset);
+    else
+       requested_buffer_latency = requested_buffer_latency - offset;
+
+    requested_buffer_bytes = pa_usec_to_bytes(requested_buffer_latency, &u->sink_input->sample_spec);
+    memblock_bytes = pa_memblockq_get_length(u->memblockq);
+
+    /* Drop audio from queue */
+    if ((int32_t)(memblock_bytes - requested_buffer_bytes) > 0) {
+       buffer_offset = memblock_bytes - requested_buffer_bytes;
+       pa_log_info("Dropping %zd bytes from queue", buffer_offset);
+       pa_memblockq_drop(u->memblockq, buffer_offset);
+    }
+    /* Add silence to queue, will never happen from IO-thread */
+    else if ((int32_t)(memblock_bytes - requested_buffer_bytes) < 0 && allow_push) {
+       requested_buffer_bytes = requested_buffer_bytes - memblock_bytes;
+       pa_log_info("Adding %zd bytes of silence to queue", requested_buffer_bytes);
+       pa_sink_input_get_silence(u->sink_input, &silence);
+       while (requested_buffer_bytes >= silence.length) {
+          pa_memblockq_push_align(u->memblockq, &silence);
+          requested_buffer_bytes -= silence.length;
+       }
+       if (requested_buffer_bytes > 0) {
+          silence.length = requested_buffer_bytes;
+          pa_memblockq_push_align(u->memblockq, &silence);
+       }
+       pa_memblock_unref(silence.memblock);
+    }
+}
+
 /* Called from input thread context */
 static void source_output_push_cb(pa_source_output *o, const pa_memchunk *chunk) {
     struct userdata *u;
-    pa_memchunk copy;
 
     pa_source_output_assert_ref(o);
     pa_source_output_assert_io_context(o);
     pa_assert_se(u = o->userdata);
 
-    if (u->skip >= chunk->length) {
-        u->skip -= chunk->length;
-        return;
-    }
-
-    if (u->skip > 0) {
-        copy = *chunk;
-        copy.index += u->skip;
-        copy.length -= u->skip;
-        u->skip = 0;
-
-        chunk = &copy;
-    }
-
     pa_asyncmsgq_post(u->asyncmsgq, PA_MSGOBJECT(u->sink_input), SINK_INPUT_MESSAGE_POST, NULL, 0, chunk, NULL);
     u->send_counter += (int64_t) chunk->length;
 }
@@ -324,6 +421,36 @@ static int source_output_process_msg_cb(pa_msgobject *obj, int code, void *data,
     return pa_source_output_process_msg(obj, code, data, offset, chunk);
 }
 
+static void set_source_output_latency(struct userdata *u, pa_source *source) {
+     pa_usec_t min_latency, max_latency, buffer_msec, latency;
+
+    /* Set lower limit of source latency to 2.333 ms, if buffer
+     * latency is specified, use latency_msec as source latency */
+    latency = PA_MAX(u->latency / 3, 2.333 * PA_USEC_PER_MSEC);
+    if (u->buffer_latency_set)
+       latency = u->latency;
+
+    if(source->flags & PA_SOURCE_DYNAMIC_LATENCY) {
+       pa_source_get_latency_range(source, &min_latency, &max_latency);
+       if (min_latency > latency && !u->buffer_latency_set) {
+          u->buffer_latency = PA_MAX(u->buffer_latency, (pa_usec_t)(min_latency * 0.75));
+          pa_log_warn("Cannot set requested source latency, adjusting buffer to %0.2f ms", (double)u->buffer_latency / PA_USEC_PER_MSEC);
+       }
+       latency = PA_CLAMP(latency, min_latency, max_latency);
+    }
+    else {
+       latency = pa_source_get_latency(source);
+       if (latency == 0)
+          latency = pa_source_get_fixed_latency(source);
+       buffer_msec = u->core->default_fragment_size_msec + DEFAULT_BUFFER_MARGIN_MSEC;
+       if (!u->buffer_latency_set && u->buffer_latency < buffer_msec * PA_USEC_PER_MSEC) {
+          pa_log_warn("Fixed latency device, setting buffer latency to %zd.00 ms", buffer_msec);
+          u->buffer_latency = buffer_msec * PA_USEC_PER_MSEC;
+       }
+    }
+    u->configured_source_latency = pa_source_output_set_requested_latency(u->source_output, latency);
+}
+
 /* Called from output thread context */
 static void source_output_attach_cb(pa_source_output *o) {
     struct userdata *u;
@@ -350,24 +477,12 @@ static void source_output_detach_cb(pa_source_output *o) {
         pa_rtpoll_item_free(u->rtpoll_item_write);
         u->rtpoll_item_write = NULL;
     }
-}
-
-/* Called from output thread context */
-static void source_output_state_change_cb(pa_source_output *o, pa_source_output_state_t state) {
-    struct userdata *u;
-
-    pa_source_output_assert_ref(o);
-    pa_source_output_assert_io_context(o);
-    pa_assert_se(u = o->userdata);
-
-    if (PA_SOURCE_OUTPUT_IS_LINKED(state) && o->thread_info.state == PA_SOURCE_OUTPUT_INIT) {
-
-        u->skip = pa_usec_to_bytes(PA_CLIP_SUB(pa_source_get_latency_within_thread(o->source),
-                                               u->latency),
-                                   &o->sample_spec);
-
-        pa_log_info("Skipping %lu bytes", (unsigned long) u->skip);
-    }
+   u->source_sink_changed = true;
+   u->source_latency_sum = 0;
+   u->source_adjust_counter = 0;
+   u->underruns = 0;
+   u->no_peeks = 0;
+   u->buffer_latency = u->initial_buffer_latency;
 }
 
 /* Called from main thread */
@@ -393,7 +508,12 @@ static bool source_output_may_move_to_cb(pa_source_output *o, pa_source *dest) {
     if (!u->sink_input || !u->sink_input->sink)
         return true;
 
-    return dest != u->sink_input->sink->monitor_source;
+    /* We may still be adjusting, so reset rate to default before moving the source */
+    if (dest != u->sink_input->sink->monitor_source) {
+       pa_sink_input_set_rate(u->sink_input, u->source_output->sample_spec.rate);
+       return true;
+    }
+    return false;
 }
 
 /* Called from main thread */
@@ -401,6 +521,7 @@ static void source_output_moving_cb(pa_source_output *o, pa_source *dest) {
     pa_proplist *p;
     const char *n;
     struct userdata *u;
+    pa_usec_t sink_latency;
 
     if (!dest)
         return;
@@ -418,6 +539,26 @@ static void source_output_moving_cb(pa_source_output *o, pa_source *dest) {
     pa_sink_input_update_proplist(u->sink_input, PA_UPDATE_REPLACE, p);
     pa_proplist_free(p);
 
+    /* Set latency and calculate necessary buffer length */
+    set_source_output_latency(u, dest);
+    if (!u->buffer_latency_set) {
+       if (u->sink_input->sink->flags & PA_SINK_DYNAMIC_LATENCY)
+          u->buffer_latency = PA_MAX(u->buffer_latency, (pa_usec_t)(u->configured_sink_latency * 0.75));
+       else
+          u->buffer_latency = PA_MAX(u->buffer_latency, (u->core->default_fragment_size_msec + DEFAULT_BUFFER_MARGIN_MSEC) * PA_USEC_PER_MSEC);
+    }
+
+    pa_sink_input_get_latency(u->sink_input, &sink_latency);
+    if (u->send_counter > u->recv_counter)
+       sink_latency += pa_bytes_to_usec(u->send_counter - u->recv_counter, &u->sink_input->sample_spec);
+    if (dest->flags & PA_SOURCE_DYNAMIC_LATENCY)
+       sink_latency += pa_source_get_latency(dest);
+    else
+       sink_latency = PA_CLIP_SUB(sink_latency, pa_source_get_latency(dest));
+    memblockq_adjust(u, sink_latency, true);
+
+    u->latency_error = 400.0 / get_requested_latency(u);
+
     if (pa_source_get_state(dest) == PA_SOURCE_SUSPENDED)
         pa_sink_input_cork(u->sink_input, true);
     else
@@ -439,18 +580,29 @@ static void source_output_suspend_cb(pa_source_output *o, bool suspended) {
     update_adjust_timer(u);
 }
 
-/* Called from output thread context */
-static void update_min_memblockq_length(struct userdata *u) {
-    size_t length;
-
-    pa_assert(u);
-    pa_sink_input_assert_io_context(u->sink_input);
+/* Called from input thread context */
+static void update_source_requested_latency_cb(pa_source_output *i) {
+    struct userdata *u;
+    pa_usec_t source_latency;
 
-    length = pa_memblockq_get_length(u->memblockq);
+    pa_source_output_assert_ref(i);
+    pa_source_output_assert_io_context(i);
+    pa_assert_se(u = i->userdata);
 
-    if (u->min_memblockq_length == (size_t) -1 ||
-        length < u->min_memblockq_length)
-        u->min_memblockq_length = length;
+    /* Source latency may have changed */
+    source_latency = pa_source_get_requested_latency_within_thread(u->source_output->source);
+    if (source_latency > u->configured_source_latency) {
+       pa_log_warn("Source latency increased to %0.2f ms", (double)source_latency / PA_USEC_PER_MSEC);
+       u->configured_source_latency = source_latency;
+       if (!u->buffer_latency_set && u->buffer_latency < source_latency * 0.75)
+          u->buffer_latency = source_latency * 0.75;
+       if (!u->source_sink_changed) {
+          u->source_adjust_counter = 0;
+          u->source_latency_sum = 0;
+          u->underruns = 0;
+          u->no_peeks = 0;
+       }
+    }
 }
 
 /* Called from output thread context */
@@ -462,21 +614,23 @@ static int sink_input_pop_cb(pa_sink_input *i, size_t nbytes, pa_memchunk *chunk
     pa_assert_se(u = i->userdata);
     pa_assert(chunk);
 
+    u->pop_called = true;
     u->in_pop = true;
     while (pa_asyncmsgq_process_one(u->asyncmsgq) > 0)
         ;
     u->in_pop = false;
 
     if (pa_memblockq_peek(u->memblockq, chunk) < 0) {
-        pa_log_info("Could not peek into queue");
+        if (!u->source_sink_changed) {
+           u->no_peeks +=1;
+           pa_log_info("Could not peek into queue");
+        }
         return -1;
     }
 
     chunk->length = PA_MIN(chunk->length, nbytes);
     pa_memblockq_drop(u->memblockq, chunk->length);
 
-    update_min_memblockq_length(u);
-
     return 0;
 }
 
@@ -513,12 +667,13 @@ static int sink_input_process_msg_cb(pa_msgobject *obj, int code, void *data, in
 
             pa_sink_input_assert_io_context(u->sink_input);
 
-            if (PA_SINK_IS_OPENED(u->sink_input->sink->thread_info.state))
-                pa_memblockq_push_align(u->memblockq, chunk);
-            else
-                pa_memblockq_flush_write(u->memblockq, true);
+            pa_memblockq_push_align(u->memblockq, chunk);
+            u->recv_counter += (int64_t) chunk->length;
 
-            update_min_memblockq_length(u);
+            if (!PA_SINK_IS_OPENED(u->sink_input->sink->thread_info.state) || !u->pop_called) {
+                memblockq_adjust(u, (int32_t)(-u->configured_sink_latency / 4), false);
+                return 0;
+            }
 
             /* Is this the end of an underrun? Then let's start things
              * right-away */
@@ -527,67 +682,76 @@ static int sink_input_process_msg_cb(pa_msgobject *obj, int code, void *data, in
                 pa_memblockq_is_readable(u->memblockq)) {
 
                 pa_log_debug("Requesting rewind due to end of underrun.");
+                if (!u->source_sink_changed)
+                   u->underruns +=1;
                 pa_sink_input_request_rewind(u->sink_input,
                                              (size_t) (u->sink_input->thread_info.underrun_for == (size_t) -1 ? 0 : u->sink_input->thread_info.underrun_for),
                                              false, true, false);
             }
 
-            u->recv_counter += (int64_t) chunk->length;
-
             return 0;
 
         case SINK_INPUT_MESSAGE_REWIND:
 
             pa_sink_input_assert_io_context(u->sink_input);
 
-            if (PA_SINK_IS_OPENED(u->sink_input->sink->thread_info.state))
+            if (PA_SINK_IS_OPENED(u->sink_input->sink->thread_info.state) && u->pop_called)
                 pa_memblockq_seek(u->memblockq, -offset, PA_SEEK_RELATIVE, true);
             else
-                pa_memblockq_flush_write(u->memblockq, true);
+                memblockq_adjust(u, (int32_t)(-u->configured_sink_latency / 4), false);
 
             u->recv_counter -= offset;
 
-            update_min_memblockq_length(u);
-
             return 0;
 
         case SINK_INPUT_MESSAGE_LATENCY_SNAPSHOT: {
             size_t length;
 
-            update_min_memblockq_length(u);
-
             length = pa_memblockq_get_length(u->sink_input->thread_info.render_memblockq);
 
             u->latency_snapshot.recv_counter = u->recv_counter;
-            u->latency_snapshot.sink_input_buffer =
-                pa_memblockq_get_length(u->memblockq) +
-                (u->sink_input->thread_info.resampler ? pa_resampler_request(u->sink_input->thread_info.resampler, length) : length);
+            u->latency_snapshot.sink_input_buffer = pa_memblockq_get_length(u->memblockq) +
+                                                    (u->sink_input->thread_info.resampler ? pa_resampler_request(u->sink_input->thread_info.resampler, length) : length);
             u->latency_snapshot.sink_latency = pa_sink_get_latency_within_thread(u->sink_input->sink);
 
-            u->latency_snapshot.max_request = pa_sink_input_get_max_request(u->sink_input);
-
-            u->latency_snapshot.min_memblockq_length = u->min_memblockq_length;
-            u->min_memblockq_length = (size_t) -1;
-
             return 0;
         }
 
-        case SINK_INPUT_MESSAGE_MAX_REQUEST_CHANGED: {
-            /* This message is sent from the IO thread to the main
-             * thread! So don't be confused. All the user cases above
-             * are executed in thread context, but this one is not! */
-
-            pa_assert_ctl_context();
-
-            if (u->time_event)
-                adjust_rates(u);
-            return 0;
-        }
     }
 
     return pa_sink_input_process_msg(obj, code, data, offset, chunk);
 }
 
+static void set_sink_input_latency(struct userdata *u, pa_sink *sink) {
+     pa_usec_t min_latency, max_latency, buffer_msec, latency;
+
+    /* Set lower limit of sink latency to 2.333 ms, if buffer
+     * latency is specified, use latency_msec as sink latency */
+    latency = PA_MAX(u->latency / 3, 2.333 * PA_USEC_PER_MSEC);
+    if (u->buffer_latency_set)
+       latency = u->latency;
+
+    if(sink->flags & PA_SINK_DYNAMIC_LATENCY) {
+       pa_sink_get_latency_range(sink, &min_latency, &max_latency);
+       if (min_latency > latency && !u->buffer_latency_set) {
+          u->buffer_latency = PA_MAX(u->buffer_latency, (pa_usec_t)(min_latency * 0.75));
+          pa_log_warn("Cannot set requested sink latency, adjusting buffer to %0.2f ms", (double)u->buffer_latency / PA_USEC_PER_MSEC);
+       }
+       latency = PA_CLAMP(latency, min_latency, max_latency);
+    }
+    else {
+       latency = pa_sink_get_latency(sink);
+       if (latency == 0)
+          latency = pa_sink_get_fixed_latency(sink);
+       buffer_msec = u->core->default_fragment_size_msec + DEFAULT_BUFFER_MARGIN_MSEC;
+       if (!u->buffer_latency_set && u->buffer_latency < buffer_msec * PA_USEC_PER_MSEC) {
+          pa_log_warn("Fixed latency device, setting buffer latency to %zd.00 ms", buffer_msec);
+          u->buffer_latency = buffer_msec * PA_USEC_PER_MSEC;
+       }
+    }
+    u->configured_sink_latency = pa_sink_input_set_requested_latency(u->sink_input, latency);
+}
+
 /* Called from output thread context */
 static void sink_input_attach_cb(pa_sink_input *i) {
     struct userdata *u;
@@ -603,8 +767,6 @@ static void sink_input_attach_cb(pa_sink_input *i) {
 
     pa_memblockq_set_prebuf(u->memblockq, pa_sink_input_get_max_request(i)*2);
     pa_memblockq_set_maxrewind(u->memblockq, pa_sink_input_get_max_rewind(i));
-
-    u->min_memblockq_length = (size_t) -1;
 }
 
 /* Called from output thread context */
@@ -619,6 +781,13 @@ static void sink_input_detach_cb(pa_sink_input *i) {
         pa_rtpoll_item_free(u->rtpoll_item_read);
         u->rtpoll_item_read = NULL;
     }
+    u->source_sink_changed = true;
+    u->pop_called = false;
+    u->sink_latency_sum = 0;
+    u->sink_adjust_counter = 0;
+    u->underruns = 0;
+    u->no_peeks = 0;
+    u->buffer_latency = u->initial_buffer_latency;
 }
 
 /* Called from output thread context */
@@ -642,7 +811,6 @@ static void sink_input_update_max_request_cb(pa_sink_input *i, size_t nbytes) {
 
     pa_memblockq_set_prebuf(u->memblockq, nbytes*2);
     pa_log_info("Max request changed");
-    pa_asyncmsgq_post(pa_thread_mq_get()->outq, PA_MSGOBJECT(u->sink_input), SINK_INPUT_MESSAGE_MAX_REQUEST_CHANGED, NULL, 0, NULL, NULL);
 }
 
 /* Called from main thread */
@@ -673,6 +841,7 @@ static void sink_input_moving_cb(pa_sink_input *i, pa_sink *dest) {
     struct userdata *u;
     pa_proplist *p;
     const char *n;
+    pa_usec_t source_latency;
 
     if (!dest)
         return;
@@ -690,6 +859,22 @@ static void sink_input_moving_cb(pa_sink_input *i, pa_sink *dest) {
     pa_source_output_update_proplist(u->source_output, PA_UPDATE_REPLACE, p);
     pa_proplist_free(p);
 
+    /* Set latency and calculate necessary buffer length */
+    set_sink_input_latency(u, dest);
+    if (!u->buffer_latency_set) {
+       if (u->source_output->source->flags & PA_SOURCE_DYNAMIC_LATENCY)
+          u->buffer_latency = PA_MAX(u->buffer_latency, (pa_usec_t)(u->configured_source_latency * 0.75));
+       else
+          u->buffer_latency = PA_MAX(u->buffer_latency, (u->core->default_fragment_size_msec + DEFAULT_BUFFER_MARGIN_MSEC) * PA_USEC_PER_MSEC);
+    }
+
+    pa_source_output_get_latency(u->source_output, &source_latency);
+    if (u->send_counter > u->recv_counter)
+       source_latency += pa_bytes_to_usec(u->send_counter - u->recv_counter, &u->sink_input->sample_spec);
+    memblockq_adjust(u, source_latency, true);
+
+    u->latency_error = 400.0 / get_requested_latency(u);
+
     if (pa_sink_get_state(dest) == PA_SINK_SUSPENDED)
         pa_source_output_cork(u->source_output, true);
     else
@@ -709,7 +894,37 @@ static bool sink_input_may_move_to_cb(pa_sink_input *i, pa_sink *dest) {
     if (!u->source_output || !u->source_output->source)
         return true;
 
-    return dest != u->source_output->source->monitor_of;
+    /* We may still be adjusting, so reset rate to default before moving the sink */
+    if (dest != u->source_output->source->monitor_of) {
+       pa_sink_input_set_rate(u->sink_input, u->source_output->sample_spec.rate);
+       return true;
+    }
+    return false;
+}
+
+/* Called from output thread context */
+static void update_sink_requested_latency_cb(pa_sink_input *i) {
+    struct userdata *u;
+    pa_usec_t sink_latency;
+
+    pa_sink_input_assert_ref(i);
+    pa_sink_input_assert_io_context(i);
+    pa_assert_se(u = i->userdata);
+
+    /* Sink latency may have changed */
+    sink_latency = pa_sink_get_requested_latency_within_thread(u->sink_input->sink);
+    if (sink_latency > u->configured_sink_latency) {
+       pa_log_warn("Sink latency increased to %0.2f ms", (double)sink_latency / PA_USEC_PER_MSEC);
+       u->configured_sink_latency = sink_latency;
+       if (!u->buffer_latency_set && u->buffer_latency < sink_latency * 0.75)
+          u->buffer_latency = sink_latency * 0.75;
+       if (!u->source_sink_changed) {
+          u->sink_adjust_counter = 0;
+          u->sink_latency_sum = 0;
+          u->underruns = 0;
+          u->no_peeks = 0;
+       }
+    }
 }
 
 /* Called from main thread */
@@ -734,12 +949,13 @@ int pa__init(pa_module *m) {
     pa_source *source = NULL;
     pa_source_output_new_data source_output_data;
     bool source_dont_move;
-    uint32_t latency_msec;
+    uint32_t latency_msec, buffer_latency_msec;
     pa_sample_spec ss;
     pa_channel_map map;
     bool format_set = false;
     bool rate_set = false;
     bool channels_set = false;
+    bool buffer_latency_set = false;
     pa_memchunk silence;
     uint32_t adjust_time_sec;
     const char *n;
@@ -799,6 +1015,11 @@ int pa__init(pa_module *m) {
         goto fail;
     }
 
+    if (ss.rate < 20 || ss.rate > 190500) {
+        pa_log("Invalid rate specification, valid range is 20 Hz to 190500 Hz");
+        goto fail;
+    }
+
     if (pa_modargs_get_value(ma, "format", NULL))
         format_set = true;
 
@@ -808,8 +1029,18 @@ int pa__init(pa_module *m) {
     if (pa_modargs_get_value(ma, "channels", NULL) || pa_modargs_get_value(ma, "channel_map", NULL))
         channels_set = true;
 
-    latency_msec = DEFAULT_LATENCY_MSEC;
-    if (pa_modargs_get_value_u32(ma, "latency_msec", &latency_msec) < 0 || latency_msec < 1 || latency_msec > 30000) {
+    buffer_latency_msec = 0;
+    if (pa_modargs_get_value_u32(ma, "buffer_latency_msec", &buffer_latency_msec) < 0 || buffer_latency_msec > 30000) {
+        pa_log_info("Invalid buffer latency specification");
+        goto fail;
+    }
+    else if (buffer_latency_msec > 0)
+       buffer_latency_set = true;
+
+    latency_msec = 0;
+    if (!buffer_latency_set)
+       latency_msec = DEFAULT_LATENCY_MSEC;
+    if (pa_modargs_get_value_u32(ma, "latency_msec", &latency_msec) < 0 || latency_msec > 30000) {
         pa_log("Invalid latency specification");
         goto fail;
     }
@@ -818,6 +1049,18 @@ int pa__init(pa_module *m) {
     u->core = m->core;
     u->module = m;
     u->latency = (pa_usec_t) latency_msec * PA_USEC_PER_MSEC;
+    if (buffer_latency_set)
+       u->initial_buffer_latency = (pa_usec_t) buffer_latency_msec * PA_USEC_PER_MSEC;
+    else
+       u->initial_buffer_latency = PA_MAX(u->latency / 4, 1.667 * PA_USEC_PER_MSEC);
+    u->buffer_latency = u->initial_buffer_latency;
+    u->buffer_latency_set = buffer_latency_set;
+    u->sink_latency_sum = 0;
+    u->source_latency_sum = 0;
+    u->pop_called = false;
+    u->underruns = 0;
+    u->no_peeks = 0;
+    u->source_sink_changed = true;
 
     adjust_time_sec = DEFAULT_ADJUST_TIME_USEC / PA_USEC_PER_SEC;
     if (pa_modargs_get_value_u32(ma, "adjust_time", &adjust_time_sec) < 0) {
@@ -894,9 +1137,13 @@ int pa__init(pa_module *m) {
     u->sink_input->may_move_to = sink_input_may_move_to_cb;
     u->sink_input->moving = sink_input_moving_cb;
     u->sink_input->suspend = sink_input_suspend_cb;
+    u->sink_input->update_sink_requested_latency = update_sink_requested_latency_cb;
     u->sink_input->userdata = u;
 
-    pa_sink_input_set_requested_latency(u->sink_input, u->latency/3);
+    if (u->latency < 4 * PA_USEC_PER_MSEC && !buffer_latency_set)
+       pa_log_warn("Latency limited to 4 ms, try buffer_latency_msec together with latency_msec if you know what you are doing");
+
+    set_sink_input_latency(u, u->sink_input->sink);
 
     pa_source_output_new_data_init(&source_output_data);
     source_output_data.driver = __FILE__;
@@ -941,13 +1188,13 @@ int pa__init(pa_module *m) {
     u->source_output->kill = source_output_kill_cb;
     u->source_output->attach = source_output_attach_cb;
     u->source_output->detach = source_output_detach_cb;
-    u->source_output->state_change = source_output_state_change_cb;
     u->source_output->may_move_to = source_output_may_move_to_cb;
     u->source_output->moving = source_output_moving_cb;
     u->source_output->suspend = source_output_suspend_cb;
+    u->source_output->update_source_requested_latency = update_source_requested_latency_cb;
     u->source_output->userdata = u;
 
-    pa_source_output_set_requested_latency(u->source_output, u->latency/3);
+    set_source_output_latency(u, u->source_output->source);
 
     pa_sink_input_get_silence(u->sink_input, &silence);
     u->memblockq = pa_memblockq_new(
@@ -962,6 +1209,14 @@ int pa__init(pa_module *m) {
             &silence);              /* silence frame */
     pa_memblock_unref(silence.memblock);
 
+    if(u->sink_input->sink->flags & PA_SINK_DYNAMIC_LATENCY)
+        memblockq_adjust(u, 0, true);
+    else
+        memblockq_adjust(u, (int32_t)(-u->configured_sink_latency / 4), true);
+
+    /* Initialize latency error (allow at least 1 ms latency offset) */
+    u->latency_error = 400.0 / get_requested_latency(u);
+
     u->asyncmsgq = pa_asyncmsgq_new(0);
 
     if (!pa_proplist_contains(u->source_output->proplist, PA_PROP_MEDIA_NAME))
-- 
2.1.4