diff --git a/firmware/code/CMakeLists.txt b/firmware/code/CMakeLists.txt
index aae60e4..74d4658 100644
--- a/firmware/code/CMakeLists.txt
+++ b/firmware/code/CMakeLists.txt
@@ -14,7 +14,6 @@ add_executable(ploopy_headphones
     run.c
     ringbuf.c
     i2s.c
-    fix16.c
     bqf.c
     configuration_manager.c
 )
diff --git a/firmware/code/bqf.h b/firmware/code/bqf.h
index dcdc038..8e5933c 100644
--- a/firmware/code/bqf.h
+++ b/firmware/code/bqf.h
@@ -43,7 +43,7 @@ typedef struct _bqf_mem_t {
 
 // In reality we do not have enough CPU resource to run 8 filtering
 // stages without some optimisation.
-#define MAX_FILTER_STAGES 8
+#define MAX_FILTER_STAGES 10
 extern int filter_stages;
 
 extern bqf_coeff_t bqf_filters_left[MAX_FILTER_STAGES];
diff --git a/firmware/code/fix16.h b/firmware/code/fix16.h
index 40acced..9d7887e 100644
--- a/firmware/code/fix16.h
+++ b/firmware/code/fix16.h
@@ -25,13 +25,6 @@
 #include <stdbool.h>
 #include <inttypes.h>
 
-// During development, it can be useful to run with real double values for reference.
-//#define USE_DOUBLE
-#ifdef USE_DOUBLE
-typedef double fix16_t;
-static const fix16_t fix16_zero = 0;
-static const fix16_t fix16_one = 1;
-#else
 
 /// @brief Fixed point math type, in format Q3.28. One sign bit, 3 bits for left-of-decimal
 ///and 28 for right-of-decimal. This arrangment works because we normalize the incoming USB
@@ -46,15 +39,13 @@ static const fix3_28_t fix16_one =    0x10000000;
 /// @brief Represents zero in fixed point world.
 static const fix3_28_t fix16_zero = 0x00000000;
 
-#endif
+static inline fix3_28_t norm_fix3_28_from_s16sample(int16_t);
 
+static inline int16_t norm_fix3_28_to_s16sample(fix3_28_t);
 
-fix3_28_t norm_fix3_28_from_s16sample(int16_t);
+static inline fix3_28_t fix3_28_from_dbl(double);
 
-int16_t norm_fix3_28_to_s16sample(fix3_28_t);
-
-fix3_28_t fix3_28_from_dbl(double);
-
-fix3_28_t fix16_mul(fix3_28_t, fix3_28_t);
+static inline fix3_28_t fix16_mul(fix3_28_t, fix3_28_t);
 
+#include "fix16.inl"
 #endif
\ No newline at end of file
diff --git a/firmware/code/fix16.c b/firmware/code/fix16.inl
similarity index 81%
rename from firmware/code/fix16.c
rename to firmware/code/fix16.inl
index 1695ed6..8ce3f5c 100644
--- a/firmware/code/fix16.c
+++ b/firmware/code/fix16.inl
@@ -25,46 +25,10 @@
 #include <limits.h>
 #include "fix16.h"
 
-#ifdef USE_DOUBLE
-fix16_t fix16_from_s16sample(int16_t a) {
-    return a;
-}
-
-int16_t fix16_to_s16sample(fix16_t a) {
-    // Handle rounding up front, adding one can cause an overflow/underflow
-    if (a < 0) {
-        a -= 0.5;
-    } else {
-        a += 0.5;
-    }
-
-    // Saturate the value if an overflow has occurred
-    if (a < SHRT_MIN) {
-        return SHRT_MIN;
-    }
-    if (a < SHRT_MAX) {
-        return SHRT_MAX;
-    }
-    return a;
-}
-
-fix16_t fix16_from_dbl(double a) {
-    return a;
-}
-
-double fix16_to_dbl(fix16_t a) {
-    return a;
-}
-
-fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1) {
-    return inArg0 * inArg1;
-}
-#else
-
 /// @brief Produces a fixed point number from a 16-bit signed integer, normalized to ]-1,1[.
 /// @param a Signed 16-bit integer.
 /// @return A fixed point number in Q3.28 format, with input normalized to ]-1,1[.
-fix3_28_t norm_fix3_28_from_s16sample(int16_t a) {
+static inline fix3_28_t norm_fix3_28_from_s16sample(int16_t a) {
     /* So, we're using a Q3.28 fixed point system here, and we want the incoming
        audio signal to be represented as a number between -1 and 1. To do this,
        we need the 16-bit value to map to the 28-bit right-of-decimal field in
@@ -79,7 +43,7 @@ fix3_28_t norm_fix3_28_from_s16sample(int16_t a) {
 ///        calculated sample to one that the DAC can understand.
 /// @param a
 /// @return Signed 16-bit integer.
-int16_t norm_fix3_28_to_s16sample(fix3_28_t a) {
+static inline int16_t norm_fix3_28_to_s16sample(fix3_28_t a) {
     // Handle rounding up front, adding one can cause an overflow/underflow
 
     // It's not clear exactly how this works, so we'll disable it for now.
@@ -110,8 +74,7 @@ int16_t norm_fix3_28_to_s16sample(fix3_28_t a) {
     return (a >> 12);
 }
 
-
-fix3_28_t fix3_28_from_dbl(double a) {
+static inline fix3_28_t fix3_28_from_dbl(double a) {
     double temp = a * fix16_one;
     temp += (double)((temp >= 0) ? 0.5f : -0.5f);
     return (fix3_28_t)temp;
@@ -121,7 +84,7 @@ fix3_28_t fix3_28_from_dbl(double a) {
 /// @param inArg0 Q3.28 format fixed point number.
 /// @param inArg1 Q3.28 format fixed point number.
 /// @return A Q3.28 fixed point number that represents the truncated result of inArg0 x inArg1.
-fix3_28_t fix16_mul(fix3_28_t inArg0, fix3_28_t inArg1) {
+static inline fix3_28_t fix16_mul(fix3_28_t inArg0, fix3_28_t inArg1) {
     const int64_t product = (int64_t)inArg0 * inArg1;
 
     /* Since we're expecting 2 Q3.28 numbers, the multiplication result should be a Q7.56 number.
@@ -143,5 +106,4 @@ fix3_28_t fix16_mul(fix3_28_t inArg0, fix3_28_t inArg1) {
     }
     #endif
     return result;
-}
-#endif
\ No newline at end of file
+}
\ No newline at end of file
diff --git a/firmware/code/i2s.c b/firmware/code/i2s.c
index 89e0b21..73bc12c 100644
--- a/firmware/code/i2s.c
+++ b/firmware/code/i2s.c
@@ -64,7 +64,7 @@ void i2s_write_init(i2s_obj_t *self) {
             self->prog_offset + self->pio_program->length - 1);
     pio_sm_set_config(self->pio, self->sm, &config);
 
-    uint8_t *rbs = malloc(sizeof(uint8_t) * RINGBUF_LEN_IN_BYTES);
+    uint32_t *rbs = malloc(sizeof(uint8_t) * RINGBUF_LEN_IN_BYTES);
     ringbuf_init(&self->ring_buffer, rbs, RINGBUF_LEN_IN_BYTES);
 
     irq_set_exclusive_handler(DMA_IRQ_1, dma_irq_write_handler);
@@ -169,27 +169,27 @@ uint8_t *dma_get_buffer(i2s_obj_t *i2s_obj, uint channel) {
 void feed_dma(i2s_obj_t *self, uint8_t *dma_buffer_p) {
     // when data exists, copy samples from ring buffer
     if (ringbuf_available_data(&self->ring_buffer) >= SIZEOF_HALF_DMA_BUFFER_IN_BYTES) {
-        for (uint32_t i = 0; i < SIZEOF_HALF_DMA_BUFFER_IN_BYTES; i++)
-            ringbuf_pop(&self->ring_buffer, &dma_buffer_p[i]);
+        for (uint32_t i = 0; i < SIZEOF_HALF_DMA_BUFFER_IN_BYTES; i+=4)
+            ringbuf_pop(&self->ring_buffer, (uint32_t*)&dma_buffer_p[i]);
     } else {
         // underflow.  clear buffer to transmit "silence" on the I2S bus
         memset(dma_buffer_p, 0, SIZEOF_HALF_DMA_BUFFER_IN_BYTES);
     }
 }
 
-uint i2s_stream_write(i2s_obj_t *self, const uint8_t *buf_out, uint size) {
+uint i2s_stream_write(i2s_obj_t *self, const uint32_t *buf_out, uint size) {
     if (size == 0) {
         //printf("ERROR: buffer can't be length zero");
         exit(1);
     }
 
-    uint32_t num_bytes_written = copy_userbuf_to_ringbuf(self, buf_out, size);
-    return num_bytes_written;
+    uint32_t num_words_written = copy_userbuf_to_ringbuf(self, buf_out, size);
+    return num_words_written;
 }
 
 // TODO maybe we can skip every fourth byte, if we're doing this in 24-bit...
 // could save on some processing power
-uint32_t copy_userbuf_to_ringbuf(i2s_obj_t *self, const uint8_t *buf_out, uint size) {
+uint32_t copy_userbuf_to_ringbuf(i2s_obj_t *self, const uint32_t *buf_out, uint size) {
     uint32_t a_index = 0;
 
     while (a_index < size) {
diff --git a/firmware/code/i2s.h b/firmware/code/i2s.h
index ca30353..faf0ec6 100644
--- a/firmware/code/i2s.h
+++ b/firmware/code/i2s.h
@@ -59,7 +59,7 @@ typedef struct _i2s_obj_t {
 extern i2s_obj_t i2s_write_obj;
 
 void i2s_write_init(i2s_obj_t *);
-uint i2s_stream_write(i2s_obj_t *, const uint8_t *, uint);
+uint i2s_stream_write(i2s_obj_t *, const uint32_t *, uint);
 
 void dma_irq_handler(uint8_t);
 void dma_irq_write_handler(void);
@@ -68,6 +68,6 @@ void dma_configure(i2s_obj_t *);
 uint8_t *dma_get_buffer(i2s_obj_t *, uint);
 void feed_dma(i2s_obj_t *, uint8_t *);
 
-uint32_t copy_userbuf_to_ringbuf(i2s_obj_t *, const uint8_t *, uint);
+uint32_t copy_userbuf_to_ringbuf(i2s_obj_t *, const uint32_t *, uint);
 
 #endif
\ No newline at end of file
diff --git a/firmware/code/ringbuf.c b/firmware/code/ringbuf.c
index b6399f3..ca466fe 100644
--- a/firmware/code/ringbuf.c
+++ b/firmware/code/ringbuf.c
@@ -33,14 +33,14 @@
 // - Sequential atomic operations
 // One byte of capacity is used to detect buffer empty/full
 
-void ringbuf_init(ring_buf_t *rbuf, uint8_t *buffer, size_t size) {
+void ringbuf_init(ring_buf_t *rbuf, uint32_t *buffer, size_t size) {
     rbuf->buffer = buffer;
     rbuf->size = size;
     rbuf->head = 0;
     rbuf->tail = 0;
 }
 
-bool ringbuf_push(ring_buf_t *rbuf, uint8_t data) {
+bool ringbuf_push(ring_buf_t *rbuf, uint32_t data) {
     size_t next_tail = (rbuf->tail + 1) % rbuf->size;
 
     if (next_tail != rbuf->head) {
@@ -53,7 +53,7 @@ bool ringbuf_push(ring_buf_t *rbuf, uint8_t data) {
     return false;
 }
 
-bool ringbuf_pop(ring_buf_t *rbuf, uint8_t *data) {
+bool ringbuf_pop(ring_buf_t *rbuf, uint32_t *data) {
     if (rbuf->head == rbuf->tail) {
         // empty
         return false;
diff --git a/firmware/code/ringbuf.h b/firmware/code/ringbuf.h
index 5e1cbcc..dd83f85 100644
--- a/firmware/code/ringbuf.h
+++ b/firmware/code/ringbuf.h
@@ -28,15 +28,15 @@
 #include "pico/stdlib.h"
 
 typedef struct _ring_buf_t {
-    uint8_t *buffer;
+    uint32_t *buffer;
     size_t head;
     size_t tail;
     size_t size;
 } ring_buf_t;
 
-void ringbuf_init(ring_buf_t *, uint8_t *, size_t);
-bool ringbuf_push(ring_buf_t *, uint8_t );
-bool ringbuf_pop(ring_buf_t *, uint8_t *);
+void ringbuf_init(ring_buf_t *, uint32_t *, size_t);
+bool ringbuf_push(ring_buf_t *, uint32_t );
+bool ringbuf_pop(ring_buf_t *, uint32_t *);
 bool ringbuf_is_empty(ring_buf_t *);
 bool ringbuf_is_full(ring_buf_t *);
 size_t ringbuf_available_data(ring_buf_t *);
diff --git a/firmware/code/run.c b/firmware/code/run.c
index 67649e6..d6f13fd 100644
--- a/firmware/code/run.c
+++ b/firmware/code/run.c
@@ -123,83 +123,60 @@ static void __no_inline_not_in_flash_func(_as_audio_packet)(struct usb_endpoint
     int16_t *in = (int16_t *) usb_buffer->data;
     int32_t *out = (int32_t *) userbuf;
     int samples = usb_buffer->data_len / 2;
- 
-    multicore_fifo_push_blocking(CORE0_READY);
-    multicore_fifo_push_blocking((uintptr_t) in);
+
+    // TODO: For some reason if we try to process in from both cores the left and right channels
+    // flip back and forth..
+    if (preprocessing.reverse_stereo) {
+        for (int i = 0; i < samples; i+=2) {
+            out[i] = fix16_mul(norm_fix3_28_from_s16sample(in[i+1]), preprocessing.preamp);
+            out[i+1] = fix16_mul(norm_fix3_28_from_s16sample(in[i]), preprocessing.preamp);
+        }
+    }
+    else {
+        for (int i = 0; i < samples; i++)
+            out[i] = fix16_mul(norm_fix3_28_from_s16sample(in[i]), preprocessing.preamp);
+    }
+
     multicore_fifo_push_blocking(samples);
 
-    if (preprocessing.reverse_stereo) {
-        in++;
-    }
     for (int i = 0; i < samples; i += 2) {
-        // Preamp the sample
-        fix3_28_t x_f16 = fix16_mul(norm_fix3_28_from_s16sample((int16_t) in[i]), preprocessing.preamp);
-
-        // Run the filters
         for (int j = 0; j < filter_stages; j++) {
-            x_f16 = bqf_transform(x_f16, &bqf_filters_left[j], &bqf_filters_mem_left[j]);
+            out[i] = bqf_transform(out[i], &bqf_filters_left[j], &bqf_filters_mem_left[j]);
         }
-        // Convert back to sample
-        out[i] = (int32_t) norm_fix3_28_to_s16sample(x_f16);
+        out[i] = (int32_t) norm_fix3_28_to_s16sample(out[i]);
     }
 
     // Signal to core 1 that we have processed our samples, so it can write to I2S
     multicore_fifo_push_blocking(CORE0_READY);
 
+    update_volume();
+    apply_config_changes();
+
     // keep on truckin'
     usb_grow_transfer(ep->current_transfer, 1);
     usb_packet_done(ep);
 }
 
 void __no_inline_not_in_flash_func(core1_entry)() {
-    uint8_t *userbuf = (uint8_t *) multicore_fifo_pop_blocking();
+    uint32_t *userbuf = (uint32_t *) multicore_fifo_pop_blocking();
     int32_t *out = (int32_t *) userbuf;
-    int limit_counter = 100;
 
     // Signal that the thread has started
     multicore_fifo_push_blocking(CORE1_READY);
 
     while (true) {
-        // Block until the userbuf is filled with data
-        uint32_t ready = multicore_fifo_pop_blocking();
-        while (ready != CORE0_READY)
-            ready = multicore_fifo_pop_blocking();
-        
-        int16_t *in = (int16_t *) multicore_fifo_pop_blocking();
         const uint32_t samples = multicore_fifo_pop_blocking();
 
-        if (preprocessing.reverse_stereo) {
-            in--;
-        }
         for (int i = 1; i < samples; i += 2) {
-            // Preamp the sample
-            fix3_28_t x_f16 = fix16_mul(norm_fix3_28_from_s16sample((int16_t) in[i]), preprocessing.preamp);
-
-            // Run the filters
             for (int j = 0; j < filter_stages; j++) {
-                x_f16 = bqf_transform(x_f16, &bqf_filters_right[j],  &bqf_filters_mem_right[j]);
+                out[i] = bqf_transform(out[i], &bqf_filters_right[j], &bqf_filters_mem_right[j]);
             }
-            // Convert back to sample
-            out[i] = (int32_t) norm_fix3_28_to_s16sample(x_f16);
+            out[i] = (int32_t) norm_fix3_28_to_s16sample(out[i]);
         }
 
-        // Update the volume and filter configs if required. We do this from
-        // core1 as core0 is more heavily loaded, doing this from core0 can
-        // lead to audio crackling.
-        // Use of a counter reduces the amount of crackling when changing
-        // volume.
-        if (limit_counter != 0)
-            limit_counter--;
-        else {
-            limit_counter = 100;
-            update_volume();
-            apply_config_changes();
-        }
-
-        // Signal to core 0 that the data has all been transformed
-        multicore_fifo_push_blocking(CORE1_READY);
-
-        i2s_stream_write(&i2s_write_obj, userbuf, samples * 4);
+        // Wait for Core 0 to finish running its filtering before we apply config updates
+        multicore_fifo_pop_blocking();
+        i2s_stream_write(&i2s_write_obj, userbuf, samples);
     }
 }