Enable oratory's 15 band EQ (#23)

* Attempt at optimizing so the new filtering runs better. * Additional improvements. * Further optimisations. * Seems to work OK with 10 filters. Just noise with 11. * Increase config buffer size, and make the bqf_transform function inline * Remove extra loop and process input evently across both cores. * Enable 15 band EQ. * Shift some load of core1 * Revert buffer size change * Mark USB transfer as done sooner. * Fast multiply. * Fix build failure. * Rollback changes we dont need. * Fix save config to flash * Increase filter stages to 20. We cant quite run that many though. * Replace a few doubles with floats. According to the raspberry-pi-pico-c-sdk manual, doubles are around 3 times slower than floats.
2023-08-23 20:47:01 +01:00 · 2023-08-23 20:47:01 +01:00 · 1e6896f918
parent fd6cbf54d5
commit 1e6896f918
12 changed files with 129 additions and 137 deletions
--- a/firmware/code/CMakeLists.txt
+++ b/firmware/code/CMakeLists.txt
@ -14,7 +14,6 @@ add_executable(ploopy_headphones
    run.c
    ringbuf.c
    i2s.c
-    fix16.c
    bqf.c
    configuration_manager.c
 )
@ -63,6 +62,9 @@ target_compile_definitions(ploopy_headphones PRIVATE
    GIT_HASH="${GIT_HASH}"

    PICO_XOSC_STARTUP_DELAY_MULTIPLIER=64
+
+    # Performance, avoid calls to ____wrap___aeabi_lmul_veneer when doing 64bit multiplies
+    PICO_INT64_OPS_IN_RAM=1
 )

 pico_enable_stdio_usb(ploopy_headphones 0)
--- a/firmware/code/bqf.c
+++ b/firmware/code/bqf.c
@ -467,21 +467,6 @@ void bqf_highshelf_config(double fs, double f0, double dBgain, double Q,
    coefficients->a2 = fix3_28_from_dbl(a2);
 }

-fix3_28_t bqf_transform(fix3_28_t x, bqf_coeff_t *coefficients, bqf_mem_t *memory) {
-    fix3_28_t y = fix16_mul(coefficients->b0, x) -
-            fix16_mul(coefficients->a1, memory->y_1) +
-            fix16_mul(coefficients->b1, memory->x_1) -
-            fix16_mul(coefficients->a2, memory->y_2) +
-            fix16_mul(coefficients->b2, memory->x_2);
-
-    memory->x_2 = memory->x_1;
-    memory->x_1 = x;
-    memory->y_2 = memory->y_1;
-    memory->y_1 = y;
-
-    return y;
-}
-
 void bqf_memreset(bqf_mem_t *memory) {
    memory->x_1 = fix16_zero;
    memory->x_2 = fix16_zero;
--- a/firmware/code/bqf.h
+++ b/firmware/code/bqf.h
@ -41,9 +41,9 @@ typedef struct _bqf_mem_t {
    fix3_28_t y_2;
 } bqf_mem_t;

-// In reality we do not have enough CPU resource to run 8 filtering
-// stages without some optimisation.
-#define MAX_FILTER_STAGES 8
+// More filters should be possible, but the config structure
+// might grow beyond the current 512 byte size.
+#define MAX_FILTER_STAGES 20
 extern int filter_stages;

 extern bqf_coeff_t bqf_filters_left[MAX_FILTER_STAGES];
@ -65,7 +65,8 @@ void bqf_peaking_config(double, double, double, double, bqf_coeff_t *);
 void bqf_lowshelf_config(double, double, double, double, bqf_coeff_t *);
 void bqf_highshelf_config(double, double, double, double, bqf_coeff_t *);

-fix3_28_t bqf_transform(fix3_28_t, bqf_coeff_t *, bqf_mem_t *);
+static inline fix3_28_t bqf_transform(fix3_28_t, bqf_coeff_t *, bqf_mem_t *);
 void bqf_memreset(bqf_mem_t *);

+#include "bqf.inl"
 #endif
--- a/firmware/code/bqf.inl
+++ b/firmware/code/bqf.inl
@ -0,0 +1,36 @@
+/**
+ * Copyright 2022 Colin Lam, Ploopy Corporation
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ *
+ * SPECIAL THANKS TO:
+ * Robert Bristow-Johnson, a.k.a. RBJ
+ * for his exceptional work on biquad formulae as applied to digital
+ * audio filtering, summarised in his pamphlet, "Audio EQ Cookbook".
+ */
+
+static inline fix3_28_t bqf_transform(fix3_28_t x, bqf_coeff_t *coefficients, bqf_mem_t *memory) {
+    fix3_28_t y = fix16_mul(coefficients->b0, x) -
+            fix16_mul(coefficients->a1, memory->y_1) +
+            fix16_mul(coefficients->b1, memory->x_1) -
+            fix16_mul(coefficients->a2, memory->y_2) +
+            fix16_mul(coefficients->b2, memory->x_2);
+
+    memory->x_2 = memory->x_1;
+    memory->x_1 = x;
+    memory->y_2 = memory->y_1;
+    memory->y_1 = y;
+
+    return y;
+}
--- a/firmware/code/configuration_manager.c
+++ b/firmware/code/configuration_manager.c
@ -52,16 +52,23 @@ static const default_configuration default_config = {
    .set_configuration = { SET_CONFIGURATION, sizeof(default_config) },
    .filters = {
        .filter = { FILTER_CONFIGURATION, sizeof(default_config.filters) },
-        .f1 = { PEAKING,    {0},    40,   -20,  1.4 },
-        .f2 = { LOWSHELF,   {0},    105,  2.5,  0.7 },
-        .f3 = { PEAKING,    {0},    450,  7,    1.8 },
-        .f4 = { PEAKING,    {0},    2100, 8,    3.0 },
-        .f5 = { PEAKING,    {0},    3500, -7.5, 2.9 },
-        .f6 = { PEAKING,    {0},    5200, 5.5,  3.0 },
-        .f7 = { PEAKING,    {0},    6400, -19,  4.0 },
-        .f8 = { PEAKING,    {0},    9000, 3.0,  2.0 }
+        .f1  = { PEAKING,    {0},    38.5, -21.0,  1.4  },
+        .f2  = { PEAKING,    {0},    60,    -6.7,  0.5  },
+        .f3  = { LOWSHELF,   {0},    105,    5.5,  0.71 },
+        .f4  = { PEAKING,    {0},    280,   -3.5,  1.1  },
+        .f5  = { PEAKING,    {0},    350,   -1.6,  6.0  },
+        .f6  = { PEAKING,    {0},    425,    7.8,  1.3  },
+        .f7  = { PEAKING,    {0},    500,   -2.0,  7.0  },
+        .f8  = { PEAKING,    {0},    690,   -5.5,  3.0  },
+        .f9  = { PEAKING,    {0},   1000,   -2.2,  5.0  },
+        .f10 = { PEAKING,    {0},   1530,   -4.0,  2.5  },
+        .f11 = { PEAKING,    {0},   2250,    6.0,  2.0  },
+        .f12 = { PEAKING,    {0},   3430,  -12.2,  2.0  },
+        .f13 = { PEAKING,    {0},   4800,    4.0,  2.0  },
+        .f14 = { PEAKING,    {0},   6200,  -15.0,  3.0  },
+        .f15 = { HIGHSHELF,  {0},  12000,   -6.0,  0.71 }
    },
-    .preprocessing = { .header = { PREPROCESSING_CONFIGURATION, sizeof(default_config.preprocessing) }, -0.16f, true, {0} }
+    .preprocessing = { .header = { PREPROCESSING_CONFIGURATION, sizeof(default_config.preprocessing) }, -0.08f, true, {0} }
 };

 // Grab the last 4k page of flash for our configuration strutures.
@ -74,7 +81,7 @@ const uint8_t *user_configuration = (const uint8_t *) (XIP_BASE + USER_CONFIGURA
 * should handle merging configurations where, for example, only a new
 * filter_configuration_tlv was received.
 */
-#define CFG_BUFFER_SIZE 256
+#define CFG_BUFFER_SIZE 512
 static uint8_t working_configuration[2][CFG_BUFFER_SIZE];
 static uint8_t inactive_working_configuration = 0;
 static uint8_t result_buffer[CFG_BUFFER_SIZE] = { U16_TO_U8S_LE(NOK), U16_TO_U8S_LE(0) };
@ -129,7 +136,7 @@ bool validate_filter_configuration(filter_configuration_tlv *filters)
                printf("Error! Not enough data left for filter6 (%d)\n", remaining);
                return false;
            }
-            if (args->a0 == 0.0) {
+            if (args->a0 == 0.0f) {
                printf("Error! The a0 co-efficient of an IIR filter must not be 0.\n");
                return false;
            }
@ -182,7 +189,7 @@ void apply_filter_configuration(filter_configuration_tlv *filters) {
                uint32_t checksum = 0;
                for (int i = 0; i < sizeof(filter6) / 4; i++) checksum ^= ((uint32_t*) args)[i];
                if (checksum != bqf_filter_checksum[filter_stages]) {
-                    bqf_filters_left[filter_stages].a0 = fix3_28_from_dbl(1.0);
+                    bqf_filters_left[filter_stages].a0 = fix16_one;
                    bqf_filters_left[filter_stages].a1 = fix3_28_from_dbl(args->a1/args->a0);
                    bqf_filters_left[filter_stages].a2 = fix3_28_from_dbl(args->a2/args->a0);
                    bqf_filters_left[filter_stages].b0 = fix3_28_from_dbl(args->b0/args->a0);
@ -308,7 +315,7 @@ bool apply_configuration(tlv_header *config) {
 #ifndef TEST_TARGET
            case PREPROCESSING_CONFIGURATION: {
                preprocessing_configuration_tlv* preprocessing_config = (preprocessing_configuration_tlv*) tlv;
-                preprocessing.preamp = fix3_28_from_dbl(1.0 + preprocessing_config->preamp);
+                preprocessing.preamp = fix3_28_from_flt(1.0f + preprocessing_config->preamp);
                preprocessing.reverse_stereo = preprocessing_config->reverse_stereo;
                break;
            }
@ -352,7 +359,7 @@ bool __no_inline_not_in_flash_func(save_configuration)() {

        const size_t config_length = config->length - ((size_t)config->value - (size_t)config);
        // Write data to flash
-        uint8_t flash_buffer[FLASH_PAGE_SIZE];
+        uint8_t flash_buffer[CFG_BUFFER_SIZE];
        flash_header_tlv* flash_header = (flash_header_tlv*) flash_buffer;
        flash_header->header.type = FLASH_HEADER;
        flash_header->header.length = sizeof(flash_header_tlv) + config_length;
@ -362,7 +369,7 @@ bool __no_inline_not_in_flash_func(save_configuration)() {

        uint32_t ints = save_and_disable_interrupts();
        flash_range_erase(USER_CONFIGURATION_OFFSET, FLASH_SECTOR_SIZE);
-        flash_range_program(USER_CONFIGURATION_OFFSET, flash_buffer, FLASH_PAGE_SIZE);
+        flash_range_program(USER_CONFIGURATION_OFFSET, flash_buffer, CFG_BUFFER_SIZE);
        restore_interrupts(ints);

        power_up_dac();
--- a/firmware/code/configuration_types.h
+++ b/firmware/code/configuration_types.h
@ -17,8 +17,8 @@
 #include <stdint.h>

 #define FLASH_MAGIC 0x2E8AFEDD
-#define CONFIG_VERSION 2
-#define MINIMUM_CONFIG_VERSION 1
+#define CONFIG_VERSION 3
+#define MINIMUM_CONFIG_VERSION 3

 enum structure_types {
    // Commands/Responses, these are container TLVs. The Value will be a set of TLV structures.
@ -53,18 +53,20 @@ typedef struct __attribute__((__packed__)) _tlv_header {
 typedef struct __attribute__((__packed__)) _filter2 {
    uint8_t type;
    uint8_t reserved[3];
-    double f0;
-    double Q;
+    float f0;
+    float Q;
 } filter2;

 typedef struct __attribute__((__packed__)) _filter3 {
    uint8_t type;
    uint8_t reserved[3];
-    double f0;
-    double db_gain;
-    double Q;
+    float f0;
+    float db_gain;
+    float Q;
 } filter3;

+// WARNING: We wont be able to support more than 8 of these filters
+// due to the config structure size.
 typedef struct __attribute__((__packed__)) _filter6 {
    uint8_t type;
    uint8_t reserved[3];
@ -98,7 +100,7 @@ typedef struct __attribute__((__packed__)) _flash_header_tlv {

 typedef struct __attribute__((__packed__)) _preprocessing_configuration_tlv {
    tlv_header header;
-    double preamp;
+    float preamp;
    uint8_t reverse_stereo;
    uint8_t reserved[3];
 } preprocessing_configuration_tlv;
@ -137,6 +139,13 @@ typedef struct __attribute__((__packed__)) _default_configuration {
        filter3 f6;
        filter3 f7;
        filter3 f8;
+        filter3 f9;
+        filter3 f10;
+        filter3 f11;
+        filter3 f12;
+        filter3 f13;
+        filter3 f14;
+        filter3 f15;
    } filters;
    preprocessing_configuration_tlv preprocessing;
 } default_configuration;
--- a/firmware/code/fix16.h
+++ b/firmware/code/fix16.h
@ -25,13 +25,6 @@
 #include <stdbool.h>
 #include <inttypes.h>

-// During development, it can be useful to run with real double values for reference.
-//#define USE_DOUBLE
-#ifdef USE_DOUBLE
-typedef double fix16_t;
-static const fix16_t fix16_zero = 0;
-static const fix16_t fix16_one = 1;
-#else

 /// @brief Fixed point math type, in format Q3.28. One sign bit, 3 bits for left-of-decimal
 ///and 28 for right-of-decimal. This arrangment works because we normalize the incoming USB
@ -46,15 +39,15 @@ static const fix3_28_t fix16_one =    0x10000000;
 /// @brief Represents zero in fixed point world.
 static const fix3_28_t fix16_zero = 0x00000000;

-#endif
+static inline fix3_28_t norm_fix3_28_from_s16sample(int16_t);

+static inline int16_t norm_fix3_28_to_s16sample(fix3_28_t);

-fix3_28_t norm_fix3_28_from_s16sample(int16_t);
+static inline fix3_28_t fix3_28_from_flt(float);

-int16_t norm_fix3_28_to_s16sample(fix3_28_t);
+static inline fix3_28_t fix3_28_from_dbl(double);

-fix3_28_t fix3_28_from_dbl(double);
-
-fix3_28_t fix16_mul(fix3_28_t, fix3_28_t);
+static inline fix3_28_t fix16_mul(fix3_28_t, fix3_28_t);

+#include "fix16.inl"
 #endif
--- a/firmware/code/fix16.inl
+++ b/firmware/code/fix16.inl
@ -25,46 +25,10 @@
 #include <limits.h>
 #include "fix16.h"

-#ifdef USE_DOUBLE
-fix16_t fix16_from_s16sample(int16_t a) {
-    return a;
-}
-
-int16_t fix16_to_s16sample(fix16_t a) {
-    // Handle rounding up front, adding one can cause an overflow/underflow
-    if (a < 0) {
-        a -= 0.5;
-    } else {
-        a += 0.5;
-    }
-
-    // Saturate the value if an overflow has occurred
-    if (a < SHRT_MIN) {
-        return SHRT_MIN;
-    }
-    if (a < SHRT_MAX) {
-        return SHRT_MAX;
-    }
-    return a;
-}
-
-fix16_t fix16_from_dbl(double a) {
-    return a;
-}
-
-double fix16_to_dbl(fix16_t a) {
-    return a;
-}
-
-fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1) {
-    return inArg0 * inArg1;
-}
-#else
-
 /// @brief Produces a fixed point number from a 16-bit signed integer, normalized to ]-1,1[.
 /// @param a Signed 16-bit integer.
 /// @return A fixed point number in Q3.28 format, with input normalized to ]-1,1[.
-fix3_28_t norm_fix3_28_from_s16sample(int16_t a) {
+static inline fix3_28_t norm_fix3_28_from_s16sample(int16_t a) {
    /* So, we're using a Q3.28 fixed point system here, and we want the incoming
       audio signal to be represented as a number between -1 and 1. To do this,
       we need the 16-bit value to map to the 28-bit right-of-decimal field in
@ -79,7 +43,7 @@ fix3_28_t norm_fix3_28_from_s16sample(int16_t a) {
 ///        calculated sample to one that the DAC can understand.
 /// @param a
 /// @return Signed 16-bit integer.
-int16_t norm_fix3_28_to_s16sample(fix3_28_t a) {
+static inline int16_t norm_fix3_28_to_s16sample(fix3_28_t a) {
    // Handle rounding up front, adding one can cause an overflow/underflow

    // It's not clear exactly how this works, so we'll disable it for now.
@ -110,8 +74,13 @@ int16_t norm_fix3_28_to_s16sample(fix3_28_t a) {
    return (a >> 12);
 }

+static inline fix3_28_t fix3_28_from_flt(float a) {
+    float temp = a * fix16_one;
+    temp += ((temp >= 0) ? 0.5f : -0.5f);
+    return (fix3_28_t)temp;
+}

-fix3_28_t fix3_28_from_dbl(double a) {
+static inline fix3_28_t fix3_28_from_dbl(double a) {
    double temp = a * fix16_one;
    temp += (double)((temp >= 0) ? 0.5f : -0.5f);
    return (fix3_28_t)temp;
@ -121,27 +90,22 @@ fix3_28_t fix3_28_from_dbl(double a) {
 /// @param inArg0 Q3.28 format fixed point number.
 /// @param inArg1 Q3.28 format fixed point number.
 /// @return A Q3.28 fixed point number that represents the truncated result of inArg0 x inArg1.
-fix3_28_t fix16_mul(fix3_28_t inArg0, fix3_28_t inArg1) {
-    const int64_t product = (int64_t)inArg0 * inArg1;
+static inline fix3_28_t fix16_mul(fix3_28_t inArg0, fix3_28_t inArg1) {
+    int32_t A = (inArg0 >> 14), C = (inArg1 >> 14);
+    uint32_t B = (inArg0 & 0x3FFF), D = (inArg1 & 0x3FFF);
+    int32_t AC = A*C;
+    int32_t AD_CB = A*D + C*B;
+    int32_t product_hi = AC + (AD_CB >> 14);

-    /* Since we're expecting 2 Q3.28 numbers, the multiplication result should be a Q7.56 number.
-       To bring this number back to the right order of magnitude, we need to shift
-       it to the right by 28. */
-    fix3_28_t result = product >> 28;
+#if HANDLE_CARRY
+	// Handle carry from lower bits to upper part of result.
+    uint32_t BD = B*D;
+	uint32_t ad_cb_temp = AD_CB << 14;
+	uint32_t product_lo = BD + ad_cb_temp;

-    // Handle rounding where we are choppping off low order bits
-    // Disabled for now, too much load. We get crackling when adjusting
-    // the volume.
-    #if 0
-    if (product & 0x4000) {
-        if (result >= 0) {
-            result++;
-        }
-        else {
-            result--;
-        }
-    }
-    #endif
-    return result;
-}
-#endif
+	if (product_lo < BD)
+		product_hi++;
+#endif
+
+    return product_hi;
+}
--- a/firmware/code/i2s.h
+++ b/firmware/code/i2s.h
@ -70,4 +70,4 @@ void feed_dma(i2s_obj_t *, uint8_t *);

 uint32_t copy_userbuf_to_ringbuf(i2s_obj_t *, const uint8_t *, uint);

-#endif
+#endif
--- a/firmware/code/ringbuf.c
+++ b/firmware/code/ringbuf.c
@ -78,4 +78,4 @@ size_t ringbuf_available_data(ring_buf_t *rbuf) {

 size_t ringbuf_available_space(ring_buf_t *rbuf) {
    return rbuf->size - ringbuf_available_data(rbuf) - 1;
-}
+}
--- a/firmware/code/ringbuf.h
+++ b/firmware/code/ringbuf.h
@ -42,4 +42,4 @@ bool ringbuf_is_full(ring_buf_t *);
 size_t ringbuf_available_data(ring_buf_t *);
 size_t ringbuf_available_space(ring_buf_t *);

-#endif
+#endif
--- a/firmware/code/run.c
+++ b/firmware/code/run.c
@ -118,7 +118,7 @@ static void update_volume()
 // PCM data into I2S data that gets shipped out to the PCM3060. It really
 // belongs with the other USB-related code due to its utter indecipherability,
 // but it's placed here to emphasize its importance.
-static void _as_audio_packet(struct usb_endpoint *ep) {
+static void __no_inline_not_in_flash_func(_as_audio_packet)(struct usb_endpoint *ep) {
    struct usb_buffer *usb_buffer = usb_current_out_packet_buffer(ep);
    int16_t *in = (int16_t *) usb_buffer->data;
    int32_t *out = (int32_t *) userbuf;
@ -156,15 +156,22 @@ static void _as_audio_packet(struct usb_endpoint *ep) {
    uint32_t ready = multicore_fifo_pop_blocking();
    multicore_fifo_push_blocking(CORE0_READY);

+    // Update the volume if required. We do this from core1 as
+    // core0 is more heavily loaded, doing this from core0 can
+    // lead to audio crackling.
+    update_volume();
+
+    // Update filters if required
+    apply_config_changes();
+
    // keep on truckin'
    usb_grow_transfer(ep->current_transfer, 1);
    usb_packet_done(ep);
 }

-void core1_entry() {
+void __no_inline_not_in_flash_func(core1_entry)() {
    uint8_t *userbuf = (uint8_t *) multicore_fifo_pop_blocking();
    int32_t *out = (int32_t *) userbuf;
-    int limit_counter = 100;

    // Signal that the thread has started
    multicore_fifo_push_blocking(CORE1_READY);
@ -191,19 +198,6 @@ void core1_entry() {
            }
        }

-        // Update the volume and filter configs if required. We do this from
-        // core1 as core0 is more heavily loaded, doing this from core0 can
-        // lead to audio crackling.
-        // Use of a counter reduces the amount of crackling when changing
-        // volume.
-        if (limit_counter != 0)
-            limit_counter--;
-        else {
-            limit_counter = 100;
-            update_volume();
-            apply_config_changes();
-        }
-
        // Signal to core 0 that the data has all been transformed
        multicore_fifo_push_blocking(CORE1_READY);

@ -253,7 +247,7 @@ void setup() {
    // The PCM3060 supports standard mode (100kbps) or fast mode (400kbps)
    // we run in fast mode so we dont block the core for too long while
    // updating the volume.
-    i2c_init(i2c0, 100000);
+    i2c_init(i2c0, 400000);
    gpio_set_function(PCM3060_SDA_PIN, GPIO_FUNC_I2C);
    gpio_set_function(PCM3060_SCL_PIN, GPIO_FUNC_I2C);
    gpio_pull_up(PCM3060_SDA_PIN);
@ -297,6 +291,7 @@ void setup() {
 * IF YOU DO, YOU COULD BLOW UP YOUR HARDWARE!                               *
 * YOU WERE WARNED!!!!!!!!!!!!!!!!                                           *
 ****************************************************************************/
+ // TODO: roundf will be much faster than round, but it might mess with timings
 void configure_neg_switch_pwm() {
    gpio_set_function(NEG_SWITCH_PWM_PIN, GPIO_FUNC_PWM);
    uint slice_num = pwm_gpio_to_slice_num(NEG_SWITCH_PWM_PIN);