From 1e6896f918383854ec5a74e860dd7ed58671cae5 Mon Sep 17 00:00:00 2001 From: George Norton <30636555+george-norton@users.noreply.github.com> Date: Wed, 23 Aug 2023 20:47:01 +0100 Subject: [PATCH] Enable oratory's 15 band EQ (#23) * Attempt at optimizing so the new filtering runs better. * Additional improvements. * Further optimisations. * Seems to work OK with 10 filters. Just noise with 11. * Increase config buffer size, and make the bqf_transform function inline * Remove extra loop and process input evently across both cores. * Enable 15 band EQ. * Shift some load of core1 * Revert buffer size change * Mark USB transfer as done sooner. * Fast multiply. * Fix build failure. * Rollback changes we dont need. * Fix save config to flash * Increase filter stages to 20. We cant quite run that many though. * Replace a few doubles with floats. According to the raspberry-pi-pico-c-sdk manual, doubles are around 3 times slower than floats. --- firmware/code/CMakeLists.txt | 4 +- firmware/code/bqf.c | 15 ----- firmware/code/bqf.h | 9 +-- firmware/code/bqf.inl | 36 +++++++++++ firmware/code/configuration_manager.c | 37 +++++++----- firmware/code/configuration_types.h | 25 +++++--- firmware/code/fix16.h | 19 ++---- firmware/code/{fix16.c => fix16.inl} | 86 ++++++++------------------- firmware/code/i2s.h | 2 +- firmware/code/ringbuf.c | 2 +- firmware/code/ringbuf.h | 2 +- firmware/code/run.c | 29 ++++----- 12 files changed, 129 insertions(+), 137 deletions(-) create mode 100644 firmware/code/bqf.inl rename firmware/code/{fix16.c => fix16.inl} (66%) diff --git a/firmware/code/CMakeLists.txt b/firmware/code/CMakeLists.txt index aae60e4..7c12e85 100644 --- a/firmware/code/CMakeLists.txt +++ b/firmware/code/CMakeLists.txt @@ -14,7 +14,6 @@ add_executable(ploopy_headphones run.c ringbuf.c i2s.c - fix16.c bqf.c configuration_manager.c ) @@ -63,6 +62,9 @@ target_compile_definitions(ploopy_headphones PRIVATE GIT_HASH="${GIT_HASH}" PICO_XOSC_STARTUP_DELAY_MULTIPLIER=64 + + # Performance, avoid calls to ____wrap___aeabi_lmul_veneer when doing 64bit multiplies + PICO_INT64_OPS_IN_RAM=1 ) pico_enable_stdio_usb(ploopy_headphones 0) diff --git a/firmware/code/bqf.c b/firmware/code/bqf.c index 21b67fb..191137c 100644 --- a/firmware/code/bqf.c +++ b/firmware/code/bqf.c @@ -467,21 +467,6 @@ void bqf_highshelf_config(double fs, double f0, double dBgain, double Q, coefficients->a2 = fix3_28_from_dbl(a2); } -fix3_28_t bqf_transform(fix3_28_t x, bqf_coeff_t *coefficients, bqf_mem_t *memory) { - fix3_28_t y = fix16_mul(coefficients->b0, x) - - fix16_mul(coefficients->a1, memory->y_1) + - fix16_mul(coefficients->b1, memory->x_1) - - fix16_mul(coefficients->a2, memory->y_2) + - fix16_mul(coefficients->b2, memory->x_2); - - memory->x_2 = memory->x_1; - memory->x_1 = x; - memory->y_2 = memory->y_1; - memory->y_1 = y; - - return y; -} - void bqf_memreset(bqf_mem_t *memory) { memory->x_1 = fix16_zero; memory->x_2 = fix16_zero; diff --git a/firmware/code/bqf.h b/firmware/code/bqf.h index dcdc038..5a38ccb 100644 --- a/firmware/code/bqf.h +++ b/firmware/code/bqf.h @@ -41,9 +41,9 @@ typedef struct _bqf_mem_t { fix3_28_t y_2; } bqf_mem_t; -// In reality we do not have enough CPU resource to run 8 filtering -// stages without some optimisation. -#define MAX_FILTER_STAGES 8 +// More filters should be possible, but the config structure +// might grow beyond the current 512 byte size. +#define MAX_FILTER_STAGES 20 extern int filter_stages; extern bqf_coeff_t bqf_filters_left[MAX_FILTER_STAGES]; @@ -65,7 +65,8 @@ void bqf_peaking_config(double, double, double, double, bqf_coeff_t *); void bqf_lowshelf_config(double, double, double, double, bqf_coeff_t *); void bqf_highshelf_config(double, double, double, double, bqf_coeff_t *); -fix3_28_t bqf_transform(fix3_28_t, bqf_coeff_t *, bqf_mem_t *); +static inline fix3_28_t bqf_transform(fix3_28_t, bqf_coeff_t *, bqf_mem_t *); void bqf_memreset(bqf_mem_t *); +#include "bqf.inl" #endif diff --git a/firmware/code/bqf.inl b/firmware/code/bqf.inl new file mode 100644 index 0000000..5fafdf5 --- /dev/null +++ b/firmware/code/bqf.inl @@ -0,0 +1,36 @@ +/** + * Copyright 2022 Colin Lam, Ploopy Corporation + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * SPECIAL THANKS TO: + * Robert Bristow-Johnson, a.k.a. RBJ + * for his exceptional work on biquad formulae as applied to digital + * audio filtering, summarised in his pamphlet, "Audio EQ Cookbook". + */ + +static inline fix3_28_t bqf_transform(fix3_28_t x, bqf_coeff_t *coefficients, bqf_mem_t *memory) { + fix3_28_t y = fix16_mul(coefficients->b0, x) - + fix16_mul(coefficients->a1, memory->y_1) + + fix16_mul(coefficients->b1, memory->x_1) - + fix16_mul(coefficients->a2, memory->y_2) + + fix16_mul(coefficients->b2, memory->x_2); + + memory->x_2 = memory->x_1; + memory->x_1 = x; + memory->y_2 = memory->y_1; + memory->y_1 = y; + + return y; +} \ No newline at end of file diff --git a/firmware/code/configuration_manager.c b/firmware/code/configuration_manager.c index f030521..e17728a 100644 --- a/firmware/code/configuration_manager.c +++ b/firmware/code/configuration_manager.c @@ -52,16 +52,23 @@ static const default_configuration default_config = { .set_configuration = { SET_CONFIGURATION, sizeof(default_config) }, .filters = { .filter = { FILTER_CONFIGURATION, sizeof(default_config.filters) }, - .f1 = { PEAKING, {0}, 40, -20, 1.4 }, - .f2 = { LOWSHELF, {0}, 105, 2.5, 0.7 }, - .f3 = { PEAKING, {0}, 450, 7, 1.8 }, - .f4 = { PEAKING, {0}, 2100, 8, 3.0 }, - .f5 = { PEAKING, {0}, 3500, -7.5, 2.9 }, - .f6 = { PEAKING, {0}, 5200, 5.5, 3.0 }, - .f7 = { PEAKING, {0}, 6400, -19, 4.0 }, - .f8 = { PEAKING, {0}, 9000, 3.0, 2.0 } + .f1 = { PEAKING, {0}, 38.5, -21.0, 1.4 }, + .f2 = { PEAKING, {0}, 60, -6.7, 0.5 }, + .f3 = { LOWSHELF, {0}, 105, 5.5, 0.71 }, + .f4 = { PEAKING, {0}, 280, -3.5, 1.1 }, + .f5 = { PEAKING, {0}, 350, -1.6, 6.0 }, + .f6 = { PEAKING, {0}, 425, 7.8, 1.3 }, + .f7 = { PEAKING, {0}, 500, -2.0, 7.0 }, + .f8 = { PEAKING, {0}, 690, -5.5, 3.0 }, + .f9 = { PEAKING, {0}, 1000, -2.2, 5.0 }, + .f10 = { PEAKING, {0}, 1530, -4.0, 2.5 }, + .f11 = { PEAKING, {0}, 2250, 6.0, 2.0 }, + .f12 = { PEAKING, {0}, 3430, -12.2, 2.0 }, + .f13 = { PEAKING, {0}, 4800, 4.0, 2.0 }, + .f14 = { PEAKING, {0}, 6200, -15.0, 3.0 }, + .f15 = { HIGHSHELF, {0}, 12000, -6.0, 0.71 } }, - .preprocessing = { .header = { PREPROCESSING_CONFIGURATION, sizeof(default_config.preprocessing) }, -0.16f, true, {0} } + .preprocessing = { .header = { PREPROCESSING_CONFIGURATION, sizeof(default_config.preprocessing) }, -0.08f, true, {0} } }; // Grab the last 4k page of flash for our configuration strutures. @@ -74,7 +81,7 @@ const uint8_t *user_configuration = (const uint8_t *) (XIP_BASE + USER_CONFIGURA * should handle merging configurations where, for example, only a new * filter_configuration_tlv was received. */ -#define CFG_BUFFER_SIZE 256 +#define CFG_BUFFER_SIZE 512 static uint8_t working_configuration[2][CFG_BUFFER_SIZE]; static uint8_t inactive_working_configuration = 0; static uint8_t result_buffer[CFG_BUFFER_SIZE] = { U16_TO_U8S_LE(NOK), U16_TO_U8S_LE(0) }; @@ -129,7 +136,7 @@ bool validate_filter_configuration(filter_configuration_tlv *filters) printf("Error! Not enough data left for filter6 (%d)\n", remaining); return false; } - if (args->a0 == 0.0) { + if (args->a0 == 0.0f) { printf("Error! The a0 co-efficient of an IIR filter must not be 0.\n"); return false; } @@ -182,7 +189,7 @@ void apply_filter_configuration(filter_configuration_tlv *filters) { uint32_t checksum = 0; for (int i = 0; i < sizeof(filter6) / 4; i++) checksum ^= ((uint32_t*) args)[i]; if (checksum != bqf_filter_checksum[filter_stages]) { - bqf_filters_left[filter_stages].a0 = fix3_28_from_dbl(1.0); + bqf_filters_left[filter_stages].a0 = fix16_one; bqf_filters_left[filter_stages].a1 = fix3_28_from_dbl(args->a1/args->a0); bqf_filters_left[filter_stages].a2 = fix3_28_from_dbl(args->a2/args->a0); bqf_filters_left[filter_stages].b0 = fix3_28_from_dbl(args->b0/args->a0); @@ -308,7 +315,7 @@ bool apply_configuration(tlv_header *config) { #ifndef TEST_TARGET case PREPROCESSING_CONFIGURATION: { preprocessing_configuration_tlv* preprocessing_config = (preprocessing_configuration_tlv*) tlv; - preprocessing.preamp = fix3_28_from_dbl(1.0 + preprocessing_config->preamp); + preprocessing.preamp = fix3_28_from_flt(1.0f + preprocessing_config->preamp); preprocessing.reverse_stereo = preprocessing_config->reverse_stereo; break; } @@ -352,7 +359,7 @@ bool __no_inline_not_in_flash_func(save_configuration)() { const size_t config_length = config->length - ((size_t)config->value - (size_t)config); // Write data to flash - uint8_t flash_buffer[FLASH_PAGE_SIZE]; + uint8_t flash_buffer[CFG_BUFFER_SIZE]; flash_header_tlv* flash_header = (flash_header_tlv*) flash_buffer; flash_header->header.type = FLASH_HEADER; flash_header->header.length = sizeof(flash_header_tlv) + config_length; @@ -362,7 +369,7 @@ bool __no_inline_not_in_flash_func(save_configuration)() { uint32_t ints = save_and_disable_interrupts(); flash_range_erase(USER_CONFIGURATION_OFFSET, FLASH_SECTOR_SIZE); - flash_range_program(USER_CONFIGURATION_OFFSET, flash_buffer, FLASH_PAGE_SIZE); + flash_range_program(USER_CONFIGURATION_OFFSET, flash_buffer, CFG_BUFFER_SIZE); restore_interrupts(ints); power_up_dac(); diff --git a/firmware/code/configuration_types.h b/firmware/code/configuration_types.h index a501246..051edd4 100644 --- a/firmware/code/configuration_types.h +++ b/firmware/code/configuration_types.h @@ -17,8 +17,8 @@ #include #define FLASH_MAGIC 0x2E8AFEDD -#define CONFIG_VERSION 2 -#define MINIMUM_CONFIG_VERSION 1 +#define CONFIG_VERSION 3 +#define MINIMUM_CONFIG_VERSION 3 enum structure_types { // Commands/Responses, these are container TLVs. The Value will be a set of TLV structures. @@ -53,18 +53,20 @@ typedef struct __attribute__((__packed__)) _tlv_header { typedef struct __attribute__((__packed__)) _filter2 { uint8_t type; uint8_t reserved[3]; - double f0; - double Q; + float f0; + float Q; } filter2; typedef struct __attribute__((__packed__)) _filter3 { uint8_t type; uint8_t reserved[3]; - double f0; - double db_gain; - double Q; + float f0; + float db_gain; + float Q; } filter3; +// WARNING: We wont be able to support more than 8 of these filters +// due to the config structure size. typedef struct __attribute__((__packed__)) _filter6 { uint8_t type; uint8_t reserved[3]; @@ -98,7 +100,7 @@ typedef struct __attribute__((__packed__)) _flash_header_tlv { typedef struct __attribute__((__packed__)) _preprocessing_configuration_tlv { tlv_header header; - double preamp; + float preamp; uint8_t reverse_stereo; uint8_t reserved[3]; } preprocessing_configuration_tlv; @@ -137,6 +139,13 @@ typedef struct __attribute__((__packed__)) _default_configuration { filter3 f6; filter3 f7; filter3 f8; + filter3 f9; + filter3 f10; + filter3 f11; + filter3 f12; + filter3 f13; + filter3 f14; + filter3 f15; } filters; preprocessing_configuration_tlv preprocessing; } default_configuration; diff --git a/firmware/code/fix16.h b/firmware/code/fix16.h index 40acced..66799ad 100644 --- a/firmware/code/fix16.h +++ b/firmware/code/fix16.h @@ -25,13 +25,6 @@ #include #include -// During development, it can be useful to run with real double values for reference. -//#define USE_DOUBLE -#ifdef USE_DOUBLE -typedef double fix16_t; -static const fix16_t fix16_zero = 0; -static const fix16_t fix16_one = 1; -#else /// @brief Fixed point math type, in format Q3.28. One sign bit, 3 bits for left-of-decimal ///and 28 for right-of-decimal. This arrangment works because we normalize the incoming USB @@ -46,15 +39,15 @@ static const fix3_28_t fix16_one = 0x10000000; /// @brief Represents zero in fixed point world. static const fix3_28_t fix16_zero = 0x00000000; -#endif +static inline fix3_28_t norm_fix3_28_from_s16sample(int16_t); +static inline int16_t norm_fix3_28_to_s16sample(fix3_28_t); -fix3_28_t norm_fix3_28_from_s16sample(int16_t); +static inline fix3_28_t fix3_28_from_flt(float); -int16_t norm_fix3_28_to_s16sample(fix3_28_t); +static inline fix3_28_t fix3_28_from_dbl(double); -fix3_28_t fix3_28_from_dbl(double); - -fix3_28_t fix16_mul(fix3_28_t, fix3_28_t); +static inline fix3_28_t fix16_mul(fix3_28_t, fix3_28_t); +#include "fix16.inl" #endif \ No newline at end of file diff --git a/firmware/code/fix16.c b/firmware/code/fix16.inl similarity index 66% rename from firmware/code/fix16.c rename to firmware/code/fix16.inl index 1695ed6..ca45a2e 100644 --- a/firmware/code/fix16.c +++ b/firmware/code/fix16.inl @@ -25,46 +25,10 @@ #include #include "fix16.h" -#ifdef USE_DOUBLE -fix16_t fix16_from_s16sample(int16_t a) { - return a; -} - -int16_t fix16_to_s16sample(fix16_t a) { - // Handle rounding up front, adding one can cause an overflow/underflow - if (a < 0) { - a -= 0.5; - } else { - a += 0.5; - } - - // Saturate the value if an overflow has occurred - if (a < SHRT_MIN) { - return SHRT_MIN; - } - if (a < SHRT_MAX) { - return SHRT_MAX; - } - return a; -} - -fix16_t fix16_from_dbl(double a) { - return a; -} - -double fix16_to_dbl(fix16_t a) { - return a; -} - -fix16_t fix16_mul(fix16_t inArg0, fix16_t inArg1) { - return inArg0 * inArg1; -} -#else - /// @brief Produces a fixed point number from a 16-bit signed integer, normalized to ]-1,1[. /// @param a Signed 16-bit integer. /// @return A fixed point number in Q3.28 format, with input normalized to ]-1,1[. -fix3_28_t norm_fix3_28_from_s16sample(int16_t a) { +static inline fix3_28_t norm_fix3_28_from_s16sample(int16_t a) { /* So, we're using a Q3.28 fixed point system here, and we want the incoming audio signal to be represented as a number between -1 and 1. To do this, we need the 16-bit value to map to the 28-bit right-of-decimal field in @@ -79,7 +43,7 @@ fix3_28_t norm_fix3_28_from_s16sample(int16_t a) { /// calculated sample to one that the DAC can understand. /// @param a /// @return Signed 16-bit integer. -int16_t norm_fix3_28_to_s16sample(fix3_28_t a) { +static inline int16_t norm_fix3_28_to_s16sample(fix3_28_t a) { // Handle rounding up front, adding one can cause an overflow/underflow // It's not clear exactly how this works, so we'll disable it for now. @@ -110,8 +74,13 @@ int16_t norm_fix3_28_to_s16sample(fix3_28_t a) { return (a >> 12); } +static inline fix3_28_t fix3_28_from_flt(float a) { + float temp = a * fix16_one; + temp += ((temp >= 0) ? 0.5f : -0.5f); + return (fix3_28_t)temp; +} -fix3_28_t fix3_28_from_dbl(double a) { +static inline fix3_28_t fix3_28_from_dbl(double a) { double temp = a * fix16_one; temp += (double)((temp >= 0) ? 0.5f : -0.5f); return (fix3_28_t)temp; @@ -121,27 +90,22 @@ fix3_28_t fix3_28_from_dbl(double a) { /// @param inArg0 Q3.28 format fixed point number. /// @param inArg1 Q3.28 format fixed point number. /// @return A Q3.28 fixed point number that represents the truncated result of inArg0 x inArg1. -fix3_28_t fix16_mul(fix3_28_t inArg0, fix3_28_t inArg1) { - const int64_t product = (int64_t)inArg0 * inArg1; +static inline fix3_28_t fix16_mul(fix3_28_t inArg0, fix3_28_t inArg1) { + int32_t A = (inArg0 >> 14), C = (inArg1 >> 14); + uint32_t B = (inArg0 & 0x3FFF), D = (inArg1 & 0x3FFF); + int32_t AC = A*C; + int32_t AD_CB = A*D + C*B; + int32_t product_hi = AC + (AD_CB >> 14); - /* Since we're expecting 2 Q3.28 numbers, the multiplication result should be a Q7.56 number. - To bring this number back to the right order of magnitude, we need to shift - it to the right by 28. */ - fix3_28_t result = product >> 28; +#if HANDLE_CARRY + // Handle carry from lower bits to upper part of result. + uint32_t BD = B*D; + uint32_t ad_cb_temp = AD_CB << 14; + uint32_t product_lo = BD + ad_cb_temp; - // Handle rounding where we are choppping off low order bits - // Disabled for now, too much load. We get crackling when adjusting - // the volume. - #if 0 - if (product & 0x4000) { - if (result >= 0) { - result++; - } - else { - result--; - } - } - #endif - return result; -} -#endif \ No newline at end of file + if (product_lo < BD) + product_hi++; +#endif + + return product_hi; +} \ No newline at end of file diff --git a/firmware/code/i2s.h b/firmware/code/i2s.h index ca30353..432b8e9 100644 --- a/firmware/code/i2s.h +++ b/firmware/code/i2s.h @@ -70,4 +70,4 @@ void feed_dma(i2s_obj_t *, uint8_t *); uint32_t copy_userbuf_to_ringbuf(i2s_obj_t *, const uint8_t *, uint); -#endif \ No newline at end of file +#endif diff --git a/firmware/code/ringbuf.c b/firmware/code/ringbuf.c index b6399f3..4c417c5 100644 --- a/firmware/code/ringbuf.c +++ b/firmware/code/ringbuf.c @@ -78,4 +78,4 @@ size_t ringbuf_available_data(ring_buf_t *rbuf) { size_t ringbuf_available_space(ring_buf_t *rbuf) { return rbuf->size - ringbuf_available_data(rbuf) - 1; -} \ No newline at end of file +} diff --git a/firmware/code/ringbuf.h b/firmware/code/ringbuf.h index 5e1cbcc..8543179 100644 --- a/firmware/code/ringbuf.h +++ b/firmware/code/ringbuf.h @@ -42,4 +42,4 @@ bool ringbuf_is_full(ring_buf_t *); size_t ringbuf_available_data(ring_buf_t *); size_t ringbuf_available_space(ring_buf_t *); -#endif \ No newline at end of file +#endif diff --git a/firmware/code/run.c b/firmware/code/run.c index 0920b1b..3a33f89 100644 --- a/firmware/code/run.c +++ b/firmware/code/run.c @@ -118,7 +118,7 @@ static void update_volume() // PCM data into I2S data that gets shipped out to the PCM3060. It really // belongs with the other USB-related code due to its utter indecipherability, // but it's placed here to emphasize its importance. -static void _as_audio_packet(struct usb_endpoint *ep) { +static void __no_inline_not_in_flash_func(_as_audio_packet)(struct usb_endpoint *ep) { struct usb_buffer *usb_buffer = usb_current_out_packet_buffer(ep); int16_t *in = (int16_t *) usb_buffer->data; int32_t *out = (int32_t *) userbuf; @@ -156,15 +156,22 @@ static void _as_audio_packet(struct usb_endpoint *ep) { uint32_t ready = multicore_fifo_pop_blocking(); multicore_fifo_push_blocking(CORE0_READY); + // Update the volume if required. We do this from core1 as + // core0 is more heavily loaded, doing this from core0 can + // lead to audio crackling. + update_volume(); + + // Update filters if required + apply_config_changes(); + // keep on truckin' usb_grow_transfer(ep->current_transfer, 1); usb_packet_done(ep); } -void core1_entry() { +void __no_inline_not_in_flash_func(core1_entry)() { uint8_t *userbuf = (uint8_t *) multicore_fifo_pop_blocking(); int32_t *out = (int32_t *) userbuf; - int limit_counter = 100; // Signal that the thread has started multicore_fifo_push_blocking(CORE1_READY); @@ -191,19 +198,6 @@ void core1_entry() { } } - // Update the volume and filter configs if required. We do this from - // core1 as core0 is more heavily loaded, doing this from core0 can - // lead to audio crackling. - // Use of a counter reduces the amount of crackling when changing - // volume. - if (limit_counter != 0) - limit_counter--; - else { - limit_counter = 100; - update_volume(); - apply_config_changes(); - } - // Signal to core 0 that the data has all been transformed multicore_fifo_push_blocking(CORE1_READY); @@ -253,7 +247,7 @@ void setup() { // The PCM3060 supports standard mode (100kbps) or fast mode (400kbps) // we run in fast mode so we dont block the core for too long while // updating the volume. - i2c_init(i2c0, 100000); + i2c_init(i2c0, 400000); gpio_set_function(PCM3060_SDA_PIN, GPIO_FUNC_I2C); gpio_set_function(PCM3060_SCL_PIN, GPIO_FUNC_I2C); gpio_pull_up(PCM3060_SDA_PIN); @@ -297,6 +291,7 @@ void setup() { * IF YOU DO, YOU COULD BLOW UP YOUR HARDWARE! * * YOU WERE WARNED!!!!!!!!!!!!!!!! * ****************************************************************************/ + // TODO: roundf will be much faster than round, but it might mess with timings void configure_neg_switch_pwm() { gpio_set_function(NEG_SWITCH_PWM_PIN, GPIO_FUNC_PWM); uint slice_num = pwm_gpio_to_slice_num(NEG_SWITCH_PWM_PIN);