From fb9a0a6d4b969287e34e99400256b9c633b7370b Mon Sep 17 00:00:00 2001 From: M Hightower <27247790+mhightower83@users.noreply.github.com> Date: Mon, 4 Jan 2021 11:38:42 -0800 Subject: [PATCH] Extended ASM got fussy when using different optimizations. ie. HWDT resets. It seems you should not use input registers for scratch registers. Add an extra output register instead. No code size increase. Light refactoring for readability Added "C" reference code for Extended ASM Save two cycles by loading a0 early in exc-c-wrapper-handler.S Use optimization O2 Net change in size, 0 bytes with optimization. Save 4 bytes w/o Optimization. With changes and "O2" save 3 cycles on write and 6 cycles on read. --- cores/esp8266/core_esp8266_non32xfer.cpp | 141 ++++++++++++++--------- cores/esp8266/exc-c-wrapper-handler.S | 4 +- 2 files changed, 90 insertions(+), 55 deletions(-) diff --git a/cores/esp8266/core_esp8266_non32xfer.cpp b/cores/esp8266/core_esp8266_non32xfer.cpp index eb4741e1ed..0d7bb19196 100644 --- a/cores/esp8266/core_esp8266_non32xfer.cpp +++ b/cores/esp8266/core_esp8266_non32xfer.cpp @@ -18,10 +18,10 @@ */ /* - * This exception handler, allows for byte or short accesses to iRAM or PROGMEM - * to succeed without causing a crash. It is still preferred to use the xxx_P - * macros whenever possible, since they are probably 30x faster than this - * exception handler method. + * This exception handler handles EXCCAUSE_LOAD_STORE_ERROR. It allows for a + * byte or short access to iRAM or PROGMEM to succeed without causing a crash. + * When reading, it is still preferred to use the xxx_P macros when possible + * since they are probably 30x faster than this exception handler method. * * Code taken directly from @pvvx's public domain code in * https://github.com/pvvx/esp8266web/blob/master/app/sdklib/system/app_main.c @@ -37,6 +37,16 @@ #include #include +// All of these optimization were tried and now work +// These results were from irammem.ino using GCC 10.2 +// DRAM reference uint16 9 AVG cycles/transfer +// #pragma GCC optimize("O0") // uint16, 289 AVG cycles/transfer, IRAM: +180 +// #pragma GCC optimize("O1") // uint16, 241 AVG cycles/transfer, IRAM: +16 +#pragma GCC optimize("O2") // uint16, 230 AVG cycles/transfer, IRAM: +4 +// #pragma GCC optimize("O3") // uint16, 230 AVG cycles/transfer, IRAM: +4 +// #pragma GCC optimize("Ofast") // uint16, 230 AVG cycles/transfer, IRAM: +4 +// #pragma GCC optimize("Os") // uint16, 233 AVG cycles/transfer, IRAM: 27556 +0 + extern "C" { #define LOAD_MASK 0x00f00fu @@ -50,32 +60,55 @@ extern "C" { static fn_c_exception_handler_t old_c_handler = NULL; -static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause) +static +IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, int cause) { do { /* - Had to split out some of the asm, compiler was reusing a register that it - needed later. A crash would come or go away with the slightest unrelated - changes elsewhere in the function. - - Register a15 was used for epc1, then clobbered for rsr. Maybe an - __asm("":::"memory") before starting the asm would help for these cases. - For this instance moved setting epc1 closer to where it was used. - Edit. "&" on output register would have resolved the problem. - Refactored to reduce and consolidate register usage. + In adapting the public domain version, a crash would come or go away with + the slightest unrelated changes elsewhere in the function. Observed that + register a15 was used for epc1, then clobbered by `rsr.` I now believe a + "&" on the output register would have resolved the problem. + + However, I have refactored the Extended ASM to reduce and consolidate + register usage and corrected the issue. + + The positioning of the Extended ASM block (as early as possible in the + compiled function) is in part controlled by the immediate need for + output variable `insn`. This placement aids in getting excvaddr read as + early as possible. */ - uint32_t insn; - __asm( - "movi %0, ~3\n\t" /* prepare a mask for the EPC */ - "and %0, %0, %1\n\t" /* apply mask for 32bit aligned base */ - "ssa8l %1\n\t" /* set up shift register for src op */ - "l32i %1, %0, 0\n\t" /* load part 1 */ - "l32i %0, %0, 4\n\t" /* load part 2 */ - "src %0, %0, %1\n\t" /* right shift to get faulting instruction */ - :"=&r"(insn) - :"r"(ef->epc) - : - ); + uint32_t insn, excvaddr; +#if 1 + { + uint32_t tmp; + __asm__ ( + "rsr.excvaddr %[vaddr]\n\t" /* Read faulting address as early as possible */ + "movi.n %[tmp], ~3\n\t" /* prepare a mask for the EPC */ + "and %[tmp], %[tmp], %[epc]\n\t" /* apply mask for 32-bit aligned base */ + "ssa8l %[epc]\n\t" /* set up shift register for src op */ + "l32i %[insn], %[tmp], 0\n\t" /* load part 1 */ + "l32i %[tmp], %[tmp], 4\n\t" /* load part 2 */ + "src %[insn], %[tmp], %[insn]\n\t" /* right shift to get faulting instruction */ + : [vaddr]"=&r"(excvaddr), [insn]"=&r"(insn), [tmp]"=&r"(tmp) + : [epc]"r"(ef->epc) :); + } + +#else + { + __asm__ __volatile__ ("rsr.excvaddr %0;" : "=r"(excvaddr):: "memory"); + /* + "C" reference code for the ASM to document intent. + May also prove useful when issolating possible issues with Extended ASM, + optimizations, new compilers, etc. + */ + uint32_t epc = ef->epc; + uint32_t *pWord = (uint32_t *)(epc & ~3); + uint64_t big_word = ((uint64_t)pWord[1] << 32) | pWord[0]; + uint32_t pos = (epc & 3) * 8; + insn = (uint32_t)(big_word >>= pos); + } +#endif uint32_t what = insn & LOAD_MASK; uint32_t valmask = 0; @@ -102,10 +135,6 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, --regno; /* account for skipped a1 in exception_frame */ } - uint32_t excvaddr; - /* read out the faulting address */ - __asm("rsr %0, EXCVADDR;" :"=r"(excvaddr)::); - #ifdef DEBUG_ESP_MMU /* debug option to validate address so we don't hide memory access bugs in APP */ if (mmu_is_iram((void *)excvaddr) || (is_read && mmu_is_icache((void *)excvaddr))) { @@ -114,31 +143,34 @@ static IRAM_ATTR void non32xfer_exception_handler(struct __exception_frame *ef, continue; /* fail */ } #endif - - if (is_read) { - /* Load, shift and mask down to correct size */ - uint32_t val = (*(uint32_t *)(excvaddr & ~0x3)); - val >>= (excvaddr & 0x3) * 8; - val &= valmask; - - /* Sign-extend for L16SI, if applicable */ - if (what == L16SI_MATCH && (val & 0x8000)) { - val |= 0xffff0000; + { + uint32_t *pWord = (uint32_t *)(excvaddr & ~0x3); + uint32_t pos = (excvaddr & 0x3) * 8; + uint32_t mem_val = *pWord; + + if (is_read) { + /* shift and mask down to correct size */ + mem_val >>= pos; + mem_val &= valmask; + + /* Sign-extend for L16SI, if applicable */ + if (what == L16SI_MATCH && (mem_val & 0x8000)) { + mem_val |= 0xffff0000; + } + + ef->a_reg[regno] = mem_val; /* carry out the load */ + + } else { /* is write */ + uint32_t val = ef->a_reg[regno]; /* get value to store from register */ + val <<= pos; + valmask <<= pos; + val &= valmask; + + /* mask out field, and merge */ + mem_val &= (~valmask); + mem_val |= val; + *pWord = mem_val; /* carry out the store */ } - - ef->a_reg[regno] = val; /* carry out the load */ - - } else { /* is write */ - uint32_t val = ef->a_reg[regno]; /* get value to store from register */ - val <<= (excvaddr & 0x3) * 8; - valmask <<= (excvaddr & 0x3) * 8; - val &= valmask; - - /* Load, mask out field, and merge */ - uint32_t dst_val = (*(uint32_t *)(excvaddr & ~0x3)); - dst_val &= (~valmask); - dst_val |= val; - (*(uint32_t *)(excvaddr & ~0x3)) = dst_val; /* carry out the store */ } ef->epc += 3; /* resume at following instruction */ @@ -201,6 +233,7 @@ static void _set_exception_handler_wrapper(int cause) { } } +void install_non32xfer_exception_handler(void) __attribute__((weak)); void install_non32xfer_exception_handler(void) { if (NULL == old_c_handler) { // Set the "C" exception handler the wrapper will call diff --git a/cores/esp8266/exc-c-wrapper-handler.S b/cores/esp8266/exc-c-wrapper-handler.S index f1ec1c391e..872702aa2d 100644 --- a/cores/esp8266/exc-c-wrapper-handler.S +++ b/cores/esp8266/exc-c-wrapper-handler.S @@ -189,6 +189,9 @@ _xtos_c_wrapper_handler: // Restore special registers l32i a14, a1, UEXC_sar + // load early - saves two cycles - @mhightower83 + movi a0, _xtos_return_from_exc + // For compatibility with Arduino interrupt architecture, we keep interrupts 100% // disabled. // /* @@ -201,7 +204,6 @@ _xtos_c_wrapper_handler: // rsil a12, 1 // XCHAL_EXCM_LEVEL rsil a12, 15 // All levels blocked. wsr a14, SAR - movi a0, _xtos_return_from_exc jx a0 /* FIXME: what about _GeneralException ? */