Make precache() cleaner and more efficient (#8903)

jjsuwa-sys3175 · web-flow · commit 65579d29081c · 2023-04-06T01:26:34.000+03:00
No need to issue a MEMW instrunction per load from each cache line.
Only once after the last load is sufficient.

  MEMW ensures that all previous load, store, acquire, release, prefetch,
  and cache instructions perform before performing any subsequent load,
  store, acquire, release, prefetch, or cache instructions.

    -- MEMW (Memory Wait), 6. Instruction Descriptions,
                                      Xtensa ISA Reference Manual (p.409)
diff --git a/cores/esp8266/core_esp8266_features.cpp b/cores/esp8266/core_esp8266_features.cpp
@@ -38,13 +38,13 @@ void precache(void *f, uint32_t bytes) {
   // page (ie 1 word in 8) for this to work.
   #define CACHE_PAGE_SIZE 32
 
-  uint32_t a0;
-  __asm__("mov.n %0, a0" : "=r"(a0));
-  uint32_t lines = (bytes/CACHE_PAGE_SIZE)+2;
-  volatile uint32_t *p = (uint32_t*)((f ? (uint32_t)f : a0) & ~0x03);
-  uint32_t x;
-  for (uint32_t i=0; i<lines; i++, p+=CACHE_PAGE_SIZE/sizeof(uint32_t)) x=*p;
-  (void)x;
+  uint32_t lines = (bytes / CACHE_PAGE_SIZE) + 2;
+  uint32_t *p = (uint32_t*)((uint32_t)(f ? f : __builtin_return_address(0)) & ~0x03);
+  do {
+    __asm__ volatile ("" : : "r"(*p));  // guarantee that the value of *p will be in some register (forced load)
+    p += CACHE_PAGE_SIZE / sizeof(uint32_t);
+  } while (--lines);
+  __sync_synchronize();  // full memory barrier, mapped to MEMW in Xtensa
 }
 
 /** based on efuse data, we could determine what type of chip this is