|
19 | 19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335, USA |
20 | 20 | */ |
21 | 21 | #include <stdint.h> |
22 | | -#include <string.h> |
23 | 22 | #include "target.h" |
24 | 23 | #include "printf.h" |
25 | 24 | #include "image.h" /* for RAMFUNCTION */ |
@@ -374,59 +373,105 @@ static void hal_cpld_init(void) |
374 | 373 | } |
375 | 374 |
|
376 | 375 | #ifdef ENABLE_DDR |
377 | | -/* Relocate stack from CPC SRAM to DDR for more stack space. |
378 | | - * Call this after DDR is initialized and verified working. |
379 | | - * This allows signature verification (ECC P384) which needs ~20-30KB stack. */ |
380 | | -static void hal_relocate_stack_to_ddr(void) |
| 376 | +/* Release CPC SRAM back to L2 cache mode. |
| 377 | + * Call after stack is relocated to DDR (done in boot_entry_C). |
| 378 | + * This gives us the full 2MB CPC as L3 cache for better performance. |
| 379 | + * |
| 380 | + * Before releasing CPC SRAM, .ramcode (RAMFUNCTION) is copied to DDR |
| 381 | + * and TLB9 is remapped: VA 0xF8F00000 -> PA DDR_RAMCODE_ADDR so that |
| 382 | + * RAMFUNCTION code (memcpy, wolfBoot_start, etc.) continues to work. */ |
| 383 | +static void hal_reconfigure_cpc_as_cache(void) |
381 | 384 | { |
382 | | - uint32_t new_sp = DDR_STACK_TOP - 64; /* 64-byte alignment, room for frame */ |
| 385 | + volatile uint32_t *cpc_csr0 = (volatile uint32_t *)(CPC_BASE + CPCCSR0); |
| 386 | + volatile uint32_t *cpc_srcr0 = (volatile uint32_t *)(CPC_BASE + CPCSRCR0); |
| 387 | + uint32_t reg; |
383 | 388 |
|
384 | | - /* Zero the DDR stack area for clean operation */ |
385 | | - memset((void*)DDR_STACK_BASE, 0, DDR_STACK_SIZE); |
| 389 | + /* Linker symbols for .ramcode section boundaries */ |
| 390 | + extern unsigned int _start_ramcode; |
| 391 | + extern unsigned int _end_ramcode; |
| 392 | + uint32_t ramcode_size = (uint32_t)&_end_ramcode - (uint32_t)&_start_ramcode; |
| 393 | + |
| 394 | + /* Step 1: Copy .ramcode from CPC SRAM to DDR. |
| 395 | + * Must use volatile loop — memcpy itself is in .ramcode! */ |
| 396 | + if (ramcode_size > 0) { |
| 397 | + volatile const uint32_t *src = (volatile const uint32_t *)&_start_ramcode; |
| 398 | + volatile uint32_t *dst = (volatile uint32_t *)DDR_RAMCODE_ADDR; |
| 399 | + volatile uint32_t *end = (volatile uint32_t *)(DDR_RAMCODE_ADDR + |
| 400 | + ramcode_size); |
| 401 | + while (dst < end) { |
| 402 | + *dst++ = *src++; |
| 403 | + } |
| 404 | + |
| 405 | + /* Flush D-cache and invalidate I-cache for the DDR copy */ |
| 406 | + flush_cache(DDR_RAMCODE_ADDR, ramcode_size); |
386 | 407 |
|
387 | | - /* Switch stack pointer from CPC SRAM to DDR. |
388 | | - * r1 is the stack pointer in PowerPC ABI. */ |
389 | | - __asm__ __volatile__( |
390 | | - "mr 1, %0\n" /* Move new stack address to r1 */ |
391 | | - "sync\n" |
392 | | - : |
393 | | - : "r" (new_sp) |
394 | | - : "memory" |
395 | | - ); |
| 408 | + /* Step 2: Remap TLB9: same VA (0xF8F00000) -> DDR physical address. |
| 409 | + * All .ramcode references use VA 0xF8F00000, so this makes them |
| 410 | + * transparently access the DDR copy instead of CPC SRAM. */ |
| 411 | + set_tlb(1, 9, |
| 412 | + L2SRAM_ADDR, DDR_RAMCODE_ADDR, 0, |
| 413 | + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_M, 0, |
| 414 | + INITIAL_SRAM_BOOKE_SZ, 1); |
| 415 | + |
| 416 | + /* Ensure TLB update and I-cache pick up new mapping */ |
| 417 | + invalidate_icache(); |
| 418 | + } |
396 | 419 |
|
397 | 420 | #ifdef DEBUG_UART |
398 | | - wolfBoot_printf("Stack relocated to DDR (SP=0x%x)\n", new_sp); |
| 421 | + wolfBoot_printf("Ramcode: copied %d bytes to DDR, TLB9 remapped\n", |
| 422 | + ramcode_size); |
399 | 423 | #endif |
400 | | -} |
401 | | - |
402 | | -/* Release CPC SRAM back to L2 cache mode after stack is relocated to DDR. |
403 | | - * This gives us the full 2MB CPC as instruction/data cache for better performance. */ |
404 | | -static void hal_reconfigure_cpc_as_cache(void) |
405 | | -{ |
406 | | - volatile uint32_t *cpc_csr0 = (volatile uint32_t *)(CPC_BASE + CPCCSR0); |
407 | | - volatile uint32_t *cpc_srcr0 = (volatile uint32_t *)(CPC_BASE + CPCSRCR0); |
408 | | - uint32_t reg; |
409 | 424 |
|
410 | | - /* Step 1: Flush the CPC to ensure no stale SRAM data. |
411 | | - * IMPORTANT: Read-modify-write to preserve CPCE/CPCPE enable bits! */ |
| 425 | + /* Step 3: Flush the CPC to push any dirty SRAM data out. |
| 426 | + * Read-modify-write to preserve CPCE/CPCPE enable bits. */ |
412 | 427 | reg = *cpc_csr0; |
413 | 428 | reg |= CPCCSR0_CPCFL; |
414 | 429 | *cpc_csr0 = reg; |
415 | 430 | __asm__ __volatile__("sync; isync" ::: "memory"); |
416 | 431 |
|
417 | | - /* Step 2: Poll until flush completes (CPCFL clears) */ |
418 | | - do { |
419 | | - reg = *cpc_csr0; |
420 | | - } while (reg & CPCCSR0_CPCFL); |
| 432 | + /* Step 4: Poll until flush completes (CPCFL clears) */ |
| 433 | + while (*cpc_csr0 & CPCCSR0_CPCFL); |
421 | 434 |
|
422 | | - /* Step 3: Disable SRAM mode - release ways back to cache */ |
423 | | - *cpc_srcr0 = 0; /* Clear SRAMEN and SRAMSZ */ |
| 435 | + /* Step 5: Disable SRAM mode - release all ways back to cache */ |
| 436 | + *cpc_srcr0 = 0; |
424 | 437 | __asm__ __volatile__("sync; isync" ::: "memory"); |
425 | 438 |
|
426 | | - /* CPC remains enabled (CPCE/CPCPE preserved), now with all ways as cache */ |
| 439 | + /* Step 6: Disable CPC SRAM LAW (no longer needed — TLB9 now routes |
| 440 | + * to DDR via LAW4, not CPC SRAM via LAW2). |
| 441 | + * Keep TLB9 — it's remapped to DDR and still in use. */ |
| 442 | + set32(LAWAR(2), 0); |
| 443 | + |
| 444 | + /* Step 7: Flash invalidate CPC to start fresh as cache */ |
| 445 | + reg = *cpc_csr0; |
| 446 | + reg |= CPCCSR0_CPCFI; |
| 447 | + *cpc_csr0 = reg; |
| 448 | + __asm__ __volatile__("sync; isync" ::: "memory"); |
| 449 | + while (*cpc_csr0 & CPCCSR0_CPCFI); |
| 450 | + |
| 451 | + /* CPC remains enabled (CPCE/CPCPE preserved), now all 2MB as cache */ |
427 | 452 |
|
428 | 453 | #ifdef DEBUG_UART |
429 | | - wolfBoot_printf("CPC: Released SRAM, full L2 cache enabled\n"); |
| 454 | + wolfBoot_printf("CPC: Released SRAM, full 2MB L2 cache enabled\n"); |
| 455 | +#endif |
| 456 | +} |
| 457 | + |
| 458 | +/* Make flash TLB cacheable for XIP code performance. |
| 459 | + * Changes TLB Entry 2 (flash) from MAS2_I|MAS2_G to MAS2_M. |
| 460 | + * This enables L1 I-cache + L2 + CPC to cache flash instructions. */ |
| 461 | +static void hal_flash_enable_caching(void) |
| 462 | +{ |
| 463 | + /* Rewrite flash TLB entry with cacheable attributes. |
| 464 | + * MAS2_M = memory coherent, enables caching */ |
| 465 | + set_tlb(1, 2, |
| 466 | + FLASH_BASE_ADDR, FLASH_BASE_ADDR, FLASH_BASE_PHYS_HIGH, |
| 467 | + MAS3_SX | MAS3_SW | MAS3_SR, MAS2_M, 0, |
| 468 | + FLASH_TLB_PAGESZ, 1); |
| 469 | + |
| 470 | + /* Invalidate L1 I-cache so new TLB attributes take effect */ |
| 471 | + invalidate_icache(); |
| 472 | + |
| 473 | +#ifdef DEBUG_UART |
| 474 | + wolfBoot_printf("Flash: caching enabled (L1+L2+CPC)\n"); |
430 | 475 | #endif |
431 | 476 | } |
432 | 477 | #endif /* ENABLE_DDR */ |
@@ -594,22 +639,17 @@ void hal_init(void) |
594 | 639 | #ifdef DEBUG_UART |
595 | 640 | hal_ddr_test(); |
596 | 641 | #endif |
597 | | - /* TODO: Implement proper assembly-based stack relocation to DDR. |
598 | | - * The current C-based approach corrupts return addresses because: |
599 | | - * 1. hal_init's return address is saved on CPC SRAM stack |
600 | | - * 2. Stack switch changes SP to DDR (zeroed area) |
601 | | - * 3. CPC release makes old stack contents invalid |
602 | | - * 4. Function returns read garbage addresses |
| 642 | + |
| 643 | + /* Stack is already in DDR (relocated in boot_entry_C via |
| 644 | + * ddr_call_with_stack trampoline before main() was called). |
603 | 645 | * |
604 | | - * For now, keep using CPC SRAM stack (1MB should be enough for P384). |
605 | | - * Stack relocation needs to be done in assembly with proper LR handling. |
606 | | - */ |
607 | | -#if 0 /* Disabled until proper assembly implementation */ |
608 | | - { |
609 | | - hal_relocate_stack_to_ddr(); |
610 | | - hal_reconfigure_cpc_as_cache(); |
611 | | - } |
612 | | -#endif |
| 646 | + * Now release CPC SRAM back to L2 cache and enable flash caching. |
| 647 | + * This dramatically improves ECC signature verification performance: |
| 648 | + * - CPC (2MB) becomes L3 cache for all memory accesses |
| 649 | + * - Flash code is cached by L1 I-cache + L2 + CPC |
| 650 | + * - Stack/data in DDR is cached by L1 D-cache + L2 + CPC */ |
| 651 | + hal_reconfigure_cpc_as_cache(); |
| 652 | + hal_flash_enable_caching(); |
613 | 653 | #endif |
614 | 654 | } |
615 | 655 |
|
|
0 commit comments