Skip to content

Commit e142332

Browse files
committed
Progress with IFC Flash driver and mult-core support
1 parent 3397332 commit e142332

4 files changed

Lines changed: 171 additions & 82 deletions

File tree

arch.mk

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1001,6 +1001,7 @@ ifeq ($(TARGET),nxp_t2080)
10011001
LDFLAGS+=$(ARCH_FLAGS)
10021002
LDFLAGS+=-Wl,--hash-style=both # generate both sysv and gnu symbol hash table
10031003
LDFLAGS+=-Wl,--as-needed # remove weak functions not used
1004+
CFLAGS+=-DTEST_FLASH # enable flash erase/write/read test
10041005
OBJS+=src/boot_ppc_mp.o # support for spin table
10051006
UPDATE_OBJS:=src/update_ram.o
10061007
OBJS+=src/fdt.o

hal/nxp_t2080.c

Lines changed: 160 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,13 @@ static void udelay(uint32_t delay_us)
172172
wait_ticks(delay_us * DELAY_US);
173173
}
174174

175+
/* Pre-computed DELAY_US value for ram_udelay() (RAMFUNCTION).
176+
* Must be initialized before any flash operations begin.
177+
* ram_udelay() cannot call DELAY_US macro because it expands to
178+
* hal_get_plat_clk() which lives in flash .text — during flash command
179+
* mode, instruction fetch from flash returns status data, not code. */
180+
static uint32_t ram_delay_ticks;
181+
175182
#if defined(ENABLE_IFC) && !defined(BUILD_LOADER_STAGE1)
176183
static int hal_flash_getid(void)
177184
{
@@ -228,6 +235,12 @@ static void hal_flash_init(void)
228235
* Flash write/erase use RAMFUNCTION to execute from DDR during
229236
* flash command mode (after .ramcode relocation in hal_init). */
230237
#endif /* ENABLE_IFC */
238+
239+
/* Pre-compute timebase ticks per microsecond for ram_udelay().
240+
* Must be done while flash is in read mode — DELAY_US macro expands to
241+
* hal_get_plat_clk() which is in flash .text and cannot be called from
242+
* RAMFUNCTION code during flash command mode. */
243+
ram_delay_ticks = DELAY_US;
231244
}
232245

233246
void hal_ddr_init(void)
@@ -688,46 +701,103 @@ void hal_init(void)
688701
#endif
689702
}
690703

704+
/* Direct UART character output from RAMFUNCTION code.
705+
* Uses inline get8/set8 to access UART registers in CCSR space (own TLB).
706+
* Safe to call during flash command mode — no flash code involved. */
707+
static void RAMFUNCTION ram_putchar(char c)
708+
{
709+
while (!(get8(UART_LSR(UART_SEL)) & UART_LSR_THRE))
710+
;
711+
set8(UART_THR(UART_SEL), c);
712+
}
713+
static void RAMFUNCTION ram_puthex4(uint8_t val)
714+
{
715+
val &= 0xF;
716+
ram_putchar(val < 10 ? '0' + val : 'A' + val - 10);
717+
}
718+
static void RAMFUNCTION ram_puthex16(uint16_t val)
719+
{
720+
ram_puthex4(val >> 12);
721+
ram_puthex4(val >> 8);
722+
ram_puthex4(val >> 4);
723+
ram_puthex4(val);
724+
}
725+
691726
/* RAM-resident microsecond delay using inline timebase reads.
692-
* Cannot call wait_ticks() (in flash .text) from RAMFUNCTION code
693-
* while flash is in command mode — instruction fetch would return garbage. */
727+
* Uses pre-computed ram_delay_ticks (initialized before flash ops).
728+
* Cannot call DELAY_US macro — it expands to hal_get_plat_clk() in flash
729+
* .text. The linker generates a trampoline that jumps to flash; during
730+
* flash command mode, instruction fetch returns status data → crash. */
694731
static void RAMFUNCTION ram_udelay(uint32_t delay_us)
695732
{
696733
uint32_t tbl_start, tbl_now;
697-
uint32_t ticks = delay_us * DELAY_US;
734+
uint32_t ticks = delay_us * ram_delay_ticks;
698735
__asm__ __volatile__("mfspr %0,268" : "=r"(tbl_start));
699736
do {
700737
__asm__ __volatile__("mfspr %0,268" : "=r"(tbl_now));
701738
} while ((tbl_now - tbl_start) < ticks);
702739
}
703740

741+
/* Inline TLB write — all RAMFUNCTION flash code must avoid calling set_tlb(),
742+
* invalidate_dcache(), invalidate_icache() because those live in flash .text.
743+
* During flash command mode, I-cache misses to flash return status data instead
744+
* of instructions → PIL exception. Using mtspr/mfspr macros (inline asm) keeps
745+
* everything in DDR .ramcode. */
746+
static void RAMFUNCTION ram_write_tlb(uint32_t mas0, uint32_t mas1,
747+
uint32_t mas2, uint32_t mas3, uint32_t mas7)
748+
{
749+
mtspr(MAS0, mas0);
750+
mtspr(MAS1, mas1);
751+
mtspr(MAS2, mas2);
752+
mtspr(MAS3, mas3);
753+
mtspr(MAS7, mas7);
754+
__asm__ __volatile__("isync; msync; tlbwe; isync");
755+
}
756+
704757
/* Switch flash TLB to cache-inhibited + guarded for direct flash chip access.
705758
* AMD flash commands require writes to reach the chip immediately and status
706759
* reads to come directly from the chip. With MAS2_M (cacheable), stores go
707760
* through the CPC coherency fabric; IFC does not support coherent writes and
708-
* returns a bus error (DSI). tlbre/tlbwe only modifies MAS2 and is unreliable
709-
* when the entry has IPROT=1; use set_tlb() to rewrite all MAS fields.
710-
* Must be called while flash is still in read-array mode (set_tlb lives in
711-
* flash .text, reachable via longcall while TLB is still M/cacheable). */
761+
* returns a bus error (DSI).
762+
* Fully inlined — no calls to flash-resident set_tlb(). */
712763
static void RAMFUNCTION hal_flash_cache_disable(void)
713764
{
714-
set_tlb(1, 2,
715-
FLASH_BASE_ADDR, FLASH_BASE_ADDR, FLASH_BASE_PHYS_HIGH,
716-
MAS3_SX | MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0,
717-
FLASH_TLB_PAGESZ, 1);
765+
ram_write_tlb(
766+
BOOKE_MAS0(1, 2, 0),
767+
BOOKE_MAS1(1, 1, 0, 0, FLASH_TLB_PAGESZ),
768+
BOOKE_MAS2(FLASH_BASE_ADDR, MAS2_I | MAS2_G),
769+
BOOKE_MAS3(FLASH_BASE_ADDR, 0, MAS3_SX | MAS3_SW | MAS3_SR),
770+
BOOKE_MAS7(FLASH_BASE_PHYS_HIGH));
718771
}
719772

720773
/* Restore flash TLB to cacheable mode after flash operation.
721774
* Flash must be back in read-array mode before calling (AMD_CMD_RESET sent).
722-
* Invalidate caches afterward so stale pre-erase data is not served. */
775+
* Invalidate caches afterward so stale pre-erase data is not served.
776+
* Fully inlined — no calls to flash-resident functions. */
723777
static void RAMFUNCTION hal_flash_cache_enable(void)
724778
{
725-
set_tlb(1, 2,
726-
FLASH_BASE_ADDR, FLASH_BASE_ADDR, FLASH_BASE_PHYS_HIGH,
727-
MAS3_SX | MAS3_SW | MAS3_SR, MAS2_M, 0,
728-
FLASH_TLB_PAGESZ, 1);
729-
invalidate_dcache();
730-
invalidate_icache();
779+
uint32_t val;
780+
781+
ram_write_tlb(
782+
BOOKE_MAS0(1, 2, 0),
783+
BOOKE_MAS1(1, 1, 0, 0, FLASH_TLB_PAGESZ),
784+
BOOKE_MAS2(FLASH_BASE_ADDR, MAS2_M),
785+
BOOKE_MAS3(FLASH_BASE_ADDR, 0, MAS3_SX | MAS3_SW | MAS3_SR),
786+
BOOKE_MAS7(FLASH_BASE_PHYS_HIGH));
787+
788+
/* Inline invalidate_dcache() — L1CSR0.CFI */
789+
val = mfspr(L1CSR0);
790+
val |= L1CSR_CFI;
791+
__asm__ __volatile__("msync; isync");
792+
mtspr(L1CSR0, val);
793+
__asm__ __volatile__("isync");
794+
795+
/* Inline invalidate_icache() — L1CSR1.CFI */
796+
val = mfspr(L1CSR1);
797+
val |= L1CSR_CFI;
798+
__asm__ __volatile__("msync; isync");
799+
mtspr(L1CSR1, val);
800+
__asm__ __volatile__("isync");
731801
}
732802

733803
/* Clear IFC write-protect. T2080RM says IFC_CSPR should only be written
@@ -847,38 +917,52 @@ static int RAMFUNCTION hal_flash_status_wait(uint32_t sector, uint16_t mask,
847917
uint32_t timeout = 0;
848918
uint16_t read1, read2;
849919

850-
/* Replicate 8-bit AMD status mask to both bytes for parallel chips */
920+
/* Replicate 8-bit AMD toggle bit to both bytes for parallel chips */
851921
#if FLASH_CFI_WIDTH == 16
852-
uint16_t mask16 = (mask << 8) | mask;
853922
uint16_t toggle16 = (AMD_STATUS_TOGGLE << 8) | AMD_STATUS_TOGGLE;
854923
#else
855-
uint16_t mask16 = mask;
856924
uint16_t toggle16 = AMD_STATUS_TOGGLE;
857925
#endif
926+
(void)mask; /* mask parameter reserved for future DQ7 data polling */
858927

859928
do {
860-
/* detection of completion happens when reading status bits
861-
* DQ6 and DQ2 stop toggling (0x44) */
929+
/* AMD toggle detection: DQ6 toggles on consecutive reads during
930+
* program/erase. When the operation completes, DQ6 reflects actual
931+
* data and consecutive reads return the same value.
932+
* NOTE: Do NOT check programmed data bits against a mask here —
933+
* after write completes, the data depends on what was written, not
934+
* on any fixed status bits. Only erase guarantees 0xFF data. */
862935
#if FLASH_CFI_WIDTH == 16
863936
read1 = FLASH_IO16_READ(sector, 0);
864-
if ((read1 & toggle16) == 0)
865-
read1 = FLASH_IO16_READ(sector, 0);
866937
read2 = FLASH_IO16_READ(sector, 0);
867-
if ((read2 & toggle16) == 0)
868-
read2 = FLASH_IO16_READ(sector, 0);
869938
#else
870939
read1 = FLASH_IO8_READ(sector, 0);
871-
if ((read1 & toggle16) == 0)
872-
read1 = FLASH_IO8_READ(sector, 0);
873940
read2 = FLASH_IO8_READ(sector, 0);
874-
if ((read2 & toggle16) == 0)
875-
read2 = FLASH_IO8_READ(sector, 0);
876941
#endif
942+
/* Print first iteration reads for diagnostics */
943+
if (timeout == 0) {
944+
ram_putchar('[');
945+
ram_puthex16(read1);
946+
ram_putchar(':');
947+
ram_puthex16(read2);
948+
ram_putchar(']');
949+
}
877950
#ifdef DEBUG_FLASH
878951
wolfBoot_printf("Wait toggle %x -> %x\n", read1, read2);
879952
#endif
880-
if (read1 == read2 && ((read1 & mask16) == mask16))
953+
/* DQ6 stopped toggling → operation complete */
954+
if (((read1 ^ read2) & toggle16) == 0)
881955
break;
956+
/* Check DQ5 (error) on both chips while still toggling */
957+
if (read1 & ((AMD_STATUS_ERROR << 8) | AMD_STATUS_ERROR)) {
958+
/* Read one more time to confirm it's not a false DQ5 */
959+
read1 = FLASH_IO16_READ(sector, 0);
960+
read2 = FLASH_IO16_READ(sector, 0);
961+
if (((read1 ^ read2) & toggle16) == 0)
962+
break; /* toggle stopped — was a race, not an error */
963+
ret = -2; /* DQ5 error — program/erase failed */
964+
break;
965+
}
882966
ram_udelay(1);
883967
} while (timeout++ < timeout_us);
884968
if (timeout >= timeout_us) {
@@ -914,13 +998,15 @@ int RAMFUNCTION hal_flash_write(uint32_t address, const uint8_t *data, int len)
914998
#endif
915999

9161000
/* Disable flash caching — AMD commands must reach the chip directly */
1001+
ram_putchar('w'); /* checkpoint: entering write */
9171002
hal_flash_cache_disable();
9181003
hal_flash_clear_wp();
9191004

9201005
/* Reset flash to read-array mode in case previous operation left it
9211006
* in command mode (e.g. after a timeout or incomplete operation) */
9221007
FLASH_IO8_WRITE(0, 0, AMD_CMD_RESET);
9231008
ram_udelay(50);
1009+
ram_putchar('r'); /* checkpoint: reset done, starting loop */
9241010

9251011
pos = 0;
9261012
while (len > 0) {
@@ -939,9 +1025,10 @@ int RAMFUNCTION hal_flash_write(uint32_t address, const uint8_t *data, int len)
9391025
#endif
9401026

9411027
hal_flash_unlock_sector(sector);
1028+
ram_putchar('u'); /* checkpoint: unlock done */
9421029
FLASH_IO8_WRITE(sector, offset, AMD_CMD_WRITE_TO_BUFFER);
943-
/* Word count (N-1) must be replicated to both chips */
9441030
FLASH_IO8_WRITE(sector, offset, (nwords-1));
1031+
ram_putchar('b'); /* checkpoint: buffer cmd + count sent */
9451032

9461033
for (i=0; i<nwords; i++) {
9471034
const uint8_t* ptr = &data[pos];
@@ -952,23 +1039,43 @@ int RAMFUNCTION hal_flash_write(uint32_t address, const uint8_t *data, int len)
9521039
#endif
9531040
pos += (FLASH_CFI_WIDTH/8);
9541041
}
1042+
ram_putchar('l'); /* checkpoint: data loaded */
1043+
/* Ensure all data stores reach IFC before confirm */
1044+
__asm__ __volatile__("sync" ::: "memory");
9551045
FLASH_IO8_WRITE(sector, offset, AMD_CMD_WRITE_BUFFER_CONFIRM);
956-
/* Typical 410us */
1046+
/* Ensure confirm write reaches flash before polling */
1047+
__asm__ __volatile__("sync; isync" ::: "memory");
1048+
ram_putchar('c'); /* checkpoint: confirm sent */
9571049

958-
/* poll for program completion - max 200ms */
1050+
/* poll for program completion - max 200ms (typical 410us) */
9591051
ret = hal_flash_status_wait(sector, 0x44, 200*1000);
1052+
if (ret == 0) {
1053+
ram_putchar('p'); /* pass */
1054+
} else if (ret == -2) {
1055+
ram_putchar('E'); /* DQ5 error */
1056+
} else {
1057+
ram_putchar('T'); /* timeout */
1058+
}
9601059
if (ret != 0) {
961-
/* Reset flash to read-array mode BEFORE calling printf */
1060+
uint16_t readback;
1061+
/* Reset flash to read-array mode BEFORE reading back data */
9621062
FLASH_IO8_WRITE(sector, 0, AMD_CMD_RESET);
9631063
ram_udelay(50);
964-
wolfBoot_printf("Flash Write: Timeout at sector %d\n", sector);
1064+
/* Read back offset 0 to see if data was actually written */
1065+
readback = FLASH_IO16_READ(sector, 0);
1066+
ram_putchar('=');
1067+
ram_puthex16(readback);
1068+
wolfBoot_printf("\nFlash Write: %s at sector %d (ret %d)\n",
1069+
ret == -2 ? "DQ5 error" : "Timeout", sector, ret);
9651070
break;
9661071
}
1072+
ram_putchar('.'); /* checkpoint: page write OK */
9671073

9681074
address += xfer;
9691075
len -= xfer;
9701076
}
9711077

1078+
ram_putchar('d'); /* checkpoint: write loop done */
9721079
/* Restore flash caching — flash is back in read-array mode */
9731080
hal_flash_cache_enable();
9741081
return ret;
@@ -1118,15 +1225,24 @@ static void hal_mp_up(uint32_t bootpg, uint32_t spin_table_ddr)
11181225
wolfBoot_printf("MP: Timeout enabling additional cores!\n");
11191226
}
11201227

1121-
/* Disable all timebases */
1122-
set32(RCPM_PCTBENR, 0);
1228+
/* Synchronize and reset timebase across all cores.
1229+
* On e6500, mtspr to TBL/TBU (SPR 284/285) may cause an illegal
1230+
* instruction exception — skip timebase reset if secondary cores
1231+
* did not start (timebase sync only matters for multi-core). */
1232+
if ((active_cores & all_cores) == all_cores) {
1233+
/* Disable all timebases */
1234+
set32(RCPM_PCTBENR, 0);
11231235

1124-
/* Reset our timebase */
1125-
mtspr(SPRN_TBWU, 0);
1126-
mtspr(SPRN_TBWL, 0);
1236+
/* Reset our timebase */
1237+
mtspr(SPRN_TBWU, 0);
1238+
mtspr(SPRN_TBWL, 0);
11271239

1128-
/* Enable timebase for all cores */
1129-
set32(RCPM_PCTBENR, all_cores);
1240+
/* Enable timebase for all cores */
1241+
set32(RCPM_PCTBENR, all_cores);
1242+
} else {
1243+
/* Only re-enable timebase for boot core */
1244+
set32(RCPM_PCTBENR, (1 << whoami));
1245+
}
11301246
}
11311247

11321248
static void hal_mp_init(void)

src/boot_ppc.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -185,7 +185,7 @@ void boot_entry_C(void)
185185
}
186186

187187
/* Run wolfBoot! */
188-
#ifdef ENABLE_DDR
188+
#if defined(ENABLE_DDR) && defined(DDR_STACK_TOP)
189189
/* DDR is initialized, .data and .bss are set up.
190190
* Switch stack from CPC SRAM to DDR for:
191191
* 1. Better performance (DDR stack is cacheable by L1/L2/CPC)

src/boot_ppc_mp.S

Lines changed: 9 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -132,43 +132,15 @@ branch_prediction:
132132
mtspr L1CSR2, r8
133133

134134
#if defined(CORE_E6500) /* --- L2 E6500 --- */
135-
ccsr_tlb_mp:
136-
/* e6500 L2 uses memory-mapped CCSR registers (L2_CLUSTER_BASE).
137-
* Secondary cores have no TLBs on entry — only the boot page
138-
* translation provides initial access. Add a temporary CCSR
139-
* mapping (TLB1 entry 2) so L2 setup can access the registers. */
140-
set_tlb(1, 2,
141-
CCSRBAR, CCSRBAR, CCSRBAR_PHYS_HIGH,
142-
MAS3_SW | MAS3_SR, MAS2_I | MAS2_G, 0,
143-
CCSRBAR_SIZE, 0, r11)
144-
l2_setup_cache:
145-
/* E6500CORERM: 11.7 L2 cache state */
146-
/* R5 = L2 cluster 1 base */
147-
lis r5, L2_CLUSTER_BASE(0)@h
148-
ori r5, r5, L2_CLUSTER_BASE(0)@l
149-
/* Invalidate and clear locks */
150-
lis r1, (L2CSR0_L2FI | L2CSR0_L2LFC)@h
151-
ori r1, r1, (L2CSR0_L2FI | L2CSR0_L2LFC)@l
152-
sync
153-
stw r1, L2CSR0(r5)
154-
155-
/* poll till invalidate and lock bits are cleared */
156-
l2_poll_invclear:
157-
lwz r4, L2CSR0(r5)
158-
and. r4, r1, r4
159-
bne l2_poll_invclear
160-
isync
161-
162-
/* set stash id to (coreID * 2) + 32 + L2 (1) */
163-
addi r3, r8,1
164-
stw r3, L2CSR1(r5)
165-
166-
/* enable L2 with parity */
167-
sync
168-
isync
169-
lis r4, (L2CSR0_L2E | L2CSR0_L2PE)@h
170-
stw r4, L2CSR0(r5)
171-
isync
135+
/* e6500 L2 is per-cluster (shared by all cores in the cluster).
136+
* The primary core already invalidated and enabled L2 during boot.
137+
* Secondary cores must NOT do L2FI (flash invalidate) — it discards
138+
* ALL dirty L2 lines including the primary core's stack, return
139+
* addresses, and cached code, causing the primary core to crash
140+
* (typically SRR0=0 from corrupted return address).
141+
* L1 stash ID (set above via L1CSR2 SPR) is per-core and sufficient.
142+
* L2CSR1 (stash ID) is per-cluster and already set by core 0.
143+
* No CCSR TLB mapping needed since we skip L2 register access. */
172144

173145
#elif defined(CORE_E5500) /* --- L2 E5500 --- */
174146
l2_setup_cache:

0 commit comments

Comments
 (0)